Skip to content
33 changes: 23 additions & 10 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,10 @@ impl<T> ArrowReaderBuilder<T> {
pub struct ArrowReaderOptions {
/// Should the reader strip any user defined metadata from the Arrow schema
skip_arrow_metadata: bool,
/// If provided used as the schema for the file, otherwise the schema is read from the file
/// If provided used as the schema hint when determining the Arrow schema,
/// otherwise the schema hint is read from the [ARROW_SCHEMA_META_KEY]
///
/// [ARROW_SCHEMA_META_KEY]: crate::arrow::ARROW_SCHEMA_META_KEY
supplied_schema: Option<SchemaRef>,
/// If true, attempt to read `OffsetIndex` and `ColumnIndex`
pub(crate) page_index: bool,
Expand Down Expand Up @@ -314,17 +317,27 @@ impl ArrowReaderOptions {
}
}

/// Provide a schema to use when reading the parquet file. If provided it
/// takes precedence over the schema inferred from the file or the schema defined
/// in the file's metadata. If the schema is not compatible with the file's
/// schema an error will be returned when constructing the builder.
/// Provide a schema hint to use when reading the Parquet file.
///
/// If provided, this schema takes precedence over any arrow schema embedded
/// in the metadata (see the [`arrow`] documentation for more details).
///
/// If the provided schema is not compatible with the data stored in the
/// parquet file schema, an error will be returned when constructing the
/// builder.
///
/// This option is only required if you want to cast columns to a different type.
/// For example, if you wanted to cast from an Int64 in the Parquet file to a Timestamp
/// in the Arrow schema.
/// This option is only required if you want to explicitly control the
/// conversion of Parquet types to Arrow types, such as casting a column to
/// a different type. For example, if you wanted to read an Int64 in
/// a Parquet file to a [`TimestampMicrosecondArray`] in the Arrow schema.
///
/// [`arrow`]: crate::arrow
/// [`TimestampMicrosecondArray`]: arrow_array::TimestampMicrosecondArray
///
/// # Notes
///
/// The supplied schema must have the same number of columns as the parquet schema and
/// the column names need to be the same.
/// The provided schema must have the same number of columns as the parquet schema and
/// the column names must be the same.
///
/// # Example
/// ```
Expand Down
56 changes: 46 additions & 10 deletions parquet/src/arrow/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,48 @@
// specific language governing permissions and limitations
// under the License.

//! API for reading/writing
//! Arrow [RecordBatch](arrow_array::RecordBatch)es and
//! [Array](arrow_array::Array)s to/from Parquet Files.
//! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from
//! Parquet Files.
//!
//! See the [crate-level documentation](crate) for more details.
//! See the [crate-level documentation](crate) for more details on other APIs
//!
//! # Example of writing Arrow record batch to Parquet file
//! # Schema Conversion
//!
//! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are
//! read back as [`RecordBatch`]es with the exact same types and values.
//!
//! Parquet and Arrow have different type systems, and there is not
//! always a one to one mapping between the systems. For example, data
//! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow
//! [`BinaryViewArray`] or [`BinaryArray`].
//!
//! To recover the original Arrow types, the writers in this module add a "hint" to
//! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow
//! schema. The metadata hint follows the same convention as arrow-cpp based
//! implementations such as `pyarrow`. The reader looks for the schema hint in the
//! metadata to determine Arrow types, and if it is not present, infers the Arrow schema
//! from the Parquet schema.
//!
//! In situations where the embedded Arrow schema is not compatible with the Parquet
//! schema, the Parquet schema takes precedence and no error is raised.
//! See [#1663](https://github.com/apache/arrow-rs/issues/1663)
//!
//! You can also control the type conversion process in more detail using:
//!
//! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet
//! types.
//!
//! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint
//! to use when reading Parquet, overriding any metadata that may be present.
//!
//! [`RecordBatch`]: arrow_array::RecordBatch
//! [`Array`]: arrow_array::Array
//! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY
//! [`BinaryViewArray`]: arrow_array::BinaryViewArray
//! [`BinaryArray`]: arrow_array::BinaryArray
//! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema
//!
//! # Example: Writing Arrow `RecordBatch` to Parquet file
//!
//!```rust
//! # use arrow_array::{Int32Array, ArrayRef};
Expand Down Expand Up @@ -53,7 +88,7 @@
//! writer.close().unwrap();
//! ```
//!
//! # Example of reading parquet file into arrow record batch
//! # Example: Reading Parquet file into Arrow `RecordBatch`
//!
//! ```rust
//! # use std::fs::File;
Expand Down Expand Up @@ -93,11 +128,10 @@
//! println!("Read {} records.", record_batch.num_rows());
//! ```
//!
//! # Example of reading non-uniformly encrypted parquet file into arrow record batch
//! # Example: Reading non-uniformly encrypted parquet file into arrow record batch
//!
//! Note: This requires the experimental `encryption` feature to be enabled at compile time.
//!
//!
#![cfg_attr(feature = "encryption", doc = "```rust")]
#![cfg_attr(not(feature = "encryption"), doc = "```ignore")]
//! # use arrow_array::{Int32Array, ArrayRef};
Expand Down Expand Up @@ -168,7 +202,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder;
pub use self::async_writer::AsyncArrowWriter;
use crate::schema::types::{SchemaDescriptor, Type};
use arrow_schema::{FieldRef, Schema};

// continue to export deprecated methods until they are removed
#[allow(deprecated)]
pub use self::schema::arrow_to_parquet_schema;
Expand All @@ -178,7 +211,10 @@ pub use self::schema::{
parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels,
};

/// Schema metadata key used to store serialized Arrow IPC schema
/// Schema metadata key used to store serialized Arrow schema
///
/// The Arrow schema is encoded using the Arrow IPC format, and then base64
/// encoded. This is the same format used by arrow-cpp systems, such as pyarrow.
pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema";

/// The value of this metadata key, if present on [`Field::metadata`], will be used
Expand Down
19 changes: 17 additions & 2 deletions parquet/src/arrow/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ pub struct FieldLevels {
///
/// Columns not included within [`ProjectionMask`] will be ignored.
///
/// The optional `hint` parameter is the desired Arrow schema. See the
/// [`arrow`] module documentation for more information.
///
/// [`arrow`]: crate::arrow
///
/// # Notes:
/// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it
/// will be used, otherwise the default arrow type for the given parquet column type will be used.
///
Expand Down Expand Up @@ -192,8 +198,12 @@ pub fn encode_arrow_schema(schema: &Schema) -> String {
BASE64_STANDARD.encode(&len_prefix_schema)
}

/// Mutates writer metadata by storing the encoded Arrow schema.
/// Mutates writer metadata by storing the encoded Arrow schema hint in
/// [`ARROW_SCHEMA_META_KEY`].
///
/// If there is an existing Arrow schema metadata, it is replaced.
///
/// [`ARROW_SCHEMA_META_KEY`]: crate::arrow::ARROW_SCHEMA_META_KEY
pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) {
let encoded = encode_arrow_schema(schema);

Expand Down Expand Up @@ -224,7 +234,12 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP

/// Converter for Arrow schema to Parquet schema
///
/// Example:
/// See the documentation on the [`arrow`] module for background
/// information on how Arrow schema is represented in Parquet.
///
/// [`arrow`]: crate::arrow
///
/// # Example:
/// ```
/// # use std::sync::Arc;
/// # use arrow_schema::{Field, Schema, DataType};
Expand Down
Loading