diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 945f62526a7e..5ada61e93d62 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -289,9 +289,12 @@ fn build_primitive_reader( } _ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?, }, - PhysicalType::FIXED_LEN_BYTE_ARRAY => { - make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? - } + PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type { + Some(DataType::Dictionary(_, _)) => { + make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)? + } + _ => make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)?, + }, }; Ok(Some(reader)) } diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 440db641a242..757d3df8a82b 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -90,21 +90,21 @@ pub fn make_byte_array_dictionary_reader( ArrowType::Dictionary(key_type, value_type) => { make_reader! { (pages, column_desc, data_type) => match (key_type.as_ref(), value_type.as_ref()) { - (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8) => (u8, i32), + (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u8, i32), (ArrowType::UInt8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u8, i64), - (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) => (i8, i32), + (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i8, i32), (ArrowType::Int8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i8, i64), - (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8) => (u16, i32), + (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u16, i32), (ArrowType::UInt16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u16, i64), - (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8) => (i16, i32), + (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i16, i32), (ArrowType::Int16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i16, i64), - (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8) => (u32, i32), + (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u32, i32), (ArrowType::UInt32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u32, i64), - (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8) => (i32, i32), + (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i32, i32), (ArrowType::Int32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i32, i64), - (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8) => (u64, i32), + (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u64, i32), (ArrowType::UInt64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u64, i64), - (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8) => (i64, i32), + (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i64, i32), (ArrowType::Int64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i64, i64), } } diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 2d23ad8510f9..9767ec98e636 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -27,8 +27,8 @@ use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow_array::{ - Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, LargeBinaryArray, - LargeStringArray, StringArray, StringViewArray, + Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray, + LargeBinaryArray, LargeStringArray, StringArray, StringViewArray, }; use arrow_schema::DataType; @@ -85,6 +85,9 @@ macro_rules! downcast_op { DataType::LargeBinary => { downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*) } + DataType::FixedSizeBinary(_) => { + downcast_dict_op!(key, FixedSizeBinaryArray, $array, $op$(, $arg)*) + } d => unreachable!("cannot downcast {} dictionary value to byte array", d), }, d => unreachable!("cannot downcast {} to byte array", d), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 1e1054c9a063..66e1b06fa799 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -989,6 +989,9 @@ impl ArrowColumnWriterFactory { ArrowDataType::Utf8View | ArrowDataType::BinaryView => { out.push(bytes(leaves.next().unwrap())?) } + ArrowDataType::FixedSizeBinary(_) => { + out.push(bytes(leaves.next().unwrap())?) + } _ => { out.push(col(leaves.next().unwrap())?) } @@ -1333,6 +1336,7 @@ mod tests { use arrow_buffer::{i256, IntervalDayTime, IntervalMonthDayNano, NullBuffer}; use arrow_schema::Fields; use half::f16; + use num::{FromPrimitive, ToPrimitive}; use crate::basic::Encoding; use crate::data_type::AsBytes; @@ -1911,6 +1915,50 @@ mod tests { roundtrip(batch, Some(SMALL_SIZE / 2)); } + #[test] + fn test_fixed_size_binary_in_dict() { + fn test_fixed_size_binary_in_dict_inner() + where + K: ArrowDictionaryKeyType, + K::Native: FromPrimitive + ToPrimitive + TryFrom, + <::Native as TryFrom>::Error: std::fmt::Debug, + { + let field = Field::new( + "a", + DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(DataType::FixedSizeBinary(4)), + ), + false, + ); + let schema = Schema::new(vec![field]); + + let keys: Vec = vec![ + K::Native::try_from(0u8).unwrap(), + K::Native::try_from(0u8).unwrap(), + K::Native::try_from(1u8).unwrap(), + ]; + let keys = PrimitiveArray::::from_iter_values(keys); + let values = FixedSizeBinaryArray::try_from_iter( + vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(), + ) + .unwrap(); + + let data = DictionaryArray::::new(keys, Arc::new(values)); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap(); + roundtrip(batch, None); + } + + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + } + #[test] fn test_empty_dict() { let struct_fields = Fields::from(vec![Field::new( diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 59f1cfa056a1..386177639356 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -154,6 +154,15 @@ impl DictionaryBuffer { } } + let ArrowType::Dictionary(_, value_type) = data_type else { + unreachable!() + }; + let values = if let ArrowType::FixedSizeBinary(size) = **value_type { + arrow_cast::cast(&values, &ArrowType::FixedSizeBinary(size)).unwrap() + } else { + values + }; + let builder = ArrayDataBuilder::new(data_type.clone()) .len(keys.len()) .add_buffer(Buffer::from_vec(keys))