From 058c669a23276b59446144736bf0f426217ef880 Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 14:30:03 -0400 Subject: [PATCH 1/9] support FixedSizedBinary in dict encoding --- parquet/src/arrow/array_reader/builder.rs | 9 ++++++-- .../array_reader/fixed_len_byte_array.rs | 5 ++++ parquet/src/arrow/arrow_writer/byte_array.rs | 5 +++- parquet/src/arrow/arrow_writer/mod.rs | 23 +++++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 945f62526a7e..347bf2955349 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -289,8 +289,13 @@ fn build_primitive_reader( } _ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?, }, - PhysicalType::FIXED_LEN_BYTE_ARRAY => { - make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? + PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type { + Some(DataType::Dictionary(_, _)) => { + make_byte_array_reader(page_iterator, column_desc, arrow_type)? + } + _ => { + make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? + } } }; Ok(Some(reader)) diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 6b437be943d4..c92c59ca7e8b 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -64,6 +64,9 @@ pub fn make_fixed_len_byte_array_reader( }; match &data_type { ArrowType::FixedSizeBinary(_) => {} + ArrowType::Dictionary(_,_) => { + // todo + } ArrowType::Decimal128(_, _) => { if byte_length > 16 { return Err(general_err!( @@ -156,6 +159,8 @@ impl ArrayReader for FixedLenByteArrayReader { fn consume_batch(&mut self) -> Result { let record_data = self.record_reader.consume_record_data(); + // println("{:?}") + let array_data = ArrayDataBuilder::new(ArrowType::FixedSizeBinary(self.byte_length as i32)) .len(self.record_reader.num_values()) .add_buffer(Buffer::from_vec(record_data.buffer)) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 2d23ad8510f9..aa8bef4f71d4 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -28,7 +28,7 @@ use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow_array::{ Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, LargeBinaryArray, - LargeStringArray, StringArray, StringViewArray, + LargeStringArray, StringArray, StringViewArray, FixedSizeBinaryArray }; use arrow_schema::DataType; @@ -85,6 +85,9 @@ macro_rules! downcast_op { DataType::LargeBinary => { downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*) } + DataType::FixedSizeBinary(_) => { + downcast_dict_op!(key, FixedSizeBinaryArray, $array, $op$(, $arg)*) + } d => unreachable!("cannot downcast {} dictionary value to byte array", d), }, d => unreachable!("cannot downcast {} to byte array", d), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 1e1054c9a063..77ce5d275c93 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -989,6 +989,9 @@ impl ArrowColumnWriterFactory { ArrowDataType::Utf8View | ArrowDataType::BinaryView => { out.push(bytes(leaves.next().unwrap())?) } + ArrowDataType::FixedSizeBinary(_) => { + out.push(bytes(leaves.next().unwrap())?) + } _ => { out.push(col(leaves.next().unwrap())?) } @@ -1911,6 +1914,26 @@ mod tests { roundtrip(batch, Some(SMALL_SIZE / 2)); } + #[test] + fn test_fixed_size_binary_in_dict() { + let field = Field::new( + "a", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::FixedSizeBinary(4))), + false, + ); + + let schema = Schema::new(vec![field]); + + let keys = UInt8Array::from_iter_values([0, 0, 1]); + let values = FixedSizeBinaryArray::try_from_iter(vec![ + vec![0, 0, 0, 0], + vec![1, 1, 1, 1], + ].into_iter()).unwrap(); + let data = UInt8DictionaryArray::new(keys, Arc::new(values)); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap(); + roundtrip(batch, None); + } + #[test] fn test_empty_dict() { let struct_fields = Fields::from(vec![Field::new( From ab3d7d39cb1c3e9d40af75a9afd085f6f68656f2 Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 14:48:57 -0400 Subject: [PATCH 2/9] roundtrip works --- parquet/src/arrow/array_reader/builder.rs | 2 +- parquet/src/arrow/array_reader/byte_array.rs | 3 ++- .../array_reader/byte_array_dictionary.rs | 2 +- parquet/src/arrow/buffer/dictionary_buffer.rs | 21 +++++++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 347bf2955349..a31a530056fe 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -291,7 +291,7 @@ fn build_primitive_reader( }, PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type { Some(DataType::Dictionary(_, _)) => { - make_byte_array_reader(page_iterator, column_desc, arrow_type)? + make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)? } _ => { make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 92583155605b..1240e6feb1fe 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -55,7 +55,8 @@ pub fn make_byte_array_reader( ArrowType::Binary | ArrowType::Utf8 | ArrowType::Decimal128(_, _) - | ArrowType::Decimal256(_, _) => { + | ArrowType::Decimal256(_, _) + | ArrowType::Dictionary(_, _) => { let reader = GenericRecordReader::new(column_desc); Ok(Box::new(ByteArrayReader::::new( pages, data_type, reader, diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 440db641a242..99bde81759c6 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -90,7 +90,7 @@ pub fn make_byte_array_dictionary_reader( ArrowType::Dictionary(key_type, value_type) => { make_reader! { (pages, column_desc, data_type) => match (key_type.as_ref(), value_type.as_ref()) { - (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8) => (u8, i32), + (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u8, i32), (ArrowType::UInt8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u8, i64), (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) => (i8, i32), (ArrowType::Int8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i8, i64), diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 59f1cfa056a1..81daaf3a95fd 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -133,6 +133,8 @@ impl DictionaryBuffer { match self { Self::Dict { keys, values } => { + println!("keys = {:?}", keys); + println!("values = {:?}", values); // Validate keys unless dictionary is empty if !values.is_empty() { let min = K::from_usize(0).unwrap(); @@ -154,6 +156,25 @@ impl DictionaryBuffer { } } + let values = match data_type { + ArrowType::Dictionary(_, value_type) => { + match **value_type{ + ArrowType::FixedSizeBinary(size) => { + arrow_cast::cast( + &values, + &ArrowType::FixedSizeBinary(size), + ).unwrap() + }, + _ => { + values + } + } + } + _ => { + values + } + }; + let builder = ArrayDataBuilder::new(data_type.clone()) .len(keys.len()) .add_buffer(Buffer::from_vec(keys)) From 9e09d0e613995c0a6fea36c588090f7bacdfb965 Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 14:56:08 -0400 Subject: [PATCH 3/9] cleanup --- .../array_reader/fixed_len_byte_array.rs | 5 ---- parquet/src/arrow/buffer/dictionary_buffer.rs | 24 ++++--------------- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index c92c59ca7e8b..6b437be943d4 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -64,9 +64,6 @@ pub fn make_fixed_len_byte_array_reader( }; match &data_type { ArrowType::FixedSizeBinary(_) => {} - ArrowType::Dictionary(_,_) => { - // todo - } ArrowType::Decimal128(_, _) => { if byte_length > 16 { return Err(general_err!( @@ -159,8 +156,6 @@ impl ArrayReader for FixedLenByteArrayReader { fn consume_batch(&mut self) -> Result { let record_data = self.record_reader.consume_record_data(); - // println("{:?}") - let array_data = ArrayDataBuilder::new(ArrowType::FixedSizeBinary(self.byte_length as i32)) .len(self.record_reader.num_values()) .add_buffer(Buffer::from_vec(record_data.buffer)) diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 81daaf3a95fd..20f0c6cf081d 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -133,8 +133,6 @@ impl DictionaryBuffer { match self { Self::Dict { keys, values } => { - println!("keys = {:?}", keys); - println!("values = {:?}", values); // Validate keys unless dictionary is empty if !values.is_empty() { let min = K::from_usize(0).unwrap(); @@ -156,23 +154,11 @@ impl DictionaryBuffer { } } - let values = match data_type { - ArrowType::Dictionary(_, value_type) => { - match **value_type{ - ArrowType::FixedSizeBinary(size) => { - arrow_cast::cast( - &values, - &ArrowType::FixedSizeBinary(size), - ).unwrap() - }, - _ => { - values - } - } - } - _ => { - values - } + let ArrowType::Dictionary(_, value_type) = data_type else { unreachable!() }; + let values = if let ArrowType::FixedSizeBinary(size) = **value_type { + arrow_cast::cast(&values, &ArrowType::FixedSizeBinary(size)).unwrap() + } else { + values }; let builder = ArrayDataBuilder::new(data_type.clone()) From 9eef5a597a96a475b59c829ff37c4affaa301f8e Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 14:58:17 -0400 Subject: [PATCH 4/9] clippy and linter --- parquet/src/arrow/arrow_writer/byte_array.rs | 4 ++-- parquet/src/arrow/arrow_writer/mod.rs | 15 +++++++++------ parquet/src/arrow/buffer/dictionary_buffer.rs | 4 +++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index aa8bef4f71d4..9767ec98e636 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -27,8 +27,8 @@ use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow_array::{ - Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, LargeBinaryArray, - LargeStringArray, StringArray, StringViewArray, FixedSizeBinaryArray + Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray, + LargeBinaryArray, LargeStringArray, StringArray, StringViewArray, }; use arrow_schema::DataType; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 77ce5d275c93..baa4c797161c 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1917,18 +1917,21 @@ mod tests { #[test] fn test_fixed_size_binary_in_dict() { let field = Field::new( - "a", - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::FixedSizeBinary(4))), + "a", + DataType::Dictionary( + Box::new(DataType::UInt8), + Box::new(DataType::FixedSizeBinary(4)), + ), false, ); let schema = Schema::new(vec![field]); let keys = UInt8Array::from_iter_values([0, 0, 1]); - let values = FixedSizeBinaryArray::try_from_iter(vec![ - vec![0, 0, 0, 0], - vec![1, 1, 1, 1], - ].into_iter()).unwrap(); + let values = FixedSizeBinaryArray::try_from_iter( + vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(), + ) + .unwrap(); let data = UInt8DictionaryArray::new(keys, Arc::new(values)); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap(); roundtrip(batch, None); diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 20f0c6cf081d..386177639356 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -154,7 +154,9 @@ impl DictionaryBuffer { } } - let ArrowType::Dictionary(_, value_type) = data_type else { unreachable!() }; + let ArrowType::Dictionary(_, value_type) = data_type else { + unreachable!() + }; let values = if let ArrowType::FixedSizeBinary(size) = **value_type { arrow_cast::cast(&values, &ArrowType::FixedSizeBinary(size)).unwrap() } else { From cb7a6748453aa494e1e9c2ce75865e7b2139e633 Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 15:14:53 -0400 Subject: [PATCH 5/9] support all types of keys in byte_array_dictionary --- .../arrow/array_reader/byte_array_dictionary.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 99bde81759c6..757d3df8a82b 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -92,19 +92,19 @@ pub fn make_byte_array_dictionary_reader( (pages, column_desc, data_type) => match (key_type.as_ref(), value_type.as_ref()) { (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u8, i32), (ArrowType::UInt8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u8, i64), - (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) => (i8, i32), + (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i8, i32), (ArrowType::Int8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i8, i64), - (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8) => (u16, i32), + (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u16, i32), (ArrowType::UInt16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u16, i64), - (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8) => (i16, i32), + (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i16, i32), (ArrowType::Int16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i16, i64), - (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8) => (u32, i32), + (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u32, i32), (ArrowType::UInt32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u32, i64), - (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8) => (i32, i32), + (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i32, i32), (ArrowType::Int32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i32, i64), - (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8) => (u64, i32), + (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u64, i32), (ArrowType::UInt64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u64, i64), - (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8) => (i64, i32), + (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i64, i32), (ArrowType::Int64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i64, i64), } } From d3701ab37a55e5d02f05138dbb0ee06a4895a1ef Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 27 Apr 2025 15:31:30 -0400 Subject: [PATCH 6/9] back out change included by mistake --- parquet/src/arrow/array_reader/byte_array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 1240e6feb1fe..92583155605b 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -55,8 +55,7 @@ pub fn make_byte_array_reader( ArrowType::Binary | ArrowType::Utf8 | ArrowType::Decimal128(_, _) - | ArrowType::Decimal256(_, _) - | ArrowType::Dictionary(_, _) => { + | ArrowType::Decimal256(_, _) => { let reader = GenericRecordReader::new(column_desc); Ok(Box::new(ByteArrayReader::::new( pages, data_type, reader, From 47c6f7e563e3a2d56569b9e9b3c6f6d9db83f48c Mon Sep 17 00:00:00 2001 From: albertlockett Date: Mon, 28 Apr 2025 20:09:40 -0300 Subject: [PATCH 7/9] linter --- parquet/src/arrow/array_reader/builder.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index a31a530056fe..5ada61e93d62 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -293,10 +293,8 @@ fn build_primitive_reader( Some(DataType::Dictionary(_, _)) => { make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)? } - _ => { - make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? - } - } + _ => make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)?, + }, }; Ok(Some(reader)) } From 4fdc3c93cd69b6cd605f1c78cae8b44017848317 Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 4 May 2025 10:59:26 -0400 Subject: [PATCH 8/9] PR feedback before cleanup --- parquet/src/arrow/arrow_writer/mod.rs | 57 +++++++++++++++++++-------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index baa4c797161c..1d075977b0a7 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1336,6 +1336,7 @@ mod tests { use arrow_buffer::{i256, IntervalDayTime, IntervalMonthDayNano, NullBuffer}; use arrow_schema::Fields; use half::f16; + use num::{FromPrimitive, ToPrimitive}; use crate::basic::Encoding; use crate::data_type::AsBytes; @@ -1916,25 +1917,47 @@ mod tests { #[test] fn test_fixed_size_binary_in_dict() { - let field = Field::new( - "a", - DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::FixedSizeBinary(4)), - ), - false, - ); + fn test_fixed_size_binary_in_dict_inner() + where + K: ArrowDictionaryKeyType, + K::Native: FromPrimitive + ToPrimitive + From, + T: TryFrom, + >::Error: std::fmt::Debug, + { + let field = Field::new( + "a", + DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(DataType::FixedSizeBinary(4)), + ), + false, + ); + let schema = Schema::new(vec![field]); + + let keys: Vec = vec![ + K::Native::try_from(TryInto::::try_into(0u8).unwrap()).unwrap(), + K::Native::try_from(TryInto::::try_into(0u8).unwrap()).unwrap(), + K::Native::try_from(TryInto::::try_into(1u8).unwrap()).unwrap(), + ]; + let keys = PrimitiveArray::::from_iter_values(keys); + let values = FixedSizeBinaryArray::try_from_iter( + vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(), + ) + .unwrap(); - let schema = Schema::new(vec![field]); + let data = DictionaryArray::::new(keys, Arc::new(values)); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap(); + roundtrip(batch, None); + } - let keys = UInt8Array::from_iter_values([0, 0, 1]); - let values = FixedSizeBinaryArray::try_from_iter( - vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(), - ) - .unwrap(); - let data = UInt8DictionaryArray::new(keys, Arc::new(values)); - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap(); - roundtrip(batch, None); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); } #[test] From 7f2a44eaca86aff23ca9730fd209586973a1cf1c Mon Sep 17 00:00:00 2001 From: albertlockett Date: Sun, 4 May 2025 11:04:09 -0400 Subject: [PATCH 9/9] PR feedback from Weston --- parquet/src/arrow/arrow_writer/mod.rs | 29 +++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 1d075977b0a7..66e1b06fa799 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1917,12 +1917,11 @@ mod tests { #[test] fn test_fixed_size_binary_in_dict() { - fn test_fixed_size_binary_in_dict_inner() + fn test_fixed_size_binary_in_dict_inner() where K: ArrowDictionaryKeyType, - K::Native: FromPrimitive + ToPrimitive + From, - T: TryFrom, - >::Error: std::fmt::Debug, + K::Native: FromPrimitive + ToPrimitive + TryFrom, + <::Native as TryFrom>::Error: std::fmt::Debug, { let field = Field::new( "a", @@ -1935,9 +1934,9 @@ mod tests { let schema = Schema::new(vec![field]); let keys: Vec = vec![ - K::Native::try_from(TryInto::::try_into(0u8).unwrap()).unwrap(), - K::Native::try_from(TryInto::::try_into(0u8).unwrap()).unwrap(), - K::Native::try_from(TryInto::::try_into(1u8).unwrap()).unwrap(), + K::Native::try_from(0u8).unwrap(), + K::Native::try_from(0u8).unwrap(), + K::Native::try_from(1u8).unwrap(), ]; let keys = PrimitiveArray::::from_iter_values(keys); let values = FixedSizeBinaryArray::try_from_iter( @@ -1950,14 +1949,14 @@ mod tests { roundtrip(batch, None); } - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); - test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); + test_fixed_size_binary_in_dict_inner::(); } #[test]