From f37b8ae0f16a7b98f19c090615d472ce6bc38ce6 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 14 Aug 2024 11:29:53 -0700 Subject: [PATCH 1/8] add unary to FixedSizeBinaryArray; use unary for transformations --- .../src/array/fixed_size_binary_array.rs | 40 +++++++++- .../array_reader/fixed_len_byte_array.rs | 61 ++++---------- .../src/arrow/array_reader/primitive_array.rs | 80 ++++++------------- 3 files changed, 80 insertions(+), 101 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index e393e2b15ae6..06fd25347108 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +use crate::array::primitive_array::PrimitiveArray; use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; +use crate::types::ArrowPrimitiveType; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray, Scalar}; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -464,6 +466,42 @@ impl FixedSizeBinaryArray { pub fn iter(&self) -> FixedSizeBinaryIter<'_> { FixedSizeBinaryIter::new(self) } + + /// Applies a unary infallible function to a fixed-size binary array, producing a + /// new array of potentially different type. + /// + /// This is the fastest way to perform an operation on a primitive array + /// when the benefits of a vectorized operation outweigh the cost of + /// branching nulls and non-nulls. + /// + /// # Null Handling + /// + /// Applies the function for all values, including those on null slots. This + /// will often allow the compiler to generate faster vectorized code, but + /// requires that the operation must be infallible (not error/panic) for any + /// value of the corresponding type or this function may panic. + pub fn unary(&self, op: F) -> PrimitiveArray + where + O: ArrowPrimitiveType, + F: Fn(&[u8]) -> O::Native, + { + let num_vals = self.len(); + let length = self.value_length as usize; + let src = self.value_data.as_slice(); + let mut dst = vec![O::Native::default(); num_vals]; + + // Performance note: not using src.chunks() as that was considerably slower than + // calculating slices of src directly. + for (i, dsti) in dst.iter_mut().enumerate().take(num_vals) { + let idx = length * i; + *dsti = op(&src[idx..idx + length]) + } + + PrimitiveArray::new( + ScalarBuffer::new(Buffer::from_vec(dst), 0, num_vals), + self.nulls().cloned(), + ) + } } impl From for FixedSizeBinaryArray { diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 3b2600c54795..e3e9192bdff9 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -27,7 +27,7 @@ use crate::column::reader::decoder::ColumnValueDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow_array::{ - Array, ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, + ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, IntervalDayTimeArray, IntervalYearMonthArray, }; use arrow_buffer::{i256, Buffer, IntervalDayTime}; @@ -165,53 +165,31 @@ impl ArrayReader for FixedLenByteArrayReader { // TODO: An improvement might be to do this conversion on read let array: ArrayRef = match &self.data_type { ArrowType::Decimal128(p, s) => { - // We can simply reuse the null buffer from `binary` rather than recomputing it - // (as was the case when we simply used `collect` to produce the new array). - // The same applies to the transformations below. - let nulls = binary.nulls().cloned(); - let decimal = binary.iter().map(|o| match o { - Some(b) => i128::from_be_bytes(sign_extend_be(b)), - None => i128::default(), - }); - let decimal = Decimal128Array::from_iter_values_with_nulls(decimal, nulls) - .with_precision_and_scale(*p, *s)?; - Arc::new(decimal) + let f = |b: &[u8]| i128::from_be_bytes(sign_extend_be(b)); + Arc::new((binary.unary(&f) as Decimal128Array).with_precision_and_scale(*p, *s)?) + as ArrayRef } ArrowType::Decimal256(p, s) => { - let nulls = binary.nulls().cloned(); - let decimal = binary.iter().map(|o| match o { - Some(b) => i256::from_be_bytes(sign_extend_be(b)), - None => i256::default(), - }); - let decimal = Decimal256Array::from_iter_values_with_nulls(decimal, nulls) - .with_precision_and_scale(*p, *s)?; - Arc::new(decimal) + let f = |b: &[u8]| i256::from_be_bytes(sign_extend_be(b)); + Arc::new((binary.unary(&f) as Decimal256Array).with_precision_and_scale(*p, *s)?) + as ArrayRef } ArrowType::Interval(unit) => { - let nulls = binary.nulls().cloned(); // An interval is stored as 3x 32-bit unsigned integers storing months, days, // and milliseconds match unit { IntervalUnit::YearMonth => { - let iter = binary.iter().map(|o| match o { - Some(b) => i32::from_le_bytes(b[0..4].try_into().unwrap()), - None => i32::default(), - }); - let interval = - IntervalYearMonthArray::from_iter_values_with_nulls(iter, nulls); - Arc::new(interval) as ArrayRef + let f = |b: &[u8]| i32::from_le_bytes(b[0..4].try_into().unwrap()); + Arc::new(binary.unary(&f) as IntervalYearMonthArray) as ArrayRef } IntervalUnit::DayTime => { - let iter = binary.iter().map(|o| match o { - Some(b) => IntervalDayTime::new( + let f = |b: &[u8]| { + IntervalDayTime::new( i32::from_le_bytes(b[4..8].try_into().unwrap()), i32::from_le_bytes(b[8..12].try_into().unwrap()), - ), - None => IntervalDayTime::default(), - }); - let interval = - IntervalDayTimeArray::from_iter_values_with_nulls(iter, nulls); - Arc::new(interval) as ArrayRef + ) + }; + Arc::new(binary.unary(&f) as IntervalDayTimeArray) as ArrayRef } IntervalUnit::MonthDayNano => { return Err(nyi_err!("MonthDayNano intervals not supported")); @@ -219,13 +197,8 @@ impl ArrayReader for FixedLenByteArrayReader { } } ArrowType::Float16 => { - let nulls = binary.nulls().cloned(); - let f16s = binary.iter().map(|o| match o { - Some(b) => f16::from_le_bytes(b[..2].try_into().unwrap()), - None => f16::default(), - }); - let f16s = Float16Array::from_iter_values_with_nulls(f16s, nulls); - Arc::new(f16s) as ArrayRef + let f = |b: &[u8]| f16::from_le_bytes(b.try_into().unwrap()); + Arc::new(binary.unary(&f) as Float16Array) as ArrayRef } _ => Arc::new(binary) as ArrayRef, }; @@ -488,8 +461,8 @@ mod tests { use crate::arrow::ArrowWriter; use arrow::datatypes::Field; use arrow::error::Result as ArrowResult; - use arrow_array::RecordBatch; use arrow_array::{Array, ListArray}; + use arrow_array::{Decimal256Array, RecordBatch}; use bytes::Bytes; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 5e0e09212c7e..ab5d415bec87 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -217,35 +217,19 @@ where arrow_cast::cast(&a, target_type)? } ArrowType::Decimal128(p, s) => { - // We can simply reuse the null buffer from `array` rather than recomputing it - // (as was the case when we simply used `collect` to produce the new array). - let nulls = array.nulls().cloned(); let array = match array.data_type() { - ArrowType::Int32 => { - let decimal = array - .as_any() - .downcast_ref::() - .unwrap() - .iter() - .map(|v| match v { - Some(i) => i as i128, - None => i128::default(), - }); - Decimal128Array::from_iter_values_with_nulls(decimal, nulls) - } - - ArrowType::Int64 => { - let decimal = array - .as_any() - .downcast_ref::() - .unwrap() - .iter() - .map(|v| match v { - Some(i) => i as i128, - None => i128::default(), - }); - Decimal128Array::from_iter_values_with_nulls(decimal, nulls) - } + ArrowType::Int32 => array + .as_any() + .downcast_ref::() + .unwrap() + .unary(|i| i as i128) + as Decimal128Array, + ArrowType::Int64 => array + .as_any() + .downcast_ref::() + .unwrap() + .unary(|i| i as i128) + as Decimal128Array, _ => { return Err(arrow_err!( "Cannot convert {:?} to decimal", @@ -258,35 +242,19 @@ where Arc::new(array) as ArrayRef } ArrowType::Decimal256(p, s) => { - // We can simply reuse the null buffer from `array` rather than recomputing it - // (as was the case when we simply used `collect` to produce the new array). - let nulls = array.nulls().cloned(); let array = match array.data_type() { - ArrowType::Int32 => { - let decimal = array - .as_any() - .downcast_ref::() - .unwrap() - .iter() - .map(|v| match v { - Some(i) => i256::from_i128(i as i128), - None => i256::default(), - }); - Decimal256Array::from_iter_values_with_nulls(decimal, nulls) - } - - ArrowType::Int64 => { - let decimal = array - .as_any() - .downcast_ref::() - .unwrap() - .iter() - .map(|v| match v { - Some(i) => i256::from_i128(i as i128), - None => i256::default(), - }); - Decimal256Array::from_iter_values_with_nulls(decimal, nulls) - } + ArrowType::Int32 => array + .as_any() + .downcast_ref::() + .unwrap() + .unary(|i| i256::from_i128(i as i128)) + as Decimal256Array, + ArrowType::Int64 => array + .as_any() + .downcast_ref::() + .unwrap() + .unary(|i| i256::from_i128(i as i128)) + as Decimal256Array, _ => { return Err(arrow_err!( "Cannot convert {:?} to decimal", From 69b42d95187093b83577b6026206aa63136ac3b5 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 14 Aug 2024 15:22:30 -0700 Subject: [PATCH 2/8] clean up documentation some --- arrow-array/src/array/fixed_size_binary_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 06fd25347108..2faa93d7b977 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -468,9 +468,9 @@ impl FixedSizeBinaryArray { } /// Applies a unary infallible function to a fixed-size binary array, producing a - /// new array of potentially different type. + /// new primitive array. /// - /// This is the fastest way to perform an operation on a primitive array + /// This is the fastest way to perform an operation on a fixed-size binary array /// when the benefits of a vectorized operation outweigh the cost of /// branching nulls and non-nulls. /// From 5db457c2c1e724e9f8689dccb8b6d73bcbee786d Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 19 Aug 2024 13:11:19 -0700 Subject: [PATCH 3/8] add from_unary to PrimitiveArray --- .../src/array/fixed_size_binary_array.rs | 40 +------------------ arrow-array/src/array/primitive_array.rs | 26 ++++++++++++ .../array_reader/fixed_len_byte_array.rs | 12 +++--- 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 2faa93d7b977..e393e2b15ae6 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -15,13 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::array::primitive_array::PrimitiveArray; use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; -use crate::types::ArrowPrimitiveType; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray, Scalar}; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, ScalarBuffer}; +use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -466,42 +464,6 @@ impl FixedSizeBinaryArray { pub fn iter(&self) -> FixedSizeBinaryIter<'_> { FixedSizeBinaryIter::new(self) } - - /// Applies a unary infallible function to a fixed-size binary array, producing a - /// new primitive array. - /// - /// This is the fastest way to perform an operation on a fixed-size binary array - /// when the benefits of a vectorized operation outweigh the cost of - /// branching nulls and non-nulls. - /// - /// # Null Handling - /// - /// Applies the function for all values, including those on null slots. This - /// will often allow the compiler to generate faster vectorized code, but - /// requires that the operation must be infallible (not error/panic) for any - /// value of the corresponding type or this function may panic. - pub fn unary(&self, op: F) -> PrimitiveArray - where - O: ArrowPrimitiveType, - F: Fn(&[u8]) -> O::Native, - { - let num_vals = self.len(); - let length = self.value_length as usize; - let src = self.value_data.as_slice(); - let mut dst = vec![O::Native::default(); num_vals]; - - // Performance note: not using src.chunks() as that was considerably slower than - // calculating slices of src directly. - for (i, dsti) in dst.iter_mut().enumerate().take(num_vals) { - let idx = length * i; - *dsti = op(&src[idx..idx + length]) - } - - PrimitiveArray::new( - ScalarBuffer::new(Buffer::from_vec(dst), 0, num_vals), - self.nulls().cloned(), - ) - } } impl From for FixedSizeBinaryArray { diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index db14845b08d9..354538f70736 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1016,6 +1016,32 @@ impl PrimitiveArray { PrimitiveArray::new(values, Some(nulls)) } + /// Applies a unary infallible function to each value in an array, producing a + /// new primitive array. + /// + /// # Null Handling + /// + /// Applies the function for all values, including those on null slots. This + /// will often allow the compiler to generate faster vectorized code, but + /// requires that the operation must be infallible (not error/panic) for any + /// value of the corresponding type or this function may panic. + pub fn from_unary(left: U, mut op: F) -> Self + where + F: FnMut(U::Item) -> T::Native, + { + let nulls = left.logical_nulls(); + let mut values: Vec = vec![T::Native::default(); left.len()]; + + for (i, val) in values.iter_mut().enumerate().take(left.len()) { + // SAFETY: i in range 0..len + unsafe { + *val = op(left.value_unchecked(i)); + } + } + let values = ScalarBuffer::from(values); + Self::new(values, nulls) + } + /// Returns a `PrimitiveBuilder` for this array, suitable for mutating values /// in place. /// diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index e3e9192bdff9..7731ba08937c 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -166,12 +166,12 @@ impl ArrayReader for FixedLenByteArrayReader { let array: ArrayRef = match &self.data_type { ArrowType::Decimal128(p, s) => { let f = |b: &[u8]| i128::from_be_bytes(sign_extend_be(b)); - Arc::new((binary.unary(&f) as Decimal128Array).with_precision_and_scale(*p, *s)?) + Arc::new(Decimal128Array::from_unary(&binary, f).with_precision_and_scale(*p, *s)?) as ArrayRef } ArrowType::Decimal256(p, s) => { let f = |b: &[u8]| i256::from_be_bytes(sign_extend_be(b)); - Arc::new((binary.unary(&f) as Decimal256Array).with_precision_and_scale(*p, *s)?) + Arc::new(Decimal256Array::from_unary(&binary, f).with_precision_and_scale(*p, *s)?) as ArrayRef } ArrowType::Interval(unit) => { @@ -180,7 +180,7 @@ impl ArrayReader for FixedLenByteArrayReader { match unit { IntervalUnit::YearMonth => { let f = |b: &[u8]| i32::from_le_bytes(b[0..4].try_into().unwrap()); - Arc::new(binary.unary(&f) as IntervalYearMonthArray) as ArrayRef + Arc::new(IntervalYearMonthArray::from_unary(&binary, f)) as ArrayRef } IntervalUnit::DayTime => { let f = |b: &[u8]| { @@ -189,7 +189,7 @@ impl ArrayReader for FixedLenByteArrayReader { i32::from_le_bytes(b[8..12].try_into().unwrap()), ) }; - Arc::new(binary.unary(&f) as IntervalDayTimeArray) as ArrayRef + Arc::new(IntervalDayTimeArray::from_unary(&binary, f)) as ArrayRef } IntervalUnit::MonthDayNano => { return Err(nyi_err!("MonthDayNano intervals not supported")); @@ -197,8 +197,8 @@ impl ArrayReader for FixedLenByteArrayReader { } } ArrowType::Float16 => { - let f = |b: &[u8]| f16::from_le_bytes(b.try_into().unwrap()); - Arc::new(binary.unary(&f) as Float16Array) as ArrayRef + let f = |b: &[u8]| f16::from_le_bytes(b[..2].try_into().unwrap()); + Arc::new(Float16Array::from_unary(&binary, f)) as ArrayRef } _ => Arc::new(binary) as ArrayRef, }; From c388d30eb0ef99cb9349e5ae1c10227eee144c05 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 19 Aug 2024 17:16:45 -0700 Subject: [PATCH 4/8] use from_unary for converting byte array to decimal --- parquet/src/arrow/array_reader/byte_array.rs | 26 +++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index ed5961586d91..24a4fb6b75b3 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -123,23 +123,25 @@ impl ArrayReader for ByteArrayReader { ArrowType::Decimal128(p, s) => { let array = buffer.into_array(null_buffer, ArrowType::Binary); let binary = array.as_any().downcast_ref::().unwrap(); - let decimal = binary - .iter() - .map(|opt| Some(i128::from_be_bytes(sign_extend_be(opt?)))) - .collect::() - .with_precision_and_scale(p, s)?; - + // Null slots will have 0 length, so we need to check for that in the lambda + // or sign_extend_be will panic. + let decimal = Decimal128Array::from_unary(binary, |x| match x.len() { + 0 => i128::default(), + _ => i128::from_be_bytes(sign_extend_be(x)), + }) + .with_precision_and_scale(p, s)?; Arc::new(decimal) } ArrowType::Decimal256(p, s) => { let array = buffer.into_array(null_buffer, ArrowType::Binary); let binary = array.as_any().downcast_ref::().unwrap(); - let decimal = binary - .iter() - .map(|opt| Some(i256::from_be_bytes(sign_extend_be(opt?)))) - .collect::() - .with_precision_and_scale(p, s)?; - + // Null slots will have 0 length, so we need to check for that in the lambda + // or sign_extend_be will panic. + let decimal = Decimal256Array::from_unary(binary, |x| match x.len() { + 0 => i256::default(), + _ => i256::from_be_bytes(sign_extend_be(x)), + }) + .with_precision_and_scale(p, s)?; Arc::new(decimal) } _ => buffer.into_array(null_buffer, self.data_type.clone()), From cf625bf4dd0fdd2714b6f2b4c40d550291528276 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 20 Aug 2024 13:41:07 -0700 Subject: [PATCH 5/8] rework from_unary to skip vector initialization --- arrow-array/src/array/primitive_array.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 354538f70736..ac32b11b4f78 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1030,16 +1030,14 @@ impl PrimitiveArray { F: FnMut(U::Item) -> T::Native, { let nulls = left.logical_nulls(); - let mut values: Vec = vec![T::Native::default(); left.len()]; + let buffer = unsafe { + // SAFETY: i in range 0..left.len() + let iter = (0..left.len()).map(|i| op(left.value_unchecked(i))); + // SAFETY: upper bound is trusted because `iter` is over a range + Buffer::from_trusted_len_iter(iter) + }; - for (i, val) in values.iter_mut().enumerate().take(left.len()) { - // SAFETY: i in range 0..len - unsafe { - *val = op(left.value_unchecked(i)); - } - } - let values = ScalarBuffer::from(values); - Self::new(values, nulls) + PrimitiveArray::new(buffer.into(), nulls) } /// Returns a `PrimitiveBuilder` for this array, suitable for mutating values From 4836237d08dbc10d4cb55f99f137bdd78e8cdfe3 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 20 Aug 2024 16:16:54 -0700 Subject: [PATCH 6/8] add example to from_unary docstring --- arrow-array/src/array/primitive_array.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index ac32b11b4f78..107b9a659550 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1021,10 +1021,17 @@ impl PrimitiveArray { /// /// # Null Handling /// - /// Applies the function for all values, including those on null slots. This - /// will often allow the compiler to generate faster vectorized code, but - /// requires that the operation must be infallible (not error/panic) for any - /// value of the corresponding type or this function may panic. + /// See [`Self::unary`] for more information on null handling. + /// + /// # Example: create an [`Int16Array`] from a [`FixedSizeBinaryArray`] + /// + /// ``` + /// use arrow_array::{Array, FixedSizeBinaryArray, Int16Array}; + /// let input_arg = vec![ vec![1, 0], vec![2, 0], vec![3, 0] ]; + /// let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); + /// let c = Int16Array::from_unary(&arr, |x| i16::from_le_bytes(x[..2].try_into().unwrap())); + /// assert_eq!(c, Int16Array::from(vec![Some(1i16), Some(2i16), Some(3i16)])); + /// ``` pub fn from_unary(left: U, mut op: F) -> Self where F: FnMut(U::Item) -> T::Native, From 7d1fac85a8e7bf748db560648c9758f351e892fd Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 20 Aug 2024 16:49:28 -0700 Subject: [PATCH 7/8] fix broken link --- arrow-array/src/array/primitive_array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 107b9a659550..521ef088e361 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1023,8 +1023,7 @@ impl PrimitiveArray { /// /// See [`Self::unary`] for more information on null handling. /// - /// # Example: create an [`Int16Array`] from a [`FixedSizeBinaryArray`] - /// + /// # Example: create an [`Int16Array`] from an [`ArrayAccessor`] with item type `&[u8]` /// ``` /// use arrow_array::{Array, FixedSizeBinaryArray, Int16Array}; /// let input_arg = vec![ vec![1, 0], vec![2, 0], vec![3, 0] ]; From 96ad209c797d8ac45296cf525f851b51ceb18ece Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 21 Aug 2024 09:02:41 -0700 Subject: [PATCH 8/8] add comments per review suggestion --- parquet/src/arrow/array_reader/byte_array.rs | 3 +++ parquet/src/arrow/array_reader/fixed_len_byte_array.rs | 3 +++ parquet/src/arrow/array_reader/primitive_array.rs | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 24a4fb6b75b3..92583155605b 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -120,6 +120,9 @@ impl ArrayReader for ByteArrayReader { self.record_reader.reset(); let array: ArrayRef = match self.data_type { + // Apply conversion to all elements regardless of null slots as the conversions + // are infallible. This improves performance by avoiding a branch in the inner + // loop (see docs for `PrimitiveArray::from_unary`). ArrowType::Decimal128(p, s) => { let array = buffer.into_array(null_buffer, ArrowType::Binary); let binary = array.as_any().downcast_ref::().unwrap(); diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 7731ba08937c..01692c242713 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -163,6 +163,9 @@ impl ArrayReader for FixedLenByteArrayReader { let binary = FixedSizeBinaryArray::from(unsafe { array_data.build_unchecked() }); // TODO: An improvement might be to do this conversion on read + // Note the conversions below apply to all elements regardless of null slots as the + // conversion lambdas are all infallible. This improves performance by avoiding a branch in + // the inner loop (see docs for `PrimitiveArray::from_unary`). let array: ArrayRef = match &self.data_type { ArrowType::Decimal128(p, s) => { let f = |b: &[u8]| i128::from_be_bytes(sign_extend_be(b)); diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index ab5d415bec87..010e9c2eed3f 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -217,6 +217,9 @@ where arrow_cast::cast(&a, target_type)? } ArrowType::Decimal128(p, s) => { + // Apply conversion to all elements regardless of null slots as the conversion + // to `i128` is infallible. This improves performance by avoiding a branch in + // the inner loop (see docs for `PrimitiveArray::unary`). let array = match array.data_type() { ArrowType::Int32 => array .as_any() @@ -242,6 +245,7 @@ where Arc::new(array) as ArrayRef } ArrowType::Decimal256(p, s) => { + // See above comment. Conversion to `i256` is likewise infallible. let array = match array.data_type() { ArrowType::Int32 => array .as_any()