-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Labels
Description
Describe the bug
I'm not sure if this is a bug in parquet or datafusion. If this is is a datafusion bug, I'll close here and open in that repo.
If I write a column of type Dictionary(u8, FixedSizeBinary(_)), and try to read it using datafusion, I get the error:
thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")
To Reproduce
use std::sync::Arc;
use arrow::{
datatypes::{DataType, Field, Schema},
util::pretty::print_batches,
};
use arrow_array::{FixedSizeBinaryArray, RecordBatch, UInt8Array, UInt8DictionaryArray};
use datafusion::{
prelude::{ParquetReadOptions, SessionContext},
sql::TableReference,
};
use object_store::{local::LocalFileSystem, path::Path};
use parquet::{
arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, async_writer::ParquetObjectWriter, AsyncArrowWriter},
file::properties::WriterProperties,
};
#[tokio::main]
async fn main() {
let schema = Arc::new(Schema::new(vec![Field::new(
"a",
DataType::Dictionary(
Box::new(DataType::UInt8),
Box::new(DataType::FixedSizeBinary(8)),
),
true,
)]));
let keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
// let values = ;
let values = FixedSizeBinaryArray::try_from_iter(
vec![
(0u8..8u8).into_iter().collect::<Vec<u8>>(),
(24u8..32u8).into_iter().collect::<Vec<u8>>(),
]
.into_iter(),
)
.unwrap();
let arr = UInt8DictionaryArray::new(keys, Arc::new(values));
let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap();
// write batch to parquet
let object_store = Arc::new(LocalFileSystem::new_with_prefix("/tmp").unwrap());
let parquet_object_writer =
ParquetObjectWriter::new(object_store.clone(), Path::from("test.parquet"));
let mut parquet_writer = AsyncArrowWriter::try_new(
parquet_object_writer,
batch.schema().clone(),
Some(WriterProperties::default()),
)
.unwrap();
parquet_writer.write(&batch).await.unwrap();
parquet_writer.close().await.unwrap();
// read directly using parquet (this works)
let file = std::fs::File::open("/tmp/test.parquet").unwrap();
let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
let mut reader = builder.build().unwrap();
let read_batch = reader.next().unwrap().unwrap();
print_batches(&[read_batch]).unwrap();
// read using datafusion (this does not work)
let ctx = SessionContext::new();
ctx.register_parquet(
TableReference::bare("tab"),
"/tmp/test.parquet",
ParquetReadOptions::default(),
)
.await
.unwrap();
let df = ctx.sql("select * from tab").await.unwrap();
let batches = df.collect().await.unwrap();
print_batches(&batches).unwrap();
}Expected behavior
I think I should be able to read the column in this table.
Additional context
Full stack trace:
thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")
stack backtrace:
0: rust_begin_unwind
at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/std/src/panicking.rs:695:5
1: core::panicking::panic_fmt
at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/panicking.rs:75:14
2: core::result::unwrap_failed
at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/result.rs:1704:5
3: core::result::Result<T,E>::unwrap
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1109:23
4: parquet::arrow::buffer::offset_buffer::OffsetBuffer<I>::into_array
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:21
5: parquet::arrow::buffer::dictionary_buffer::DictionaryBuffer<K,V>::into_array
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/dictionary_buffer.rs:187:39
6: <parquet::arrow::array_reader::byte_array_dictionary::ByteArrayDictionaryReader<K,V> as parquet::arrow::array_reader::ArrayReader>::consume_batch
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/byte_array_dictionary.rs:170:21
7: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:111:27
8: core::iter::adapters::map::map_try_fold::{{closure}}
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:95:28
9: core::iter::traits::iterator::Iterator::try_fold
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2370:21
10: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:121:9
11: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:191:9
12: core::iter::traits::iterator::Iterator::try_for_each
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2431:9
13: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:174:14
14: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter_nested.rs:25:32
15: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter.rs:34:9
16: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/mod.rs:3424:9
17: core::iter::traits::iterator::Iterator::collect
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
18: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:51
19: core::iter::adapters::try_process
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:160:17
20: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:9
21: core::iter::traits::iterator::Iterator::collect
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
22: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:108:30
23: <parquet::arrow::arrow_reader::ParquetRecordBatchReader as core::iter::traits::iterator::Iterator>::next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/arrow_reader/mod.rs:855:15
24: <parquet::arrow::async_reader::ParquetRecordBatchStream<T> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/async_reader/mod.rs:811:62
25: <S as futures_core::stream::TryStream>::try_poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
26: <futures_util::stream::try_stream::into_stream::IntoStream<St> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/into_stream.rs:38:9
27: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
28: <futures_util::stream::try_stream::MapErr<St,F> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/lib.rs:97:13
29: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
30: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
31: futures_util::stream::stream::StreamExt::poll_next_unpin
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/mod.rs:1638:9
32: datafusion_datasource::file_stream::FileStream::poll_inner
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:220:34
33: <datafusion_datasource::file_stream::FileStream as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:333:22
34: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
35: <S as futures_core::stream::TryStream>::try_poll_next
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
36: <futures_util::stream::try_stream::try_collect::TryCollect<St,C> as core::future::future::Future>::poll
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/try_collect.rs:46:26
37: datafusion_physical_plan::common::collect::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/common.rs:45:36
38: datafusion_physical_plan::execution_plan::collect::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/execution_plan.rs:868:36
39: datafusion::dataframe::DataFrame::collect::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-47.0.0/src/dataframe/mod.rs:1351:33
40: parquet_bug_repro::main::{{closure}}
at ./src/bin/parquet_bug_repro.rs:72:32
41: <core::pin::Pin<P> as core::future::future::Future>::poll
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/future/future.rs:124:9
42: tokio::runtime::park::CachedParkThread::block_on::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:60
43: tokio::task::coop::with_budget
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:167:5
44: tokio::task::coop::budget
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:133:5
45: tokio::runtime::park::CachedParkThread::block_on
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:31
46: tokio::runtime::context::blocking::BlockingRegionGuard::block_on
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/blocking.rs:66:9
47: tokio::runtime::scheduler::multi_thread::MultiThread::block_on::{{closure}}
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:87:13
48: tokio::runtime::context::runtime::enter_runtime
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/runtime.rs:65:16
49: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:86:9
50: tokio::runtime::runtime::Runtime::block_on_inner
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:358:45
51: tokio::runtime::runtime::Runtime::block_on
at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:328:13
52: parquet_bug_repro::main
at ./src/bin/parquet_bug_repro.rs:73:5
53: core::ops::function::FnOnce::call_once
at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
Versions:
arrow = { version = "55", features = ["prettyprint", "chrono-tz"] }
arrow-array = "55"
datafusion = "47"
parquet = { version = "55", features = ["arrow", "async", "object_store"]}
object_store = "0.12"
tokio = { version = "1", features = ["full"] }