Skip to content

Unable to read Dictionary(u8, FixedSizeBinary(_)) using datafusion. #7545

@albertlockett

Description

@albertlockett

Describe the bug
I'm not sure if this is a bug in parquet or datafusion. If this is is a datafusion bug, I'll close here and open in that repo.

If I write a column of type Dictionary(u8, FixedSizeBinary(_)), and try to read it using datafusion, I get the error:

thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")

To Reproduce

use std::sync::Arc;

use arrow::{
    datatypes::{DataType, Field, Schema},
    util::pretty::print_batches,
};
use arrow_array::{FixedSizeBinaryArray, RecordBatch, UInt8Array, UInt8DictionaryArray};
use datafusion::{
    prelude::{ParquetReadOptions, SessionContext},
    sql::TableReference,
};
use object_store::{local::LocalFileSystem, path::Path};
use parquet::{
    arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, async_writer::ParquetObjectWriter, AsyncArrowWriter},
    file::properties::WriterProperties,
};

#[tokio::main]
async fn main() {
    let schema = Arc::new(Schema::new(vec![Field::new(
        "a",
        DataType::Dictionary(
            Box::new(DataType::UInt8),
            Box::new(DataType::FixedSizeBinary(8)),
        ),
        true,
    )]));

    let keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
    // let values = ;
    let values = FixedSizeBinaryArray::try_from_iter(
        vec![
            (0u8..8u8).into_iter().collect::<Vec<u8>>(),
            (24u8..32u8).into_iter().collect::<Vec<u8>>(),
        ]
        .into_iter(),
    )
    .unwrap();
    let arr = UInt8DictionaryArray::new(keys, Arc::new(values));
    let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap();

    // write batch to parquet
    let object_store = Arc::new(LocalFileSystem::new_with_prefix("/tmp").unwrap());
    let parquet_object_writer =
        ParquetObjectWriter::new(object_store.clone(), Path::from("test.parquet"));
    let mut parquet_writer = AsyncArrowWriter::try_new(
        parquet_object_writer,
        batch.schema().clone(),
        Some(WriterProperties::default()),
    )
    .unwrap();
    parquet_writer.write(&batch).await.unwrap();
    parquet_writer.close().await.unwrap();

    // read directly using parquet (this works)
    let file = std::fs::File::open("/tmp/test.parquet").unwrap();
    let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
    let mut reader = builder.build().unwrap();
    let read_batch = reader.next().unwrap().unwrap();
    print_batches(&[read_batch]).unwrap();

    // read using datafusion (this does not work)
    let ctx = SessionContext::new();
    ctx.register_parquet(
        TableReference::bare("tab"),
        "/tmp/test.parquet",
        ParquetReadOptions::default(),
    )
    .await
    .unwrap();
    let df = ctx.sql("select * from tab").await.unwrap();
    let batches = df.collect().await.unwrap();
    print_batches(&batches).unwrap();
}

Expected behavior
I think I should be able to read the column in this table.

Additional context
Full stack trace:

thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")
stack backtrace:
   0: rust_begin_unwind
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/std/src/panicking.rs:695:5
   1: core::panicking::panic_fmt
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/panicking.rs:75:14
   2: core::result::unwrap_failed
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/result.rs:1704:5
   3: core::result::Result<T,E>::unwrap
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1109:23
   4: parquet::arrow::buffer::offset_buffer::OffsetBuffer<I>::into_array
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:21
   5: parquet::arrow::buffer::dictionary_buffer::DictionaryBuffer<K,V>::into_array
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/dictionary_buffer.rs:187:39
   6: <parquet::arrow::array_reader::byte_array_dictionary::ByteArrayDictionaryReader<K,V> as parquet::arrow::array_reader::ArrayReader>::consume_batch
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/byte_array_dictionary.rs:170:21
   7: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:111:27
   8: core::iter::adapters::map::map_try_fold::{{closure}}
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:95:28
   9: core::iter::traits::iterator::Iterator::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2370:21
  10: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:121:9
  11: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:191:9
  12: core::iter::traits::iterator::Iterator::try_for_each
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2431:9
  13: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:174:14
  14: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter_nested.rs:25:32
  15: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter.rs:34:9
  16: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/mod.rs:3424:9
  17: core::iter::traits::iterator::Iterator::collect
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
  18: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:51
  19: core::iter::adapters::try_process
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:160:17
  20: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:9
  21: core::iter::traits::iterator::Iterator::collect
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
  22: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:108:30
  23: <parquet::arrow::arrow_reader::ParquetRecordBatchReader as core::iter::traits::iterator::Iterator>::next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/arrow_reader/mod.rs:855:15
  24: <parquet::arrow::async_reader::ParquetRecordBatchStream<T> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/async_reader/mod.rs:811:62
  25: <S as futures_core::stream::TryStream>::try_poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
  26: <futures_util::stream::try_stream::into_stream::IntoStream<St> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/into_stream.rs:38:9
  27: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
  28: <futures_util::stream::try_stream::MapErr<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/lib.rs:97:13
  29: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
  30: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
  31: futures_util::stream::stream::StreamExt::poll_next_unpin
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/mod.rs:1638:9
  32: datafusion_datasource::file_stream::FileStream::poll_inner
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:220:34
  33: <datafusion_datasource::file_stream::FileStream as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:333:22
  34: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
  35: <S as futures_core::stream::TryStream>::try_poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
  36: <futures_util::stream::try_stream::try_collect::TryCollect<St,C> as core::future::future::Future>::poll
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/try_collect.rs:46:26
  37: datafusion_physical_plan::common::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/common.rs:45:36
  38: datafusion_physical_plan::execution_plan::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/execution_plan.rs:868:36
  39: datafusion::dataframe::DataFrame::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-47.0.0/src/dataframe/mod.rs:1351:33
  40: parquet_bug_repro::main::{{closure}}
             at ./src/bin/parquet_bug_repro.rs:72:32
  41: <core::pin::Pin<P> as core::future::future::Future>::poll
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/future/future.rs:124:9
  42: tokio::runtime::park::CachedParkThread::block_on::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:60
  43: tokio::task::coop::with_budget
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:167:5
  44: tokio::task::coop::budget
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:133:5
  45: tokio::runtime::park::CachedParkThread::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:31
  46: tokio::runtime::context::blocking::BlockingRegionGuard::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/blocking.rs:66:9
  47: tokio::runtime::scheduler::multi_thread::MultiThread::block_on::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:87:13
  48: tokio::runtime::context::runtime::enter_runtime
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/runtime.rs:65:16
  49: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:86:9
  50: tokio::runtime::runtime::Runtime::block_on_inner
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:358:45
  51: tokio::runtime::runtime::Runtime::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:328:13
  52: parquet_bug_repro::main
             at ./src/bin/parquet_bug_repro.rs:73:5
  53: core::ops::function::FnOnce::call_once
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

Versions:

arrow = { version = "55", features = ["prettyprint", "chrono-tz"] }
arrow-array = "55"
datafusion = "47"
parquet = { version = "55", features = ["arrow", "async", "object_store"]}
object_store = "0.12"
tokio = { version = "1", features = ["full"] }

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugparquetChanges to the parquet crate

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions