From 6f720b08b6871f27c3f37253bbe539407c53f332 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Aug 2024 17:31:28 -0400 Subject: [PATCH 1/3] Add tests for bad parquet files --- parquet-testing | 2 +- parquet/tests/arrow_reader/bad_data.rs | 132 +++++++++++++++++++++++++ parquet/tests/arrow_reader/mod.rs | 1 + 3 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 parquet/tests/arrow_reader/bad_data.rs diff --git a/parquet-testing b/parquet-testing index 9b48ff4f94dc..cb7a9674142c 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 9b48ff4f94dc5e89592d46a119884dbb88100884 +Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199 diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs new file mode 100644 index 000000000000..f561a5a492bc --- /dev/null +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests that reading invalid parquet files returns an error + +use arrow::util::test_util::parquet_test_data; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::errors::ParquetError; +use std::collections::HashSet; +use std::path::PathBuf; + +static KNOWN_FILES: &[&str] = &[ + "PARQUET-1481.parquet", + "ARROW-GH-41321.parquet", + "ARROW-RS-GH-6229-DICTHEADER.parquet", + "ARROW-RS-GH-6229-LEVELS.parquet", + "README.md", +]; + +/// Returns the path to 'parquet-testing/bad_data' +fn bad_data_dir() -> PathBuf { + // points to parquet-testing/data + let parquet_testing_data = parquet_test_data(); + PathBuf::from(parquet_testing_data) + .parent() + .expect("was in parquet-testing/data") + .join("bad_data") +} + +#[test] +// Ensure that if we add a new test the files are added to the tests. +fn test_invalid_files() { + let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect(); + let mut seen_files = HashSet::new(); + + let files = std::fs::read_dir(bad_data_dir()).unwrap(); + + for file in files { + let file_name = file + .unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); + // If you see this error, please add a test for the new file following the model below + assert!( + known_files.contains(file_name.as_str()), + "Found new file in bad_data, please add test: {file_name}" + ); + seen_files.insert(file_name); + } + for expected_file in known_files { + assert!( + seen_files.contains(expected_file), + "Expected file not found in bad_data directory: {expected_file}" + ); + } +} + +#[test] +fn test_parquet_1481() { + let err = read_file("PARQUET-1481.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: unexpected parquet type: -7" + ); +} + +#[test] +#[should_panic(expected = "assertion failed: self.current_value.is_some()")] +fn test_arrow_gh_41321() { + let err = read_file("ARROW-GH-41321.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "TBD (currently panics)" + ); +} + + +#[test] +fn test_arrow_rs_gh_6229_dict_header() { + let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: EOF: eof decoding byte array" + ); +} + +/* INFINITE LOOP +https://github.com/apache/arrow-rs/issues/6229 +#[test] +fn test_arrow_rs_gh_6229_dict_levels() { + let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: EOF: eof decoding byte array" + ); +} + */ + +/// Reads the file and tries to return the total row count +/// Returns an error if the file is invalid +fn read_file(name: &str) -> Result { + let path = bad_data_dir().join(name); + println!("Reading file: {:?}", path); + + let file = std::fs::File::open(&path).unwrap(); + let reader = ArrowReaderBuilder::try_new(file)?.build()?; + + let mut num_rows = 0; + for batch in reader { + let batch = batch?; + num_rows += batch.num_rows(); + } + Ok(num_rows) +} diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index 7e979dcf3ec0..cc4c8f3c977b 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -35,6 +35,7 @@ use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; use tempfile::NamedTempFile; +mod bad_data; mod statistics; // returns a struct array with columns "int32_col", "float32_col" and "float64_col" with the specified values From 6a1a7102e3be748045c3b31ee9e462336705f3ea Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Aug 2024 18:13:50 -0400 Subject: [PATCH 2/3] Reenable test --- parquet/tests/arrow_reader/bad_data.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index f561a5a492bc..de168a12cd9f 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -86,13 +86,9 @@ fn test_parquet_1481() { #[should_panic(expected = "assertion failed: self.current_value.is_some()")] fn test_arrow_gh_41321() { let err = read_file("ARROW-GH-41321.parquet").unwrap_err(); - assert_eq!( - err.to_string(), - "TBD (currently panics)" - ); + assert_eq!(err.to_string(), "TBD (currently panics)"); } - #[test] fn test_arrow_rs_gh_6229_dict_header() { let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err(); @@ -102,17 +98,14 @@ fn test_arrow_rs_gh_6229_dict_header() { ); } -/* INFINITE LOOP -https://github.com/apache/arrow-rs/issues/6229 #[test] fn test_arrow_rs_gh_6229_dict_levels() { let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: EOF: eof decoding byte array" + "External: Parquet argument error: Parquet error: Insufficient repetition levels read from column" ); } - */ /// Reads the file and tries to return the total row count /// Returns an error if the file is invalid From 6779cf95ba97297697cad43941acc8ef81b8622a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Aug 2024 18:23:10 -0400 Subject: [PATCH 3/3] Add test for very subltley different file --- parquet/tests/arrow_reader/bad_data.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index de168a12cd9f..6e325f119710 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -25,6 +25,7 @@ use std::path::PathBuf; static KNOWN_FILES: &[&str] = &[ "PARQUET-1481.parquet", + "ARROW-GH-41317.parquet", "ARROW-GH-41321.parquet", "ARROW-RS-GH-6229-DICTHEADER.parquet", "ARROW-RS-GH-6229-LEVELS.parquet", @@ -58,6 +59,7 @@ fn test_invalid_files() { .to_str() .unwrap() .to_string(); + // If you see this error, please add a test for the new file following the model below assert!( known_files.contains(file_name.as_str()), @@ -89,6 +91,15 @@ fn test_arrow_gh_41321() { assert_eq!(err.to_string(), "TBD (currently panics)"); } +#[test] +fn test_arrow_gh_41317() { + let err = read_file("ARROW-GH-41317.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: External: bad data" + ); +} + #[test] fn test_arrow_rs_gh_6229_dict_header() { let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err();