From a944d030f9365b5080586bd2e522555ab8cf20a7 Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Sat, 17 Jan 2026 03:01:30 +0530 Subject: [PATCH 1/4] API: Optimize NOT IN and != predicates for single-value partition manifests --- .../expressions/ManifestEvaluator.java | 50 +++++++++++++ .../TestInclusiveManifestEvaluator.java | 75 ++++++++++++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index fc3d394203ff..3d031c3eb838 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -269,6 +269,14 @@ public Boolean eq(BoundReference ref, Literal lit) { public Boolean notEq(BoundReference ref, Literal lit) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + // However, when lower == upper and the manifest has no nulls or NaN values, we can safely + // prune if that value equals the literal. + T value = uniqueValue(ref); + + if (value != null && lit.comparator().compare(value, lit.value()) == 0) { + return ROWS_CANNOT_MATCH; + } + return ROWS_MIGHT_MATCH; } @@ -313,6 +321,14 @@ public Boolean in(BoundReference ref, Set literalSet) { public Boolean notIn(BoundReference ref, Set literalSet) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. + // However, when lower == upper and the manifest has no nulls or NaN values, we can safely + // prune if that value is in the exclusion set. + T value = uniqueValue(ref); + + if (value != null && literalSet.contains(value)) { + return ROWS_CANNOT_MATCH; + } + return ROWS_MIGHT_MATCH; } @@ -400,6 +416,40 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + /** + * Returns the partition field's single value if all partitions contain the same value. Defined + * as a partition field with no nulls, no NaNs, and lower bound equals upper bound. Returns null + * otherwise. + */ + private T uniqueValue(BoundReference ref) { + int pos = Accessors.toPosition(ref.accessor()); + PartitionFieldSummary fieldStats = stats.get(pos); + + if (fieldStats.containsNull()) { + return null; + } + + if (fieldStats.containsNaN() != null && fieldStats.containsNaN()) { + return null; + } + + ByteBuffer lowerBound = fieldStats.lowerBound(); + ByteBuffer upperBound = fieldStats.upperBound(); + + if (lowerBound == null || upperBound == null) { + return null; + } + + T lower = Conversions.fromByteBuffer(ref.type(), lowerBound); + T upper = Conversions.fromByteBuffer(ref.type(), upperBound); + + if (ref.comparator().compare(lower, upper) != 0) { + return null; + } + + return lower; + } + private boolean allValuesAreNull(PartitionFieldSummary summary, Type.TypeID typeId) { // containsNull encodes whether at least one partition value is null, // lowerBound is null if all partition values are null diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 6c4944e9cd3a..bbd0d5329162 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -66,7 +66,8 @@ public class TestInclusiveManifestEvaluator { optional(12, "no_nan_or_null", Types.DoubleType.get()), optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), optional(14, "all_same_value_or_null", Types.StringType.get()), - optional(15, "no_nulls_same_value_a", Types.StringType.get())); + optional(15, "no_nulls_same_value_a", Types.StringType.get()), + optional(16, "single_value_with_nan", Types.FloatType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -84,6 +85,7 @@ public class TestInclusiveManifestEvaluator { .identity("all_nulls_missing_nan_float") .identity("all_same_value_or_null") .identity("no_nulls_same_value_a") + .identity("single_value_with_nan") .build(); private static final int INT_MIN_VALUE = 30; @@ -128,7 +130,12 @@ public class TestInclusiveManifestEvaluator { toByteBuffer(Types.FloatType.get(), 20F)), new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + true, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F))), null); @Test @@ -753,4 +760,68 @@ public void testIntegerNotIn() { ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE); assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); } + + @Test + public void testNotEqWithSingleValue() { + boolean shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls", "a"), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has range of partition values, cannot exclude based on literal") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "a"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single partition value equal to literal") + .isFalse(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest contains single partition value not equal to literal") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("all_same_value_or_null", "a"), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has nulls which match != predicate").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_with_nan", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has NaN values which match != predicate") + .isTrue(); + } + + @Test + public void testNotInWithSingleValue() { + boolean shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls", "a", "b"), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has range of partition values, cannot exclude based on literal") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single partition value in exclusion list") + .isFalse(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "b", "c"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest contains single partition value not in exclusion list") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("all_same_value_or_null", "a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has nulls which match NOT IN predicate") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_with_nan", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has NaN values which match NOT IN predicate") + .isTrue(); + } } From 0718b85d75d501e775243256d72ec099fe4633fb Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Tue, 20 Jan 2026 10:26:23 +0530 Subject: [PATCH 2/4] Treat null containsNaN as unknown and skip pruning --- .../expressions/ManifestEvaluator.java | 2 +- .../TestInclusiveManifestEvaluator.java | 34 ++++++++++++------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 3d031c3eb838..11d086a77c96 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -429,7 +429,7 @@ private T uniqueValue(BoundReference ref) { return null; } - if (fieldStats.containsNaN() != null && fieldStats.containsNaN()) { + if (fieldStats.containsNaN() == null || fieldStats.containsNaN()) { return null; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index bbd0d5329162..9d13cc1eb052 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -67,7 +67,8 @@ public class TestInclusiveManifestEvaluator { optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), optional(14, "all_same_value_or_null", Types.StringType.get()), optional(15, "no_nulls_same_value_a", Types.StringType.get()), - optional(16, "single_value_with_nan", Types.FloatType.get())); + optional(16, "single_value_with_nan", Types.FloatType.get()), + optional(17, "single_value_nan_unknown", Types.FloatType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -86,6 +87,7 @@ public class TestInclusiveManifestEvaluator { .identity("all_same_value_or_null") .identity("no_nulls_same_value_a") .identity("single_value_with_nan") + .identity("single_value_nan_unknown") .build(); private static final int INT_MIN_VALUE = 30; @@ -130,11 +132,15 @@ public class TestInclusiveManifestEvaluator { toByteBuffer(Types.FloatType.get(), 20F)), new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, false, STRING_MIN, STRING_MIN), new TestHelpers.TestFieldSummary( false, true, toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), toByteBuffer(Types.FloatType.get(), 5.0F))), null); @@ -765,20 +771,18 @@ public void testIntegerNotIn() { public void testNotEqWithSingleValue() { boolean shouldRead = ManifestEvaluator.forRowFilter(notEqual("no_nulls", "a"), SPEC, true).eval(FILE); - assertThat(shouldRead) - .as("Should read: manifest has range of partition values, cannot exclude based on literal") - .isTrue(); + assertThat(shouldRead).as("Should read: manifest has range of values").isTrue(); shouldRead = ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "a"), SPEC, true) .eval(FILE); assertThat(shouldRead) - .as("Should not read: manifest contains single partition value equal to literal") + .as("Should not read: manifest contains single value equal to literal") .isFalse(); shouldRead = ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "b"), SPEC, true) .eval(FILE); assertThat(shouldRead) - .as("Should read: manifest contains single partition value not equal to literal") + .as("Should read: manifest contains single value not equal to literal") .isTrue(); shouldRead = ManifestEvaluator.forRowFilter(notEqual("all_same_value_or_null", "a"), SPEC, true) @@ -790,26 +794,28 @@ public void testNotEqWithSingleValue() { assertThat(shouldRead) .as("Should read: manifest has NaN values which match != predicate") .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_nan_unknown", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); } @Test public void testNotInWithSingleValue() { boolean shouldRead = ManifestEvaluator.forRowFilter(notIn("no_nulls", "a", "b"), SPEC, true).eval(FILE); - assertThat(shouldRead) - .as("Should read: manifest has range of partition values, cannot exclude based on literal") - .isTrue(); + assertThat(shouldRead).as("Should read: manifest has range of values").isTrue(); shouldRead = ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "a", "b"), SPEC, true) .eval(FILE); assertThat(shouldRead) - .as("Should not read: manifest contains single partition value in exclusion list") + .as("Should not read: manifest contains single value in exclusion list") .isFalse(); shouldRead = ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "b", "c"), SPEC, true) .eval(FILE); assertThat(shouldRead) - .as("Should read: manifest contains single partition value not in exclusion list") + .as("Should read: manifest contains single value not in exclusion list") .isTrue(); shouldRead = ManifestEvaluator.forRowFilter(notIn("all_same_value_or_null", "a", "b"), SPEC, true) @@ -823,5 +829,9 @@ public void testNotInWithSingleValue() { assertThat(shouldRead) .as("Should read: manifest has NaN values which match NOT IN predicate") .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_nan_unknown", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); } } From f3520a732ec56f3e32a7cd7cad964d3bb3ef69ec Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Tue, 20 Jan 2026 15:05:34 +0530 Subject: [PATCH 3/4] Combine null and NaN checks in uniqueValue. --- .../org/apache/iceberg/expressions/ManifestEvaluator.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 11d086a77c96..c62969641b12 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -425,11 +425,9 @@ private T uniqueValue(BoundReference ref) { int pos = Accessors.toPosition(ref.accessor()); PartitionFieldSummary fieldStats = stats.get(pos); - if (fieldStats.containsNull()) { - return null; - } - - if (fieldStats.containsNaN() == null || fieldStats.containsNaN()) { + if (fieldStats.containsNull() + || fieldStats.containsNaN() == null + || fieldStats.containsNaN()) { return null; } From 963104ed58ce5f85d24e93470dad95b792ad3720 Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Wed, 21 Jan 2026 10:13:40 +0530 Subject: [PATCH 4/4] Only check containsNaN for floating-point types. --- .../expressions/ManifestEvaluator.java | 15 ++++++++---- .../TestInclusiveManifestEvaluator.java | 23 +++++++++++++++++-- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index c62969641b12..21762946ca33 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -418,19 +418,24 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { /** * Returns the partition field's single value if all partitions contain the same value. Defined - * as a partition field with no nulls, no NaNs, and lower bound equals upper bound. Returns null - * otherwise. + * as a partition field with no nulls, no NaNs (for floating-point types), and lower bound + * equals upper bound. Returns null otherwise. */ private T uniqueValue(BoundReference ref) { int pos = Accessors.toPosition(ref.accessor()); PartitionFieldSummary fieldStats = stats.get(pos); - if (fieldStats.containsNull() - || fieldStats.containsNaN() == null - || fieldStats.containsNaN()) { + if (fieldStats.containsNull()) { return null; } + Type.TypeID typeId = ref.type().typeId(); + if (Type.TypeID.FLOAT.equals(typeId) || Type.TypeID.DOUBLE.equals(typeId)) { + if (fieldStats.containsNaN() == null || fieldStats.containsNaN()) { + return null; + } + } + ByteBuffer lowerBound = fieldStats.lowerBound(); ByteBuffer upperBound = fieldStats.upperBound(); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 9d13cc1eb052..78e6064eb427 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -68,7 +68,8 @@ public class TestInclusiveManifestEvaluator { optional(14, "all_same_value_or_null", Types.StringType.get()), optional(15, "no_nulls_same_value_a", Types.StringType.get()), optional(16, "single_value_with_nan", Types.FloatType.get()), - optional(17, "single_value_nan_unknown", Types.FloatType.get())); + optional(17, "single_value_nan_unknown", Types.FloatType.get()), + optional(18, "single_value_no_nan", Types.FloatType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -88,6 +89,7 @@ public class TestInclusiveManifestEvaluator { .identity("no_nulls_same_value_a") .identity("single_value_with_nan") .identity("single_value_nan_unknown") + .identity("single_value_no_nan") .build(); private static final int INT_MIN_VALUE = 30; @@ -132,13 +134,18 @@ public class TestInclusiveManifestEvaluator { toByteBuffer(Types.FloatType.get(), 20F)), new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), new TestHelpers.TestFieldSummary( false, true, toByteBuffer(Types.FloatType.get(), 5.0F), toByteBuffer(Types.FloatType.get(), 5.0F)), new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, false, toByteBuffer(Types.FloatType.get(), 5.0F), toByteBuffer(Types.FloatType.get(), 5.0F))), @@ -798,6 +805,12 @@ public void testNotEqWithSingleValue() { ManifestEvaluator.forRowFilter(notEqual("single_value_nan_unknown", 5.0F), SPEC, true) .eval(FILE); assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_no_nan", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single float value with no NaNs") + .isFalse(); } @Test @@ -833,5 +846,11 @@ public void testNotInWithSingleValue() { ManifestEvaluator.forRowFilter(notIn("single_value_nan_unknown", 5.0F, 10.0F), SPEC, true) .eval(FILE); assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_no_nan", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single float value with no NaNs") + .isFalse(); } }