diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index fc3d394203ff..21762946ca33 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -269,6 +269,14 @@ public Boolean eq(BoundReference ref, Literal lit) { public Boolean notEq(BoundReference ref, Literal lit) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + // However, when lower == upper and the manifest has no nulls or NaN values, we can safely + // prune if that value equals the literal. + T value = uniqueValue(ref); + + if (value != null && lit.comparator().compare(value, lit.value()) == 0) { + return ROWS_CANNOT_MATCH; + } + return ROWS_MIGHT_MATCH; } @@ -313,6 +321,14 @@ public Boolean in(BoundReference ref, Set literalSet) { public Boolean notIn(BoundReference ref, Set literalSet) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. + // However, when lower == upper and the manifest has no nulls or NaN values, we can safely + // prune if that value is in the exclusion set. + T value = uniqueValue(ref); + + if (value != null && literalSet.contains(value)) { + return ROWS_CANNOT_MATCH; + } + return ROWS_MIGHT_MATCH; } @@ -400,6 +416,43 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } + /** + * Returns the partition field's single value if all partitions contain the same value. Defined + * as a partition field with no nulls, no NaNs (for floating-point types), and lower bound + * equals upper bound. Returns null otherwise. + */ + private T uniqueValue(BoundReference ref) { + int pos = Accessors.toPosition(ref.accessor()); + PartitionFieldSummary fieldStats = stats.get(pos); + + if (fieldStats.containsNull()) { + return null; + } + + Type.TypeID typeId = ref.type().typeId(); + if (Type.TypeID.FLOAT.equals(typeId) || Type.TypeID.DOUBLE.equals(typeId)) { + if (fieldStats.containsNaN() == null || fieldStats.containsNaN()) { + return null; + } + } + + ByteBuffer lowerBound = fieldStats.lowerBound(); + ByteBuffer upperBound = fieldStats.upperBound(); + + if (lowerBound == null || upperBound == null) { + return null; + } + + T lower = Conversions.fromByteBuffer(ref.type(), lowerBound); + T upper = Conversions.fromByteBuffer(ref.type(), upperBound); + + if (ref.comparator().compare(lower, upper) != 0) { + return null; + } + + return lower; + } + private boolean allValuesAreNull(PartitionFieldSummary summary, Type.TypeID typeId) { // containsNull encodes whether at least one partition value is null, // lowerBound is null if all partition values are null diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 6c4944e9cd3a..78e6064eb427 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -66,7 +66,10 @@ public class TestInclusiveManifestEvaluator { optional(12, "no_nan_or_null", Types.DoubleType.get()), optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), optional(14, "all_same_value_or_null", Types.StringType.get()), - optional(15, "no_nulls_same_value_a", Types.StringType.get())); + optional(15, "no_nulls_same_value_a", Types.StringType.get()), + optional(16, "single_value_with_nan", Types.FloatType.get()), + optional(17, "single_value_nan_unknown", Types.FloatType.get()), + optional(18, "single_value_no_nan", Types.FloatType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -84,6 +87,9 @@ public class TestInclusiveManifestEvaluator { .identity("all_nulls_missing_nan_float") .identity("all_same_value_or_null") .identity("no_nulls_same_value_a") + .identity("single_value_with_nan") + .identity("single_value_nan_unknown") + .identity("single_value_no_nan") .build(); private static final int INT_MIN_VALUE = 30; @@ -128,7 +134,21 @@ public class TestInclusiveManifestEvaluator { toByteBuffer(Types.FloatType.get(), 20F)), new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + true, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F))), null); @Test @@ -753,4 +773,84 @@ public void testIntegerNotIn() { ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE); assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); } + + @Test + public void testNotEqWithSingleValue() { + boolean shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls", "a"), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: manifest has range of values").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "a"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single value equal to literal") + .isFalse(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest contains single value not equal to literal") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("all_same_value_or_null", "a"), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has nulls which match != predicate").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_with_nan", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has NaN values which match != predicate") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_nan_unknown", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("single_value_no_nan", 5.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single float value with no NaNs") + .isFalse(); + } + + @Test + public void testNotInWithSingleValue() { + boolean shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls", "a", "b"), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: manifest has range of values").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single value in exclusion list") + .isFalse(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "b", "c"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest contains single value not in exclusion list") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("all_same_value_or_null", "a", "b"), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has nulls which match NOT IN predicate") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_with_nan", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should read: manifest has NaN values which match NOT IN predicate") + .isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_nan_unknown", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue(); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("single_value_no_nan", 5.0F, 10.0F), SPEC, true) + .eval(FILE); + assertThat(shouldRead) + .as("Should not read: manifest contains single float value with no NaNs") + .isFalse(); + } }