Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,14 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
// However, when lower == upper and the manifest has no nulls or NaN values, we can safely
// prune if that value equals the literal.
T value = uniqueValue(ref);

if (value != null && lit.comparator().compare(value, lit.value()) == 0) {
return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

Expand Down Expand Up @@ -313,6 +321,14 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col.
// However, when lower == upper and the manifest has no nulls or NaN values, we can safely
// prune if that value is in the exclusion set.
T value = uniqueValue(ref);

if (value != null && literalSet.contains(value)) {
return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

Expand Down Expand Up @@ -400,6 +416,38 @@ public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_MATCH;
}

/**
* Returns the partition field's single value if all partitions contain the same value. Defined
* as a partition field with no nulls, no NaNs, and lower bound equals upper bound. Returns null
* otherwise.
*/
private <T> T uniqueValue(BoundReference<T> ref) {
int pos = Accessors.toPosition(ref.accessor());
PartitionFieldSummary fieldStats = stats.get(pos);

if (fieldStats.containsNull()
|| fieldStats.containsNaN() == null
|| fieldStats.containsNaN()) {
return null;
}

ByteBuffer lowerBound = fieldStats.lowerBound();
ByteBuffer upperBound = fieldStats.upperBound();

if (lowerBound == null || upperBound == null) {
return null;
}

T lower = Conversions.fromByteBuffer(ref.type(), lowerBound);
T upper = Conversions.fromByteBuffer(ref.type(), upperBound);

if (ref.comparator().compare(lower, upper) != 0) {
return null;
}

return lower;
}

private boolean allValuesAreNull(PartitionFieldSummary summary, Type.TypeID typeId) {
// containsNull encodes whether at least one partition value is null,
// lowerBound is null if all partition values are null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ public class TestInclusiveManifestEvaluator {
optional(12, "no_nan_or_null", Types.DoubleType.get()),
optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()),
optional(14, "all_same_value_or_null", Types.StringType.get()),
optional(15, "no_nulls_same_value_a", Types.StringType.get()));
optional(15, "no_nulls_same_value_a", Types.StringType.get()),
optional(16, "single_value_with_nan", Types.FloatType.get()),
optional(17, "single_value_nan_unknown", Types.FloatType.get()));

private static final PartitionSpec SPEC =
PartitionSpec.builderFor(SCHEMA)
Expand All @@ -84,6 +86,8 @@ public class TestInclusiveManifestEvaluator {
.identity("all_nulls_missing_nan_float")
.identity("all_same_value_or_null")
.identity("no_nulls_same_value_a")
.identity("single_value_with_nan")
.identity("single_value_nan_unknown")
.build();

private static final int INT_MIN_VALUE = 30;
Expand Down Expand Up @@ -128,7 +132,16 @@ public class TestInclusiveManifestEvaluator {
toByteBuffer(Types.FloatType.get(), 20F)),
new TestHelpers.TestFieldSummary(true, null, null),
new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN),
new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)),
new TestHelpers.TestFieldSummary(false, false, STRING_MIN, STRING_MIN),
new TestHelpers.TestFieldSummary(
false,
true,
toByteBuffer(Types.FloatType.get(), 5.0F),
toByteBuffer(Types.FloatType.get(), 5.0F)),
new TestHelpers.TestFieldSummary(
false,
toByteBuffer(Types.FloatType.get(), 5.0F),
toByteBuffer(Types.FloatType.get(), 5.0F))),
null);

@Test
Expand Down Expand Up @@ -753,4 +766,72 @@ public void testIntegerNotIn() {
ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue();
}

@Test
public void testNotEqWithSingleValue() {
boolean shouldRead =
ManifestEvaluator.forRowFilter(notEqual("no_nulls", "a"), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: manifest has range of values").isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "a"), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should not read: manifest contains single value equal to literal")
.isFalse();
shouldRead =
ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "b"), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should read: manifest contains single value not equal to literal")
.isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notEqual("all_same_value_or_null", "a"), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: manifest has nulls which match != predicate").isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notEqual("single_value_with_nan", 5.0F), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should read: manifest has NaN values which match != predicate")
.isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notEqual("single_value_nan_unknown", 5.0F), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue();
}

@Test
public void testNotInWithSingleValue() {
boolean shouldRead =
ManifestEvaluator.forRowFilter(notIn("no_nulls", "a", "b"), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: manifest has range of values").isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "a", "b"), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should not read: manifest contains single value in exclusion list")
.isFalse();
shouldRead =
ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "b", "c"), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should read: manifest contains single value not in exclusion list")
.isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notIn("all_same_value_or_null", "a", "b"), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should read: manifest has nulls which match NOT IN predicate")
.isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notIn("single_value_with_nan", 5.0F, 10.0F), SPEC, true)
.eval(FILE);
assertThat(shouldRead)
.as("Should read: manifest has NaN values which match NOT IN predicate")
.isTrue();
shouldRead =
ManifestEvaluator.forRowFilter(notIn("single_value_nan_unknown", 5.0F, 10.0F), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: manifest has unknown NaN info").isTrue();
}
}