Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ awssdk-s3accessgrants = "2.3.0"
bson-ver = "4.11.5"
caffeine = "2.9.3"
calcite = "1.40.0"
comet = "0.8.1"
comet = "0.10.0"
datasketches = "6.2.0"
delta-standalone = "3.3.2"
delta-spark = "3.3.2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ private OrcBatchReadConf orcBatchReadConf() {
// - Parquet vectorization is enabled
// - only primitives or metadata columns are projected
// - all tasks are of FileScanTask type and read only Parquet files
private boolean useParquetBatchReads() {
protected boolean useParquetBatchReads() {
return readConf.parquetVectorizationEnabled()
&& expectedSchema.columns().stream().allMatch(this::supportsParquetBatchReads)
&& taskGroups.stream().allMatch(this::supportsParquetBatchReads);
Expand All @@ -175,7 +175,7 @@ private boolean supportsParquetBatchReads(Types.NestedField field) {
return field.type().isPrimitiveType() || MetadataColumns.isMetadataColumn(field.fieldId());
}

private boolean useCometBatchReads() {
protected boolean useCometBatchReads() {
return readConf.parquetVectorizationEnabled()
&& readConf.parquetReaderType() == ParquetReaderType.COMET
&& expectedSchema.columns().stream().allMatch(this::supportsCometBatchReads)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.util.Optional;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.comet.parquet.SupportsComet;
import org.apache.iceberg.BlobMetadata;
import org.apache.iceberg.ScanTask;
import org.apache.iceberg.ScanTaskGroup;
Expand Down Expand Up @@ -95,7 +96,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

abstract class SparkScan implements Scan, SupportsReportStatistics {
abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet {
private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class);
private static final String NDV_KEY = "ndv";

Expand Down Expand Up @@ -351,4 +352,10 @@ protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
return splitSize;
}
}

@Override
public boolean isCometEnabled() {
SparkBatch batch = (SparkBatch) this.toBatch();
return batch.useCometBatchReads();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ private OrcBatchReadConf orcBatchReadConf() {
// - Parquet vectorization is enabled
// - only primitives or metadata columns are projected
// - all tasks are of FileScanTask type and read only Parquet files
private boolean useParquetBatchReads() {
protected boolean useParquetBatchReads() {
return readConf.parquetVectorizationEnabled()
&& expectedSchema.columns().stream().allMatch(this::supportsParquetBatchReads)
&& taskGroups.stream().allMatch(this::supportsParquetBatchReads);
Expand All @@ -175,19 +175,7 @@ private boolean supportsParquetBatchReads(Types.NestedField field) {
return field.type().isPrimitiveType() || MetadataColumns.isMetadataColumn(field.fieldId());
}

private boolean useCometBatchReads() {
return readConf.parquetVectorizationEnabled()
&& readConf.parquetReaderType() == ParquetReaderType.COMET
&& expectedSchema.columns().stream().allMatch(this::supportsCometBatchReads)
&& taskGroups.stream().allMatch(this::supportsParquetBatchReads);
}

private boolean supportsCometBatchReads(Types.NestedField field) {
return field.type().isPrimitiveType()
&& !field.type().typeId().equals(Type.TypeID.UUID)
&& field.fieldId() != MetadataColumns.ROW_ID.fieldId()
&& field.fieldId() != MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId();
}
q

// conditions for using ORC batch reads:
// - ORC vectorization is enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.util.Optional;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.comet.parquet.SupportsComet;
import org.apache.iceberg.BlobMetadata;
import org.apache.iceberg.ScanTask;
import org.apache.iceberg.ScanTaskGroup;
Expand Down Expand Up @@ -95,7 +96,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

abstract class SparkScan implements Scan, SupportsReportStatistics {
abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet {
private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class);
private static final String NDV_KEY = "ndv";

Expand Down Expand Up @@ -351,4 +352,10 @@ protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
return splitSize;
}
}

@Override
public boolean isCometEnabled() {
SparkBatch batch = (SparkBatch) this.toBatch();
return batch.useCometBatchReads();
}
}
4 changes: 2 additions & 2 deletions spark/v4.0/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
exclude group: 'org.roaringbitmap'
}

compileOnly "org.apache.datafusion:comet-spark-spark3.5_2.13:${libs.versions.comet.get()}"
compileOnly "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_2.13:${libs.versions.comet.get()}"

implementation libs.parquet.column
implementation libs.parquet.hadoop
Expand Down Expand Up @@ -190,7 +190,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer
testImplementation libs.avro.avro
testImplementation libs.parquet.hadoop
testImplementation libs.awaitility
testImplementation "org.apache.datafusion:comet-spark-spark3.5_2.13:${libs.versions.comet.get()}"
testImplementation "org.apache.datafusion:comet-spark-spark${sparkMajorVersion}_2.13:${libs.versions.comet.get()}"
testImplementation(testFixtures(project(':iceberg-parquet')))

// Required because we remove antlr plugin dependencies from the compile configuration, see note above
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ private boolean supportsParquetBatchReads(Types.NestedField field) {
return field.type().isPrimitiveType() || MetadataColumns.isMetadataColumn(field.fieldId());
}

private boolean useCometBatchReads() {
protected boolean useCometBatchReads() {
return readConf.parquetVectorizationEnabled()
&& readConf.parquetReaderType() == ParquetReaderType.COMET
&& expectedSchema.columns().stream().allMatch(this::supportsCometBatchReads)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.util.Optional;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.comet.parquet.SupportsComet;
import org.apache.iceberg.BlobMetadata;
import org.apache.iceberg.ScanTask;
import org.apache.iceberg.ScanTaskGroup;
Expand Down Expand Up @@ -95,7 +96,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

abstract class SparkScan implements Scan, SupportsReportStatistics {
abstract class SparkScan implements Scan, SupportsReportStatistics, SupportsComet {
private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class);
private static final String NDV_KEY = "ndv";

Expand Down Expand Up @@ -351,4 +352,10 @@ protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
return splitSize;
}
}

@Override
public boolean isCometEnabled() {
SparkBatch batch = (SparkBatch) this.toBatch();
return batch.useCometBatchReads();
}
}
Loading