Building blocks for Ray DataFusionDatasource

ccciudatu · ccciudatu · commit d605d59f6cab · 2025-02-18T17:55:36.000+02:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -708,6 +708,9 @@ def count(self) -> int:
         """
         return self.df.count()
 
+    def distributed_plan(self, num_shards: int):
+        return self.df.distributed_plan(num_shards)
+
     @deprecated("Use :py:func:`unnest_columns` instead.")
     def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame:
         """See :py:func:`unnest_columns`."""
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -27,18 +27,27 @@ use arrow::util::display::{ArrayFormatter, FormatOptions};
 use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
 use datafusion::arrow::util::pretty;
-use datafusion::common::UnnestOptions;
-use datafusion::config::{CsvOptions, TableParquetOptions};
+use datafusion::common::stats::Precision;
+use datafusion::common::{DFSchema, UnnestOptions};
+use datafusion::config::{ConfigOptions, CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
-use datafusion::execution::SendableRecordBatchStream;
+use datafusion::datasource::physical_plan::{FileScanConfig, ParquetExec};
+use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
+use datafusion::physical_plan::{displayable, execute_stream, ExecutionPlan};
 use datafusion::prelude::*;
+use datafusion_expr::registry::MemoryFunctionRegistry;
+use datafusion_proto::physical_plan::{AsExecutionPlan, PhysicalExtensionCodec};
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use deltalake::delta_datafusion::DeltaPhysicalCodec;
+use prost::Message;
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
-
 use crate::errors::py_datafusion_err;
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
@@ -49,6 +58,7 @@ use crate::{
     errors::DataFusionError,
     expr::{sort_expr::PySortExpr, PyExpr},
 };
+use crate::common::df_schema::PyDFSchema;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -650,6 +660,153 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    fn distributed_plan(&self, num_shards: usize, py: Python<'_>) -> PyResult<DistributedPlan> {
+        let distributed_plan = wait_for_future(py, split_physical_plan(&self.df, num_shards))
+            .map_err(PyErr::from)?;
+        Ok(distributed_plan)
+    }
+
+}
+
+#[pyclass(get_all)]
+#[derive(Debug, Clone)]
+pub struct Statistics {
+    num_bytes: Option<usize>,
+    num_rows: Option<usize>,
+}
+
+impl Statistics {
+    fn new(plan: &dyn ExecutionPlan) -> Self {
+        fn extract(prec: Precision<usize>) -> Option<usize> {
+            match prec {
+                Precision::Exact(n) | Precision::Inexact(n) => Some(n),
+                Precision::Absent => None,
+            }
+        }
+        if let Ok(stats) = plan.statistics() {
+            let num_bytes = extract(stats.total_byte_size);
+            let num_rows = extract(stats.num_rows);
+            Statistics { num_bytes, num_rows}
+        } else {
+            Statistics { num_bytes: None, num_rows: None }
+        }
+    }
+}
+
+#[pyclass(get_all)]
+#[derive(Debug, Clone)]
+pub struct Shard {
+    stats: Statistics,
+    serialized_plan: Vec<u8>,
+}
+
+impl Shard {
+    pub fn try_new(plan: &Arc<dyn ExecutionPlan>) -> Result<Self, DataFusionError> {
+        let stats = Statistics::new(plan.as_ref());
+        let serialized_plan = PhysicalPlanNode::try_from_physical_plan(plan.clone(), Self::codec())?
+            .encode_to_vec();
+        Ok(Self { stats, serialized_plan })
+    }
+
+    fn codec() -> &'static dyn PhysicalExtensionCodec {
+        static CODEC: DeltaPhysicalCodec = DeltaPhysicalCodec {};
+        &CODEC
+    }
+}
+#[pymethods]
+impl Shard {
+    pub fn stream(&self) -> PyResult<PyRecordBatchStream> {
+        shard_stream(self.serialized_plan.as_ref())
+    }
+}
+
+#[pyclass(get_all)]
+#[derive(Debug, Clone)]
+pub struct DistributedPlan {
+    shards: Vec<Shard>,
+    schema: PyDFSchema,
+    stats: Statistics,
+}
+
+async fn split_physical_plan(df: &DataFrame, num_shards: usize) -> Result<DistributedPlan, DataFusionError> {
+    fn split(plan: &Arc<dyn ExecutionPlan>, num_shards: usize) -> Vec<Arc<dyn ExecutionPlan>> {
+        if let Some(parquet) =  plan.as_any().downcast_ref::<ParquetExec>() {
+            let parquet = if let Ok(Some(repartitioned)) = parquet.repartitioned(num_shards, &ConfigOptions::default()) {
+                repartitioned.as_any().downcast_ref::<ParquetExec>()
+                    .expect("repartitioned parquet is no longer parquet")
+                    .clone()
+            } else { // repartition failed
+                parquet.clone()
+            };
+            let config = parquet.base_config();
+            config
+                .file_groups
+                .iter()
+                .map(|shard| {
+                    FileScanConfig {
+                        object_store_url: config.object_store_url.clone(),
+                        file_schema: config.file_schema.clone(),
+                        file_groups: shard.iter().map(|file| vec![file.to_owned()]).collect(), // one partition per file
+                        statistics: config.statistics.clone(),
+                        projection: config.projection.clone(),
+                        projection_deep: config.projection_deep.clone(),
+                        limit: config.limit,
+                        table_partition_cols: config.table_partition_cols.clone(),
+                        output_ordering: config.output_ordering.clone(),
+                    }
+                })
+                .map(|config| {
+                    let mut builder = ParquetExecBuilder::new(config)
+                        .with_table_parquet_options(parquet.table_parquet_options().clone());
+                    if let Some(predicate) = parquet.predicate() {
+                        builder = builder.with_predicate(predicate.clone());
+                    }
+                    builder.build_arc()
+                })
+                .map(|shard| shard as Arc<dyn ExecutionPlan>)
+                .collect()
+        } else if plan.children().len() == 0 { // TODO: split leaf nodes other than parquet?
+            vec![plan.clone()]
+        } else if plan.children().len() == 1 {
+            plan.children().into_iter()
+                .flat_map(|child| {
+                    split(child, num_shards)
+                        .into_iter()
+                        .map(|shard| plan.clone().with_new_children(vec![shard]))
+                })
+                .collect::<Result<Vec<_>, _>>()
+                .expect("Unable to split plan")
+        } else {
+            panic!(
+                "Only leaf or single-child plans are supported, found {}",
+                displayable(plan.as_ref()).one_line()
+            )
+        }
+    }
+    let plan = df.clone().create_physical_plan().await?;
+    let shards = split(&plan, num_shards)
+        .iter()
+        .map(Shard::try_new)
+        .collect::<Result<Vec<_>, _>>()?;
+    let schema = DFSchema::try_from(plan.schema().as_ref().to_owned())?.into();
+    let stats = Statistics::new(plan.as_ref());
+    Ok(DistributedPlan { shards, schema, stats })
+}
+
+#[pyfunction]
+pub fn shard_stream(serialized_shard_plan: &[u8]) -> PyResult<PyRecordBatchStream> {
+    deltalake::ensure_initialized();
+    let registry = MemoryFunctionRegistry::default();
+    let runtime = RuntimeEnvBuilder::new().build()?;
+    let codec = DeltaPhysicalCodec {};
+    let node = PhysicalPlanNode::decode(serialized_shard_plan)
+        .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))
+        .map_err(PyErr::from)?;
+    let plan = node.try_into_physical_plan(&registry, &runtime, &codec)?;
+    println!("Shard plan: {}", displayable(plan.as_ref()).one_line());
+    let ctx = TaskContext::default();
+    execute_stream(plan, Arc::new(ctx)).map(PyRecordBatchStream::new).map_err(PyErr::from)
 }
 
 /// Print DataFrame
diff --git a/src/lib.rs b/src/lib.rs
@@ -115,6 +115,9 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     #[cfg(feature = "substrait")]
     setup_substrait_module(py, &m)?;
 
+    m.add_class::<dataframe::Shard>()?;
+    m.add_class::<dataframe::DistributedPlan>()?;
+    m.add_wrapped(wrap_pyfunction!(dataframe::shard_stream))?;
     Ok(())
 }
 

Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,9 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {`
`115`	`115`	`#[cfg(feature = "substrait")]`
`116`	`116`	`setup_substrait_module(py, &m)?;`
`117`	`117`
	`118`	`+ m.add_class::<dataframe::Shard>()?;`
	`119`	`+ m.add_class::<dataframe::DistributedPlan>()?;`
	`120`	`+ m.add_wrapped(wrap_pyfunction!(dataframe::shard_stream))?;`
`118`	`121`	`Ok(())`
`119`	`122`	`}`
`120`	`123`