From 1ca3a5941450e56951b2a361679063a0b3a6753d Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Fri, 19 Dec 2025 23:13:31 +0100 Subject: [PATCH 01/11] SEDONA-738 Add sedonadb worker --- pom.xml | 2 +- python/pyproject.toml | 52 ++- python/sedona/spark/sql/functions.py | 64 +++- python/sedona/spark/worker/__init__.py | 0 python/sedona/spark/worker/serde.py | 82 +++++ python/sedona/spark/worker/udf_info.py | 34 ++ python/sedona/spark/worker/worker.py | 295 ++++++++++++++++++ python/tests/test_base.py | 2 +- .../utils/test_sedona_db_vectorized_udf.py | 94 ++++++ .../apache/sedona/spark/SedonaContext.scala | 3 +- .../sedona/sql/UDF/PythonEvalType.scala | 7 + .../python/SedonaArrowPythonRunner.scala | 58 ++++ .../python/SedonaArrowStrategy.scala | 159 ++++++++++ .../python/SedonaBasePythonRunner.scala | 121 +++++++ .../python/SedonaDBWorkerFactory.scala | 118 +++++++ .../python/SedonaPythonArrowInput.scala | 135 ++++++++ .../python/SedonaPythonArrowOutput.scala | 171 ++++++++++ .../sql/execution/python/WorkerContext.scala | 52 +++ .../spark/sql/udf/ExtractSedonaUDFRule.scala | 13 +- .../spark/sql/udf/SedonaArrowStrategy.scala | 89 ------ .../org/apache/sedona/sql/TestBaseScala.scala | 1 + .../apache/spark/sql/udf/StrategySuite.scala | 32 +- .../spark/sql/udf/TestScalarPandasUDF.scala | 162 ++++++++-- 23 files changed, 1586 insertions(+), 160 deletions(-) create mode 100644 python/sedona/spark/worker/__init__.py create mode 100644 python/sedona/spark/worker/serde.py create mode 100644 python/sedona/spark/worker/udf_info.py create mode 100644 python/sedona/spark/worker/worker.py create mode 100644 python/tests/utils/test_sedona_db_vectorized_udf.py create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala delete mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala diff --git a/pom.xml b/pom.xml index d6e4e81319d..613e310983f 100644 --- a/pom.xml +++ b/pom.xml @@ -631,7 +631,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + 3.12.0 attach-javadocs diff --git a/python/pyproject.toml b/python/pyproject.toml index b988966c4fc..76169261c34 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["setuptools>=69", "wheel"] +requires = ["setuptools>=80.9.0", "wheel"] build-backend = "setuptools.build_meta" [project] @@ -26,13 +26,19 @@ description = "Apache Sedona is a cluster computing system for processing large- readme = "README.md" license = { text = "Apache-2.0" } authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ] -requires-python = ">=3.8" +requires-python = ">=3.12" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", ] dependencies = [ "attrs", + "geoarrow-c>=0.3.1", + "geoarrow-pyarrow>=0.2.0", + "geopandas>=1.1.2", + "pyarrow>=16.1.0", + "pyspark==3.5.4", + "sedonadb", "shapely>=1.7.0", ] @@ -43,38 +49,16 @@ kepler-map = ["geopandas", "keplergl==0.3.2"] flink = ["apache-flink>=1.19.0"] db = ["sedonadb[geopandas]; python_version >= '3.9'"] all = [ - "pyspark>=3.4.0,<4.1.0", - "geopandas", - "pydeck==0.8.0", - "keplergl==0.3.2", - "rasterio>=1.2.10", +# "pyspark>=3.4.0,<4.1.0", +# "geopandas", +# "pydeck==0.8.0", +# "keplergl==0.3.2", +# "rasterio>=1.2.10", ] [dependency-groups] dev = [ - "pytest", - "pytest-cov", - "notebook==6.4.12", - "jupyter", - "mkdocs", - "scikit-learn", - "esda", - "libpysal", - "matplotlib", # implicit dependency of esda - # prevent incompatibility with pysal 4.7.0, which is what is resolved to when shapely >2 is specified - "scipy<=1.10.0", - "pandas>=2.0.0", - "numpy<2", - "geopandas", - # https://stackoverflow.com/questions/78949093/how-to-resolve-attributeerror-module-fiona-has-no-attribute-path - # cannot set geopandas>=0.14.4 since it doesn't support python 3.8, so we pin fiona to <1.10.0 - "fiona<1.10.0", - "pyarrow", - "pyspark>=3.4.0,<4.1.0", - "keplergl==0.3.2", - "pydeck==0.8.0", - "pystac==1.5.0", - "rasterio>=1.2.10", + "pytest>=9.0.2", ] [project.urls] @@ -99,3 +83,11 @@ sources = [ "src/geom_buf.c", "src/geos_c_dyn.c", ] + +[tool.uv] +dev-dependencies = [ + "pytest>=9.0.2", +] + +[tool.uv.sources] +sedonadb = { path = "../../../sedona-db/target/wheels/sedonadb-0.3.0-cp312-cp312-macosx_11_0_arm64.whl" } diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py index 2420301d52d..7c480e1700d 100644 --- a/python/sedona/spark/sql/functions.py +++ b/python/sedona/spark/sql/functions.py @@ -21,11 +21,14 @@ import pandas as pd -from sedona.spark.sql.types import GeometryType from sedona.spark.utils import geometry_serde -from pyspark.sql.udf import UserDefinedFunction -from pyspark.sql.types import DataType from shapely.geometry.base import BaseGeometry +from pyspark.sql.udf import UserDefinedFunction +import pyarrow as pa +import geoarrow.pyarrow as ga +from sedonadb import udf as sedona_udf_module +from sedona.spark.sql.types import GeometryType +from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType SEDONA_SCALAR_EVAL_TYPE = 5200 @@ -142,3 +145,58 @@ def serialize_to_geometry_if_geom(data, return_type: DataType): return geometry_serde.serialize(data) return data + + +def infer_pa_type(spark_type: DataType): + if isinstance(spark_type, GeometryType): + return ga.wkb() + elif isinstance(spark_type, FloatType): + return pa.float32() + elif isinstance(spark_type, DoubleType): + return pa.float64() + elif isinstance(spark_type, IntegerType): + return pa.int32() + elif isinstance(spark_type, StringType): + return pa.string() + else: + raise NotImplementedError(f"Type {spark_type} is not supported yet.") + +def infer_input_type(spark_type: DataType): + if isinstance(spark_type, GeometryType): + return sedona_udf_module.GEOMETRY + elif isinstance(spark_type, FloatType) or isinstance(spark_type, DoubleType) or isinstance(spark_type, IntegerType): + return sedona_udf_module.NUMERIC + elif isinstance(spark_type, StringType): + return sedona_udf_module.STRING + else: + raise NotImplementedError(f"Type {spark_type} is not supported yet.") + +def infer_input_types(spark_types: list[DataType]): + pa_types = [] + for spark_type in spark_types: + pa_type = infer_input_type(spark_type) + pa_types.append(pa_type) + + return pa_types + + +def sedona_db_vectorized_udf( + return_type: DataType, + input_types: list[DataType] +): + def apply_fn(fn): + out_type = infer_pa_type(return_type) + input_types_sedona_db = infer_input_types(input_types) + + @sedona_udf_module.arrow_udf(out_type, input_types=input_types_sedona_db) + def shapely_udf(*args, **kwargs): + return fn(*args, **kwargs) + + udf = UserDefinedFunction( + lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=6200 + ) + + return udf + + + return apply_fn diff --git a/python/sedona/spark/worker/__init__.py b/python/sedona/spark/worker/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py new file mode 100644 index 00000000000..31038b7fcdd --- /dev/null +++ b/python/sedona/spark/worker/serde.py @@ -0,0 +1,82 @@ +import socket + +from pyspark.serializers import write_int, SpecialLengths +from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer + +from sedona.spark.worker.udf_info import UDFInfo + + +def read_available(buf, chunk=4096): + # buf.raw._sock.settimeout(0.01) # non-blocking-ish + data = bytearray() + index = 0 + while True: + index+=1 + try: + chunk_bytes = buf.read(chunk) + except socket.timeout: + break + + if not chunk_bytes and index > 10: + break + + data.extend(chunk_bytes) + + return bytes(data) + +class SedonaDBSerializer(ArrowStreamPandasSerializer): + def __init__(self, timezone, safecheck, db, udf_info: UDFInfo): + super(SedonaDBSerializer, self).__init__(timezone, safecheck) + self.db = db + self.udf_info = udf_info + + def load_stream(self, stream): + import pyarrow as pa + + batches = super(ArrowStreamPandasSerializer, self).load_stream(stream) + index = 0 + for batch in batches: + table = pa.Table.from_batches(batches=[batch]) + import pyarrow as pa + df = self.db.create_data_frame(table) + table_name = f"my_table_{index}" + + df.to_view(table_name) + + sql_expression = self.udf_info.sedona_db_transformation_expr(table_name) + + index += 1 + + yield self.db.sql(sql_expression) + + def arrow_dump_stream(self, iterator, stream): + import pyarrow as pa + + writer = None + try: + for batch in iterator: + if writer is None: + writer = pa.RecordBatchStreamWriter(stream, batch.schema) + writer.write_batch(batch) + # stream.flush() + finally: + if writer is not None: + writer.close() + + def dump_stream(self, iterator, stream): + """ + Override because Pandas UDFs require a START_ARROW_STREAM before the Arrow stream is sent. + This should be sent after creating the first record batch so in case of an error, it can + be sent back to the JVM before the Arrow stream starts. + """ + + def init_stream_yield_batches(): + should_write_start_length = True + for batch in iterator: + if should_write_start_length: + write_int(SpecialLengths.START_ARROW_STREAM, stream) + should_write_start_length = False + + yield batch + + return self.arrow_dump_stream(init_stream_yield_batches(), stream) diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py new file mode 100644 index 00000000000..d354bcea7ed --- /dev/null +++ b/python/sedona/spark/worker/udf_info.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass + +from sedona.spark import GeometryType + + +@dataclass +class UDFInfo: + arg_offsets: list + geom_offsets: dict + function: object + return_type: object + name: str + + def get_function_call_sql(self, table_name: str) -> str: + arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets]) + function_expr = f"{self.name}({arg_offset_str})" + if isinstance(self.return_type, GeometryType): + return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" + + return f"SELECT {function_expr} AS _0 FROM {table_name}" + + def sedona_db_transformation_expr(self, table_name: str) -> str: + fields = [] + for arg in self.arg_offsets: + if arg in self.geom_offsets: + crs = self.geom_offsets[arg] + fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}") + continue + + fields.append(f"_{arg}") + + + fields_expr = ", ".join(fields) + return f"SELECT {fields_expr} FROM {table_name}" diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py new file mode 100644 index 00000000000..74a61b02ee8 --- /dev/null +++ b/python/sedona/spark/worker/worker.py @@ -0,0 +1,295 @@ +import importlib +import os +import sys +import time + +import sedonadb +from pyspark import TaskContext, shuffle, SparkFiles +from pyspark.errors import PySparkRuntimeError +from pyspark.java_gateway import local_connect_and_auth +from pyspark.resource import ResourceInformation +from pyspark.serializers import read_int, UTF8Deserializer, read_bool, read_long, CPickleSerializer, write_int, \ + write_long, SpecialLengths + +from sedona.spark.worker.serde import SedonaDBSerializer +from sedona.spark.worker.udf_info import UDFInfo + + +def apply_iterator(db, iterator, udf_info: UDFInfo): + i = 0 + for df in iterator: + i+=1 + table_name = f"output_table_{i}" + df.to_view(table_name) + + function_call_sql = udf_info.get_function_call_sql(table_name) + + df_out = db.sql(function_call_sql) + df_out.to_view(f"view_{i}") + at = df_out.to_arrow_table() + batches = at.combine_chunks().to_batches() + + for batch in batches: + yield batch + + +def check_python_version(utf_serde: UTF8Deserializer, infile) -> str: + version = utf_serde.loads(infile) + + python_major, python_minor = sys.version_info[:2] + + if version != f"{python_major}.{python_minor}": + raise PySparkRuntimeError( + error_class="PYTHON_VERSION_MISMATCH", + message_parameters={ + "worker_version": str(sys.version_info[:2]), + "driver_version": str(version), + }, + ) + + return version + +def check_barrier_flag(infile): + is_barrier = read_bool(infile) + bound_port = read_int(infile) + secret = UTF8Deserializer().loads(infile) + + if is_barrier: + raise PySparkRuntimeError( + error_class="BARRIER_MODE_NOT_SUPPORTED", + message_parameters={ + "worker_version": str(sys.version_info[:2]), + "message": "Barrier mode is not supported by SedonaDB vectorized functions.", + }, + ) + + return is_barrier + +def assign_task_context(utf_serde: UTF8Deserializer, infile): + stage_id = read_int(infile) + partition_id = read_int(infile) + attempt_number = read_long(infile) + task_attempt_id = read_int(infile) + cpus = read_int(infile) + + task_context = TaskContext._getOrCreate() + task_context._stage_id = stage_id + task_context._partition_id = partition_id + task_context._attempt_number = attempt_number + task_context._task_attempt_id = task_attempt_id + task_context._cpus = cpus + + for r in range(read_int(infile)): + key = utf_serde.loads(infile) + name = utf_serde.loads(infile) + addresses = [] + task_context._resources = {} + for a in range(read_int(infile)): + addresses.append(utf_serde.loads(infile)) + task_context._resources[key] = ResourceInformation(name, addresses) + + task_context._localProperties = dict() + for i in range(read_int(infile)): + k = utf_serde.loads(infile) + v = utf_serde.loads(infile) + task_context._localProperties[k] = v + + return task_context + +def resolve_python_path(utf_serde: UTF8Deserializer, infile): + def add_path(path: str): + # worker can be used, so do not add path multiple times + if path not in sys.path: + # overwrite system packages + sys.path.insert(1, path) + + spark_files_dir = utf_serde.loads(infile) + # _accumulatorRegistry.clear() + + SparkFiles._root_directory = spark_files_dir + SparkFiles._is_running_on_worker = True + + add_path(spark_files_dir) # *.py files that were added will be copied here + num_python_includes = read_int(infile) + for _ in range(num_python_includes): + filename = utf_serde.loads(infile) + add_path(os.path.join(spark_files_dir, filename)) + + importlib.invalidate_caches() + + +def check_broadcast_variables(infile): + needs_broadcast_decryption_server = read_bool(infile) + num_broadcast_variables = read_int(infile) + + if needs_broadcast_decryption_server or num_broadcast_variables > 0: + raise PySparkRuntimeError( + error_class="BROADCAST_VARS_NOT_SUPPORTED", + message_parameters={ + "worker_version": str(sys.version_info[:2]), + "message": "Broadcast variables are not supported by SedonaDB vectorized functions.", + }, + ) + +def get_runner_conf(utf_serde: UTF8Deserializer, infile): + runner_conf = {} + num_conf = read_int(infile) + for i in range(num_conf): + k = utf_serde.loads(infile) + v = utf_serde.loads(infile) + runner_conf[k] = v + return runner_conf + + +def read_command(serializer, infile): + command = serializer._read_with_length(infile) + return command + +def read_udf(infile, pickle_ser) -> UDFInfo: + num_arg = read_int(infile) + arg_offsets = [read_int(infile) for i in range(num_arg)] + + function = None + return_type = None + + for i in range(read_int(infile)): + function, return_type = read_command(pickle_ser, infile) + + sedona_db_udf_expression = function() + + return UDFInfo( + arg_offsets=arg_offsets, + function=sedona_db_udf_expression, + return_type=return_type, + name=sedona_db_udf_expression._name, + geom_offsets=[0] + ) + +# def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): +# num_arg = read_int(infile) +# arg_offsets = [read_int(infile) for i in range(num_arg)] +# chained_func = None +# for i in range(read_int(infile)): +# f, return_type = read_command(pickleSer, infile) +# if chained_func is None: +# chained_func = f +# else: +# chained_func = chain(chained_func, f) +# +# func = chained_func +# +# # the last returnType will be the return type of UDF +# if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: +# return arg_offsets, func, return_type +# else: +# raise ValueError("Unknown eval type: {}".format(eval_type)) +# + +def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo: + num_udfs = read_int(infile) + + udf = None + for _ in range(num_udfs): + udf = read_udf(infile, pickle_ser) + # Here we would register the UDF with SedonaDB's internal context + + + return udf + + +def report_times(outfile, boot, init, finish): + write_int(SpecialLengths.TIMING_DATA, outfile) + write_long(int(1000 * boot), outfile) + write_long(int(1000 * init), outfile) + write_long(int(1000 * finish), outfile) + + +def write_statistics(infile, outfile, boot_time, init_time) -> None: + TaskContext._setTaskContext(None) + finish_time = time.time() + report_times(outfile, boot_time, init_time, finish_time) + write_long(shuffle.MemoryBytesSpilled, outfile) + write_long(shuffle.DiskBytesSpilled, outfile) + + # Mark the beginning of the accumulators section of the output + write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) + # write_int(len(_accumulatorRegistry), outfile) + # for (aid, accum) in _accumulatorRegistry.items(): + # pickleSer._write_with_length((aid, accum._value), outfile) + + if read_int(infile) == SpecialLengths.END_OF_STREAM: + write_int(SpecialLengths.END_OF_STREAM, outfile) + outfile.flush() + else: + write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) + outfile.flush() + sys.exit(-1) + + +def main(infile, outfile): + boot_time = time.time() + sedona_db = sedonadb.connect() + # + utf8_deserializer = UTF8Deserializer() + pickle_ser = CPickleSerializer() + + split_index = read_int(infile) + # + check_python_version(utf8_deserializer, infile) + # + check_barrier_flag(infile) + + task_context = assign_task_context(utf_serde=utf8_deserializer, infile=infile) + shuffle.MemoryBytesSpilled = 0 + shuffle.DiskBytesSpilled = 0 + + resolve_python_path(utf8_deserializer, infile) + # + check_broadcast_variables(infile) + + eval_type = read_int(infile) + + runner_conf = get_runner_conf(utf8_deserializer, infile) + + udf = register_sedona_db_udf(infile, pickle_ser) + + sedona_db.register_udf(udf.function) + init_time = time.time() + + serde = SedonaDBSerializer( + timezone=runner_conf.get("spark.sql.session.timeZone", "UTC"), + safecheck=False, + db=sedona_db, + udf_info=udf + ) + + number_of_geometries = read_int(infile) + geom_offsets = {} + for i in range(number_of_geometries): + geom_index = read_int(infile) + geom_srid = read_int(infile) + + geom_offsets[geom_index] = geom_srid + + udf.geom_offsets = geom_offsets + + iterator = serde.load_stream(infile) + out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf) + + serde.dump_stream(out_iterator, outfile) + + write_statistics( + infile, outfile, boot_time=boot_time, init_time=init_time + ) + + +if __name__ == "__main__": + # add file handler + auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] + java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) + (sock_file, sc) = local_connect_and_auth(java_port, auth_secret) + + write_int(os.getpid(), sock_file) + sock_file.flush() + + main(sock_file, sock_file) diff --git a/python/tests/test_base.py b/python/tests/test_base.py index cc2b09e422a..a6dbae65975 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -22,7 +22,7 @@ import pyspark from pyspark.sql import DataFrame -from sedona.spark import * +from sedona.spark import SedonaContext from sedona.spark.utils.decorators import classproperty SPARK_REMOTE = os.getenv("SPARK_REMOTE") diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py new file mode 100644 index 00000000000..749d45420ef --- /dev/null +++ b/python/tests/utils/test_sedona_db_vectorized_udf.py @@ -0,0 +1,94 @@ +from sedona.spark.sql.functions import sedona_db_vectorized_udf +from tests.test_base import TestBase +import pyarrow as pa +import shapely +from sedona.sql import GeometryType +from pyspark.sql.functions import expr, lit +from pyspark.sql.types import DoubleType, IntegerType + + +class TestSedonaDBArrowFunction(TestBase): + def test_vectorized_udf(self): + @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType(), IntegerType()]) + def my_own_function(geom, distance): + geom_wkb = pa.array(geom.storage.to_array()) + distance = pa.array(distance.to_array()) + geom = shapely.from_wkb(geom_wkb) + + result_shapely = shapely.centroid(geom) + + return pa.array(shapely.to_wkb(result_shapely)) + + df = self.spark.createDataFrame( + [ + (1, "POINT (1 1)"), + (2, "POINT (2 2)"), + (3, "POINT (3 3)"), + ], + ["id", "wkt"], + ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) + + df.select(my_own_function(df.wkt, lit(100)).alias("geom")).show() + + def test_geometry_to_double(self): + @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()]) + def geometry_to_non_geometry_udf(geom): + geom_wkb = pa.array(geom.storage.to_array()) + geom = shapely.from_wkb(geom_wkb) + + result_shapely = shapely.get_x(shapely.centroid(geom)) + + return pa.array(result_shapely, pa.float64()) + + df = self.spark.createDataFrame( + [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], + ["id", "wkt"], + ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) + + values = df.select(geometry_to_non_geometry_udf(df.wkt).alias("x_coord")) \ + .collect() + + values_list = [row["x_coord"] for row in values] + + assert values_list == [1.0, 2.0, 3.0] + + def test_geometry_to_int(self): + @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[GeometryType()]) + def geometry_to_int(geom): + geom_wkb = pa.array(geom.storage.to_array()) + geom = shapely.from_wkb(geom_wkb) + + result_shapely = shapely.get_num_points(geom) + + return pa.array(result_shapely, pa.int32()) + + df = self.spark.createDataFrame( + [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], + ["id", "wkt"], + ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) + + values = df.select(geometry_to_int(df.wkt)) \ + .collect() + + values_list = [row[0] for row in values] + + assert values_list == [0, 0, 0] + + def test_geometry_crs_preservation(self): + @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()]) + def return_same_geometry(geom): + geom_wkb = pa.array(geom.storage.to_array()) + geom = shapely.from_wkb(geom_wkb) + + return pa.array(shapely.to_wkb(geom)) + + df = self.spark.createDataFrame( + [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], + ["id", "wkt"], + ).withColumn("wkt", expr("ST_SetSRID(ST_GeomFromWKT(wkt), 3857)")) + + result_df = df.select(return_same_geometry(df.wkt).alias("geom")) + + crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect() + + assert crs_list == [3857, 3857, 3857] diff --git a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala index b0e46cf6e9e..add3caf225a 100644 --- a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala +++ b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala @@ -41,7 +41,6 @@ class InternalApi( extends StaticAnnotation object SedonaContext { - private def customOptimizationsWithSession(sparkSession: SparkSession) = Seq( new TransformNestedUDTParquet(sparkSession), @@ -72,7 +71,7 @@ object SedonaContext { val sedonaArrowStrategy = Try( Class - .forName("org.apache.spark.sql.udf.SedonaArrowStrategy") + .forName("org.apache.spark.sql.execution.python.SedonaArrowStrategy") .getDeclaredConstructor() .newInstance() .asInstanceOf[SparkStrategy]) diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala index aece26267d9..11263dd7f68 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala @@ -23,7 +23,14 @@ object PythonEvalType { val SQL_SCALAR_SEDONA_UDF = 5200 val SEDONA_UDF_TYPE_CONSTANT = 5000 + // sedona db eval types + val SQL_SCALAR_SEDONA_DB_UDF = 6200 + val SEDONA_DB_UDF_TYPE_CONSTANT = 6000 + def toString(pythonEvalType: Int): String = pythonEvalType match { case SQL_SCALAR_SEDONA_UDF => "SQL_SCALAR_GEO_UDF" + case SQL_SCALAR_SEDONA_DB_UDF => "SQL_SCALAR_SEDONA_DB_UDF" } + + def evals(): Set[Int] = Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF) } diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala new file mode 100644 index 00000000000..0d3960d2d81 --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.api.python._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream. + */ +class SedonaArrowPythonRunner( + funcs: Seq[ChainedPythonFunctions], + evalType: Int, + argOffsets: Array[Array[Int]], + protected override val schema: StructType, + protected override val timeZoneId: String, + protected override val largeVarTypes: Boolean, + protected override val workerConf: Map[String, String], + val pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + geometryFields: Seq[(Int, Int)]) + extends SedonaBasePythonRunner[Iterator[InternalRow], ColumnarBatch]( + funcs, + evalType, + argOffsets, + jobArtifactUUID, + geometryFields) + with SedonaBasicPythonArrowInput + with SedonaBasicPythonArrowOutput { + + override val errorOnDuplicatedFieldNames: Boolean = true + + override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize + require( + bufferSize >= 4, + "Pandas execution requires more than 4 bytes. Please set higher buffer. " + + s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.") +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala new file mode 100644 index 00000000000..bb897931b63 --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.sedona.sql.UDF.PythonEvalType +import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.Strategy +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, PythonUDF} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.udf.SedonaArrowEvalPython +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT + +import scala.collection.JavaConverters.asScalaIteratorConverter + +// We use custom Strategy to avoid Apache Spark assert on types, we +// can consider extending this to support other engines working with +// arrow data +class SedonaArrowStrategy extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case SedonaArrowEvalPython(udfs, output, child, evalType) => + SedonaArrowEvalPythonExec(udfs, output, planLater(child), evalType) :: Nil + case _ => Nil + } +} + +// It's modification og Apache Spark's ArrowEvalPythonExec, we remove the check on the types to allow geometry types +// here, it's initial version to allow the vectorized udf for Sedona geometry types. We can consider extending this +// to support other engines working with arrow data +case class SedonaArrowEvalPythonExec( + udfs: Seq[PythonUDF], + resultAttrs: Seq[Attribute], + child: SparkPlan, + evalType: Int) + extends EvalPythonExec + with PythonSQLMetrics { + + private val batchSize = conf.arrowMaxRecordsPerBatch + private val sessionLocalTimeZone = conf.sessionLocalTimeZone + private val largeVarTypes = conf.arrowUseLargeVarTypes + private val pythonRunnerConf = + Map[String, String](SQLConf.SESSION_LOCAL_TIMEZONE.key -> conf.sessionLocalTimeZone) + private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + private def inferCRS(iterator: Iterator[InternalRow], schema: StructType): Seq[(Int, Int)] = { + // this triggers the iterator + if (!iterator.hasNext) { + return Seq.empty + } + + val row = iterator.next() + + val rowMatched = row match { + case generic: GenericInternalRow => + Some(generic) + case _ => None + } + + schema + .filter { field => + field.dataType == GeometryUDT + } + .zipWithIndex + .map { case (_, index) => + if (rowMatched.isEmpty || rowMatched.get.values(index) == null) (index, 0) + else { + val geom = rowMatched.get.get(index, GeometryUDT).asInstanceOf[Array[Byte]] + val preambleByte = geom(0) & 0xff + val hasSrid = (preambleByte & 0x01) != 0 + + var srid = 0 + if (hasSrid) { + val srid2 = (geom(1) & 0xff) << 16 + val srid1 = (geom(2) & 0xff) << 8 + val srid0 = geom(3) & 0xff + srid = srid2 | srid1 | srid0 + } + + (index, srid) + } + } + } + + protected override def evaluate( + funcs: Seq[ChainedPythonFunctions], + argOffsets: Array[Array[Int]], + iter: Iterator[InternalRow], + schema: StructType, + context: TaskContext): Iterator[InternalRow] = { + val (probe, full) = iter.duplicate + + val geometryFields = inferCRS(probe, schema) + + val batchIter = if (batchSize > 0) new BatchIterator(full, batchSize) else Iterator(full) + + evalType match { + case SQL_SCALAR_SEDONA_DB_UDF => + val columnarBatchIter = new SedonaArrowPythonRunner( + funcs, + evalType - PythonEvalType.SEDONA_DB_UDF_TYPE_CONSTANT, + argOffsets, + schema, + sessionLocalTimeZone, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID, + geometryFields).compute(batchIter, context.partitionId(), context) + + val result = columnarBatchIter.flatMap { batch => + batch.rowIterator.asScala + } + + result + + case SQL_SCALAR_SEDONA_UDF => + val columnarBatchIter = new ArrowPythonRunner( + funcs, + evalType - PythonEvalType.SEDONA_UDF_TYPE_CONSTANT, + argOffsets, + schema, + sessionLocalTimeZone, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID).compute(batchIter, context.partitionId(), context) + + val iter = columnarBatchIter.flatMap { batch => + batch.rowIterator.asScala + } + + iter + } + } + + override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = + copy(child = newChild) +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala new file mode 100644 index 00000000000..8ecc110e39d --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import java.net._ +import java.util.concurrent.atomic.AtomicBoolean +import scala.collection.JavaConverters._ +import org.apache.spark._ +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXECUTOR_CORES +import org.apache.spark.internal.config.Python._ +import org.apache.spark.resource.ResourceProfile.{EXECUTOR_CORES_LOCAL_PROPERTY, PYSPARK_MEMORY_LOCAL_PROPERTY} +import org.apache.spark.util._ + +private object SedonaBasePythonRunner { + + private lazy val faultHandlerLogDir = Utils.createTempDir(namePrefix = "faulthandler") +} + +private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( + funcs: Seq[ChainedPythonFunctions], + evalType: Int, + argOffsets: Array[Array[Int]], + jobArtifactUUID: Option[String], + val geometryFields: Seq[(Int, Int)] = Seq.empty) + extends BasePythonRunner[IN, OUT](funcs, evalType, argOffsets, jobArtifactUUID) + with Logging { + + require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs") + + private val conf = SparkEnv.get.conf + private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED) + + private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = { + mem.map(_ / cores) + } + + import java.io._ + + override def compute( + inputIterator: Iterator[IN], + partitionIndex: Int, + context: TaskContext): Iterator[OUT] = { + val startTime = System.currentTimeMillis + val env = SparkEnv.get + + val execCoresProp = Option(context.getLocalProperty(EXECUTOR_CORES_LOCAL_PROPERTY)) + val memoryMb = Option(context.getLocalProperty(PYSPARK_MEMORY_LOCAL_PROPERTY)).map(_.toLong) + + if (simplifiedTraceback) { + envVars.put("SPARK_SIMPLIFIED_TRACEBACK", "1") + } + // SPARK-30299 this could be wrong with standalone mode when executor + // cores might not be correct because it defaults to all cores on the box. + val execCores = execCoresProp.map(_.toInt).getOrElse(conf.get(EXECUTOR_CORES)) + val workerMemoryMb = getWorkerMemoryMb(memoryMb, execCores) + if (workerMemoryMb.isDefined) { + envVars.put("PYSPARK_EXECUTOR_MEMORY_MB", workerMemoryMb.get.toString) + } + envVars.put("SPARK_AUTH_SOCKET_TIMEOUT", authSocketTimeout.toString) + envVars.put("SPARK_BUFFER_SIZE", bufferSize.toString) + if (faultHandlerEnabled) { + envVars.put("PYTHON_FAULTHANDLER_DIR", SedonaBasePythonRunner.faultHandlerLogDir.toString) + } + + envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default")) + + val (worker: Socket, pid: Option[Int]) = { + WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap) + } + + val releasedOrClosed = new AtomicBoolean(false) + + // Start a thread to feed the process input from our parent's iterator + val writerThread = newWriterThread(env, worker, inputIterator, partitionIndex, context) + + context.addTaskCompletionListener[Unit] { _ => + writerThread.shutdownOnTaskCompletion() + if (releasedOrClosed.compareAndSet(false, true)) { + try { + worker.close() + } catch { + case e: Exception => + logWarning("Failed to close worker socket", e) + } + } + } + + writerThread.start() + + val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize)) + + val stdoutIterator = newReaderIterator( + stream, + writerThread, + startTime, + env, + worker, + pid, + releasedOrClosed, + context) + new InterruptibleIterator(context, stdoutIterator) + } +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala new file mode 100644 index 00000000000..add09a7cb22 --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.{SparkException, SparkFiles} +import org.apache.spark.api.python.{PythonUtils, PythonWorkerFactory} +import org.apache.spark.util.Utils + +import java.io.{DataInputStream, File} +import java.net.{InetAddress, ServerSocket, Socket} +import java.util.Arrays +import java.io.InputStream +import scala.collection.JavaConverters._ +import scala.collection.mutable +import org.apache.spark._ +import org.apache.spark.security.SocketAuthHelper +import org.apache.spark.util.RedirectThread + +class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) + extends PythonWorkerFactory(pythonExec, envVars) { + self => + + private val simpleWorkers = new mutable.WeakHashMap[Socket, Process]() + private val authHelper = new SocketAuthHelper(SparkEnv.get.conf) + + private val sedonaUDFWorkerModule = + SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker") + + private val pythonPath = PythonUtils.mergePythonPaths( + PythonUtils.sparkPythonPath, + envVars.getOrElse("PYTHONPATH", ""), + sys.env.getOrElse("PYTHONPATH", "")) + + override def create(): (Socket, Option[Int]) = { + createSimpleWorker(sedonaUDFWorkerModule) + } + + private def createSimpleWorker(workerModule: String): (Socket, Option[Int]) = { + var serverSocket: ServerSocket = null + try { + serverSocket = new ServerSocket(0, 1, InetAddress.getLoopbackAddress()) + + // Create and start the worker + val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", workerModule)) + val jobArtifactUUID = envVars.getOrElse("SPARK_JOB_ARTIFACT_UUID", "default") + if (jobArtifactUUID != "default") { + val f = new File(SparkFiles.getRootDirectory(), jobArtifactUUID) + f.mkdir() + pb.directory(f) + } + val workerEnv = pb.environment() + workerEnv.putAll(envVars.asJava) + workerEnv.put("PYTHONPATH", pythonPath) + // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u: + workerEnv.put("PYTHONUNBUFFERED", "YES") + workerEnv.put("PYTHON_WORKER_FACTORY_PORT", serverSocket.getLocalPort.toString) + workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret) + if (Utils.preferIPv6) { + workerEnv.put("SPARK_PREFER_IPV6", "True") + } + val worker = pb.start() + + // Redirect worker stdout and stderr + redirectStreamsToStderr(worker.getInputStream, worker.getErrorStream) + + // Wait for it to connect to our socket, and validate the auth secret. + serverSocket.setSoTimeout(10000) + + try { + val socket = serverSocket.accept() + authHelper.authClient(socket) + // TODO: When we drop JDK 8, we can just use worker.pid() + val pid = new DataInputStream(socket.getInputStream).readInt() + if (pid < 0) { + throw new IllegalStateException("Python failed to launch worker with code " + pid) + } + self.synchronized { + simpleWorkers.put(socket, worker) + } + + (socket, Some(pid)) + } catch { + case e: Exception => + throw new SparkException("Python worker failed to connect back.", e) + } + } finally { + if (serverSocket != null) { + serverSocket.close() + } + } + } + + private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream): Unit = { + try { + new RedirectThread(stdout, System.err, "stdout reader for " + pythonExec).start() + new RedirectThread(stderr, System.err, "stderr reader for " + pythonExec).start() + } catch { + case e: Exception => + logError("Exception in redirecting streams", e) + } + } +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala new file mode 100644 index 00000000000..18db42ae0df --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamWriter +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.arrow.ArrowWriter +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.Utils +import org.apache.spark.{SparkEnv, TaskContext} + +import java.io.DataOutputStream +import java.net.Socket + +private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] { + self: SedonaBasePythonRunner[IN, _] => + protected override def newWriterThread( + env: SparkEnv, + worker: Socket, + inputIterator: Iterator[IN], + partitionIndex: Int, + context: TaskContext): WriterThread = { + new WriterThread(env, worker, inputIterator, partitionIndex, context) { + + protected override def writeCommand(dataOut: DataOutputStream): Unit = { + handleMetadataBeforeExec(dataOut) + writeUDF(dataOut, funcs, argOffsets) + + // write + dataOut.writeInt(self.geometryFields.length) + // write geometry field indices and their SRIDs + geometryFields.foreach { case (index, srid) => + dataOut.writeInt(index) + dataOut.writeInt(srid) + } + } + + protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = { + val arrowSchema = + ArrowUtils.toArrowSchema(schema, timeZoneId, errorOnDuplicatedFieldNames, largeVarTypes) + val allocator = ArrowUtils.rootAllocator.newChildAllocator( + s"stdout writer for $pythonExec", + 0, + Long.MaxValue) + val root = VectorSchemaRoot.create(arrowSchema, allocator) + + Utils.tryWithSafeFinally { + val writer = new ArrowStreamWriter(root, null, dataOut) + writer.start() + + writeIteratorToArrowStream(root, writer, dataOut, inputIterator) + + // end writes footer to the output stream and doesn't clean any resources. + // It could throw exception if the output stream is closed, so it should be + // in the try block. + writer.end() + } { + // If we close root and allocator in TaskCompletionListener, there could be a race + // condition where the writer thread keeps writing to the VectorSchemaRoot while + // it's being closed by the TaskCompletion listener. + // Closing root and allocator here is cleaner because root and allocator is owned + // by the writer thread and is only visible to the writer thread. + // + // If the writer thread is interrupted by TaskCompletionListener, it should either + // (1) in the try block, in which case it will get an InterruptedException when + // performing io, and goes into the finally block or (2) in the finally block, + // in which case it will ignore the interruption and close the resources. + root.close() + allocator.close() + } + } + } + } +} + +private[python] trait SedonaBasicPythonArrowInput + extends SedonaPythonArrowInput[Iterator[InternalRow]] { + self: SedonaBasePythonRunner[Iterator[InternalRow], _] => + + protected def writeIteratorToArrowStream( + root: VectorSchemaRoot, + writer: ArrowStreamWriter, + dataOut: DataOutputStream, + inputIterator: Iterator[Iterator[InternalRow]]): Unit = { + val arrowWriter = ArrowWriter.create(root) + while (inputIterator.hasNext) { + val startData = dataOut.size() + val nextBatch = inputIterator.next() + + while (nextBatch.hasNext) { + arrowWriter.write(nextBatch.next()) + } + + arrowWriter.finish() + writer.writeBatch() + arrowWriter.reset() + val deltaData = dataOut.size() - startData + pythonMetrics("pythonDataSent") += deltaData + } + } +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala new file mode 100644 index 00000000000..a9421df0af8 --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import java.io.DataInputStream +import java.net.Socket +import java.util.concurrent.atomic.AtomicBoolean +import scala.collection.JavaConverters._ +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamReader +import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.api.python.{BasePythonRunner, SpecialLengths} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, ColumnarBatch} + +private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] => + + protected def pythonMetrics: Map[String, SQLMetric] + + protected def deserializeColumnarBatch(batch: ColumnarBatch, schema: StructType): OUT + + protected def newReaderIterator( + stream: DataInputStream, + writerThread: WriterThread, + startTime: Long, + env: SparkEnv, + worker: Socket, + pid: Option[Int], + releasedOrClosed: AtomicBoolean, + context: TaskContext): Iterator[OUT] = { + + new ReaderIterator( + stream, + writerThread, + startTime, + env, + worker, + pid, + releasedOrClosed, + context) { + + private val allocator = ArrowUtils.rootAllocator.newChildAllocator( + s"stdin reader for $pythonExec", + 0, + Long.MaxValue) + + private var reader: ArrowStreamReader = _ + private var root: VectorSchemaRoot = _ + private var schema: StructType = _ + private var vectors: Array[ColumnVector] = _ + private var eos = false + private var nextObj: OUT = _ + + context.addTaskCompletionListener[Unit] { _ => + if (reader != null) { + reader.close(false) + } + allocator.close() + } + + private var batchLoaded = true + + def handleEndOfDataSectionSedona(): Unit = { + if (stream.readInt() == SpecialLengths.END_OF_STREAM) {} + + eos = true + } + + protected override def handleEndOfDataSection(): Unit = { + handleEndOfDataSectionSedona() + } + + override def hasNext: Boolean = nextObj != null || { + if (!eos) { + nextObj = read() + hasNext + } else { + false + } + } + + override def next(): OUT = { + if (hasNext) { + val obj = nextObj + nextObj = null.asInstanceOf[OUT] + obj + } else { + Iterator.empty.next() + } + } + + protected override def read(): OUT = { + if (writerThread.exception.isDefined) { + throw writerThread.exception.get + } + try { + if (reader != null && batchLoaded) { + val bytesReadStart = reader.bytesRead() + batchLoaded = reader.loadNextBatch() + if (batchLoaded) { + val batch = new ColumnarBatch(vectors) + val rowCount = root.getRowCount + batch.setNumRows(root.getRowCount) + val bytesReadEnd = reader.bytesRead() + pythonMetrics("pythonNumRowsReceived") += rowCount + pythonMetrics("pythonDataReceived") += bytesReadEnd - bytesReadStart + deserializeColumnarBatch(batch, schema) + } else { + reader.close(false) + allocator.close() + read() + } + } else { + val specialSign = stream.readInt() + + specialSign match { + case SpecialLengths.START_ARROW_STREAM => + reader = new ArrowStreamReader(stream, allocator) + root = reader.getVectorSchemaRoot() + schema = ArrowUtils.fromArrowSchema(root.getSchema()) + vectors = root + .getFieldVectors() + .asScala + .map { vector => + new ArrowColumnVector(vector) + } + .toArray[ColumnVector] + + read() + case SpecialLengths.TIMING_DATA => + handleTimingData() + read() + case SpecialLengths.PYTHON_EXCEPTION_THROWN => + throw handlePythonException() + case SpecialLengths.END_OF_DATA_SECTION => + handleEndOfDataSection() + null.asInstanceOf[OUT] + } + } + } catch handleException + } + } + } +} + +private[python] trait SedonaBasicPythonArrowOutput + extends SedonaPythonArrowOutput[ColumnarBatch] { + self: BasePythonRunner[_, ColumnarBatch] => + + protected def deserializeColumnarBatch( + batch: ColumnarBatch, + schema: StructType): ColumnarBatch = batch +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala new file mode 100644 index 00000000000..dbad8358d6f --- /dev/null +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.python + +import java.net.Socket +import scala.collection.mutable + +object WorkerContext { + + def createPythonWorker( + pythonExec: String, + envVars: Map[String, String]): (java.net.Socket, Option[Int]) = { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create() + } + } + + private[spark] def destroyPythonWorker( + pythonExec: String, + envVars: Map[String, String], + worker: Socket): Unit = { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers + .get(key) + .foreach(workerFactory => { + workerFactory.stopWorker(worker) + }) + } + } + + private val pythonWorkers = + mutable.HashMap[(String, Map[String, String]), SedonaDBWorkerFactory]() + +} diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala index 3d3301580cc..ebb5a568e14 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala @@ -44,9 +44,7 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging { } def isScalarPythonUDF(e: Expression): Boolean = { - e.isInstanceOf[PythonUDF] && e - .asInstanceOf[PythonUDF] - .evalType == PythonEvalType.SQL_SCALAR_SEDONA_UDF + e.isInstanceOf[PythonUDF] && PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType) } private def collectEvaluableUDFsFromExpressions( @@ -168,13 +166,12 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging { evalTypes.mkString(",")) } val evalType = evalTypes.head - val evaluation = evalType match { - case PythonEvalType.SQL_SCALAR_SEDONA_UDF => - SedonaArrowEvalPython(validUdfs, resultAttrs, child, evalType) - case _ => - throw new IllegalStateException("Unexpected UDF evalType") + if (!PythonEvalType.evals().contains(evalType)) { + throw new IllegalStateException(s"Unexpected UDF evalType: $evalType") } + val evaluation = SedonaArrowEvalPython(validUdfs, resultAttrs, child, evalType) + attributeMap ++= validUdfs.map(canonicalizeDeterministic).zip(resultAttrs) evaluation } else { diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala deleted file mode 100644 index a403fa6b9eb..00000000000 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.spark.sql.udf - -import org.apache.sedona.sql.UDF.PythonEvalType -import org.apache.spark.api.python.ChainedPythonFunctions -import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.sql.Strategy -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, PythonUDF} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, EvalPythonExec, PythonSQLMetrics} -import org.apache.spark.sql.types.StructType - -import scala.collection.JavaConverters.asScalaIteratorConverter - -// We use custom Strategy to avoid Apache Spark assert on types, we -// can consider extending this to support other engines working with -// arrow data -class SedonaArrowStrategy extends Strategy { - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case SedonaArrowEvalPython(udfs, output, child, evalType) => - SedonaArrowEvalPythonExec(udfs, output, planLater(child), evalType) :: Nil - case _ => Nil - } -} - -// It's modification og Apache Spark's ArrowEvalPythonExec, we remove the check on the types to allow geometry types -// here, it's initial version to allow the vectorized udf for Sedona geometry types. We can consider extending this -// to support other engines working with arrow data -case class SedonaArrowEvalPythonExec( - udfs: Seq[PythonUDF], - resultAttrs: Seq[Attribute], - child: SparkPlan, - evalType: Int) - extends EvalPythonExec - with PythonSQLMetrics { - - private val batchSize = conf.arrowMaxRecordsPerBatch - private val sessionLocalTimeZone = conf.sessionLocalTimeZone - private val largeVarTypes = conf.arrowUseLargeVarTypes - private val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf) - private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) - - protected override def evaluate( - funcs: Seq[ChainedPythonFunctions], - argOffsets: Array[Array[Int]], - iter: Iterator[InternalRow], - schema: StructType, - context: TaskContext): Iterator[InternalRow] = { - - val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter) - - val columnarBatchIter = new ArrowPythonRunner( - funcs, - evalType - PythonEvalType.SEDONA_UDF_TYPE_CONSTANT, - argOffsets, - schema, - sessionLocalTimeZone, - largeVarTypes, - pythonRunnerConf, - pythonMetrics, - jobArtifactUUID).compute(batchIter, context.partitionId(), context) - - columnarBatchIter.flatMap { batch => - batch.rowIterator.asScala - } - } - - override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = - copy(child = newChild) -} diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index 28943ff11da..e0b81c5e479 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -46,6 +46,7 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { // We need to be explicit about broadcasting in tests. .config("sedona.join.autoBroadcastJoinThreshold", "-1") .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") + .config("sedona.python.worker.udf.module", "sedonaworker.worker") .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala index 8d41848de98..7719b2199c5 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.udf import org.apache.sedona.sql.TestBaseScala import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.udf.ScalarUDF.geoPandasScalaFunction +import org.apache.spark.sql.functions.{col, expr, lit} +import org.apache.spark.sql.udf.ScalarUDF.{geoPandasScalaFunction, sedonaDBGeometryToGeometryFunction} import org.locationtech.jts.io.WKTReader import org.scalatest.matchers.should.Matchers @@ -35,7 +35,8 @@ class StrategySuite extends TestBaseScala with Matchers { import spark.implicits._ - it("sedona geospatial UDF") { + + it("sedona geospatial UDF - geopandas") { val df = Seq( (1, "value", wktReader.read("POINT(21 52)")), (2, "value1", wktReader.read("POINT(20 50)")), @@ -43,11 +44,13 @@ class StrategySuite extends TestBaseScala with Matchers { (4, "value3", wktReader.read("POINT(20 48)")), (5, "value4", wktReader.read("POINT(20 47)"))) .toDF("id", "value", "geom") + + val geopandasUDFDF = df .withColumn("geom_buffer", geoPandasScalaFunction(col("geom"))) - df.count shouldEqual 5 + geopandasUDFDF.count shouldEqual 5 - df.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))") + geopandasUDFDF.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))") .as[String] .collect() should contain theSameElementsAs Seq( "POLYGON ((20 51, 20 53, 22 53, 22 51, 20 51))", @@ -56,4 +59,23 @@ class StrategySuite extends TestBaseScala with Matchers { "POLYGON ((19 47, 19 49, 21 49, 21 47, 19 47))", "POLYGON ((19 46, 19 48, 21 48, 21 46, 19 46))") } + + it("sedona geospatial UDF - sedona db") { + val df = Seq( + (1, "value", wktReader.read("POINT(21 52)")), + (2, "value1", wktReader.read("POINT(20 50)")), + (3, "value2", wktReader.read("POINT(20 49)")), + (4, "value3", wktReader.read("POINT(20 48)")), + (5, "value4", wktReader.read("POINT(20 47)"))) + .toDF("id", "value", "geom") + + val dfVectorized = df + .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) + .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) + + dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") + .selectExpr("sum(x)") + .as[Double] + .collect().head shouldEqual 101 + } } diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala index c0a2d8f260d..23aac14bbe5 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala @@ -19,11 +19,13 @@ package org.apache.spark.sql.udf import org.apache.sedona.sql.UDF -import org.apache.spark.TestUtils +import org.apache.spark.{SparkEnv, TestUtils} import org.apache.spark.api.python._ import org.apache.spark.broadcast.Broadcast +import org.apache.spark.internal.config.Python.{PYTHON_USE_DAEMON, PYTHON_WORKER_MODULE} import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT +import org.apache.spark.sql.types.FloatType import org.apache.spark.util.Utils import java.io.File @@ -43,6 +45,9 @@ object ScalarUDF { } } + SparkEnv.get.conf.set(PYTHON_USE_DAEMON, false) + SparkEnv.get.conf.set(PYTHON_WORKER_MODULE, "sedonaworker.work") + private[spark] lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "") protected lazy val sparkHome: String = { sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) @@ -54,7 +59,7 @@ object ScalarUDF { private lazy val isPythonAvailable: Boolean = TestUtils.testCommandAvailable(pythonExec) - lazy val pythonVer: String = if (isPythonAvailable) { + val pythonVer: String = if (isPythonAvailable) { Process( Seq(pythonExec, "-c", "import sys; print('%d.%d' % sys.version_info[:2])"), None, @@ -70,31 +75,85 @@ object ScalarUDF { finally Utils.deleteRecursively(path) } - val pandasFunc: Array[Byte] = { + val additionalModule = "spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf" + + val vectorizedFunction: Array[Byte] = { + var binaryPandasFunc: Array[Byte] = null + withTempPath { path => + Process( + Seq( + pythonExec, + "-c", + f""" + |from pyspark.sql.types import FloatType + |from pyspark.serializers import CloudPickleSerializer + |f = open('$path', 'wb'); + | + |def apply_function_on_number(x): + | return x + 1.0 + |f.write(CloudPickleSerializer().dumps((apply_function_on_number, FloatType()))) + |""".stripMargin), + None, + "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! + binaryPandasFunc = Files.readAllBytes(path.toPath) + } + assert(binaryPandasFunc != null) + binaryPandasFunc + } + + val sedonaDBGeometryToGeometryFunctionBytes: Array[Byte] = { + var binaryPandasFunc: Array[Byte] = null + withTempPath { path => + Process( + Seq( + pythonExec, + "-c", + f""" + |import pyarrow as pa + |import shapely + |import geoarrow.pyarrow as ga + |from sedonadb import udf + |from sedona.sql.types import GeometryType + |from pyspark.serializers import CloudPickleSerializer + |from pyspark.sql.types import DoubleType, IntegerType + |from sedonadb import udf as sedona_udf_module + | + |@sedona_udf_module.arrow_udf(ga.wkb(), [udf.GEOMETRY, udf.NUMERIC]) + |def geometry_udf(geom, distance): + | geom_wkb = pa.array(geom.storage.to_array()) + | distance = pa.array(distance.to_array()) + | geom = shapely.from_wkb(geom_wkb) + | result_shapely = shapely.buffer(geom, distance) + | + | return pa.array(shapely.to_wkb(result_shapely)) + | + |f = open('$path', 'wb'); + |f.write(CloudPickleSerializer().dumps((lambda: geometry_udf, GeometryType()))) + |""".stripMargin), + None, + "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! + binaryPandasFunc = Files.readAllBytes(path.toPath) + } + assert(binaryPandasFunc != null) + binaryPandasFunc + } + + val geopandasNonGeometryToGeometryFunction: Array[Byte] = { var binaryPandasFunc: Array[Byte] = null withTempPath { path => - println(path) Process( Seq( pythonExec, "-c", f""" - |from pyspark.sql.types import IntegerType - |from shapely.geometry import Point - |from sedona.sql.types import GeometryType - |from pyspark.serializers import CloudPickleSerializer - |from sedona.utils import geometry_serde - |from shapely import box - |f = open('$path', 'wb'); - |def w(x): - | def apply_function(w): - | geom, offset = geometry_serde.deserialize(w) - | bounds = geom.buffer(1).bounds - | x = box(*bounds) - | return geometry_serde.serialize(x) - | return x.apply(apply_function) - |f.write(CloudPickleSerializer().dumps((w, GeometryType()))) - |""".stripMargin), + |from sedona.sql.types import GeometryType + |from shapely.wkt import loads + |from pyspark.serializers import CloudPickleSerializer + |f = open('$path', 'wb'); + |def apply_geopandas(x): + | return x.apply(lambda wkt: loads(wkt).buffer(1)) + |f.write(CloudPickleSerializer().dumps((apply_geopandas, GeometryType()))) + |""".stripMargin), None, "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! binaryPandasFunc = Files.readAllBytes(path.toPath) @@ -104,7 +163,39 @@ object ScalarUDF { } private val workerEnv = new java.util.HashMap[String, String]() - workerEnv.put("PYTHONPATH", s"$pysparkPythonPath:$pythonPath") + + val pandasFunc: Array[Byte] = { + var binaryPandasFunc: Array[Byte] = null + withTempPath { path => + println(path) + Process( + Seq( + pythonExec, + "-c", + f""" + |from pyspark.sql.types import IntegerType + |from shapely.geometry import Point + |from sedona.sql.types import GeometryType + |from pyspark.serializers import CloudPickleSerializer + |from sedona.utils import geometry_serde + |from shapely import box + |f = open('$path', 'wb'); + |def w(x): + | def apply_function(w): + | geom, offset = geometry_serde.deserialize(w) + | bounds = geom.buffer(1).bounds + | x = box(*bounds) + | return geometry_serde.serialize(x) + | return x.apply(apply_function) + |f.write(CloudPickleSerializer().dumps((w, GeometryType()))) + |""".stripMargin), + None, + "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! + binaryPandasFunc = Files.readAllBytes(path.toPath) + } + assert(binaryPandasFunc != null) + binaryPandasFunc + } val geoPandasScalaFunction: UserDefinedPythonFunction = UserDefinedPythonFunction( name = "geospatial_udf", @@ -119,4 +210,33 @@ object ScalarUDF { dataType = GeometryUDT, pythonEvalType = UDF.PythonEvalType.SQL_SCALAR_SEDONA_UDF, udfDeterministic = true) + + val nonGeometryVectorizedUDF: UserDefinedPythonFunction = UserDefinedPythonFunction( + name = "vectorized_udf", + func = SimplePythonFunction( + command = vectorizedFunction, + envVars = workerEnv.clone().asInstanceOf[java.util.Map[String, String]], + pythonIncludes = List.empty[String].asJava, + pythonExec = pythonExec, + pythonVer = pythonVer, + broadcastVars = List.empty[Broadcast[PythonBroadcast]].asJava, + accumulator = null), + dataType = FloatType, + pythonEvalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF, + udfDeterministic = false) + + val sedonaDBGeometryToGeometryFunction: UserDefinedPythonFunction = UserDefinedPythonFunction( + name = "geospatial_udf", + func = SimplePythonFunction( + command = sedonaDBGeometryToGeometryFunctionBytes, + envVars = workerEnv.clone().asInstanceOf[java.util.Map[String, String]], + pythonIncludes = List.empty[String].asJava, + pythonExec = pythonExec, + pythonVer = pythonVer, + broadcastVars = List.empty[Broadcast[PythonBroadcast]].asJava, + accumulator = null), + dataType = GeometryUDT, + pythonEvalType = UDF.PythonEvalType.SQL_SCALAR_SEDONA_DB_UDF, + udfDeterministic = true) + } From 45c2fd64ca1d18b1947e988a346bac793333f592 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Sun, 4 Jan 2026 22:39:13 +0100 Subject: [PATCH 02/11] add sedonadb sedona udf worker example --- python/sedona/spark/worker/serde.py | 27 --------------------------- python/sedona/spark/worker/worker.py | 25 ------------------------- 2 files changed, 52 deletions(-) diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py index 31038b7fcdd..3954d075b74 100644 --- a/python/sedona/spark/worker/serde.py +++ b/python/sedona/spark/worker/serde.py @@ -1,29 +1,8 @@ -import socket - from pyspark.serializers import write_int, SpecialLengths from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from sedona.spark.worker.udf_info import UDFInfo - -def read_available(buf, chunk=4096): - # buf.raw._sock.settimeout(0.01) # non-blocking-ish - data = bytearray() - index = 0 - while True: - index+=1 - try: - chunk_bytes = buf.read(chunk) - except socket.timeout: - break - - if not chunk_bytes and index > 10: - break - - data.extend(chunk_bytes) - - return bytes(data) - class SedonaDBSerializer(ArrowStreamPandasSerializer): def __init__(self, timezone, safecheck, db, udf_info: UDFInfo): super(SedonaDBSerializer, self).__init__(timezone, safecheck) @@ -64,12 +43,6 @@ def arrow_dump_stream(self, iterator, stream): writer.close() def dump_stream(self, iterator, stream): - """ - Override because Pandas UDFs require a START_ARROW_STREAM before the Arrow stream is sent. - This should be sent after creating the first record batch so in case of an error, it can - be sent back to the JVM before the Arrow stream starts. - """ - def init_stream_yield_batches(): should_write_start_length = True for batch in iterator: diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py index 74a61b02ee8..571134f4076 100644 --- a/python/sedona/spark/worker/worker.py +++ b/python/sedona/spark/worker/worker.py @@ -104,7 +104,6 @@ def add_path(path: str): sys.path.insert(1, path) spark_files_dir = utf_serde.loads(infile) - # _accumulatorRegistry.clear() SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True @@ -165,26 +164,6 @@ def read_udf(infile, pickle_ser) -> UDFInfo: geom_offsets=[0] ) -# def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): -# num_arg = read_int(infile) -# arg_offsets = [read_int(infile) for i in range(num_arg)] -# chained_func = None -# for i in range(read_int(infile)): -# f, return_type = read_command(pickleSer, infile) -# if chained_func is None: -# chained_func = f -# else: -# chained_func = chain(chained_func, f) -# -# func = chained_func -# -# # the last returnType will be the return type of UDF -# if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: -# return arg_offsets, func, return_type -# else: -# raise ValueError("Unknown eval type: {}".format(eval_type)) -# - def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo: num_udfs = read_int(infile) @@ -211,11 +190,7 @@ def write_statistics(infile, outfile, boot_time, init_time) -> None: write_long(shuffle.MemoryBytesSpilled, outfile) write_long(shuffle.DiskBytesSpilled, outfile) - # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) - # write_int(len(_accumulatorRegistry), outfile) - # for (aid, accum) in _accumulatorRegistry.items(): - # pickleSer._write_with_length((aid, accum._value), outfile) if read_int(infile) == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) From 880d8c7140c65eb05a91158cf6a6ae4dde7c5a37 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Sun, 4 Jan 2026 23:27:06 +0100 Subject: [PATCH 03/11] add sedonadb sedona udf worker example --- python/sedona/spark/worker/worker.py | 2 -- python/tests/utils/test_sedona_db_vectorized_udf.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py index 571134f4076..6b2a18c8f20 100644 --- a/python/sedona/spark/worker/worker.py +++ b/python/sedona/spark/worker/worker.py @@ -170,8 +170,6 @@ def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo: udf = None for _ in range(num_udfs): udf = read_udf(infile, pickle_ser) - # Here we would register the UDF with SedonaDB's internal context - return udf diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py index 749d45420ef..6021811e916 100644 --- a/python/tests/utils/test_sedona_db_vectorized_udf.py +++ b/python/tests/utils/test_sedona_db_vectorized_udf.py @@ -5,6 +5,7 @@ from sedona.sql import GeometryType from pyspark.sql.functions import expr, lit from pyspark.sql.types import DoubleType, IntegerType +from sedona.spark.sql import ST_X class TestSedonaDBArrowFunction(TestBase): @@ -28,7 +29,7 @@ def my_own_function(geom, distance): ["id", "wkt"], ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) - df.select(my_own_function(df.wkt, lit(100)).alias("geom")).show() + df.select(ST_X(my_own_function(df.wkt, lit(100)).alias("geom"))).show() def test_geometry_to_double(self): @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()]) From 53f69e51f964f0e578bd2d97ec9b5971be565da2 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Fri, 9 Jan 2026 10:47:48 +0100 Subject: [PATCH 04/11] add sedonadb sedona udf worker example --- python/tests/test_base.py | 2 + .../utils/test_sedona_db_vectorized_udf.py | 41 +++ .../python/SedonaBasePythonRunner.scala | 6 +- .../python/SedonaDBWorkerFactory.scala | 247 +++++++++++++++++- .../python/SedonaPythonArrowOutput.scala | 26 +- .../sql/execution/python/WorkerContext.scala | 24 +- .../sedona/sql/SQLSyntaxTestScala.scala | 8 +- .../org/apache/sedona/sql/TestBaseScala.scala | 24 +- .../apache/spark/sql/udf/StrategySuite.scala | 43 ++- 9 files changed, 373 insertions(+), 48 deletions(-) diff --git a/python/tests/test_base.py b/python/tests/test_base.py index a6dbae65975..911860e416f 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -70,6 +70,8 @@ def spark(self): "spark.sedona.stac.load.itemsLimitMax", "20", ) + .config("spark.executor.memory", "10G") \ + .config("spark.driver.memory", "10G") \ # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default # in Spark 4 .config("spark.sql.ansi.enabled", "false") diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py index 6021811e916..904d59a2820 100644 --- a/python/tests/utils/test_sedona_db_vectorized_udf.py +++ b/python/tests/utils/test_sedona_db_vectorized_udf.py @@ -93,3 +93,44 @@ def return_same_geometry(geom): crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect() assert crs_list == [3857, 3857, 3857] + + def test_geometry_to_geometry(self): + @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()]) + def buffer_geometry(geom): + geom_wkb = pa.array(geom.storage.to_array()) + geom = shapely.from_wkb(geom_wkb) + + result_shapely = shapely.buffer(geom, 10) + + return pa.array(shapely.to_wkb(result_shapely)) + + df = self.spark.read.\ + format("geoparquet").\ + load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") + # 18 24 + # df.union(df).union(df).union(df).union(df).union(df).union(df).\ + # write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") + + values = df.select(buffer_geometry(df.geometry).alias("geometry")).\ + selectExpr("ST_Area(geometry) as area").\ + selectExpr("Sum(area) as total_area") + + values.show() + + def test_geometry_to_geometry_normal_udf(self): + from pyspark.sql.functions import udf + + def create_buffer(geom): + return geom.buffer(10) + + create_buffer_udf = udf(create_buffer, GeometryType()) + + df = self.spark.read. \ + format("geoparquet"). \ + load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") + + values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \ + selectExpr("ST_Area(geometry) as area"). \ + selectExpr("Sum(area) as total_area") + + values.show() diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala index 8ecc110e39d..276383a0ee8 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala @@ -46,6 +46,7 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs") private val conf = SparkEnv.get.conf + private val reuseWorker = conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED) private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = { @@ -82,6 +83,7 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default")) + println("running the compute for SedonaBasePythonRunner and partition index: " + partitionIndex) val (worker: Socket, pid: Option[Int]) = { WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap) } @@ -93,8 +95,10 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( context.addTaskCompletionListener[Unit] { _ => writerThread.shutdownOnTaskCompletion() - if (releasedOrClosed.compareAndSet(false, true)) { + + if (!reuseWorker || releasedOrClosed.compareAndSet(false, true)) { try { + logInfo("Shutting down worker socket") worker.close() } catch { case e: Exception => diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala index add09a7cb22..93bcaee0c6f 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala @@ -19,36 +19,68 @@ package org.apache.spark.sql.execution.python import org.apache.spark.{SparkException, SparkFiles} -import org.apache.spark.api.python.{PythonUtils, PythonWorkerFactory} +import org.apache.spark.api.python.PythonUtils import org.apache.spark.util.Utils -import java.io.{DataInputStream, File} -import java.net.{InetAddress, ServerSocket, Socket} +import java.io.{DataInputStream, DataOutputStream, EOFException, File, InputStream} +import java.net.{InetAddress, ServerSocket, Socket, SocketException} import java.util.Arrays -import java.io.InputStream import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark._ +import org.apache.spark.errors.SparkCoreErrors +import org.apache.spark.internal.Logging import org.apache.spark.security.SocketAuthHelper +import org.apache.spark.sql.execution.python.SedonaPythonWorkerFactory.PROCESS_WAIT_TIMEOUT_MS import org.apache.spark.util.RedirectThread -class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) - extends PythonWorkerFactory(pythonExec, envVars) { +import java.util.concurrent.TimeUnit +import javax.annotation.concurrent.GuardedBy + +class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) extends Logging { self => private val simpleWorkers = new mutable.WeakHashMap[Socket, Process]() private val authHelper = new SocketAuthHelper(SparkEnv.get.conf) + @GuardedBy("self") + private var daemon: Process = null + val daemonHost = InetAddress.getLoopbackAddress() + @GuardedBy("self") + private var daemonPort: Int = 0 + @GuardedBy("self") + private val daemonWorkers = new mutable.WeakHashMap[Socket, Int]() + @GuardedBy("self") + private val idleWorkers = new mutable.Queue[Socket]() + @GuardedBy("self") + private var lastActivityNs = 0L + + private val useDaemon: Boolean = + SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false) private val sedonaUDFWorkerModule = SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker") + private val sedonaDaemonModule = + SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") + private val pythonPath = PythonUtils.mergePythonPaths( PythonUtils.sparkPythonPath, envVars.getOrElse("PYTHONPATH", ""), sys.env.getOrElse("PYTHONPATH", "")) - override def create(): (Socket, Option[Int]) = { - createSimpleWorker(sedonaUDFWorkerModule) + def create(): (Socket, Option[Int]) = { + if (useDaemon) { + self.synchronized { + if (idleWorkers.nonEmpty) { + val worker = idleWorkers.dequeue() + return (worker, daemonWorkers.get(worker)) + } + } + + createThroughDaemon() + } else { + createSimpleWorker(sedonaUDFWorkerModule) + } } private def createSimpleWorker(workerModule: String): (Socket, Option[Int]) = { @@ -115,4 +147,203 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) logError("Exception in redirecting streams", e) } } + + private def createThroughDaemon(): (Socket, Option[Int]) = { + + def createSocket(): (Socket, Option[Int]) = { + val socket = new Socket(daemonHost, daemonPort) + val pid = new DataInputStream(socket.getInputStream).readInt() + if (pid < 0) { + throw new IllegalStateException("Python daemon failed to launch worker with code " + pid) + } + + authHelper.authToServer(socket) + daemonWorkers.put(socket, pid) + (socket, Some(pid)) + } + + self.synchronized { + // Start the daemon if it hasn't been started + startDaemon() + + // Attempt to connect, restart and retry once if it fails + try { + createSocket() + } catch { + case exc: SocketException => + logWarning("Failed to open socket to Python daemon:", exc) + logWarning("Assuming that daemon unexpectedly quit, attempting to restart") + stopDaemon() + startDaemon() + createSocket() + } + } + } + + private def stopDaemon(): Unit = { + logError("daemon stopping called") + self.synchronized { + if (useDaemon) { + cleanupIdleWorkers() + + // Request shutdown of existing daemon by sending SIGTERM + if (daemon != null) { + daemon.destroy() + } + + daemon = null + daemonPort = 0 + } else { + println("Stopping simple workers") + simpleWorkers.mapValues(_.destroy()) + } + } + } + + private def startDaemon(): Unit = { + self.synchronized { + // Is it already running? + if (daemon != null) { + return + } + + try { + // Create and start the daemon + val command = Arrays.asList(pythonExec, "-m", sedonaDaemonModule) + val pb = new ProcessBuilder(command) + val jobArtifactUUID = envVars.getOrElse("SPARK_JOB_ARTIFACT_UUID", "default") + if (jobArtifactUUID != "default") { + val f = new File(SparkFiles.getRootDirectory(), jobArtifactUUID) + f.mkdir() + pb.directory(f) + } + val workerEnv = pb.environment() + workerEnv.putAll(envVars.asJava) + workerEnv.put("PYTHONPATH", pythonPath) + workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret) + if (Utils.preferIPv6) { + workerEnv.put("SPARK_PREFER_IPV6", "True") + } + // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u: + workerEnv.put("PYTHONUNBUFFERED", "YES") + daemon = pb.start() + + val in = new DataInputStream(daemon.getInputStream) + try { + daemonPort = in.readInt() + } catch { + case _: EOFException if daemon.isAlive => + throw SparkCoreErrors.eofExceptionWhileReadPortNumberError( + sedonaDaemonModule) + case _: EOFException => + throw SparkCoreErrors. + eofExceptionWhileReadPortNumberError(sedonaDaemonModule, Some(daemon.exitValue)) + } + + // test that the returned port number is within a valid range. + // note: this does not cover the case where the port number + // is arbitrary data but is also coincidentally within range + if (daemonPort < 1 || daemonPort > 0xffff) { + val exceptionMessage = f""" + |Bad data in $sedonaDaemonModule's standard output. Invalid port number: + | $daemonPort (0x$daemonPort%08x) + |Python command to execute the daemon was: + | ${command.asScala.mkString(" ")} + |Check that you don't have any unexpected modules or libraries in + |your PYTHONPATH: + | $pythonPath + |Also, check if you have a sitecustomize.py module in your python path, + |or in your python installation, that is printing to standard output""" + throw new SparkException(exceptionMessage.stripMargin) + } + + // Redirect daemon stdout and stderr + redirectStreamsToStderr(in, daemon.getErrorStream) + } catch { + case e: Exception => + + // If the daemon exists, wait for it to finish and get its stderr + val stderr = Option(daemon) + .flatMap { d => Utils.getStderr(d, PROCESS_WAIT_TIMEOUT_MS) } + .getOrElse("") + + stopDaemon() + + if (stderr != "") { + val formattedStderr = stderr.replace("\n", "\n ") + val errorMessage = s""" + |Error from python worker: + | $formattedStderr + |PYTHONPATH was: + | $pythonPath + |$e""" + + // Append error message from python daemon, but keep original stack trace + val wrappedException = new SparkException(errorMessage.stripMargin) + wrappedException.setStackTrace(e.getStackTrace) + throw wrappedException + } else { + throw e + } + } + + // Important: don't close daemon's stdin (daemon.getOutputStream) so it can correctly + // detect our disappearance. + } + } + + private def cleanupIdleWorkers(): Unit = { + while (idleWorkers.nonEmpty) { + val worker = idleWorkers.dequeue() + try { + // the worker will exit after closing the socket + worker.close() + } catch { + case e: Exception => + logWarning("Failed to close worker socket", e) + } + } + } + + def releaseWorker(worker: Socket): Unit = { + if (useDaemon) { + logInfo("Releasing worker back to daemon pool") + self.synchronized { + lastActivityNs = System.nanoTime() + idleWorkers.enqueue(worker) + } + } else { + // Cleanup the worker socket. This will also cause the Python worker to exit. + try { + worker.close() + } catch { + case e: Exception => + logWarning("Failed to close worker socket", e) + } + } + } + + def stopWorker(worker: Socket): Unit = { + self.synchronized { + if (useDaemon) { + if (daemon != null) { + daemonWorkers.get(worker).foreach { pid => + // tell daemon to kill worker by pid + val output = new DataOutputStream(daemon.getOutputStream) + output.writeInt(pid) + output.flush() + daemon.getOutputStream.flush() + } + } + } else { + simpleWorkers.get(worker).foreach(_.destroy()) + } + } + worker.close() + } +} + +private object SedonaPythonWorkerFactory { + val PROCESS_WAIT_TIMEOUT_MS = 10000 + val IDLE_WORKER_TIMEOUT_NS = TimeUnit.MINUTES.toNanos(1) // kill idle workers after 1 minute } diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala index a9421df0af8..0c0b220933f 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala @@ -26,6 +26,7 @@ import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowStreamReader import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{BasePythonRunner, SpecialLengths} +import org.apache.spark.internal.config.Python.PYTHON_WORKER_REUSE import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.ArrowUtils @@ -33,6 +34,8 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, Columna private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] => + private val reuseWorker = SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) + protected def pythonMetrics: Map[String, SQLMetric] protected def deserializeColumnarBatch(batch: ColumnarBatch, schema: StructType): OUT @@ -78,11 +81,28 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR private var batchLoaded = true - def handleEndOfDataSectionSedona(): Unit = { - if (stream.readInt() == SpecialLengths.END_OF_STREAM) {} - + protected def handleEndOfDataSectionSedona(): Unit = { + // We've finished the data section of the output, but we can still + // read some accumulator updates: +// val numAccumulatorUpdates = stream.readInt() +// (1 to numAccumulatorUpdates).foreach { _ => +// val updateLen = stream.readInt() +// val update = new Array[Byte](updateLen) +// stream.readFully(update) +// } + // Check whether the worker is ready to be re-used. + if (stream.readInt() == SpecialLengths.END_OF_STREAM) { + if (reuseWorker && releasedOrClosed.compareAndSet(false, true)) { + WorkerContext.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker) + } + } eos = true } +// def handleEndOfDataSectionSedona(): Unit = { +// if (stream.readInt() == SpecialLengths.END_OF_STREAM) {} +// +// eos = true +// } protected override def handleEndOfDataSection(): Unit = { handleEndOfDataSectionSedona() diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala index dbad8358d6f..82fe6deddaf 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala @@ -24,25 +24,27 @@ import scala.collection.mutable object WorkerContext { def createPythonWorker( - pythonExec: String, - envVars: Map[String, String]): (java.net.Socket, Option[Int]) = { + pythonExec: String, + envVars: Map[String, String]): (java.net.Socket, Option[Int]) = { synchronized { val key = (pythonExec, envVars) pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create() } } - private[spark] def destroyPythonWorker( - pythonExec: String, - envVars: Map[String, String], - worker: Socket): Unit = { + def destroyPythonWorker(pythonExec: String, + envVars: Map[String, String], worker: Socket): Unit = { synchronized { val key = (pythonExec, envVars) - pythonWorkers - .get(key) - .foreach(workerFactory => { - workerFactory.stopWorker(worker) - }) + pythonWorkers.get(key).foreach(_.stopWorker(worker)) + } + } + + def releasePythonWorker(pythonExec: String, + envVars: Map[String, String], worker: Socket): Unit = { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers.get(key).foreach(_.releaseWorker(worker)) } } diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala index 6f873d0a087..72a27461f69 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala @@ -47,11 +47,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { try { sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) - sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") +// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") } catch { case ex: Exception => ex.getClass.getName.endsWith("ParseException") should be(true) - sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") +// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") } } @@ -61,11 +61,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { sparkSession.sql( "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) - sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") +// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") } catch { case ex: Exception => ex.getClass.getName.endsWith("ParseException") should be(true) - sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") +// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") } } } diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index e0b81c5e479..e64e9dec3bb 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -30,13 +30,13 @@ import java.io.FileInputStream import java.util.concurrent.ThreadLocalRandom trait TestBaseScala extends FunSpec with BeforeAndAfterAll { - Logger.getRootLogger().setLevel(Level.WARN) - Logger.getLogger("org.apache").setLevel(Level.WARN) - Logger.getLogger("com").setLevel(Level.WARN) - Logger.getLogger("akka").setLevel(Level.WARN) - Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) +// Logger.getRootLogger().setLevel(Level.WARN) +// Logger.getLogger("org.apache").setLevel(Level.WARN) +// Logger.getLogger("com").setLevel(Level.WARN) +// Logger.getLogger("akka").setLevel(Level.WARN) +// Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) - val keyParserExtension = "spark.sedona.enableParserExtension" +// val keyParserExtension = "spark.sedona.enableParserExtension" val warehouseLocation = System.getProperty("user.dir") + "/target/" val sparkSession = SedonaContext .builder() @@ -47,9 +47,19 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { .config("sedona.join.autoBroadcastJoinThreshold", "-1") .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") .config("sedona.python.worker.udf.module", "sedonaworker.worker") - .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) + .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon") + .config("sedona.python.worker.daemon.enabled", "true") +// .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() +// private val useDaemon: Boolean = +// SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false) +// +// private val sedonaUDFWorkerModule = +// SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker") +// +// private val sedonaDaemonModule = +// SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") val sparkSessionMinio = SedonaContext .builder() .master("local[*]") diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala index 7719b2199c5..000c1f55b62 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala @@ -61,21 +61,36 @@ class StrategySuite extends TestBaseScala with Matchers { } it("sedona geospatial UDF - sedona db") { - val df = Seq( - (1, "value", wktReader.read("POINT(21 52)")), - (2, "value1", wktReader.read("POINT(20 50)")), - (3, "value2", wktReader.read("POINT(20 49)")), - (4, "value3", wktReader.read("POINT(20 48)")), - (5, "value4", wktReader.read("POINT(20 47)"))) - .toDF("id", "value", "geom") +// val df = Seq( +// (1, "value", wktReader.read("POINT(21 52)")), +// (2, "value1", wktReader.read("POINT(20 50)")), +// (3, "value2", wktReader.read("POINT(20 49)")), +// (4, "value3", wktReader.read("POINT(20 48)")), +// (5, "value4", wktReader.read("POINT(20 47)"))) +// .toDF("id", "value", "geom") +// +// val dfVectorized = df +// .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) +// .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) + +// dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") +// .selectExpr("sum(x)") +// .as[Double] +// .collect().head shouldEqual 101 + + val dfCopied = sparkSession.read + .format("geoparquet") + .load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet") - val dfVectorized = df - .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) - .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) + val values = dfCopied.unionAll(dfCopied) + .unionAll(dfCopied) +// .unionAll(dfCopied) +// .unionAll(dfCopied) +// .unionAll(dfCopied) + .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom")) + .selectExpr("ST_Area(geom) as area") + .selectExpr("Sum(area) as total_area") - dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") - .selectExpr("sum(x)") - .as[Double] - .collect().head shouldEqual 101 + values.show() } } From e00915751a90249f3395dc5d71957e0521f4a640 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Tue, 13 Jan 2026 00:25:30 +0100 Subject: [PATCH 05/11] add sedonadb sedona udf worker example --- python/pyproject.toml | 14 +- python/sedona/spark/sql/functions.py | 22 +- python/sedona/spark/utils/geometry_serde.py | 6 +- python/sedona/spark/utils/udf.py | 26 ++ python/sedona/spark/worker/daemon.py | 227 ++++++++++++++++++ python/sedona/spark/worker/serde.py | 6 +- python/sedona/spark/worker/udf_info.py | 9 +- python/sedona/spark/worker/worker.py | 18 +- python/setup.py | 6 + python/src/geom_buf.c | 2 + python/src/geomserde.c | 1 + python/src/geomserde_speedup_module.c | 99 +++++++- python/tests/test_base.py | 2 + .../utils/test_sedona_db_vectorized_udf.py | 132 +++++++++- .../sedona/sql/UDF/PythonEvalType.scala | 4 +- .../python/SedonaArrowPythonRunner.scala | 6 +- .../python/SedonaArrowStrategy.scala | 10 +- .../python/SedonaBasePythonRunner.scala | 12 +- .../python/SedonaDBWorkerFactory.scala | 14 +- .../python/SedonaPythonArrowInput.scala | 3 + .../python/SedonaPythonArrowOutput.scala | 3 +- .../sql/execution/python/WorkerContext.scala | 16 +- .../spark/sql/udf/ExtractSedonaUDFRule.scala | 3 +- .../org/apache/sedona/sql/TestBaseScala.scala | 4 +- .../apache/spark/sql/udf/StrategySuite.scala | 59 +++-- 25 files changed, 602 insertions(+), 102 deletions(-) create mode 100644 python/sedona/spark/utils/udf.py create mode 100644 python/sedona/spark/worker/daemon.py create mode 100644 python/setup.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 76169261c34..37159cf83b9 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["setuptools>=80.9.0", "wheel"] +requires = ["setuptools>=80.9.0", "wheel", "numpy"] build-backend = "setuptools.build_meta" [project] @@ -36,6 +36,7 @@ dependencies = [ "geoarrow-c>=0.3.1", "geoarrow-pyarrow>=0.2.0", "geopandas>=1.1.2", + "numpy>=2.1.3", "pyarrow>=16.1.0", "pyspark==3.5.4", "sedonadb", @@ -79,14 +80,19 @@ exclude = ["*.tests", "*.tests.*", "tests", "tests.*"] name = "sedona.spark.utils.geomserde_speedup" sources = [ "src/geomserde_speedup_module.c", + "src/sedonaserde_vectorized_udf_module.c", "src/geomserde.c", "src/geom_buf.c", "src/geos_c_dyn.c", ] -[tool.uv] -dev-dependencies = [ - "pytest>=9.0.2", +[[tool.setuptools.ext-modules]] +name = "sedona.spark.utils.sedonaserde_vectorized_udf_module" +sources = [ + "src/sedonaserde_vectorized_udf_module.c", + "src/geomserde.c", + "src/geom_buf.c", + "src/geos_c_dyn.c", ] [tool.uv.sources] diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py index 7c480e1700d..232ccb50a3c 100644 --- a/python/sedona/spark/sql/functions.py +++ b/python/sedona/spark/sql/functions.py @@ -28,8 +28,9 @@ import geoarrow.pyarrow as ga from sedonadb import udf as sedona_udf_module from sedona.spark.sql.types import GeometryType -from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType +from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType, ByteType +from sedona.spark.utils.udf import has_sedona_serializer_speedup SEDONA_SCALAR_EVAL_TYPE = 5200 SEDONA_PANDAS_ARROW_NAME = "SedonaPandasArrowUDF" @@ -51,7 +52,7 @@ class InvalidSedonaUDFType(Exception): def sedona_vectorized_udf( - return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR + return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR ): import geopandas as gpd @@ -92,7 +93,7 @@ def apply_fn(fn): def _apply_shapely_series_udf( - fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool + fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool ): def apply(series: pd.Series) -> pd.Series: applied = series.apply( @@ -113,7 +114,7 @@ def apply(series: pd.Series) -> pd.Series: def _apply_geo_series_udf( - fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool + fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool ): import geopandas as gpd @@ -161,6 +162,7 @@ def infer_pa_type(spark_type: DataType): else: raise NotImplementedError(f"Type {spark_type} is not supported yet.") + def infer_input_type(spark_type: DataType): if isinstance(spark_type, GeometryType): return sedona_udf_module.GEOMETRY @@ -168,9 +170,12 @@ def infer_input_type(spark_type: DataType): return sedona_udf_module.NUMERIC elif isinstance(spark_type, StringType): return sedona_udf_module.STRING + elif isinstance(spark_type, ByteType): + return sedona_udf_module.BINARY else: raise NotImplementedError(f"Type {spark_type} is not supported yet.") + def infer_input_types(spark_types: list[DataType]): pa_types = [] for spark_type in spark_types: @@ -182,8 +187,12 @@ def infer_input_types(spark_types: list[DataType]): def sedona_db_vectorized_udf( return_type: DataType, - input_types: list[DataType] + input_types: list[DataType], ): + eval_type = 6201 + if has_sedona_serializer_speedup(): + eval_type = 6200 + def apply_fn(fn): out_type = infer_pa_type(return_type) input_types_sedona_db = infer_input_types(input_types) @@ -193,10 +202,9 @@ def shapely_udf(*args, **kwargs): return fn(*args, **kwargs) udf = UserDefinedFunction( - lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=6200 + lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=eval_type ) return udf - return apply_fn diff --git a/python/sedona/spark/utils/geometry_serde.py b/python/sedona/spark/utils/geometry_serde.py index 103eb498173..0ef3d4ed5cb 100644 --- a/python/sedona/spark/utils/geometry_serde.py +++ b/python/sedona/spark/utils/geometry_serde.py @@ -25,9 +25,6 @@ speedup_enabled = False - -# Use geomserde_speedup when available, otherwise fallback to general pure -# python implementation. try: from . import geomserde_speedup @@ -60,8 +57,9 @@ def find_geos_c_dll(): def deserialize(buf: bytearray) -> Optional[BaseGeometry]: if buf is None: return None - return geomserde_speedup.deserialize(buf) + return geomserde_speedup.deserialize_2(buf) + # Export the from_sedona_func for use with numpy ufuncs speedup_enabled = True elif shapely.__version__.startswith("1."): diff --git a/python/sedona/spark/utils/udf.py b/python/sedona/spark/utils/udf.py new file mode 100644 index 00000000000..01a38a675a9 --- /dev/null +++ b/python/sedona/spark/utils/udf.py @@ -0,0 +1,26 @@ +import shapely + + +def has_sedona_serializer_speedup(): + try: + from . import geomserde_speedup + except ImportError: + return False + return True + +def to_sedona_func(arr): + try: + from . import geomserde_speedup + except ImportError: + return shapely.to_wkb(arr) + + return geomserde_speedup.to_sedona_func(arr) + + +def from_sedona_func(arr): + try: + from . import geomserde_speedup + except ImportError: + return shapely.from_wkb(arr) + + return geomserde_speedup.from_sedona_func(arr) diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py new file mode 100644 index 00000000000..0d64a543c56 --- /dev/null +++ b/python/sedona/spark/worker/daemon.py @@ -0,0 +1,227 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import logging +import numbers +import os +import signal +import select +import socket +import sys +import traceback +import time +import gc +from errno import EINTR, EAGAIN +from socket import AF_INET, AF_INET6, SOCK_STREAM, SOMAXCONN +from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN, SIGINT + +from sedona.spark.worker.worker import main as worker_main +from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer + + +def compute_real_exit_code(exit_code): + # SystemExit's code can be integer or string, but os._exit only accepts integers + if isinstance(exit_code, numbers.Integral): + return exit_code + else: + return 1 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +file_handler = logging.FileHandler("/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", delay=False) +file_handler.flush = file_handler.stream.flush + +logger.addHandler(file_handler) + +def worker(sock, authenticated): + logger.info("Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock)) + """ + Called by a worker process after the fork(). + """ + signal.signal(SIGHUP, SIG_DFL) + signal.signal(SIGCHLD, SIG_DFL) + signal.signal(SIGTERM, SIG_DFL) + # restore the handler for SIGINT, + # it's useful for debugging (show the stacktrace before exit) + signal.signal(SIGINT, signal.default_int_handler) + + # Read the socket using fdopen instead of socket.makefile() because the latter + # seems to be very slow; note that we need to dup() the file descriptor because + # otherwise writes also cause a seek that makes us miss data on the read side. + buffer_size = int(os.environ.get("SPARK_BUFFER_SIZE", 65536)) + infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size) + outfile = os.fdopen(os.dup(sock.fileno()), "wb", buffer_size) + + if not authenticated: + client_secret = UTF8Deserializer().loads(infile) + if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret: + write_with_length("ok".encode("utf-8"), outfile) + outfile.flush() + else: + write_with_length("err".encode("utf-8"), outfile) + outfile.flush() + sock.close() + return 1 + + exit_code = 0 + try: + worker_main(infile, outfile) + except SystemExit as exc: + exit_code = compute_real_exit_code(exc.code) + finally: + try: + outfile.flush() + except Exception: + pass + return exit_code + + +def manager(): + # Create a new process group to corral our children + os.setpgid(0, 0) + + # Create a listening socket on the loopback interface + if os.environ.get("SPARK_PREFER_IPV6", "false").lower() == "true": + listen_sock = socket.socket(AF_INET6, SOCK_STREAM) + listen_sock.bind(("::1", 0, 0, 0)) + listen_sock.listen(max(1024, SOMAXCONN)) + listen_host, listen_port, _, _ = listen_sock.getsockname() + else: + listen_sock = socket.socket(AF_INET, SOCK_STREAM) + listen_sock.bind(("127.0.0.1", 0)) + listen_sock.listen(max(1024, SOMAXCONN)) + listen_host, listen_port = listen_sock.getsockname() + + # re-open stdin/stdout in 'wb' mode + stdin_bin = os.fdopen(sys.stdin.fileno(), "rb", 4) + stdout_bin = os.fdopen(sys.stdout.fileno(), "wb", 4) + write_int(listen_port, stdout_bin) + stdout_bin.flush() + + def shutdown(code): + signal.signal(SIGTERM, SIG_DFL) + # Send SIGHUP to notify workers of shutdown + os.kill(0, SIGHUP) + sys.exit(code) + + def handle_sigterm(*args): + shutdown(1) + + signal.signal(SIGTERM, handle_sigterm) # Gracefully exit on SIGTERM + signal.signal(SIGHUP, SIG_IGN) # Don't die on SIGHUP + signal.signal(SIGCHLD, SIG_IGN) + + reuse = os.environ.get("SPARK_REUSE_WORKER") + + # Initialization complete + try: + while True: + try: + ready_fds = select.select([0, listen_sock], [], [], 1)[0] + except select.error as ex: + if ex[0] == EINTR: + continue + else: + raise + + if 0 in ready_fds: + try: + worker_pid = read_int(stdin_bin) + except EOFError: + # Spark told us to exit by closing stdin + shutdown(0) + try: + os.kill(worker_pid, signal.SIGKILL) + except OSError: + pass # process already died + + if listen_sock in ready_fds: + try: + sock, _ = listen_sock.accept() + except OSError as e: + if e.errno == EINTR: + continue + raise + + # Launch a worker process + try: + pid = os.fork() + except OSError as e: + if e.errno in (EAGAIN, EINTR): + time.sleep(1) + pid = os.fork() # error here will shutdown daemon + else: + outfile = sock.makefile(mode="wb") + write_int(e.errno, outfile) # Signal that the fork failed + outfile.flush() + outfile.close() + sock.close() + continue + + if pid == 0: + # in child process + listen_sock.close() + + # It should close the standard input in the child process so that + # Python native function executions stay intact. + # + # Note that if we just close the standard input (file descriptor 0), + # the lowest file descriptor (file descriptor 0) will be allocated, + # later when other file descriptors should happen to open. + # + # Therefore, here we redirects it to '/dev/null' by duplicating + # another file descriptor for '/dev/null' to the standard input (0). + # See SPARK-26175. + devnull = open(os.devnull, "r") + os.dup2(devnull.fileno(), 0) + devnull.close() + + try: + # Acknowledge that the fork was successful + outfile = sock.makefile(mode="wb") + write_int(os.getpid(), outfile) + outfile.flush() + outfile.close() + authenticated = False + while True: + code = worker(sock, authenticated) + logger.info("Worker exited with code %d", code) + if code == 0: + authenticated = True + if not reuse or code: + # wait for closing + try: + while sock.recv(1024): + pass + except Exception: + pass + break + gc.collect() + except BaseException: + traceback.print_exc() + os._exit(1) + else: + os._exit(0) + else: + sock.close() + + finally: + shutdown(1) + + +if __name__ == "__main__": + manager() diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py index 3954d075b74..5a33a266105 100644 --- a/python/sedona/spark/worker/serde.py +++ b/python/sedona/spark/worker/serde.py @@ -4,10 +4,11 @@ from sedona.spark.worker.udf_info import UDFInfo class SedonaDBSerializer(ArrowStreamPandasSerializer): - def __init__(self, timezone, safecheck, db, udf_info: UDFInfo): + def __init__(self, timezone, safecheck, db, udf_info: UDFInfo, cast_to_wkb=False): super(SedonaDBSerializer, self).__init__(timezone, safecheck) self.db = db self.udf_info = udf_info + self.cast_to_wkb = cast_to_wkb def load_stream(self, stream): import pyarrow as pa @@ -22,7 +23,7 @@ def load_stream(self, stream): df.to_view(table_name) - sql_expression = self.udf_info.sedona_db_transformation_expr(table_name) + sql_expression = self.udf_info.sedona_db_transformation_expr(table_name, self.cast_to_wkb) index += 1 @@ -37,7 +38,6 @@ def arrow_dump_stream(self, iterator, stream): if writer is None: writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) - # stream.flush() finally: if writer is not None: writer.close() diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py index d354bcea7ed..7853133e77f 100644 --- a/python/sedona/spark/worker/udf_info.py +++ b/python/sedona/spark/worker/udf_info.py @@ -11,24 +11,23 @@ class UDFInfo: return_type: object name: str - def get_function_call_sql(self, table_name: str) -> str: + def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> str: arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets]) function_expr = f"{self.name}({arg_offset_str})" - if isinstance(self.return_type, GeometryType): + if isinstance(self.return_type, GeometryType) and cast_to_wkb: return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" return f"SELECT {function_expr} AS _0 FROM {table_name}" - def sedona_db_transformation_expr(self, table_name: str) -> str: + def sedona_db_transformation_expr(self, table_name: str, cast_to_wkb: bool = False) -> str: fields = [] for arg in self.arg_offsets: - if arg in self.geom_offsets: + if arg in self.geom_offsets and cast_to_wkb: crs = self.geom_offsets[arg] fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}") continue fields.append(f"_{arg}") - fields_expr = ", ".join(fields) return f"SELECT {fields_expr} FROM {table_name}" diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py index 6b2a18c8f20..17dae02e637 100644 --- a/python/sedona/spark/worker/worker.py +++ b/python/sedona/spark/worker/worker.py @@ -15,16 +15,17 @@ from sedona.spark.worker.udf_info import UDFInfo -def apply_iterator(db, iterator, udf_info: UDFInfo): +def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False): i = 0 for df in iterator: i+=1 table_name = f"output_table_{i}" df.to_view(table_name) - function_call_sql = udf_info.get_function_call_sql(table_name) + function_call_sql = udf_info.get_function_call_sql(table_name, cast_to_wkb=cast_to_wkb) df_out = db.sql(function_call_sql) + df_out.to_view(f"view_{i}") at = df_out.to_arrow_table() batches = at.combine_chunks().to_batches() @@ -207,9 +208,9 @@ def main(infile, outfile): pickle_ser = CPickleSerializer() split_index = read_int(infile) - # + check_python_version(utf8_deserializer, infile) - # + check_barrier_flag(infile) task_context = assign_task_context(utf_serde=utf8_deserializer, infile=infile) @@ -217,7 +218,7 @@ def main(infile, outfile): shuffle.DiskBytesSpilled = 0 resolve_python_path(utf8_deserializer, infile) - # + check_broadcast_variables(infile) eval_type = read_int(infile) @@ -229,11 +230,14 @@ def main(infile, outfile): sedona_db.register_udf(udf.function) init_time = time.time() + cast_to_wkb = read_bool(infile) + serde = SedonaDBSerializer( timezone=runner_conf.get("spark.sql.session.timeZone", "UTC"), safecheck=False, db=sedona_db, - udf_info=udf + udf_info=udf, + cast_to_wkb=cast_to_wkb ) number_of_geometries = read_int(infile) @@ -247,7 +251,7 @@ def main(infile, outfile): udf.geom_offsets = geom_offsets iterator = serde.load_stream(infile) - out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf) + out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb) serde.dump_stream(out_iterator, outfile) diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 00000000000..66ab74701b0 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,6 @@ +from setuptools import setup +import numpy + +setup( + include_dirs=[numpy.get_include()], +) diff --git a/python/src/geom_buf.c b/python/src/geom_buf.c index 5239de5ae09..d6a51bb3d0e 100644 --- a/python/src/geom_buf.c +++ b/python/src/geom_buf.c @@ -208,6 +208,8 @@ SedonaErrorCode geom_buf_alloc(GeomBuffer *geom_buf, return SEDONA_SUCCESS; } +#include + SedonaErrorCode read_geom_buf_header(const char *buf, int buf_size, GeomBuffer *geom_buf, CoordinateSequenceInfo *cs_info, diff --git a/python/src/geomserde.c b/python/src/geomserde.c index c1f74277383..81dafe216f0 100644 --- a/python/src/geomserde.c +++ b/python/src/geomserde.c @@ -718,6 +718,7 @@ static SedonaErrorCode deserialize_geom_buf(GEOSContextHandle_t handle, return SEDONA_SUCCESS; } +#include SedonaErrorCode sedona_deserialize_geom(GEOSContextHandle_t handle, const char *buf, int buf_size, GEOSGeometry **p_geom, diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c index a95ced29e57..621f956cd03 100644 --- a/python/src/geomserde_speedup_module.c +++ b/python/src/geomserde_speedup_module.c @@ -20,10 +20,15 @@ #define PY_SSIZE_T_CLEAN #include #include +// +//#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #include "geomserde.h" #include "geos_c_dyn.h" #include "pygeos/c_api.h" +#include +#include +#include PyDoc_STRVAR(module_doc, "Geometry serialization/deserialization module."); @@ -225,7 +230,7 @@ static PyObject *serialize(PyObject *self, PyObject *args) { return do_serialize(geos_geom); } -static PyObject *deserialize(PyObject *self, PyObject *args) { +static PyObject *deserialize_2(PyObject *self, PyObject *args) { GEOSContextHandle_t handle = NULL; int length = 0; GEOSGeometry *geom = do_deserialize(args, &handle, &length); @@ -262,16 +267,106 @@ static PyObject *deserialize_1(PyObject *self, PyObject *args) { return Py_BuildValue("(Kibi)", geom, geom_type_id, has_z, length); } +static PyObject *to_sedona_func(PyObject *self, PyObject *args) { + import_array(); + PyObject *input_obj = NULL; + if (!PyArg_ParseTuple(args, "O", &input_obj)){ + return NULL; + }; + + PyArrayObject *array = (PyArrayObject *)input_obj; + PyObject **objs = (PyObject **)PyArray_DATA(array); + + GEOSContextHandle_t handle = get_geos_context_handle(); + if (handle == NULL) { + return NULL; + } + + npy_intp n = PyArray_SIZE(input_obj); + npy_intp dims[1] = {n}; + PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); + for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { + PyObject *obj = objs[i]; + GEOSGeometry *geos_geom = NULL; + char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom); + + PyObject *serialized = do_serialize(geos_geom); + PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized); + } + + return out; +} /* Module definition for Shapely 2.x */ +static PyObject *from_sedona_func(PyObject *self, PyObject *args) { + import_array(); + PyObject *input_obj = NULL; + if (!PyArg_ParseTuple(args, "O", &input_obj)){ + return NULL; + }; + + GEOSContextHandle_t handle = get_geos_context_handle(); + + PyArrayObject *array = (PyArrayObject *)input_obj; + PyObject **objs = (PyObject **)PyArray_DATA(array); + + int p_bytes_read = 0; + + npy_intp n = PyArray_SIZE(input_obj); + + npy_intp dims[1] = {n}; + PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); + + for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { + PyObject *obj = objs[i]; + if (!PyBytes_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected bytes"); + return NULL; + } + + char *buf = PyBytes_AS_STRING(obj); + + Py_ssize_t len = PyBytes_GET_SIZE(obj); + + GEOSGeometry *geom = NULL; + + SedonaErrorCode err = sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read); + if (err != SEDONA_SUCCESS) { + handle_geomserde_error(err); + return NULL; + } + PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle); + + PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom); + } + + return out; +} + static PyMethodDef geomserde_methods_shapely_2[] = { {"load_libgeos_c", load_libgeos_c, METH_VARARGS, "Load libgeos_c."}, {"serialize", serialize, METH_VARARGS, "Serialize geometry object as bytearray."}, - {"deserialize", deserialize, METH_VARARGS, + {"deserialize_2", deserialize_2, METH_VARARGS, + "Deserialize bytes-like object to geometry object."}, + {"from_sedona_func", from_sedona_func, METH_VARARGS, + "Deserialize bytes-like object to geometry object."}, + {"to_sedona_func", to_sedona_func, METH_VARARGS, "Deserialize bytes-like object to geometry object."}, {NULL, NULL, 0, NULL}, /* Sentinel */ }; +// +//static int add_from_sedona_func_to_module(PyObject *m) { +// PyObject *capsule = PyCapsule_New((void *)from_sedona_func, "from_sedona_func", NULL); +// if (capsule == NULL) { +// return -1; +// } +// if (PyModule_AddObject(m, "from_sedona_func", capsule) < 0) { +// Py_DECREF(capsule); +// return -1; +// } +// return 0; +//} static struct PyModuleDef geomserde_module_shapely_2 = { PyModuleDef_HEAD_INIT, "geomserde_speedup", module_doc, 0, diff --git a/python/tests/test_base.py b/python/tests/test_base.py index 911860e416f..e240a09758c 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -72,6 +72,8 @@ def spark(self): ) .config("spark.executor.memory", "10G") \ .config("spark.driver.memory", "10G") \ + .config("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") \ + .config("sedona.python.worker.daemon.enabled", "false") \ # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default # in Spark 4 .config("spark.sql.ansi.enabled", "false") diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py index 904d59a2820..4b266384fa9 100644 --- a/python/tests/utils/test_sedona_db_vectorized_udf.py +++ b/python/tests/utils/test_sedona_db_vectorized_udf.py @@ -1,12 +1,106 @@ +import time + +import numpy as np + from sedona.spark.sql.functions import sedona_db_vectorized_udf +from sedona.spark.utils.udf import to_sedona_func, from_sedona_func from tests.test_base import TestBase import pyarrow as pa import shapely from sedona.sql import GeometryType from pyspark.sql.functions import expr, lit -from pyspark.sql.types import DoubleType, IntegerType +from pyspark.sql.types import DoubleType, IntegerType, ByteType from sedona.spark.sql import ST_X - +from shapely._enum import ParamEnum + +def test_m(): + on_invalid="raise" + wkb = b'\x12\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?' + geometry = np.asarray([wkb, wkb], dtype=object) + + DecodingErrorOptions = ParamEnum( + "DecodingErrorOptions", {"ignore": 0, "warn": 1, "raise": 2, "fix": 3} + ) + + # print("sss") + + + # + # object + # C_CONTIGUOUS : True + # F_CONTIGUOUS : True + # OWNDATA : False + # WRITEABLE : True + # ALIGNED : True + # WRITEBACKIFCOPY : False + # print(type(geometry)) + # print(geometry.dtype) + # print(geometry.flags) + + result = from_sedona_func(geometry) + + result2 = to_sedona_func(result) + +# ensure the input has object dtype, to avoid numpy inferring it as a +# fixed-length string dtype (which removes trailing null bytes upon access +# of array elements) + # + # def from_sedona_func(arr): + # try: + # from . import sedonaserde_vectorized_udf_module + # print(sedonaserde_vectorized_udf_module.from_sedona_func_3(arr)) + # except Exception as e: + # print("Cannot import sedonaserde_vectorized_udf_module:") + # print(e) + # # print() + # return None +# +# def from_wkb(geometry, on_invalid="raise", **kwargs): +# r"""Create geometries from the Well-Known Binary (WKB) representation. +# +# The Well-Known Binary format is defined in the `OGC Simple Features +# Specification for SQL `__. +# +# Parameters +# ---------- +# geometry : str or array_like +# The WKB byte object(s) to convert. +# on_invalid : {"raise", "warn", "ignore", "fix"}, default "raise" +# Indicates what to do when an invalid WKB is encountered. Note that the +# validations involved are very basic, e.g. the minimum number of points +# for the geometry type. For a thorough check, use :func:`is_valid` after +# conversion to geometries. Valid options are: +# +# - raise: an exception will be raised if any input geometry is invalid. +# - warn: a warning will be raised and invalid WKT geometries will be +# returned as ``None``. +# - ignore: invalid geometries will be returned as ``None`` without a +# warning. +# - fix: an effort is made to fix invalid input geometries (currently just +# unclosed rings). If this is not possible, they are returned as +# ``None`` without a warning. Requires GEOS >= 3.11. +# +# .. versionadded:: 2.1.0 +# **kwargs +# See :ref:`NumPy ufunc docs ` for other keyword arguments. +# +# Examples +# -------- +# >>> import shapely +# >>> shapely.from_wkb(b'\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?') +# +# +# """ # noqa: E501 +# if not np.isscalar(on_invalid): +# raise TypeError("on_invalid only accepts scalar values") +# +# invalid_handler = np.uint8(DecodingErrorOptions.get_value(on_invalid)) +# +# # ensure the input has object dtype, to avoid numpy inferring it as a +# # fixed-length string dtype (which removes trailing null bytes upon access +# # of array elements) +# geometry = np.asarray(geometry, dtype=object) +# return lib.from_wkb(geometry, invalid_handler, **kwargs) class TestSedonaDBArrowFunction(TestBase): def test_vectorized_udf(self): @@ -15,7 +109,6 @@ def my_own_function(geom, distance): geom_wkb = pa.array(geom.storage.to_array()) distance = pa.array(distance.to_array()) geom = shapely.from_wkb(geom_wkb) - result_shapely = shapely.centroid(geom) return pa.array(shapely.to_wkb(result_shapely)) @@ -95,18 +188,26 @@ def return_same_geometry(geom): assert crs_list == [3857, 3857, 3857] def test_geometry_to_geometry(self): - @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()]) + @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()]) def buffer_geometry(geom): geom_wkb = pa.array(geom.storage.to_array()) - geom = shapely.from_wkb(geom_wkb) + geometry_array = np.asarray(geom_wkb, dtype=object) + geom = from_sedona_func(geometry_array) result_shapely = shapely.buffer(geom, 10) - return pa.array(shapely.to_wkb(result_shapely)) + return pa.array(to_sedona_func(result_shapely)) df = self.spark.read.\ format("geoparquet").\ - load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") + load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l1") + + # 1 045 770 + # print(df.count()) + + # df.unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\ + # unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\ + # write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2") # 18 24 # df.union(df).union(df).union(df).union(df).union(df).union(df).\ # write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") @@ -117,6 +218,12 @@ def buffer_geometry(geom): values.show() + # for _ in range(4): + # start_time = time.time() + # values.show() + # end_time = time.time() + # print(f"Execution time: {end_time - start_time} seconds") + def test_geometry_to_geometry_normal_udf(self): from pyspark.sql.functions import udf @@ -127,10 +234,19 @@ def create_buffer(geom): df = self.spark.read. \ format("geoparquet"). \ - load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") + load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2") + # print(df.count()) + # df.limit(10).collect() values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \ selectExpr("ST_Area(geometry) as area"). \ selectExpr("Sum(area) as total_area") values.show() + + # for _ in range(4): + # start_time = time.time() + # values.show() + # end_time = time.time() + # print(f"Execution time: {end_time - start_time} seconds") +# 1 045 770 diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala index 11263dd7f68..0f1a5fe0a01 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala @@ -25,6 +25,7 @@ object PythonEvalType { // sedona db eval types val SQL_SCALAR_SEDONA_DB_UDF = 6200 + val SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF = 6201 val SEDONA_DB_UDF_TYPE_CONSTANT = 6000 def toString(pythonEvalType: Int): String = pythonEvalType match { @@ -32,5 +33,6 @@ object PythonEvalType { case SQL_SCALAR_SEDONA_DB_UDF => "SQL_SCALAR_SEDONA_DB_UDF" } - def evals(): Set[Int] = Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF) + def evals(): Set[Int] = + Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF) } diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala index 0d3960d2d81..3055e768b98 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala @@ -38,13 +38,15 @@ class SedonaArrowPythonRunner( protected override val workerConf: Map[String, String], val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String], - geometryFields: Seq[(Int, Int)]) + geometryFields: Seq[(Int, Int)], + castGeometryToWKB: Boolean = false) extends SedonaBasePythonRunner[Iterator[InternalRow], ColumnarBatch]( funcs, evalType, argOffsets, jobArtifactUUID, - geometryFields) + geometryFields, + castGeometryToWKB) with SedonaBasicPythonArrowInput with SedonaBasicPythonArrowOutput { diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala index bb897931b63..228ddc2cbca 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.python import org.apache.sedona.sql.UDF.PythonEvalType -import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF} +import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF, SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.InternalRow @@ -115,10 +115,10 @@ case class SedonaArrowEvalPythonExec( val batchIter = if (batchSize > 0) new BatchIterator(full, batchSize) else Iterator(full) evalType match { - case SQL_SCALAR_SEDONA_DB_UDF => + case SQL_SCALAR_SEDONA_DB_UDF | SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF => val columnarBatchIter = new SedonaArrowPythonRunner( funcs, - evalType - PythonEvalType.SEDONA_DB_UDF_TYPE_CONSTANT, + 200, argOffsets, schema, sessionLocalTimeZone, @@ -126,7 +126,9 @@ case class SedonaArrowEvalPythonExec( pythonRunnerConf, pythonMetrics, jobArtifactUUID, - geometryFields).compute(batchIter, context.partitionId(), context) + geometryFields, + evalType == SQL_SCALAR_SEDONA_DB_UDF) + .compute(batchIter, context.partitionId(), context) val result = columnarBatchIter.flatMap { batch => batch.rowIterator.asScala diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala index 276383a0ee8..055d5db15f1 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala @@ -39,14 +39,16 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( evalType: Int, argOffsets: Array[Array[Int]], jobArtifactUUID: Option[String], - val geometryFields: Seq[(Int, Int)] = Seq.empty) + val geometryFields: Seq[(Int, Int)] = Seq.empty, + val castGeometryToWKB: Boolean = false) extends BasePythonRunner[IN, OUT](funcs, evalType, argOffsets, jobArtifactUUID) with Logging { require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs") private val conf = SparkEnv.get.conf - private val reuseWorker = conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) + private val reuseWorker = + conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED) private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = { @@ -81,9 +83,12 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( envVars.put("PYTHON_FAULTHANDLER_DIR", SedonaBasePythonRunner.faultHandlerLogDir.toString) } + if (reuseWorker) { + envVars.put("SPARK_REUSE_WORKER", "1") + } + envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default")) - println("running the compute for SedonaBasePythonRunner and partition index: " + partitionIndex) val (worker: Socket, pid: Option[Int]) = { WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap) } @@ -98,7 +103,6 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT]( if (!reuseWorker || releasedOrClosed.compareAndSet(false, true)) { try { - logInfo("Shutting down worker socket") worker.close() } catch { case e: Exception => diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala index 93bcaee0c6f..459388856b0 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala @@ -34,7 +34,6 @@ import org.apache.spark.security.SocketAuthHelper import org.apache.spark.sql.execution.python.SedonaPythonWorkerFactory.PROCESS_WAIT_TIMEOUT_MS import org.apache.spark.util.RedirectThread -import java.util.concurrent.TimeUnit import javax.annotation.concurrent.GuardedBy class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) extends Logging { @@ -181,7 +180,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex } private def stopDaemon(): Unit = { - logError("daemon stopping called") self.synchronized { if (useDaemon) { cleanupIdleWorkers() @@ -194,7 +192,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex daemon = null daemonPort = 0 } else { - println("Stopping simple workers") simpleWorkers.mapValues(_.destroy()) } } @@ -233,11 +230,11 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex daemonPort = in.readInt() } catch { case _: EOFException if daemon.isAlive => - throw SparkCoreErrors.eofExceptionWhileReadPortNumberError( - sedonaDaemonModule) + throw SparkCoreErrors.eofExceptionWhileReadPortNumberError(sedonaDaemonModule) case _: EOFException => - throw SparkCoreErrors. - eofExceptionWhileReadPortNumberError(sedonaDaemonModule, Some(daemon.exitValue)) + throw SparkCoreErrors.eofExceptionWhileReadPortNumberError( + sedonaDaemonModule, + Some(daemon.exitValue)) } // test that the returned port number is within a valid range. @@ -261,7 +258,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex redirectStreamsToStderr(in, daemon.getErrorStream) } catch { case e: Exception => - // If the daemon exists, wait for it to finish and get its stderr val stderr = Option(daemon) .flatMap { d => Utils.getStderr(d, PROCESS_WAIT_TIMEOUT_MS) } @@ -307,7 +303,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex def releaseWorker(worker: Socket): Unit = { if (useDaemon) { - logInfo("Releasing worker back to daemon pool") self.synchronized { lastActivityNs = System.nanoTime() idleWorkers.enqueue(worker) @@ -345,5 +340,4 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex private object SedonaPythonWorkerFactory { val PROCESS_WAIT_TIMEOUT_MS = 10000 - val IDLE_WORKER_TIMEOUT_NS = TimeUnit.MINUTES.toNanos(1) // kill idle workers after 1 minute } diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala index 18db42ae0df..2544e63a979 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala @@ -60,6 +60,9 @@ private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] { handleMetadataBeforeExec(dataOut) writeUDF(dataOut, funcs, argOffsets) + // if speedup is not available and we need to use casting + dataOut.writeBoolean(self.castGeometryToWKB) + // write dataOut.writeInt(self.geometryFields.length) // write geometry field indices and their SRIDs diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala index 0c0b220933f..27764c2a54f 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala @@ -34,7 +34,8 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, Columna private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] => - private val reuseWorker = SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) + private val reuseWorker = + SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get) protected def pythonMetrics: Map[String, SQLMetric] diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala index 82fe6deddaf..6411bec97e4 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala @@ -24,24 +24,28 @@ import scala.collection.mutable object WorkerContext { def createPythonWorker( - pythonExec: String, - envVars: Map[String, String]): (java.net.Socket, Option[Int]) = { + pythonExec: String, + envVars: Map[String, String]): (java.net.Socket, Option[Int]) = { synchronized { val key = (pythonExec, envVars) pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create() } } - def destroyPythonWorker(pythonExec: String, - envVars: Map[String, String], worker: Socket): Unit = { + def destroyPythonWorker( + pythonExec: String, + envVars: Map[String, String], + worker: Socket): Unit = { synchronized { val key = (pythonExec, envVars) pythonWorkers.get(key).foreach(_.stopWorker(worker)) } } - def releasePythonWorker(pythonExec: String, - envVars: Map[String, String], worker: Socket): Unit = { + def releasePythonWorker( + pythonExec: String, + envVars: Map[String, String], + worker: Socket): Unit = { synchronized { val key = (pythonExec, envVars) pythonWorkers.get(key).foreach(_.releaseWorker(worker)) diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala index ebb5a568e14..3584cb01bd7 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala @@ -44,7 +44,8 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging { } def isScalarPythonUDF(e: Expression): Boolean = { - e.isInstanceOf[PythonUDF] && PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType) + e.isInstanceOf[PythonUDF] && + PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType) } private def collectEvaluableUDFsFromExpressions( diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index e64e9dec3bb..c9b4d6ac289 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -46,9 +46,9 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { // We need to be explicit about broadcasting in tests. .config("sedona.join.autoBroadcastJoinThreshold", "-1") .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") - .config("sedona.python.worker.udf.module", "sedonaworker.worker") + .config("sedona.python.worker.udf.module", "sedona.spark.worker.worker") .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon") - .config("sedona.python.worker.daemon.enabled", "true") + .config("sedona.python.worker.daemon.enabled", "false") // .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala index 000c1f55b62..4fe4acfb121 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala @@ -35,7 +35,6 @@ class StrategySuite extends TestBaseScala with Matchers { import spark.implicits._ - it("sedona geospatial UDF - geopandas") { val df = Seq( (1, "value", wktReader.read("POINT(21 52)")), @@ -50,7 +49,8 @@ class StrategySuite extends TestBaseScala with Matchers { geopandasUDFDF.count shouldEqual 5 - geopandasUDFDF.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))") + geopandasUDFDF + .selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))") .as[String] .collect() should contain theSameElementsAs Seq( "POLYGON ((20 51, 20 53, 22 53, 22 51, 20 51))", @@ -61,36 +61,33 @@ class StrategySuite extends TestBaseScala with Matchers { } it("sedona geospatial UDF - sedona db") { -// val df = Seq( -// (1, "value", wktReader.read("POINT(21 52)")), -// (2, "value1", wktReader.read("POINT(20 50)")), -// (3, "value2", wktReader.read("POINT(20 49)")), -// (4, "value3", wktReader.read("POINT(20 48)")), -// (5, "value4", wktReader.read("POINT(20 47)"))) -// .toDF("id", "value", "geom") -// -// val dfVectorized = df -// .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) -// .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) - -// dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") -// .selectExpr("sum(x)") -// .as[Double] -// .collect().head shouldEqual 101 - - val dfCopied = sparkSession.read - .format("geoparquet") - .load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet") + val df = Seq( + (1, "value", wktReader.read("POINT(21 52)")), + (2, "value1", wktReader.read("POINT(20 50)")), + (3, "value2", wktReader.read("POINT(20 49)")), + (4, "value3", wktReader.read("POINT(20 48)")), + (5, "value4", wktReader.read("POINT(20 47)"))) + .toDF("id", "value", "geom") - val values = dfCopied.unionAll(dfCopied) - .unionAll(dfCopied) -// .unionAll(dfCopied) -// .unionAll(dfCopied) -// .unionAll(dfCopied) - .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom")) - .selectExpr("ST_Area(geom) as area") - .selectExpr("Sum(area) as total_area") + val dfVectorized = df + .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) + .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) - values.show() + dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") + .selectExpr("sum(x)") + .as[Double] + .collect().head shouldEqual 101 +// +// val dfCopied = sparkSession.read +// .format("geoparquet") +// .load( +// "/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet") +// +// val values = dfCopied +// .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom")) +// .selectExpr("ST_Area(geom) as area") +// .selectExpr("Sum(area) as total_area") +// +// values.show() } } From 42229e5f66bb0cf032348799b358991c77e3e63a Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Tue, 13 Jan 2026 23:23:05 +0100 Subject: [PATCH 06/11] add sedonadb sedona udf worker example --- python/sedona/spark/sql/functions.py | 29 ++- python/sedona/spark/utils/udf.py | 22 +- python/sedona/spark/worker/__init__.py | 16 ++ python/sedona/spark/worker/daemon.py | 48 ++-- python/sedona/spark/worker/serde.py | 25 +- python/sedona/spark/worker/udf_info.py | 25 +- python/sedona/spark/worker/worker.py | 56 ++++- python/setup.py | 17 ++ python/src/geomserde_speedup_module.c | 138 +++++------ python/tests/test_base.py | 14 +- .../utils/test_sedona_db_vectorized_udf.py | 218 ++++-------------- .../apache/spark/sql/udf/StrategySuite.scala | 18 +- 12 files changed, 311 insertions(+), 315 deletions(-) diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py index 232ccb50a3c..d8bf73c1522 100644 --- a/python/sedona/spark/sql/functions.py +++ b/python/sedona/spark/sql/functions.py @@ -28,7 +28,14 @@ import geoarrow.pyarrow as ga from sedonadb import udf as sedona_udf_module from sedona.spark.sql.types import GeometryType -from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType, ByteType +from pyspark.sql.types import ( + DataType, + FloatType, + DoubleType, + IntegerType, + StringType, + ByteType, +) from sedona.spark.utils.udf import has_sedona_serializer_speedup @@ -52,7 +59,7 @@ class InvalidSedonaUDFType(Exception): def sedona_vectorized_udf( - return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR + return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR ): import geopandas as gpd @@ -93,7 +100,7 @@ def apply_fn(fn): def _apply_shapely_series_udf( - fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool + fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool ): def apply(series: pd.Series) -> pd.Series: applied = series.apply( @@ -114,7 +121,7 @@ def apply(series: pd.Series) -> pd.Series: def _apply_geo_series_udf( - fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool + fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool ): import geopandas as gpd @@ -166,7 +173,11 @@ def infer_pa_type(spark_type: DataType): def infer_input_type(spark_type: DataType): if isinstance(spark_type, GeometryType): return sedona_udf_module.GEOMETRY - elif isinstance(spark_type, FloatType) or isinstance(spark_type, DoubleType) or isinstance(spark_type, IntegerType): + elif ( + isinstance(spark_type, FloatType) + or isinstance(spark_type, DoubleType) + or isinstance(spark_type, IntegerType) + ): return sedona_udf_module.NUMERIC elif isinstance(spark_type, StringType): return sedona_udf_module.STRING @@ -186,12 +197,12 @@ def infer_input_types(spark_types: list[DataType]): def sedona_db_vectorized_udf( - return_type: DataType, - input_types: list[DataType], + return_type: DataType, + input_types: list[DataType], ): - eval_type = 6201 + eval_type = 6200 if has_sedona_serializer_speedup(): - eval_type = 6200 + eval_type = 6201 def apply_fn(fn): out_type = infer_pa_type(return_type) diff --git a/python/sedona/spark/utils/udf.py b/python/sedona/spark/utils/udf.py index 01a38a675a9..0f88ef07f29 100644 --- a/python/sedona/spark/utils/udf.py +++ b/python/sedona/spark/utils/udf.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import shapely @@ -8,7 +25,8 @@ def has_sedona_serializer_speedup(): return False return True -def to_sedona_func(arr): + +def to_sedona(arr): try: from . import geomserde_speedup except ImportError: @@ -17,7 +35,7 @@ def to_sedona_func(arr): return geomserde_speedup.to_sedona_func(arr) -def from_sedona_func(arr): +def from_sedona(arr): try: from . import geomserde_speedup except ImportError: diff --git a/python/sedona/spark/worker/__init__.py b/python/sedona/spark/worker/__init__.py index e69de29bb2d..13a83393a91 100644 --- a/python/sedona/spark/worker/__init__.py +++ b/python/sedona/spark/worker/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py index 0d64a543c56..ce75e376eac 100644 --- a/python/sedona/spark/worker/daemon.py +++ b/python/sedona/spark/worker/daemon.py @@ -1,19 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# http://www.apache.org/licenses/LICENSE-2.0 # +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import logging import numbers import os @@ -39,16 +40,23 @@ def compute_real_exit_code(exit_code): else: return 1 + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -file_handler = logging.FileHandler("/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", delay=False) +file_handler = logging.FileHandler( + "/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", + delay=False, +) file_handler.flush = file_handler.stream.flush logger.addHandler(file_handler) + def worker(sock, authenticated): - logger.info("Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock)) + logger.info( + "Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock) + ) """ Called by a worker process after the fork(). """ @@ -69,10 +77,10 @@ def worker(sock, authenticated): if not authenticated: client_secret = UTF8Deserializer().loads(infile) if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret: - write_with_length("ok".encode("utf-8"), outfile) + write_with_length(b"ok", outfile) outfile.flush() else: - write_with_length("err".encode("utf-8"), outfile) + write_with_length(b"err", outfile) outfile.flush() sock.close() return 1 @@ -132,7 +140,7 @@ def handle_sigterm(*args): while True: try: ready_fds = select.select([0, listen_sock], [], [], 1)[0] - except select.error as ex: + except OSError as ex: if ex[0] == EINTR: continue else: @@ -186,7 +194,7 @@ def handle_sigterm(*args): # Therefore, here we redirects it to '/dev/null' by duplicating # another file descriptor for '/dev/null' to the standard input (0). # See SPARK-26175. - devnull = open(os.devnull, "r") + devnull = open(os.devnull) os.dup2(devnull.fileno(), 0) devnull.close() diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py index 5a33a266105..52e7b663a58 100644 --- a/python/sedona/spark/worker/serde.py +++ b/python/sedona/spark/worker/serde.py @@ -1,11 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pyspark.serializers import write_int, SpecialLengths from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from sedona.spark.worker.udf_info import UDFInfo + class SedonaDBSerializer(ArrowStreamPandasSerializer): def __init__(self, timezone, safecheck, db, udf_info: UDFInfo, cast_to_wkb=False): - super(SedonaDBSerializer, self).__init__(timezone, safecheck) + super().__init__(timezone, safecheck) self.db = db self.udf_info = udf_info self.cast_to_wkb = cast_to_wkb @@ -18,12 +36,15 @@ def load_stream(self, stream): for batch in batches: table = pa.Table.from_batches(batches=[batch]) import pyarrow as pa + df = self.db.create_data_frame(table) table_name = f"my_table_{index}" df.to_view(table_name) - sql_expression = self.udf_info.sedona_db_transformation_expr(table_name, self.cast_to_wkb) + sql_expression = self.udf_info.sedona_db_transformation_expr( + table_name, self.cast_to_wkb + ) index += 1 diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py index 7853133e77f..eb278a1511c 100644 --- a/python/sedona/spark/worker/udf_info.py +++ b/python/sedona/spark/worker/udf_info.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from dataclasses import dataclass from sedona.spark import GeometryType @@ -15,11 +32,15 @@ def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> s arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets]) function_expr = f"{self.name}({arg_offset_str})" if isinstance(self.return_type, GeometryType) and cast_to_wkb: - return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" + return ( + f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" + ) return f"SELECT {function_expr} AS _0 FROM {table_name}" - def sedona_db_transformation_expr(self, table_name: str, cast_to_wkb: bool = False) -> str: + def sedona_db_transformation_expr( + self, table_name: str, cast_to_wkb: bool = False + ) -> str: fields = [] for arg in self.arg_offsets: if arg in self.geom_offsets and cast_to_wkb: diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py index 17dae02e637..02fedf00581 100644 --- a/python/sedona/spark/worker/worker.py +++ b/python/sedona/spark/worker/worker.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import importlib import os import sys @@ -8,8 +25,16 @@ from pyspark.errors import PySparkRuntimeError from pyspark.java_gateway import local_connect_and_auth from pyspark.resource import ResourceInformation -from pyspark.serializers import read_int, UTF8Deserializer, read_bool, read_long, CPickleSerializer, write_int, \ - write_long, SpecialLengths +from pyspark.serializers import ( + read_int, + UTF8Deserializer, + read_bool, + read_long, + CPickleSerializer, + write_int, + write_long, + SpecialLengths, +) from sedona.spark.worker.serde import SedonaDBSerializer from sedona.spark.worker.udf_info import UDFInfo @@ -18,11 +43,13 @@ def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False): i = 0 for df in iterator: - i+=1 + i += 1 table_name = f"output_table_{i}" df.to_view(table_name) - function_call_sql = udf_info.get_function_call_sql(table_name, cast_to_wkb=cast_to_wkb) + function_call_sql = udf_info.get_function_call_sql( + table_name, cast_to_wkb=cast_to_wkb + ) df_out = db.sql(function_call_sql) @@ -30,8 +57,7 @@ def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False): at = df_out.to_arrow_table() batches = at.combine_chunks().to_batches() - for batch in batches: - yield batch + yield from batches def check_python_version(utf_serde: UTF8Deserializer, infile) -> str: @@ -50,6 +76,7 @@ def check_python_version(utf_serde: UTF8Deserializer, infile) -> str: return version + def check_barrier_flag(infile): is_barrier = read_bool(infile) bound_port = read_int(infile) @@ -66,6 +93,7 @@ def check_barrier_flag(infile): return is_barrier + def assign_task_context(utf_serde: UTF8Deserializer, infile): stage_id = read_int(infile) partition_id = read_int(infile) @@ -97,6 +125,7 @@ def assign_task_context(utf_serde: UTF8Deserializer, infile): return task_context + def resolve_python_path(utf_serde: UTF8Deserializer, infile): def add_path(path: str): # worker can be used, so do not add path multiple times @@ -131,6 +160,7 @@ def check_broadcast_variables(infile): }, ) + def get_runner_conf(utf_serde: UTF8Deserializer, infile): runner_conf = {} num_conf = read_int(infile) @@ -145,6 +175,7 @@ def read_command(serializer, infile): command = serializer._read_with_length(infile) return command + def read_udf(infile, pickle_ser) -> UDFInfo: num_arg = read_int(infile) arg_offsets = [read_int(infile) for i in range(num_arg)] @@ -162,9 +193,10 @@ def read_udf(infile, pickle_ser) -> UDFInfo: function=sedona_db_udf_expression, return_type=return_type, name=sedona_db_udf_expression._name, - geom_offsets=[0] + geom_offsets=[0], ) + def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo: num_udfs = read_int(infile) @@ -237,7 +269,7 @@ def main(infile, outfile): safecheck=False, db=sedona_db, udf_info=udf, - cast_to_wkb=cast_to_wkb + cast_to_wkb=cast_to_wkb, ) number_of_geometries = read_int(infile) @@ -251,13 +283,13 @@ def main(infile, outfile): udf.geom_offsets = geom_offsets iterator = serde.load_stream(infile) - out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb) + out_iterator = apply_iterator( + db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb + ) serde.dump_stream(out_iterator, outfile) - write_statistics( - infile, outfile, boot_time=boot_time, init_time=init_time - ) + write_statistics(infile, outfile, boot_time=boot_time, init_time=init_time) if __name__ == "__main__": diff --git a/python/setup.py b/python/setup.py index 66ab74701b0..ae5e7bf1746 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from setuptools import setup import numpy diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c index 621f956cd03..1d7aefcd772 100644 --- a/python/src/geomserde_speedup_module.c +++ b/python/src/geomserde_speedup_module.c @@ -19,16 +19,14 @@ #define PY_SSIZE_T_CLEAN #include +#include +#include +#include #include -// -//#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #include "geomserde.h" #include "geos_c_dyn.h" #include "pygeos/c_api.h" -#include -#include -#include PyDoc_STRVAR(module_doc, "Geometry serialization/deserialization module."); @@ -230,7 +228,7 @@ static PyObject *serialize(PyObject *self, PyObject *args) { return do_serialize(geos_geom); } -static PyObject *deserialize_2(PyObject *self, PyObject *args) { +static PyObject *deserialize(PyObject *self, PyObject *args) { GEOSContextHandle_t handle = NULL; int length = 0; GEOSGeometry *geom = do_deserialize(args, &handle, &length); @@ -268,86 +266,86 @@ static PyObject *deserialize_1(PyObject *self, PyObject *args) { } static PyObject *to_sedona_func(PyObject *self, PyObject *args) { - import_array(); - PyObject *input_obj = NULL; - if (!PyArg_ParseTuple(args, "O", &input_obj)){ - return NULL; - }; - - PyArrayObject *array = (PyArrayObject *)input_obj; - PyObject **objs = (PyObject **)PyArray_DATA(array); - - GEOSContextHandle_t handle = get_geos_context_handle(); - if (handle == NULL) { - return NULL; - } - - npy_intp n = PyArray_SIZE(input_obj); - npy_intp dims[1] = {n}; - PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); - for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { - PyObject *obj = objs[i]; - GEOSGeometry *geos_geom = NULL; - char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom); - - PyObject *serialized = do_serialize(geos_geom); - PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized); - } + import_array(); + PyObject *input_obj = NULL; + if (!PyArg_ParseTuple(args, "O", &input_obj)) { + return NULL; + }; + + PyArrayObject *array = (PyArrayObject *)input_obj; + PyObject **objs = (PyObject **)PyArray_DATA(array); - return out; + GEOSContextHandle_t handle = get_geos_context_handle(); + if (handle == NULL) { + return NULL; + } + + npy_intp n = PyArray_SIZE(input_obj); + npy_intp dims[1] = {n}; + PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); + for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { + PyObject *obj = objs[i]; + GEOSGeometry *geos_geom = NULL; + char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom); + + PyObject *serialized = do_serialize(geos_geom); + PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized); + } + + return out; } /* Module definition for Shapely 2.x */ static PyObject *from_sedona_func(PyObject *self, PyObject *args) { - import_array(); - PyObject *input_obj = NULL; - if (!PyArg_ParseTuple(args, "O", &input_obj)){ - return NULL; - }; + import_array(); + PyObject *input_obj = NULL; + if (!PyArg_ParseTuple(args, "O", &input_obj)) { + return NULL; + }; - GEOSContextHandle_t handle = get_geos_context_handle(); + GEOSContextHandle_t handle = get_geos_context_handle(); - PyArrayObject *array = (PyArrayObject *)input_obj; - PyObject **objs = (PyObject **)PyArray_DATA(array); + PyArrayObject *array = (PyArrayObject *)input_obj; + PyObject **objs = (PyObject **)PyArray_DATA(array); - int p_bytes_read = 0; + int p_bytes_read = 0; - npy_intp n = PyArray_SIZE(input_obj); + npy_intp n = PyArray_SIZE(input_obj); - npy_intp dims[1] = {n}; - PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); + npy_intp dims[1] = {n}; + PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT); - for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { - PyObject *obj = objs[i]; - if (!PyBytes_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected bytes"); - return NULL; - } + for (npy_intp i = 0; i < PyArray_SIZE(array); i++) { + PyObject *obj = objs[i]; + if (!PyBytes_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected bytes"); + return NULL; + } - char *buf = PyBytes_AS_STRING(obj); + char *buf = PyBytes_AS_STRING(obj); - Py_ssize_t len = PyBytes_GET_SIZE(obj); + Py_ssize_t len = PyBytes_GET_SIZE(obj); - GEOSGeometry *geom = NULL; + GEOSGeometry *geom = NULL; - SedonaErrorCode err = sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read); - if (err != SEDONA_SUCCESS) { - handle_geomserde_error(err); - return NULL; - } - PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle); + SedonaErrorCode err = + sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read); + if (err != SEDONA_SUCCESS) { + handle_geomserde_error(err); + return NULL; + } + PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle); - PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom); - } + PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom); + } - return out; + return out; } - static PyMethodDef geomserde_methods_shapely_2[] = { {"load_libgeos_c", load_libgeos_c, METH_VARARGS, "Load libgeos_c."}, {"serialize", serialize, METH_VARARGS, "Serialize geometry object as bytearray."}, - {"deserialize_2", deserialize_2, METH_VARARGS, + {"deserialize", deserialize, METH_VARARGS, "Deserialize bytes-like object to geometry object."}, {"from_sedona_func", from_sedona_func, METH_VARARGS, "Deserialize bytes-like object to geometry object."}, @@ -355,18 +353,6 @@ static PyMethodDef geomserde_methods_shapely_2[] = { "Deserialize bytes-like object to geometry object."}, {NULL, NULL, 0, NULL}, /* Sentinel */ }; -// -//static int add_from_sedona_func_to_module(PyObject *m) { -// PyObject *capsule = PyCapsule_New((void *)from_sedona_func, "from_sedona_func", NULL); -// if (capsule == NULL) { -// return -1; -// } -// if (PyModule_AddObject(m, "from_sedona_func", capsule) < 0) { -// Py_DECREF(capsule); -// return -1; -// } -// return 0; -//} static struct PyModuleDef geomserde_module_shapely_2 = { PyModuleDef_HEAD_INIT, "geomserde_speedup", module_doc, 0, diff --git a/python/tests/test_base.py b/python/tests/test_base.py index e240a09758c..300d937d278 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -70,11 +70,15 @@ def spark(self): "spark.sedona.stac.load.itemsLimitMax", "20", ) - .config("spark.executor.memory", "10G") \ - .config("spark.driver.memory", "10G") \ - .config("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") \ - .config("sedona.python.worker.daemon.enabled", "false") \ - # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default + .config("spark.executor.memory", "10G") + .config("spark.driver.memory", "10G") + .config( + "sedona.python.worker.udf.daemon.module", + "sedona.spark.worker.daemon", + ) + .config( + "sedona.python.worker.daemon.enabled", "false" + ) # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default # in Spark 4 .config("spark.sql.ansi.enabled", "false") ) diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py index 4b266384fa9..eea84eec91f 100644 --- a/python/tests/utils/test_sedona_db_vectorized_udf.py +++ b/python/tests/utils/test_sedona_db_vectorized_udf.py @@ -1,9 +1,24 @@ -import time +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import numpy as np from sedona.spark.sql.functions import sedona_db_vectorized_udf -from sedona.spark.utils.udf import to_sedona_func, from_sedona_func +from sedona.spark.utils.udf import to_sedona, from_sedona from tests.test_base import TestBase import pyarrow as pa import shapely @@ -11,107 +26,21 @@ from pyspark.sql.functions import expr, lit from pyspark.sql.types import DoubleType, IntegerType, ByteType from sedona.spark.sql import ST_X -from shapely._enum import ParamEnum - -def test_m(): - on_invalid="raise" - wkb = b'\x12\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?' - geometry = np.asarray([wkb, wkb], dtype=object) - - DecodingErrorOptions = ParamEnum( - "DecodingErrorOptions", {"ignore": 0, "warn": 1, "raise": 2, "fix": 3} - ) - - # print("sss") - - - # - # object - # C_CONTIGUOUS : True - # F_CONTIGUOUS : True - # OWNDATA : False - # WRITEABLE : True - # ALIGNED : True - # WRITEBACKIFCOPY : False - # print(type(geometry)) - # print(geometry.dtype) - # print(geometry.flags) - - result = from_sedona_func(geometry) - - result2 = to_sedona_func(result) - -# ensure the input has object dtype, to avoid numpy inferring it as a -# fixed-length string dtype (which removes trailing null bytes upon access -# of array elements) - # - # def from_sedona_func(arr): - # try: - # from . import sedonaserde_vectorized_udf_module - # print(sedonaserde_vectorized_udf_module.from_sedona_func_3(arr)) - # except Exception as e: - # print("Cannot import sedonaserde_vectorized_udf_module:") - # print(e) - # # print() - # return None -# -# def from_wkb(geometry, on_invalid="raise", **kwargs): -# r"""Create geometries from the Well-Known Binary (WKB) representation. -# -# The Well-Known Binary format is defined in the `OGC Simple Features -# Specification for SQL `__. -# -# Parameters -# ---------- -# geometry : str or array_like -# The WKB byte object(s) to convert. -# on_invalid : {"raise", "warn", "ignore", "fix"}, default "raise" -# Indicates what to do when an invalid WKB is encountered. Note that the -# validations involved are very basic, e.g. the minimum number of points -# for the geometry type. For a thorough check, use :func:`is_valid` after -# conversion to geometries. Valid options are: -# -# - raise: an exception will be raised if any input geometry is invalid. -# - warn: a warning will be raised and invalid WKT geometries will be -# returned as ``None``. -# - ignore: invalid geometries will be returned as ``None`` without a -# warning. -# - fix: an effort is made to fix invalid input geometries (currently just -# unclosed rings). If this is not possible, they are returned as -# ``None`` without a warning. Requires GEOS >= 3.11. -# -# .. versionadded:: 2.1.0 -# **kwargs -# See :ref:`NumPy ufunc docs ` for other keyword arguments. -# -# Examples -# -------- -# >>> import shapely -# >>> shapely.from_wkb(b'\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?') -# -# -# """ # noqa: E501 -# if not np.isscalar(on_invalid): -# raise TypeError("on_invalid only accepts scalar values") -# -# invalid_handler = np.uint8(DecodingErrorOptions.get_value(on_invalid)) -# -# # ensure the input has object dtype, to avoid numpy inferring it as a -# # fixed-length string dtype (which removes trailing null bytes upon access -# # of array elements) -# geometry = np.asarray(geometry, dtype=object) -# return lib.from_wkb(geometry, invalid_handler, **kwargs) + class TestSedonaDBArrowFunction(TestBase): def test_vectorized_udf(self): - @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType(), IntegerType()]) + @sedona_db_vectorized_udf( + return_type=GeometryType(), input_types=[ByteType(), IntegerType()] + ) def my_own_function(geom, distance): geom_wkb = pa.array(geom.storage.to_array()) + geometry_array = np.asarray(geom_wkb, dtype=object) distance = pa.array(distance.to_array()) - geom = shapely.from_wkb(geom_wkb) + geom = from_sedona(geometry_array) result_shapely = shapely.centroid(geom) - return pa.array(shapely.to_wkb(result_shapely)) + return pa.array(to_sedona(result_shapely)) df = self.spark.createDataFrame( [ @@ -125,56 +54,61 @@ def my_own_function(geom, distance): df.select(ST_X(my_own_function(df.wkt, lit(100)).alias("geom"))).show() def test_geometry_to_double(self): - @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()]) + @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[ByteType()]) def geometry_to_non_geometry_udf(geom): geom_wkb = pa.array(geom.storage.to_array()) - geom = shapely.from_wkb(geom_wkb) + geometry_array = np.asarray(geom_wkb, dtype=object) + geom = from_sedona(geometry_array) result_shapely = shapely.get_x(shapely.centroid(geom)) - return pa.array(result_shapely, pa.float64()) + return pa.array(result_shapely) df = self.spark.createDataFrame( [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], ["id", "wkt"], ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) - values = df.select(geometry_to_non_geometry_udf(df.wkt).alias("x_coord")) \ - .collect() + values = df.select( + geometry_to_non_geometry_udf(df.wkt).alias("x_coord") + ).collect() values_list = [row["x_coord"] for row in values] assert values_list == [1.0, 2.0, 3.0] def test_geometry_to_int(self): - @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[GeometryType()]) + @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[ByteType()]) def geometry_to_int(geom): geom_wkb = pa.array(geom.storage.to_array()) - geom = shapely.from_wkb(geom_wkb) + geometry_array = np.asarray(geom_wkb, dtype=object) + + geom = from_sedona(geometry_array) result_shapely = shapely.get_num_points(geom) - return pa.array(result_shapely, pa.int32()) + return pa.array(result_shapely) df = self.spark.createDataFrame( [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], ["id", "wkt"], ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)")) - values = df.select(geometry_to_int(df.wkt)) \ - .collect() + values = df.select(geometry_to_int(df.wkt)).collect() values_list = [row[0] for row in values] assert values_list == [0, 0, 0] def test_geometry_crs_preservation(self): - @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()]) + @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()]) def return_same_geometry(geom): geom_wkb = pa.array(geom.storage.to_array()) - geom = shapely.from_wkb(geom_wkb) + geometry_array = np.asarray(geom_wkb, dtype=object) + + geom = from_sedona(geometry_array) - return pa.array(shapely.to_wkb(geom)) + return pa.array(to_sedona(geom)) df = self.spark.createDataFrame( [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")], @@ -183,70 +117,8 @@ def return_same_geometry(geom): result_df = df.select(return_same_geometry(df.wkt).alias("geom")) - crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect() + crs_list = ( + result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect() + ) assert crs_list == [3857, 3857, 3857] - - def test_geometry_to_geometry(self): - @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()]) - def buffer_geometry(geom): - geom_wkb = pa.array(geom.storage.to_array()) - geometry_array = np.asarray(geom_wkb, dtype=object) - geom = from_sedona_func(geometry_array) - - result_shapely = shapely.buffer(geom, 10) - - return pa.array(to_sedona_func(result_shapely)) - - df = self.spark.read.\ - format("geoparquet").\ - load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l1") - - # 1 045 770 - # print(df.count()) - - # df.unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\ - # unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\ - # write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2") - # 18 24 - # df.union(df).union(df).union(df).union(df).union(df).union(df).\ - # write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3") - - values = df.select(buffer_geometry(df.geometry).alias("geometry")).\ - selectExpr("ST_Area(geometry) as area").\ - selectExpr("Sum(area) as total_area") - - values.show() - - # for _ in range(4): - # start_time = time.time() - # values.show() - # end_time = time.time() - # print(f"Execution time: {end_time - start_time} seconds") - - def test_geometry_to_geometry_normal_udf(self): - from pyspark.sql.functions import udf - - def create_buffer(geom): - return geom.buffer(10) - - create_buffer_udf = udf(create_buffer, GeometryType()) - - df = self.spark.read. \ - format("geoparquet"). \ - load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2") - - # print(df.count()) - # df.limit(10).collect() - values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \ - selectExpr("ST_Area(geometry) as area"). \ - selectExpr("Sum(area) as total_area") - - values.show() - - # for _ in range(4): - # start_time = time.time() - # values.show() - # end_time = time.time() - # print(f"Execution time: {end_time - start_time} seconds") -# 1 045 770 diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala index 4fe4acfb121..94ce194c654 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala @@ -73,21 +73,11 @@ class StrategySuite extends TestBaseScala with Matchers { .withColumn("geometry", expr("ST_SetSRID(geom, '4326')")) .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom")) - dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x") + dfVectorized + .selectExpr("ST_X(ST_Centroid(geom)) AS x") .selectExpr("sum(x)") .as[Double] - .collect().head shouldEqual 101 -// -// val dfCopied = sparkSession.read -// .format("geoparquet") -// .load( -// "/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet") -// -// val values = dfCopied -// .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom")) -// .selectExpr("ST_Area(geom) as area") -// .selectExpr("Sum(area) as total_area") -// -// values.show() + .collect() + .head shouldEqual 101 } } From 0fd81806a87ebc8cdaf50d75dccbd1a6bf7e16e3 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Wed, 14 Jan 2026 16:17:48 +0100 Subject: [PATCH 07/11] add sedonadb sedona udf worker example --- python/pyproject.toml | 49 +++++++++++-------- python/sedona/spark/utils/geometry_serde.py | 6 ++- python/sedona/spark/worker/daemon.py | 17 ------- python/src/geom_buf.c | 2 - python/src/geomserde.c | 1 - python/src/geomserde_speedup_module.c | 33 ++++++++++++- python/tests/test_base.py | 2 +- .../apache/sedona/spark/SedonaContext.scala | 1 + .../python/SedonaPythonArrowInput.scala | 30 ------------ .../python/SedonaPythonArrowOutput.scala | 5 -- .../sedona/sql/SQLSyntaxTestScala.scala | 8 +-- .../org/apache/sedona/sql/TestBaseScala.scala | 22 +++------ .../spark/sql/udf/TestScalarPandasUDF.scala | 3 -- 13 files changed, 77 insertions(+), 102 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 37159cf83b9..8b2a06a5d37 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,7 +26,7 @@ description = "Apache Sedona is a cluster computing system for processing large- readme = "README.md" license = { text = "Apache-2.0" } authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ] -requires-python = ">=3.12" +requires-python = ">=3.8" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", @@ -50,16 +50,38 @@ kepler-map = ["geopandas", "keplergl==0.3.2"] flink = ["apache-flink>=1.19.0"] db = ["sedonadb[geopandas]; python_version >= '3.9'"] all = [ -# "pyspark>=3.4.0,<4.1.0", -# "geopandas", -# "pydeck==0.8.0", -# "keplergl==0.3.2", -# "rasterio>=1.2.10", + "pyspark>=3.4.0,<4.1.0", + "geopandas", + "pydeck==0.8.0", + "keplergl==0.3.2", + "rasterio>=1.2.10", ] [dependency-groups] dev = [ - "pytest>=9.0.2", + "pytest", + "pytest-cov", + "notebook==6.4.12", + "jupyter", + "mkdocs", + "scikit-learn", + "esda", + "libpysal", + "matplotlib", # implicit dependency of esda + # prevent incompatibility with pysal 4.7.0, which is what is resolved to when shapely >2 is specified + "scipy<=1.10.0", + "pandas>=2.0.0", + "numpy<2", + "geopandas", + # https://stackoverflow.com/questions/78949093/how-to-resolve-attributeerror-module-fiona-has-no-attribute-path + # cannot set geopandas>=0.14.4 since it doesn't support python 3.8, so we pin fiona to <1.10.0 + "fiona<1.10.0", + "pyarrow", + "pyspark>=3.4.0,<4.1.0", + "keplergl==0.3.2", + "pydeck==0.8.0", + "pystac==1.5.0", + "rasterio>=1.2.10", ] [project.urls] @@ -80,20 +102,7 @@ exclude = ["*.tests", "*.tests.*", "tests", "tests.*"] name = "sedona.spark.utils.geomserde_speedup" sources = [ "src/geomserde_speedup_module.c", - "src/sedonaserde_vectorized_udf_module.c", "src/geomserde.c", "src/geom_buf.c", "src/geos_c_dyn.c", ] - -[[tool.setuptools.ext-modules]] -name = "sedona.spark.utils.sedonaserde_vectorized_udf_module" -sources = [ - "src/sedonaserde_vectorized_udf_module.c", - "src/geomserde.c", - "src/geom_buf.c", - "src/geos_c_dyn.c", -] - -[tool.uv.sources] -sedonadb = { path = "../../../sedona-db/target/wheels/sedonadb-0.3.0-cp312-cp312-macosx_11_0_arm64.whl" } diff --git a/python/sedona/spark/utils/geometry_serde.py b/python/sedona/spark/utils/geometry_serde.py index 0ef3d4ed5cb..103eb498173 100644 --- a/python/sedona/spark/utils/geometry_serde.py +++ b/python/sedona/spark/utils/geometry_serde.py @@ -25,6 +25,9 @@ speedup_enabled = False + +# Use geomserde_speedup when available, otherwise fallback to general pure +# python implementation. try: from . import geomserde_speedup @@ -57,9 +60,8 @@ def find_geos_c_dll(): def deserialize(buf: bytearray) -> Optional[BaseGeometry]: if buf is None: return None - return geomserde_speedup.deserialize_2(buf) + return geomserde_speedup.deserialize(buf) - # Export the from_sedona_func for use with numpy ufuncs speedup_enabled = True elif shapely.__version__.startswith("1."): diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py index ce75e376eac..266baf76d5d 100644 --- a/python/sedona/spark/worker/daemon.py +++ b/python/sedona/spark/worker/daemon.py @@ -40,23 +40,7 @@ def compute_real_exit_code(exit_code): else: return 1 - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -file_handler = logging.FileHandler( - "/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", - delay=False, -) -file_handler.flush = file_handler.stream.flush - -logger.addHandler(file_handler) - - def worker(sock, authenticated): - logger.info( - "Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock) - ) """ Called by a worker process after the fork(). """ @@ -207,7 +191,6 @@ def handle_sigterm(*args): authenticated = False while True: code = worker(sock, authenticated) - logger.info("Worker exited with code %d", code) if code == 0: authenticated = True if not reuse or code: diff --git a/python/src/geom_buf.c b/python/src/geom_buf.c index d6a51bb3d0e..5239de5ae09 100644 --- a/python/src/geom_buf.c +++ b/python/src/geom_buf.c @@ -208,8 +208,6 @@ SedonaErrorCode geom_buf_alloc(GeomBuffer *geom_buf, return SEDONA_SUCCESS; } -#include - SedonaErrorCode read_geom_buf_header(const char *buf, int buf_size, GeomBuffer *geom_buf, CoordinateSequenceInfo *cs_info, diff --git a/python/src/geomserde.c b/python/src/geomserde.c index 81dafe216f0..c1f74277383 100644 --- a/python/src/geomserde.c +++ b/python/src/geomserde.c @@ -718,7 +718,6 @@ static SedonaErrorCode deserialize_geom_buf(GEOSContextHandle_t handle, return SEDONA_SUCCESS; } -#include SedonaErrorCode sedona_deserialize_geom(GEOSContextHandle_t handle, const char *buf, int buf_size, GEOSGeometry **p_geom, diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c index 1d7aefcd772..610c4d1b05b 100644 --- a/python/src/geomserde_speedup_module.c +++ b/python/src/geomserde_speedup_module.c @@ -287,9 +287,24 @@ static PyObject *to_sedona_func(PyObject *self, PyObject *args) { PyObject *obj = objs[i]; GEOSGeometry *geos_geom = NULL; char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom); + if (!success || geos_geom == NULL) { + PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry"); + Py_DECREF(out); + return NULL; + } PyObject *serialized = do_serialize(geos_geom); - PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized); + if (!serialized) { + Py_DECREF(out); + return NULL; + } + + if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized) < 0) { + Py_DECREF(serialized); + Py_DECREF(out); + return NULL; + } + Py_DECREF(serialized); } return out; @@ -318,6 +333,8 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) { PyObject *obj = objs[i]; if (!PyBytes_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected bytes"); + Py_DECREF(out); + return NULL; } @@ -331,11 +348,23 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) { sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read); if (err != SEDONA_SUCCESS) { handle_geomserde_error(err); + Py_DECREF(out); return NULL; } + PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle); + if (!pygeom) { + Py_DECREF(out); + return NULL; + } + + if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) { + Py_DECREF(pygeom); + Py_DECREF(out); + return NULL; + } - PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom); + Py_DECREF(pygeom); } return out; diff --git a/python/tests/test_base.py b/python/tests/test_base.py index 300d937d278..39749302072 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -77,7 +77,7 @@ def spark(self): "sedona.spark.worker.daemon", ) .config( - "sedona.python.worker.daemon.enabled", "false" + "sedona.python.worker.daemon.enabled", "true" ) # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default # in Spark 4 .config("spark.sql.ansi.enabled", "false") diff --git a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala index add3caf225a..c9e8497f7ed 100644 --- a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala +++ b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala @@ -41,6 +41,7 @@ class InternalApi( extends StaticAnnotation object SedonaContext { + private def customOptimizationsWithSession(sparkSession: SparkSession) = Seq( new TransformNestedUDTParquet(sparkSession), diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala index 2544e63a979..66029673519 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala @@ -18,23 +18,6 @@ */ package org.apache.spark.sql.execution.python -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark.sql.catalyst.InternalRow @@ -87,21 +70,8 @@ private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] { writeIteratorToArrowStream(root, writer, dataOut, inputIterator) - // end writes footer to the output stream and doesn't clean any resources. - // It could throw exception if the output stream is closed, so it should be - // in the try block. writer.end() } { - // If we close root and allocator in TaskCompletionListener, there could be a race - // condition where the writer thread keeps writing to the VectorSchemaRoot while - // it's being closed by the TaskCompletion listener. - // Closing root and allocator here is cleaner because root and allocator is owned - // by the writer thread and is only visible to the writer thread. - // - // If the writer thread is interrupted by TaskCompletionListener, it should either - // (1) in the try block, in which case it will get an InterruptedException when - // performing io, and goes into the finally block or (2) in the finally block, - // in which case it will ignore the interruption and close the resources. root.close() allocator.close() } diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala index 27764c2a54f..8940a376a22 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala @@ -99,11 +99,6 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR } eos = true } -// def handleEndOfDataSectionSedona(): Unit = { -// if (stream.readInt() == SpecialLengths.END_OF_STREAM) {} -// -// eos = true -// } protected override def handleEndOfDataSection(): Unit = { handleEndOfDataSectionSedona() diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala index 72a27461f69..6f873d0a087 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala @@ -47,11 +47,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { try { sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) -// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") } catch { case ex: Exception => ex.getClass.getName.endsWith("ParseException") should be(true) -// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") } } @@ -61,11 +61,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { sparkSession.sql( "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) -// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") } catch { case ex: Exception => ex.getClass.getName.endsWith("ParseException") should be(true) -// sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") } } } diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index c9b4d6ac289..50d751f4842 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -30,13 +30,13 @@ import java.io.FileInputStream import java.util.concurrent.ThreadLocalRandom trait TestBaseScala extends FunSpec with BeforeAndAfterAll { -// Logger.getRootLogger().setLevel(Level.WARN) -// Logger.getLogger("org.apache").setLevel(Level.WARN) -// Logger.getLogger("com").setLevel(Level.WARN) -// Logger.getLogger("akka").setLevel(Level.WARN) -// Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) + Logger.getRootLogger().setLevel(Level.WARN) + Logger.getLogger("org.apache").setLevel(Level.WARN) + Logger.getLogger("com").setLevel(Level.WARN) + Logger.getLogger("akka").setLevel(Level.WARN) + Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) -// val keyParserExtension = "spark.sedona.enableParserExtension" + val keyParserExtension = "spark.sedona.enableParserExtension" val warehouseLocation = System.getProperty("user.dir") + "/target/" val sparkSession = SedonaContext .builder() @@ -49,17 +49,9 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { .config("sedona.python.worker.udf.module", "sedona.spark.worker.worker") .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon") .config("sedona.python.worker.daemon.enabled", "false") -// .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) + .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() -// private val useDaemon: Boolean = -// SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false) -// -// private val sedonaUDFWorkerModule = -// SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker") -// -// private val sedonaDaemonModule = -// SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") val sparkSessionMinio = SedonaContext .builder() .master("local[*]") diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala index 23aac14bbe5..d2c0d71c703 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala @@ -45,9 +45,6 @@ object ScalarUDF { } } - SparkEnv.get.conf.set(PYTHON_USE_DAEMON, false) - SparkEnv.get.conf.set(PYTHON_WORKER_MODULE, "sedonaworker.work") - private[spark] lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "") protected lazy val sparkHome: String = { sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) From 04d5cfc269dc3513efcf70602eea4c15b1703cd3 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Wed, 14 Jan 2026 20:26:32 +0100 Subject: [PATCH 08/11] fix pre commit --- python/sedona/spark/worker/daemon.py | 6 ++-- python/sedona/spark/worker/udf_info.py | 12 ++++---- python/sedona/spark/worker/worker.py | 2 +- python/src/geomserde_speedup_module.c | 28 +++++++++---------- .../python/SedonaPythonArrowOutput.scala | 10 +------ 5 files changed, 25 insertions(+), 33 deletions(-) diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py index 266baf76d5d..0c03dde5b8a 100644 --- a/python/sedona/spark/worker/daemon.py +++ b/python/sedona/spark/worker/daemon.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import logging import numbers import os import signal @@ -40,6 +39,7 @@ def compute_real_exit_code(exit_code): else: return 1 + def worker(sock, authenticated): """ Called by a worker process after the fork(). @@ -77,7 +77,7 @@ def worker(sock, authenticated): finally: try: outfile.flush() - except Exception: + except Exception: # nosec pass return exit_code @@ -198,7 +198,7 @@ def handle_sigterm(*args): try: while sock.recv(1024): pass - except Exception: + except Exception: # nosec pass break gc.collect() diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py index eb278a1511c..32a0833f51f 100644 --- a/python/sedona/spark/worker/udf_info.py +++ b/python/sedona/spark/worker/udf_info.py @@ -32,11 +32,9 @@ def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> s arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets]) function_expr = f"{self.name}({arg_offset_str})" if isinstance(self.return_type, GeometryType) and cast_to_wkb: - return ( - f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" - ) + return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}" # nosec - return f"SELECT {function_expr} AS _0 FROM {table_name}" + return f"SELECT {function_expr} AS _0 FROM {table_name}" # nosec def sedona_db_transformation_expr( self, table_name: str, cast_to_wkb: bool = False @@ -45,10 +43,12 @@ def sedona_db_transformation_expr( for arg in self.arg_offsets: if arg in self.geom_offsets and cast_to_wkb: crs = self.geom_offsets[arg] - fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}") + fields.append( + f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}" + ) # nosec continue fields.append(f"_{arg}") fields_expr = ", ".join(fields) - return f"SELECT {fields_expr} FROM {table_name}" + return f"SELECT {fields_expr} FROM {table_name}" # nosec diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py index 02fedf00581..e31d8e76d1f 100644 --- a/python/sedona/spark/worker/worker.py +++ b/python/sedona/spark/worker/worker.py @@ -117,7 +117,7 @@ def assign_task_context(utf_serde: UTF8Deserializer, infile): addresses.append(utf_serde.loads(infile)) task_context._resources[key] = ResourceInformation(name, addresses) - task_context._localProperties = dict() + task_context._localProperties = {} for i in range(read_int(infile)): k = utf_serde.loads(infile) v = utf_serde.loads(infile) diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c index 610c4d1b05b..99358d6fd76 100644 --- a/python/src/geomserde_speedup_module.c +++ b/python/src/geomserde_speedup_module.c @@ -288,21 +288,21 @@ static PyObject *to_sedona_func(PyObject *self, PyObject *args) { GEOSGeometry *geos_geom = NULL; char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom); if (!success || geos_geom == NULL) { - PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry"); - Py_DECREF(out); - return NULL; + PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry"); + Py_DECREF(out); + return NULL; } PyObject *serialized = do_serialize(geos_geom); if (!serialized) { - Py_DECREF(out); - return NULL; + Py_DECREF(out); + return NULL; } if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized) < 0) { - Py_DECREF(serialized); - Py_DECREF(out); - return NULL; + Py_DECREF(serialized); + Py_DECREF(out); + return NULL; } Py_DECREF(serialized); } @@ -354,14 +354,14 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) { PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle); if (!pygeom) { - Py_DECREF(out); - return NULL; + Py_DECREF(out); + return NULL; } - if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) { - Py_DECREF(pygeom); - Py_DECREF(out); - return NULL; + if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) { + Py_DECREF(pygeom); + Py_DECREF(out); + return NULL; } Py_DECREF(pygeom); diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala index 8940a376a22..d0316052601 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala @@ -83,15 +83,7 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR private var batchLoaded = true protected def handleEndOfDataSectionSedona(): Unit = { - // We've finished the data section of the output, but we can still - // read some accumulator updates: -// val numAccumulatorUpdates = stream.readInt() -// (1 to numAccumulatorUpdates).foreach { _ => -// val updateLen = stream.readInt() -// val update = new Array[Byte](updateLen) -// stream.readFully(update) -// } - // Check whether the worker is ready to be re-used. + // Check whether the worker is ready to be reused. if (stream.readInt() == SpecialLengths.END_OF_STREAM) { if (reuseWorker && releasedOrClosed.compareAndSet(false, true)) { WorkerContext.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker) From 84f75dcb9e39d875d9773675daebbc94cb73872a Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Wed, 14 Jan 2026 20:30:43 +0100 Subject: [PATCH 09/11] fix pre commit --- pom.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 613e310983f..071f5233d85 100644 --- a/pom.xml +++ b/pom.xml @@ -631,7 +631,8 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.12.0 + + 2.10.4 attach-javadocs From e4cab0c51eea081fdf7d51ae48307bdb2953400e Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Wed, 14 Jan 2026 23:57:53 +0100 Subject: [PATCH 10/11] fix pre commit --- python/pyproject.toml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8b2a06a5d37..7795f73962d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,20 +26,17 @@ description = "Apache Sedona is a cluster computing system for processing large- readme = "README.md" license = { text = "Apache-2.0" } authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ] -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", ] dependencies = [ "attrs", - "geoarrow-c>=0.3.1", - "geoarrow-pyarrow>=0.2.0", - "geopandas>=1.1.2", - "numpy>=2.1.3", "pyarrow>=16.1.0", "pyspark==3.5.4", "sedonadb", + "setuptools==80.9.0", "shapely>=1.7.0", ] From b415c53c31163535b305cf04dd17df8160473b99 Mon Sep 17 00:00:00 2001 From: pawelkocinski Date: Thu, 15 Jan 2026 00:00:08 +0100 Subject: [PATCH 11/11] fix pre commit --- .github/workflows/python.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 05782bbae9e..9f1159e57cf 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -85,10 +85,10 @@ jobs: scala: '2.12.8' java: '11' python: '3.9' - - spark: '3.5.0' - scala: '2.12.8' - java: '11' - python: '3.8' +# - spark: '3.5.0' +# scala: '2.12.8' +# java: '11' +# python: '3.8' - spark: '3.4.0' scala: '2.12.8' java: '11' @@ -101,15 +101,15 @@ jobs: scala: '2.12.8' java: '11' python: '3.9' - - spark: '3.4.0' - scala: '2.12.8' - java: '11' - python: '3.8' - - spark: '3.4.0' - scala: '2.12.8' - java: '11' - python: '3.8' - shapely: '1' +# - spark: '3.4.0' +# scala: '2.12.8' +# java: '11' +# python: '3.8' +# - spark: '3.4.0' +# scala: '2.12.8' +# java: '11' +# python: '3.8' +# shapely: '1' steps: - uses: actions/checkout@v6