From 1ca3a5941450e56951b2a361679063a0b3a6753d Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Fri, 19 Dec 2025 23:13:31 +0100
Subject: [PATCH 01/11] SEDONA-738 Add sedonadb worker

---
 pom.xml                                       |   2 +-
 python/pyproject.toml                         |  52 ++-
 python/sedona/spark/sql/functions.py          |  64 +++-
 python/sedona/spark/worker/__init__.py        |   0
 python/sedona/spark/worker/serde.py           |  82 +++++
 python/sedona/spark/worker/udf_info.py        |  34 ++
 python/sedona/spark/worker/worker.py          | 295 ++++++++++++++++++
 python/tests/test_base.py                     |   2 +-
 .../utils/test_sedona_db_vectorized_udf.py    |  94 ++++++
 .../apache/sedona/spark/SedonaContext.scala   |   3 +-
 .../sedona/sql/UDF/PythonEvalType.scala       |   7 +
 .../python/SedonaArrowPythonRunner.scala      |  58 ++++
 .../python/SedonaArrowStrategy.scala          | 159 ++++++++++
 .../python/SedonaBasePythonRunner.scala       | 121 +++++++
 .../python/SedonaDBWorkerFactory.scala        | 118 +++++++
 .../python/SedonaPythonArrowInput.scala       | 135 ++++++++
 .../python/SedonaPythonArrowOutput.scala      | 171 ++++++++++
 .../sql/execution/python/WorkerContext.scala  |  52 +++
 .../spark/sql/udf/ExtractSedonaUDFRule.scala  |  13 +-
 .../spark/sql/udf/SedonaArrowStrategy.scala   |  89 ------
 .../org/apache/sedona/sql/TestBaseScala.scala |   1 +
 .../apache/spark/sql/udf/StrategySuite.scala  |  32 +-
 .../spark/sql/udf/TestScalarPandasUDF.scala   | 162 ++++++++--
 23 files changed, 1586 insertions(+), 160 deletions(-)
 create mode 100644 python/sedona/spark/worker/__init__.py
 create mode 100644 python/sedona/spark/worker/serde.py
 create mode 100644 python/sedona/spark/worker/udf_info.py
 create mode 100644 python/sedona/spark/worker/worker.py
 create mode 100644 python/tests/utils/test_sedona_db_vectorized_udf.py
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
 create mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
 delete mode 100644 spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala
diff --git a/pom.xml b/pom.xml
index d6e4e81319d..613e310983f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -631,7 +631,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>2.10.4</version>
+                <version>3.12.0</version>
                 <executions>
                     <execution>
                         <id>attach-javadocs</id>
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b988966c4fc..76169261c34 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [build-system]
-requires = ["setuptools>=69", "wheel"]
+requires = ["setuptools>=80.9.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -26,13 +26,19 @@ description = "Apache Sedona is a cluster computing system for processing large-
 readme = "README.md"
 license = { text = "Apache-2.0" }
 authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ]
-requires-python = ">=3.8"
+requires-python = ">=3.12"
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
   "attrs",
+  "geoarrow-c>=0.3.1",
+  "geoarrow-pyarrow>=0.2.0",
+  "geopandas>=1.1.2",
+  "pyarrow>=16.1.0",
+  "pyspark==3.5.4",
+  "sedonadb",
   "shapely>=1.7.0",
 ]
 
@@ -43,38 +49,16 @@ kepler-map = ["geopandas", "keplergl==0.3.2"]
 flink = ["apache-flink>=1.19.0"]
 db = ["sedonadb[geopandas]; python_version >= '3.9'"]
 all = [
-  "pyspark>=3.4.0,<4.1.0",
-  "geopandas",
-  "pydeck==0.8.0",
-  "keplergl==0.3.2",
-  "rasterio>=1.2.10",
+#  "pyspark>=3.4.0,<4.1.0",
+#  "geopandas",
+#  "pydeck==0.8.0",
+#  "keplergl==0.3.2",
+#  "rasterio>=1.2.10",
 ]
 
 [dependency-groups]
 dev = [
-  "pytest",
-  "pytest-cov",
-  "notebook==6.4.12",
-  "jupyter",
-  "mkdocs",
-  "scikit-learn",
-  "esda",
-  "libpysal",
-  "matplotlib",  # implicit dependency of esda
-  # prevent incompatibility with pysal 4.7.0, which is what is resolved to when shapely >2 is specified
-  "scipy<=1.10.0",
-  "pandas>=2.0.0",
-  "numpy<2",
-  "geopandas",
-  # https://stackoverflow.com/questions/78949093/how-to-resolve-attributeerror-module-fiona-has-no-attribute-path
-  # cannot set geopandas>=0.14.4 since it doesn't support python 3.8, so we pin fiona to <1.10.0
-  "fiona<1.10.0",
-  "pyarrow",
-  "pyspark>=3.4.0,<4.1.0",
-  "keplergl==0.3.2",
-  "pydeck==0.8.0",
-  "pystac==1.5.0",
-  "rasterio>=1.2.10",
+    "pytest>=9.0.2",
 ]
 
 [project.urls]
@@ -99,3 +83,11 @@ sources = [
   "src/geom_buf.c",
   "src/geos_c_dyn.c",
 ]
+
+[tool.uv]
+dev-dependencies = [
+    "pytest>=9.0.2",
+]
+
+[tool.uv.sources]
+sedonadb = { path = "../../../sedona-db/target/wheels/sedonadb-0.3.0-cp312-cp312-macosx_11_0_arm64.whl" }
diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py
index 2420301d52d..7c480e1700d 100644
--- a/python/sedona/spark/sql/functions.py
+++ b/python/sedona/spark/sql/functions.py
@@ -21,11 +21,14 @@
 
 import pandas as pd
 
-from sedona.spark.sql.types import GeometryType
 from sedona.spark.utils import geometry_serde
-from pyspark.sql.udf import UserDefinedFunction
-from pyspark.sql.types import DataType
 from shapely.geometry.base import BaseGeometry
+from pyspark.sql.udf import UserDefinedFunction
+import pyarrow as pa
+import geoarrow.pyarrow as ga
+from sedonadb import udf as sedona_udf_module
+from sedona.spark.sql.types import GeometryType
+from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType
 
 
 SEDONA_SCALAR_EVAL_TYPE = 5200
@@ -142,3 +145,58 @@ def serialize_to_geometry_if_geom(data, return_type: DataType):
         return geometry_serde.serialize(data)
 
     return data
+
+
+def infer_pa_type(spark_type: DataType):
+    if isinstance(spark_type, GeometryType):
+        return ga.wkb()
+    elif isinstance(spark_type, FloatType):
+        return pa.float32()
+    elif isinstance(spark_type, DoubleType):
+        return pa.float64()
+    elif isinstance(spark_type, IntegerType):
+        return pa.int32()
+    elif isinstance(spark_type, StringType):
+        return pa.string()
+    else:
+        raise NotImplementedError(f"Type {spark_type} is not supported yet.")
+
+def infer_input_type(spark_type: DataType):
+    if isinstance(spark_type, GeometryType):
+        return sedona_udf_module.GEOMETRY
+    elif isinstance(spark_type, FloatType) or isinstance(spark_type, DoubleType) or isinstance(spark_type, IntegerType):
+        return sedona_udf_module.NUMERIC
+    elif isinstance(spark_type, StringType):
+        return sedona_udf_module.STRING
+    else:
+        raise NotImplementedError(f"Type {spark_type} is not supported yet.")
+
+def infer_input_types(spark_types: list[DataType]):
+    pa_types = []
+    for spark_type in spark_types:
+        pa_type = infer_input_type(spark_type)
+        pa_types.append(pa_type)
+
+    return pa_types
+
+
+def sedona_db_vectorized_udf(
+        return_type: DataType,
+        input_types: list[DataType]
+):
+    def apply_fn(fn):
+        out_type = infer_pa_type(return_type)
+        input_types_sedona_db = infer_input_types(input_types)
+
+        @sedona_udf_module.arrow_udf(out_type, input_types=input_types_sedona_db)
+        def shapely_udf(*args, **kwargs):
+            return fn(*args, **kwargs)
+
+        udf = UserDefinedFunction(
+            lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=6200
+        )
+
+        return udf
+
+
+    return apply_fn
diff --git a/python/sedona/spark/worker/__init__.py b/python/sedona/spark/worker/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py
new file mode 100644
index 00000000000..31038b7fcdd
--- /dev/null
+++ b/python/sedona/spark/worker/serde.py
@@ -0,0 +1,82 @@
+import socket
+
+from pyspark.serializers import write_int, SpecialLengths
+from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
+
+from sedona.spark.worker.udf_info import UDFInfo
+
+
+def read_available(buf, chunk=4096):
+    # buf.raw._sock.settimeout(0.01)   # non-blocking-ish
+    data = bytearray()
+    index = 0
+    while True:
+        index+=1
+        try:
+            chunk_bytes = buf.read(chunk)
+        except socket.timeout:
+            break
+
+        if not chunk_bytes and index > 10:
+            break
+
+        data.extend(chunk_bytes)
+
+    return bytes(data)
+
+class SedonaDBSerializer(ArrowStreamPandasSerializer):
+    def __init__(self, timezone, safecheck, db, udf_info: UDFInfo):
+        super(SedonaDBSerializer, self).__init__(timezone, safecheck)
+        self.db = db
+        self.udf_info = udf_info
+
+    def load_stream(self, stream):
+        import pyarrow as pa
+
+        batches = super(ArrowStreamPandasSerializer, self).load_stream(stream)
+        index = 0
+        for batch in batches:
+            table = pa.Table.from_batches(batches=[batch])
+            import pyarrow as pa
+            df = self.db.create_data_frame(table)
+            table_name = f"my_table_{index}"
+
+            df.to_view(table_name)
+
+            sql_expression = self.udf_info.sedona_db_transformation_expr(table_name)
+
+            index += 1
+
+            yield self.db.sql(sql_expression)
+
+    def arrow_dump_stream(self, iterator, stream):
+        import pyarrow as pa
+
+        writer = None
+        try:
+            for batch in iterator:
+                if writer is None:
+                    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
+                writer.write_batch(batch)
+                # stream.flush()
+        finally:
+            if writer is not None:
+                writer.close()
+
+    def dump_stream(self, iterator, stream):
+        """
+        Override because Pandas UDFs require a START_ARROW_STREAM before the Arrow stream is sent.
+        This should be sent after creating the first record batch so in case of an error, it can
+        be sent back to the JVM before the Arrow stream starts.
+        """
+
+        def init_stream_yield_batches():
+            should_write_start_length = True
+            for batch in iterator:
+                if should_write_start_length:
+                    write_int(SpecialLengths.START_ARROW_STREAM, stream)
+                    should_write_start_length = False
+
+                yield batch
+
+        return self.arrow_dump_stream(init_stream_yield_batches(), stream)
diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py
new file mode 100644
index 00000000000..d354bcea7ed
--- /dev/null
+++ b/python/sedona/spark/worker/udf_info.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+
+from sedona.spark import GeometryType
+
+
+@dataclass
+class UDFInfo:
+    arg_offsets: list
+    geom_offsets: dict
+    function: object
+    return_type: object
+    name: str
+
+    def get_function_call_sql(self, table_name: str) -> str:
+        arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets])
+        function_expr = f"{self.name}({arg_offset_str})"
+        if isinstance(self.return_type, GeometryType):
+            return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"
+
+        return f"SELECT {function_expr} AS _0 FROM {table_name}"
+
+    def sedona_db_transformation_expr(self, table_name: str) -> str:
+        fields = []
+        for arg in self.arg_offsets:
+            if arg in self.geom_offsets:
+                crs = self.geom_offsets[arg]
+                fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}")
+                continue
+
+            fields.append(f"_{arg}")
+
+
+        fields_expr = ", ".join(fields)
+        return f"SELECT {fields_expr} FROM {table_name}"
diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
new file mode 100644
index 00000000000..74a61b02ee8
--- /dev/null
+++ b/python/sedona/spark/worker/worker.py
@@ -0,0 +1,295 @@
+import importlib
+import os
+import sys
+import time
+
+import sedonadb
+from pyspark import TaskContext, shuffle, SparkFiles
+from pyspark.errors import PySparkRuntimeError
+from pyspark.java_gateway import local_connect_and_auth
+from pyspark.resource import ResourceInformation
+from pyspark.serializers import read_int, UTF8Deserializer, read_bool, read_long, CPickleSerializer, write_int, \
+    write_long, SpecialLengths
+
+from sedona.spark.worker.serde import SedonaDBSerializer
+from sedona.spark.worker.udf_info import UDFInfo
+
+
+def apply_iterator(db, iterator, udf_info: UDFInfo):
+    i = 0
+    for df in iterator:
+        i+=1
+        table_name = f"output_table_{i}"
+        df.to_view(table_name)
+
+        function_call_sql = udf_info.get_function_call_sql(table_name)
+
+        df_out = db.sql(function_call_sql)
+        df_out.to_view(f"view_{i}")
+        at = df_out.to_arrow_table()
+        batches = at.combine_chunks().to_batches()
+
+        for batch in batches:
+            yield batch
+
+
+def check_python_version(utf_serde: UTF8Deserializer, infile) -> str:
+    version = utf_serde.loads(infile)
+
+    python_major, python_minor = sys.version_info[:2]
+
+    if version != f"{python_major}.{python_minor}":
+        raise PySparkRuntimeError(
+            error_class="PYTHON_VERSION_MISMATCH",
+            message_parameters={
+                "worker_version": str(sys.version_info[:2]),
+                "driver_version": str(version),
+            },
+        )
+
+    return version
+
+def check_barrier_flag(infile):
+    is_barrier = read_bool(infile)
+    bound_port = read_int(infile)
+    secret = UTF8Deserializer().loads(infile)
+
+    if is_barrier:
+        raise PySparkRuntimeError(
+            error_class="BARRIER_MODE_NOT_SUPPORTED",
+            message_parameters={
+                "worker_version": str(sys.version_info[:2]),
+                "message": "Barrier mode is not supported by SedonaDB vectorized functions.",
+            },
+        )
+
+    return is_barrier
+
+def assign_task_context(utf_serde: UTF8Deserializer, infile):
+    stage_id = read_int(infile)
+    partition_id = read_int(infile)
+    attempt_number = read_long(infile)
+    task_attempt_id = read_int(infile)
+    cpus = read_int(infile)
+
+    task_context = TaskContext._getOrCreate()
+    task_context._stage_id = stage_id
+    task_context._partition_id = partition_id
+    task_context._attempt_number = attempt_number
+    task_context._task_attempt_id = task_attempt_id
+    task_context._cpus = cpus
+
+    for r in range(read_int(infile)):
+        key = utf_serde.loads(infile)
+        name = utf_serde.loads(infile)
+        addresses = []
+        task_context._resources = {}
+        for a in range(read_int(infile)):
+            addresses.append(utf_serde.loads(infile))
+        task_context._resources[key] = ResourceInformation(name, addresses)
+
+    task_context._localProperties = dict()
+    for i in range(read_int(infile)):
+        k = utf_serde.loads(infile)
+        v = utf_serde.loads(infile)
+        task_context._localProperties[k] = v
+
+    return task_context
+
+def resolve_python_path(utf_serde: UTF8Deserializer, infile):
+    def add_path(path: str):
+        # worker can be used, so do not add path multiple times
+        if path not in sys.path:
+            # overwrite system packages
+            sys.path.insert(1, path)
+
+    spark_files_dir = utf_serde.loads(infile)
+    # _accumulatorRegistry.clear()
+
+    SparkFiles._root_directory = spark_files_dir
+    SparkFiles._is_running_on_worker = True
+
+    add_path(spark_files_dir)  # *.py files that were added will be copied here
+    num_python_includes = read_int(infile)
+    for _ in range(num_python_includes):
+        filename = utf_serde.loads(infile)
+        add_path(os.path.join(spark_files_dir, filename))
+
+    importlib.invalidate_caches()
+
+
+def check_broadcast_variables(infile):
+    needs_broadcast_decryption_server = read_bool(infile)
+    num_broadcast_variables = read_int(infile)
+
+    if needs_broadcast_decryption_server or num_broadcast_variables > 0:
+        raise PySparkRuntimeError(
+            error_class="BROADCAST_VARS_NOT_SUPPORTED",
+            message_parameters={
+                "worker_version": str(sys.version_info[:2]),
+                "message": "Broadcast variables are not supported by SedonaDB vectorized functions.",
+            },
+        )
+
+def get_runner_conf(utf_serde: UTF8Deserializer, infile):
+    runner_conf = {}
+    num_conf = read_int(infile)
+    for i in range(num_conf):
+        k = utf_serde.loads(infile)
+        v = utf_serde.loads(infile)
+        runner_conf[k] = v
+    return runner_conf
+
+
+def read_command(serializer, infile):
+    command = serializer._read_with_length(infile)
+    return command
+
+def read_udf(infile, pickle_ser) -> UDFInfo:
+    num_arg = read_int(infile)
+    arg_offsets = [read_int(infile) for i in range(num_arg)]
+
+    function = None
+    return_type = None
+
+    for i in range(read_int(infile)):
+        function, return_type = read_command(pickle_ser, infile)
+
+    sedona_db_udf_expression = function()
+
+    return UDFInfo(
+        arg_offsets=arg_offsets,
+        function=sedona_db_udf_expression,
+        return_type=return_type,
+        name=sedona_db_udf_expression._name,
+        geom_offsets=[0]
+    )
+
+# def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
+#     num_arg = read_int(infile)
+#     arg_offsets = [read_int(infile) for i in range(num_arg)]
+#     chained_func = None
+#     for i in range(read_int(infile)):
+#         f, return_type = read_command(pickleSer, infile)
+#         if chained_func is None:
+#             chained_func = f
+#         else:
+#             chained_func = chain(chained_func, f)
+#
+#     func = chained_func
+#
+#     # the last returnType will be the return type of UDF
+#     if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
+#         return arg_offsets, func, return_type
+#     else:
+#         raise ValueError("Unknown eval type: {}".format(eval_type))
+#
+
+def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo:
+    num_udfs = read_int(infile)
+
+    udf = None
+    for _ in range(num_udfs):
+        udf = read_udf(infile, pickle_ser)
+        # Here we would register the UDF with SedonaDB's internal context
+
+
+    return udf
+
+
+def report_times(outfile, boot, init, finish):
+    write_int(SpecialLengths.TIMING_DATA, outfile)
+    write_long(int(1000 * boot), outfile)
+    write_long(int(1000 * init), outfile)
+    write_long(int(1000 * finish), outfile)
+
+
+def write_statistics(infile, outfile, boot_time, init_time) -> None:
+    TaskContext._setTaskContext(None)
+    finish_time = time.time()
+    report_times(outfile, boot_time, init_time, finish_time)
+    write_long(shuffle.MemoryBytesSpilled, outfile)
+    write_long(shuffle.DiskBytesSpilled, outfile)
+
+    # Mark the beginning of the accumulators section of the output
+    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
+    # write_int(len(_accumulatorRegistry), outfile)
+    # for (aid, accum) in _accumulatorRegistry.items():
+    #     pickleSer._write_with_length((aid, accum._value), outfile)
+
+    if read_int(infile) == SpecialLengths.END_OF_STREAM:
+        write_int(SpecialLengths.END_OF_STREAM, outfile)
+        outfile.flush()
+    else:
+        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
+        outfile.flush()
+        sys.exit(-1)
+
+
+def main(infile, outfile):
+    boot_time = time.time()
+    sedona_db = sedonadb.connect()
+    #
+    utf8_deserializer = UTF8Deserializer()
+    pickle_ser = CPickleSerializer()
+
+    split_index = read_int(infile)
+    #
+    check_python_version(utf8_deserializer, infile)
+    #
+    check_barrier_flag(infile)
+
+    task_context = assign_task_context(utf_serde=utf8_deserializer, infile=infile)
+    shuffle.MemoryBytesSpilled = 0
+    shuffle.DiskBytesSpilled = 0
+
+    resolve_python_path(utf8_deserializer, infile)
+    #
+    check_broadcast_variables(infile)
+
+    eval_type = read_int(infile)
+
+    runner_conf = get_runner_conf(utf8_deserializer, infile)
+
+    udf = register_sedona_db_udf(infile, pickle_ser)
+
+    sedona_db.register_udf(udf.function)
+    init_time = time.time()
+
+    serde = SedonaDBSerializer(
+        timezone=runner_conf.get("spark.sql.session.timeZone", "UTC"),
+        safecheck=False,
+        db=sedona_db,
+        udf_info=udf
+    )
+
+    number_of_geometries = read_int(infile)
+    geom_offsets = {}
+    for i in range(number_of_geometries):
+        geom_index = read_int(infile)
+        geom_srid = read_int(infile)
+
+        geom_offsets[geom_index] = geom_srid
+
+    udf.geom_offsets = geom_offsets
+
+    iterator = serde.load_stream(infile)
+    out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf)
+
+    serde.dump_stream(out_iterator, outfile)
+
+    write_statistics(
+        infile, outfile, boot_time=boot_time, init_time=init_time
+    )
+
+
+if __name__ == "__main__":
+    # add file handler
+    auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"]
+    java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"])
+    (sock_file, sc) = local_connect_and_auth(java_port, auth_secret)
+
+    write_int(os.getpid(), sock_file)
+    sock_file.flush()
+
+    main(sock_file, sock_file)
diff --git a/python/tests/test_base.py b/python/tests/test_base.py
index cc2b09e422a..a6dbae65975 100644
--- a/python/tests/test_base.py
+++ b/python/tests/test_base.py
@@ -22,7 +22,7 @@
 import pyspark
 from pyspark.sql import DataFrame
 
-from sedona.spark import *
+from sedona.spark import SedonaContext
 from sedona.spark.utils.decorators import classproperty
 
 SPARK_REMOTE = os.getenv("SPARK_REMOTE")
diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py
new file mode 100644
index 00000000000..749d45420ef
--- /dev/null
+++ b/python/tests/utils/test_sedona_db_vectorized_udf.py
@@ -0,0 +1,94 @@
+from sedona.spark.sql.functions import sedona_db_vectorized_udf
+from tests.test_base import TestBase
+import pyarrow as pa
+import shapely
+from sedona.sql import GeometryType
+from pyspark.sql.functions import expr, lit
+from pyspark.sql.types import DoubleType, IntegerType
+
+
+class TestSedonaDBArrowFunction(TestBase):
+    def test_vectorized_udf(self):
+        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType(), IntegerType()])
+        def my_own_function(geom, distance):
+            geom_wkb = pa.array(geom.storage.to_array())
+            distance = pa.array(distance.to_array())
+            geom = shapely.from_wkb(geom_wkb)
+
+            result_shapely = shapely.centroid(geom)
+
+            return pa.array(shapely.to_wkb(result_shapely))
+
+        df = self.spark.createDataFrame(
+            [
+                (1, "POINT (1 1)"),
+                (2, "POINT (2 2)"),
+                (3, "POINT (3 3)"),
+            ],
+            ["id", "wkt"],
+        ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
+
+        df.select(my_own_function(df.wkt, lit(100)).alias("geom")).show()
+
+    def test_geometry_to_double(self):
+        @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()])
+        def geometry_to_non_geometry_udf(geom):
+            geom_wkb = pa.array(geom.storage.to_array())
+            geom = shapely.from_wkb(geom_wkb)
+
+            result_shapely = shapely.get_x(shapely.centroid(geom))
+
+            return pa.array(result_shapely, pa.float64())
+
+        df = self.spark.createDataFrame(
+            [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
+            ["id", "wkt"],
+        ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
+
+        values = df.select(geometry_to_non_geometry_udf(df.wkt).alias("x_coord")) \
+            .collect()
+
+        values_list = [row["x_coord"] for row in values]
+
+        assert values_list == [1.0, 2.0, 3.0]
+
+    def test_geometry_to_int(self):
+        @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[GeometryType()])
+        def geometry_to_int(geom):
+            geom_wkb = pa.array(geom.storage.to_array())
+            geom = shapely.from_wkb(geom_wkb)
+
+            result_shapely = shapely.get_num_points(geom)
+
+            return pa.array(result_shapely, pa.int32())
+
+        df = self.spark.createDataFrame(
+            [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
+            ["id", "wkt"],
+        ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
+
+        values = df.select(geometry_to_int(df.wkt)) \
+            .collect()
+
+        values_list = [row[0] for row in values]
+
+        assert values_list == [0, 0, 0]
+
+    def test_geometry_crs_preservation(self):
+        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()])
+        def return_same_geometry(geom):
+            geom_wkb = pa.array(geom.storage.to_array())
+            geom = shapely.from_wkb(geom_wkb)
+
+            return pa.array(shapely.to_wkb(geom))
+
+        df = self.spark.createDataFrame(
+            [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
+            ["id", "wkt"],
+        ).withColumn("wkt", expr("ST_SetSRID(ST_GeomFromWKT(wkt), 3857)"))
+
+        result_df = df.select(return_same_geometry(df.wkt).alias("geom"))
+
+        crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect()
+
+        assert crs_list == [3857, 3857, 3857]
diff --git a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
index b0e46cf6e9e..add3caf225a 100644
--- a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
+++ b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
@@ -41,7 +41,6 @@ class InternalApi(
     extends StaticAnnotation
 
 object SedonaContext {
-
   private def customOptimizationsWithSession(sparkSession: SparkSession) =
     Seq(
       new TransformNestedUDTParquet(sparkSession),
@@ -72,7 +71,7 @@ object SedonaContext {
 
     val sedonaArrowStrategy = Try(
       Class
-        .forName("org.apache.spark.sql.udf.SedonaArrowStrategy")
+        .forName("org.apache.spark.sql.execution.python.SedonaArrowStrategy")
         .getDeclaredConstructor()
         .newInstance()
         .asInstanceOf[SparkStrategy])
diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
index aece26267d9..11263dd7f68 100644
--- a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
+++ b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
@@ -23,7 +23,14 @@ object PythonEvalType {
   val SQL_SCALAR_SEDONA_UDF = 5200
   val SEDONA_UDF_TYPE_CONSTANT = 5000
 
+  // sedona db eval types
+  val SQL_SCALAR_SEDONA_DB_UDF = 6200
+  val SEDONA_DB_UDF_TYPE_CONSTANT = 6000
+
   def toString(pythonEvalType: Int): String = pythonEvalType match {
     case SQL_SCALAR_SEDONA_UDF => "SQL_SCALAR_GEO_UDF"
+    case SQL_SCALAR_SEDONA_DB_UDF => "SQL_SCALAR_SEDONA_DB_UDF"
   }
+
+  def evals(): Set[Int] = Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF)
 }
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
new file mode 100644
index 00000000000..0d3960d2d81
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import org.apache.spark.api.python._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream.
+ */
+class SedonaArrowPythonRunner(
+    funcs: Seq[ChainedPythonFunctions],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    protected override val schema: StructType,
+    protected override val timeZoneId: String,
+    protected override val largeVarTypes: Boolean,
+    protected override val workerConf: Map[String, String],
+    val pythonMetrics: Map[String, SQLMetric],
+    jobArtifactUUID: Option[String],
+    geometryFields: Seq[(Int, Int)])
+    extends SedonaBasePythonRunner[Iterator[InternalRow], ColumnarBatch](
+      funcs,
+      evalType,
+      argOffsets,
+      jobArtifactUUID,
+      geometryFields)
+    with SedonaBasicPythonArrowInput
+    with SedonaBasicPythonArrowOutput {
+
+  override val errorOnDuplicatedFieldNames: Boolean = true
+
+  override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize
+  require(
+    bufferSize >= 4,
+    "Pandas execution requires more than 4 bytes. Please set higher buffer. " +
+      s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.")
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
new file mode 100644
index 00000000000..bb897931b63
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import org.apache.sedona.sql.UDF.PythonEvalType
+import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.Strategy
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, PythonUDF}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.udf.SedonaArrowEvalPython
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
+
+import scala.collection.JavaConverters.asScalaIteratorConverter
+
+// We use custom Strategy to avoid Apache Spark assert on types, we
+// can consider extending this to support other engines working with
+// arrow data
+class SedonaArrowStrategy extends Strategy {
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case SedonaArrowEvalPython(udfs, output, child, evalType) =>
+      SedonaArrowEvalPythonExec(udfs, output, planLater(child), evalType) :: Nil
+    case _ => Nil
+  }
+}
+
+// It's modification og Apache Spark's ArrowEvalPythonExec, we remove the check on the types to allow geometry types
+// here, it's initial version to allow the vectorized udf for Sedona geometry types. We can consider extending this
+// to support other engines working with arrow data
+case class SedonaArrowEvalPythonExec(
+    udfs: Seq[PythonUDF],
+    resultAttrs: Seq[Attribute],
+    child: SparkPlan,
+    evalType: Int)
+    extends EvalPythonExec
+    with PythonSQLMetrics {
+
+  private val batchSize = conf.arrowMaxRecordsPerBatch
+  private val sessionLocalTimeZone = conf.sessionLocalTimeZone
+  private val largeVarTypes = conf.arrowUseLargeVarTypes
+  private val pythonRunnerConf =
+    Map[String, String](SQLConf.SESSION_LOCAL_TIMEZONE.key -> conf.sessionLocalTimeZone)
+  private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+  private def inferCRS(iterator: Iterator[InternalRow], schema: StructType): Seq[(Int, Int)] = {
+    // this triggers the iterator
+    if (!iterator.hasNext) {
+      return Seq.empty
+    }
+
+    val row = iterator.next()
+
+    val rowMatched = row match {
+      case generic: GenericInternalRow =>
+        Some(generic)
+      case _ => None
+    }
+
+    schema
+      .filter { field =>
+        field.dataType == GeometryUDT
+      }
+      .zipWithIndex
+      .map { case (_, index) =>
+        if (rowMatched.isEmpty || rowMatched.get.values(index) == null) (index, 0)
+        else {
+          val geom = rowMatched.get.get(index, GeometryUDT).asInstanceOf[Array[Byte]]
+          val preambleByte = geom(0) & 0xff
+          val hasSrid = (preambleByte & 0x01) != 0
+
+          var srid = 0
+          if (hasSrid) {
+            val srid2 = (geom(1) & 0xff) << 16
+            val srid1 = (geom(2) & 0xff) << 8
+            val srid0 = geom(3) & 0xff
+            srid = srid2 | srid1 | srid0
+          }
+
+          (index, srid)
+        }
+      }
+  }
+
+  protected override def evaluate(
+      funcs: Seq[ChainedPythonFunctions],
+      argOffsets: Array[Array[Int]],
+      iter: Iterator[InternalRow],
+      schema: StructType,
+      context: TaskContext): Iterator[InternalRow] = {
+    val (probe, full) = iter.duplicate
+
+    val geometryFields = inferCRS(probe, schema)
+
+    val batchIter = if (batchSize > 0) new BatchIterator(full, batchSize) else Iterator(full)
+
+    evalType match {
+      case SQL_SCALAR_SEDONA_DB_UDF =>
+        val columnarBatchIter = new SedonaArrowPythonRunner(
+          funcs,
+          evalType - PythonEvalType.SEDONA_DB_UDF_TYPE_CONSTANT,
+          argOffsets,
+          schema,
+          sessionLocalTimeZone,
+          largeVarTypes,
+          pythonRunnerConf,
+          pythonMetrics,
+          jobArtifactUUID,
+          geometryFields).compute(batchIter, context.partitionId(), context)
+
+        val result = columnarBatchIter.flatMap { batch =>
+          batch.rowIterator.asScala
+        }
+
+        result
+
+      case SQL_SCALAR_SEDONA_UDF =>
+        val columnarBatchIter = new ArrowPythonRunner(
+          funcs,
+          evalType - PythonEvalType.SEDONA_UDF_TYPE_CONSTANT,
+          argOffsets,
+          schema,
+          sessionLocalTimeZone,
+          largeVarTypes,
+          pythonRunnerConf,
+          pythonMetrics,
+          jobArtifactUUID).compute(batchIter, context.partitionId(), context)
+
+        val iter = columnarBatchIter.flatMap { batch =>
+          batch.rowIterator.asScala
+        }
+
+        iter
+    }
+  }
+
+  override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan =
+    copy(child = newChild)
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
new file mode 100644
index 00000000000..8ecc110e39d
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import java.net._
+import java.util.concurrent.atomic.AtomicBoolean
+import scala.collection.JavaConverters._
+import org.apache.spark._
+import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.EXECUTOR_CORES
+import org.apache.spark.internal.config.Python._
+import org.apache.spark.resource.ResourceProfile.{EXECUTOR_CORES_LOCAL_PROPERTY, PYSPARK_MEMORY_LOCAL_PROPERTY}
+import org.apache.spark.util._
+
+private object SedonaBasePythonRunner {
+
+  private lazy val faultHandlerLogDir = Utils.createTempDir(namePrefix = "faulthandler")
+}
+
+private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
+    funcs: Seq[ChainedPythonFunctions],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    jobArtifactUUID: Option[String],
+    val geometryFields: Seq[(Int, Int)] = Seq.empty)
+    extends BasePythonRunner[IN, OUT](funcs, evalType, argOffsets, jobArtifactUUID)
+    with Logging {
+
+  require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
+
+  private val conf = SparkEnv.get.conf
+  private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED)
+
+  private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = {
+    mem.map(_ / cores)
+  }
+
+  import java.io._
+
+  override def compute(
+      inputIterator: Iterator[IN],
+      partitionIndex: Int,
+      context: TaskContext): Iterator[OUT] = {
+    val startTime = System.currentTimeMillis
+    val env = SparkEnv.get
+
+    val execCoresProp = Option(context.getLocalProperty(EXECUTOR_CORES_LOCAL_PROPERTY))
+    val memoryMb = Option(context.getLocalProperty(PYSPARK_MEMORY_LOCAL_PROPERTY)).map(_.toLong)
+
+    if (simplifiedTraceback) {
+      envVars.put("SPARK_SIMPLIFIED_TRACEBACK", "1")
+    }
+    // SPARK-30299 this could be wrong with standalone mode when executor
+    // cores might not be correct because it defaults to all cores on the box.
+    val execCores = execCoresProp.map(_.toInt).getOrElse(conf.get(EXECUTOR_CORES))
+    val workerMemoryMb = getWorkerMemoryMb(memoryMb, execCores)
+    if (workerMemoryMb.isDefined) {
+      envVars.put("PYSPARK_EXECUTOR_MEMORY_MB", workerMemoryMb.get.toString)
+    }
+    envVars.put("SPARK_AUTH_SOCKET_TIMEOUT", authSocketTimeout.toString)
+    envVars.put("SPARK_BUFFER_SIZE", bufferSize.toString)
+    if (faultHandlerEnabled) {
+      envVars.put("PYTHON_FAULTHANDLER_DIR", SedonaBasePythonRunner.faultHandlerLogDir.toString)
+    }
+
+    envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default"))
+
+    val (worker: Socket, pid: Option[Int]) = {
+      WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap)
+    }
+
+    val releasedOrClosed = new AtomicBoolean(false)
+
+    // Start a thread to feed the process input from our parent's iterator
+    val writerThread = newWriterThread(env, worker, inputIterator, partitionIndex, context)
+
+    context.addTaskCompletionListener[Unit] { _ =>
+      writerThread.shutdownOnTaskCompletion()
+      if (releasedOrClosed.compareAndSet(false, true)) {
+        try {
+          worker.close()
+        } catch {
+          case e: Exception =>
+            logWarning("Failed to close worker socket", e)
+        }
+      }
+    }
+
+    writerThread.start()
+
+    val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
+
+    val stdoutIterator = newReaderIterator(
+      stream,
+      writerThread,
+      startTime,
+      env,
+      worker,
+      pid,
+      releasedOrClosed,
+      context)
+    new InterruptibleIterator(context, stdoutIterator)
+  }
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
new file mode 100644
index 00000000000..add09a7cb22
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import org.apache.spark.{SparkException, SparkFiles}
+import org.apache.spark.api.python.{PythonUtils, PythonWorkerFactory}
+import org.apache.spark.util.Utils
+
+import java.io.{DataInputStream, File}
+import java.net.{InetAddress, ServerSocket, Socket}
+import java.util.Arrays
+import java.io.InputStream
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import org.apache.spark._
+import org.apache.spark.security.SocketAuthHelper
+import org.apache.spark.util.RedirectThread
+
+class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String])
+    extends PythonWorkerFactory(pythonExec, envVars) {
+  self =>
+
+  private val simpleWorkers = new mutable.WeakHashMap[Socket, Process]()
+  private val authHelper = new SocketAuthHelper(SparkEnv.get.conf)
+
+  private val sedonaUDFWorkerModule =
+    SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
+
+  private val pythonPath = PythonUtils.mergePythonPaths(
+    PythonUtils.sparkPythonPath,
+    envVars.getOrElse("PYTHONPATH", ""),
+    sys.env.getOrElse("PYTHONPATH", ""))
+
+  override def create(): (Socket, Option[Int]) = {
+    createSimpleWorker(sedonaUDFWorkerModule)
+  }
+
+  private def createSimpleWorker(workerModule: String): (Socket, Option[Int]) = {
+    var serverSocket: ServerSocket = null
+    try {
+      serverSocket = new ServerSocket(0, 1, InetAddress.getLoopbackAddress())
+
+      // Create and start the worker
+      val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", workerModule))
+      val jobArtifactUUID = envVars.getOrElse("SPARK_JOB_ARTIFACT_UUID", "default")
+      if (jobArtifactUUID != "default") {
+        val f = new File(SparkFiles.getRootDirectory(), jobArtifactUUID)
+        f.mkdir()
+        pb.directory(f)
+      }
+      val workerEnv = pb.environment()
+      workerEnv.putAll(envVars.asJava)
+      workerEnv.put("PYTHONPATH", pythonPath)
+      // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
+      workerEnv.put("PYTHONUNBUFFERED", "YES")
+      workerEnv.put("PYTHON_WORKER_FACTORY_PORT", serverSocket.getLocalPort.toString)
+      workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret)
+      if (Utils.preferIPv6) {
+        workerEnv.put("SPARK_PREFER_IPV6", "True")
+      }
+      val worker = pb.start()
+
+      // Redirect worker stdout and stderr
+      redirectStreamsToStderr(worker.getInputStream, worker.getErrorStream)
+
+      // Wait for it to connect to our socket, and validate the auth secret.
+      serverSocket.setSoTimeout(10000)
+
+      try {
+        val socket = serverSocket.accept()
+        authHelper.authClient(socket)
+        // TODO: When we drop JDK 8, we can just use worker.pid()
+        val pid = new DataInputStream(socket.getInputStream).readInt()
+        if (pid < 0) {
+          throw new IllegalStateException("Python failed to launch worker with code " + pid)
+        }
+        self.synchronized {
+          simpleWorkers.put(socket, worker)
+        }
+
+        (socket, Some(pid))
+      } catch {
+        case e: Exception =>
+          throw new SparkException("Python worker failed to connect back.", e)
+      }
+    } finally {
+      if (serverSocket != null) {
+        serverSocket.close()
+      }
+    }
+  }
+
+  private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream): Unit = {
+    try {
+      new RedirectThread(stdout, System.err, "stdout reader for " + pythonExec).start()
+      new RedirectThread(stderr, System.err, "stderr reader for " + pythonExec).start()
+    } catch {
+      case e: Exception =>
+        logError("Exception in redirecting streams", e)
+    }
+  }
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
new file mode 100644
index 00000000000..18db42ae0df
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.arrow.vector.VectorSchemaRoot
+import org.apache.arrow.vector.ipc.ArrowStreamWriter
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.arrow.ArrowWriter
+import org.apache.spark.sql.util.ArrowUtils
+import org.apache.spark.util.Utils
+import org.apache.spark.{SparkEnv, TaskContext}
+
+import java.io.DataOutputStream
+import java.net.Socket
+
+private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] {
+  self: SedonaBasePythonRunner[IN, _] =>
+  protected override def newWriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[IN],
+      partitionIndex: Int,
+      context: TaskContext): WriterThread = {
+    new WriterThread(env, worker, inputIterator, partitionIndex, context) {
+
+      protected override def writeCommand(dataOut: DataOutputStream): Unit = {
+        handleMetadataBeforeExec(dataOut)
+        writeUDF(dataOut, funcs, argOffsets)
+
+        // write
+        dataOut.writeInt(self.geometryFields.length)
+        // write geometry field indices and their SRIDs
+        geometryFields.foreach { case (index, srid) =>
+          dataOut.writeInt(index)
+          dataOut.writeInt(srid)
+        }
+      }
+
+      protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+        val arrowSchema =
+          ArrowUtils.toArrowSchema(schema, timeZoneId, errorOnDuplicatedFieldNames, largeVarTypes)
+        val allocator = ArrowUtils.rootAllocator.newChildAllocator(
+          s"stdout writer for $pythonExec",
+          0,
+          Long.MaxValue)
+        val root = VectorSchemaRoot.create(arrowSchema, allocator)
+
+        Utils.tryWithSafeFinally {
+          val writer = new ArrowStreamWriter(root, null, dataOut)
+          writer.start()
+
+          writeIteratorToArrowStream(root, writer, dataOut, inputIterator)
+
+          // end writes footer to the output stream and doesn't clean any resources.
+          // It could throw exception if the output stream is closed, so it should be
+          // in the try block.
+          writer.end()
+        } {
+          // If we close root and allocator in TaskCompletionListener, there could be a race
+          // condition where the writer thread keeps writing to the VectorSchemaRoot while
+          // it's being closed by the TaskCompletion listener.
+          // Closing root and allocator here is cleaner because root and allocator is owned
+          // by the writer thread and is only visible to the writer thread.
+          //
+          // If the writer thread is interrupted by TaskCompletionListener, it should either
+          // (1) in the try block, in which case it will get an InterruptedException when
+          // performing io, and goes into the finally block or (2) in the finally block,
+          // in which case it will ignore the interruption and close the resources.
+          root.close()
+          allocator.close()
+        }
+      }
+    }
+  }
+}
+
+private[python] trait SedonaBasicPythonArrowInput
+    extends SedonaPythonArrowInput[Iterator[InternalRow]] {
+  self: SedonaBasePythonRunner[Iterator[InternalRow], _] =>
+
+  protected def writeIteratorToArrowStream(
+      root: VectorSchemaRoot,
+      writer: ArrowStreamWriter,
+      dataOut: DataOutputStream,
+      inputIterator: Iterator[Iterator[InternalRow]]): Unit = {
+    val arrowWriter = ArrowWriter.create(root)
+    while (inputIterator.hasNext) {
+      val startData = dataOut.size()
+      val nextBatch = inputIterator.next()
+
+      while (nextBatch.hasNext) {
+        arrowWriter.write(nextBatch.next())
+      }
+
+      arrowWriter.finish()
+      writer.writeBatch()
+      arrowWriter.reset()
+      val deltaData = dataOut.size() - startData
+      pythonMetrics("pythonDataSent") += deltaData
+    }
+  }
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
new file mode 100644
index 00000000000..a9421df0af8
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import java.io.DataInputStream
+import java.net.Socket
+import java.util.concurrent.atomic.AtomicBoolean
+import scala.collection.JavaConverters._
+import org.apache.arrow.vector.VectorSchemaRoot
+import org.apache.arrow.vector.ipc.ArrowStreamReader
+import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.api.python.{BasePythonRunner, SpecialLengths}
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.ArrowUtils
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, ColumnarBatch}
+
+private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] =>
+
+  protected def pythonMetrics: Map[String, SQLMetric]
+
+  protected def deserializeColumnarBatch(batch: ColumnarBatch, schema: StructType): OUT
+
+  protected def newReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      pid: Option[Int],
+      releasedOrClosed: AtomicBoolean,
+      context: TaskContext): Iterator[OUT] = {
+
+    new ReaderIterator(
+      stream,
+      writerThread,
+      startTime,
+      env,
+      worker,
+      pid,
+      releasedOrClosed,
+      context) {
+
+      private val allocator = ArrowUtils.rootAllocator.newChildAllocator(
+        s"stdin reader for $pythonExec",
+        0,
+        Long.MaxValue)
+
+      private var reader: ArrowStreamReader = _
+      private var root: VectorSchemaRoot = _
+      private var schema: StructType = _
+      private var vectors: Array[ColumnVector] = _
+      private var eos = false
+      private var nextObj: OUT = _
+
+      context.addTaskCompletionListener[Unit] { _ =>
+        if (reader != null) {
+          reader.close(false)
+        }
+        allocator.close()
+      }
+
+      private var batchLoaded = true
+
+      def handleEndOfDataSectionSedona(): Unit = {
+        if (stream.readInt() == SpecialLengths.END_OF_STREAM) {}
+
+        eos = true
+      }
+
+      protected override def handleEndOfDataSection(): Unit = {
+        handleEndOfDataSectionSedona()
+      }
+
+      override def hasNext: Boolean = nextObj != null || {
+        if (!eos) {
+          nextObj = read()
+          hasNext
+        } else {
+          false
+        }
+      }
+
+      override def next(): OUT = {
+        if (hasNext) {
+          val obj = nextObj
+          nextObj = null.asInstanceOf[OUT]
+          obj
+        } else {
+          Iterator.empty.next()
+        }
+      }
+
+      protected override def read(): OUT = {
+        if (writerThread.exception.isDefined) {
+          throw writerThread.exception.get
+        }
+        try {
+          if (reader != null && batchLoaded) {
+            val bytesReadStart = reader.bytesRead()
+            batchLoaded = reader.loadNextBatch()
+            if (batchLoaded) {
+              val batch = new ColumnarBatch(vectors)
+              val rowCount = root.getRowCount
+              batch.setNumRows(root.getRowCount)
+              val bytesReadEnd = reader.bytesRead()
+              pythonMetrics("pythonNumRowsReceived") += rowCount
+              pythonMetrics("pythonDataReceived") += bytesReadEnd - bytesReadStart
+              deserializeColumnarBatch(batch, schema)
+            } else {
+              reader.close(false)
+              allocator.close()
+              read()
+            }
+          } else {
+            val specialSign = stream.readInt()
+
+            specialSign match {
+              case SpecialLengths.START_ARROW_STREAM =>
+                reader = new ArrowStreamReader(stream, allocator)
+                root = reader.getVectorSchemaRoot()
+                schema = ArrowUtils.fromArrowSchema(root.getSchema())
+                vectors = root
+                  .getFieldVectors()
+                  .asScala
+                  .map { vector =>
+                    new ArrowColumnVector(vector)
+                  }
+                  .toArray[ColumnVector]
+
+                read()
+              case SpecialLengths.TIMING_DATA =>
+                handleTimingData()
+                read()
+              case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
+                throw handlePythonException()
+              case SpecialLengths.END_OF_DATA_SECTION =>
+                handleEndOfDataSection()
+                null.asInstanceOf[OUT]
+            }
+          }
+        } catch handleException
+      }
+    }
+  }
+}
+
+private[python] trait SedonaBasicPythonArrowOutput
+    extends SedonaPythonArrowOutput[ColumnarBatch] {
+  self: BasePythonRunner[_, ColumnarBatch] =>
+
+  protected def deserializeColumnarBatch(
+      batch: ColumnarBatch,
+      schema: StructType): ColumnarBatch = batch
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
new file mode 100644
index 00000000000..dbad8358d6f
--- /dev/null
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import java.net.Socket
+import scala.collection.mutable
+
+object WorkerContext {
+
+  def createPythonWorker(
+      pythonExec: String,
+      envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
+    synchronized {
+      val key = (pythonExec, envVars)
+      pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create()
+    }
+  }
+
+  private[spark] def destroyPythonWorker(
+      pythonExec: String,
+      envVars: Map[String, String],
+      worker: Socket): Unit = {
+    synchronized {
+      val key = (pythonExec, envVars)
+      pythonWorkers
+        .get(key)
+        .foreach(workerFactory => {
+          workerFactory.stopWorker(worker)
+        })
+    }
+  }
+
+  private val pythonWorkers =
+    mutable.HashMap[(String, Map[String, String]), SedonaDBWorkerFactory]()
+
+}
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
index 3d3301580cc..ebb5a568e14 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
@@ -44,9 +44,7 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging {
   }
 
   def isScalarPythonUDF(e: Expression): Boolean = {
-    e.isInstanceOf[PythonUDF] && e
-      .asInstanceOf[PythonUDF]
-      .evalType == PythonEvalType.SQL_SCALAR_SEDONA_UDF
+    e.isInstanceOf[PythonUDF] && PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType)
   }
 
   private def collectEvaluableUDFsFromExpressions(
@@ -168,13 +166,12 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging {
               evalTypes.mkString(","))
         }
         val evalType = evalTypes.head
-        val evaluation = evalType match {
-          case PythonEvalType.SQL_SCALAR_SEDONA_UDF =>
-            SedonaArrowEvalPython(validUdfs, resultAttrs, child, evalType)
-          case _ =>
-            throw new IllegalStateException("Unexpected UDF evalType")
+        if (!PythonEvalType.evals().contains(evalType)) {
+          throw new IllegalStateException(s"Unexpected UDF evalType: $evalType")
         }
 
+        val evaluation = SedonaArrowEvalPython(validUdfs, resultAttrs, child, evalType)
+
         attributeMap ++= validUdfs.map(canonicalizeDeterministic).zip(resultAttrs)
         evaluation
       } else {
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala
deleted file mode 100644
index a403fa6b9eb..00000000000
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/SedonaArrowStrategy.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.spark.sql.udf
-
-import org.apache.sedona.sql.UDF.PythonEvalType
-import org.apache.spark.api.python.ChainedPythonFunctions
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.sql.Strategy
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, PythonUDF}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, EvalPythonExec, PythonSQLMetrics}
-import org.apache.spark.sql.types.StructType
-
-import scala.collection.JavaConverters.asScalaIteratorConverter
-
-// We use custom Strategy to avoid Apache Spark assert on types, we
-// can consider extending this to support other engines working with
-// arrow data
-class SedonaArrowStrategy extends Strategy {
-  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-    case SedonaArrowEvalPython(udfs, output, child, evalType) =>
-      SedonaArrowEvalPythonExec(udfs, output, planLater(child), evalType) :: Nil
-    case _ => Nil
-  }
-}
-
-// It's modification og Apache Spark's ArrowEvalPythonExec, we remove the check on the types to allow geometry types
-// here, it's initial version to allow the vectorized udf for Sedona geometry types. We can consider extending this
-// to support other engines working with arrow data
-case class SedonaArrowEvalPythonExec(
-    udfs: Seq[PythonUDF],
-    resultAttrs: Seq[Attribute],
-    child: SparkPlan,
-    evalType: Int)
-    extends EvalPythonExec
-    with PythonSQLMetrics {
-
-  private val batchSize = conf.arrowMaxRecordsPerBatch
-  private val sessionLocalTimeZone = conf.sessionLocalTimeZone
-  private val largeVarTypes = conf.arrowUseLargeVarTypes
-  private val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf)
-  private[this] val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
-
-  protected override def evaluate(
-      funcs: Seq[ChainedPythonFunctions],
-      argOffsets: Array[Array[Int]],
-      iter: Iterator[InternalRow],
-      schema: StructType,
-      context: TaskContext): Iterator[InternalRow] = {
-
-    val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
-
-    val columnarBatchIter = new ArrowPythonRunner(
-      funcs,
-      evalType - PythonEvalType.SEDONA_UDF_TYPE_CONSTANT,
-      argOffsets,
-      schema,
-      sessionLocalTimeZone,
-      largeVarTypes,
-      pythonRunnerConf,
-      pythonMetrics,
-      jobArtifactUUID).compute(batchIter, context.partitionId(), context)
-
-    columnarBatchIter.flatMap { batch =>
-      batch.rowIterator.asScala
-    }
-  }
-
-  override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan =
-    copy(child = newChild)
-}
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
index 28943ff11da..e0b81c5e479 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
@@ -46,6 +46,7 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
     // We need to be explicit about broadcasting in tests.
     .config("sedona.join.autoBroadcastJoinThreshold", "-1")
     .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
+    .config("sedona.python.worker.udf.module", "sedonaworker.worker")
     .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
     .getOrCreate()
 
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
index 8d41848de98..7719b2199c5 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.udf
 
 import org.apache.sedona.sql.TestBaseScala
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.udf.ScalarUDF.geoPandasScalaFunction
+import org.apache.spark.sql.functions.{col, expr, lit}
+import org.apache.spark.sql.udf.ScalarUDF.{geoPandasScalaFunction, sedonaDBGeometryToGeometryFunction}
 import org.locationtech.jts.io.WKTReader
 import org.scalatest.matchers.should.Matchers
 
@@ -35,7 +35,8 @@ class StrategySuite extends TestBaseScala with Matchers {
 
   import spark.implicits._
 
-  it("sedona geospatial UDF") {
+
+  it("sedona geospatial UDF - geopandas") {
     val df = Seq(
       (1, "value", wktReader.read("POINT(21 52)")),
       (2, "value1", wktReader.read("POINT(20 50)")),
@@ -43,11 +44,13 @@ class StrategySuite extends TestBaseScala with Matchers {
       (4, "value3", wktReader.read("POINT(20 48)")),
       (5, "value4", wktReader.read("POINT(20 47)")))
       .toDF("id", "value", "geom")
+
+    val geopandasUDFDF = df
       .withColumn("geom_buffer", geoPandasScalaFunction(col("geom")))
 
-    df.count shouldEqual 5
+    geopandasUDFDF.count shouldEqual 5
 
-    df.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))")
+    geopandasUDFDF.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))")
       .as[String]
       .collect() should contain theSameElementsAs Seq(
       "POLYGON ((20 51, 20 53, 22 53, 22 51, 20 51))",
@@ -56,4 +59,23 @@ class StrategySuite extends TestBaseScala with Matchers {
       "POLYGON ((19 47, 19 49, 21 49, 21 47, 19 47))",
       "POLYGON ((19 46, 19 48, 21 48, 21 46, 19 46))")
   }
+
+  it("sedona geospatial UDF - sedona db") {
+    val df = Seq(
+      (1, "value", wktReader.read("POINT(21 52)")),
+      (2, "value1", wktReader.read("POINT(20 50)")),
+      (3, "value2", wktReader.read("POINT(20 49)")),
+      (4, "value3", wktReader.read("POINT(20 48)")),
+      (5, "value4", wktReader.read("POINT(20 47)")))
+      .toDF("id", "value", "geom")
+
+    val dfVectorized = df
+      .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
+      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
+
+    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
+      .selectExpr("sum(x)")
+      .as[Double]
+      .collect().head shouldEqual 101
+  }
 }
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
index c0a2d8f260d..23aac14bbe5 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
@@ -19,11 +19,13 @@
 package org.apache.spark.sql.udf
 
 import org.apache.sedona.sql.UDF
-import org.apache.spark.TestUtils
+import org.apache.spark.{SparkEnv, TestUtils}
 import org.apache.spark.api.python._
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.internal.config.Python.{PYTHON_USE_DAEMON, PYTHON_WORKER_MODULE}
 import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
 import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
+import org.apache.spark.sql.types.FloatType
 import org.apache.spark.util.Utils
 
 import java.io.File
@@ -43,6 +45,9 @@ object ScalarUDF {
     }
   }
 
+  SparkEnv.get.conf.set(PYTHON_USE_DAEMON, false)
+  SparkEnv.get.conf.set(PYTHON_WORKER_MODULE, "sedonaworker.work")
+
   private[spark] lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "")
   protected lazy val sparkHome: String = {
     sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))
@@ -54,7 +59,7 @@ object ScalarUDF {
 
   private lazy val isPythonAvailable: Boolean = TestUtils.testCommandAvailable(pythonExec)
 
-  lazy val pythonVer: String = if (isPythonAvailable) {
+  val pythonVer: String = if (isPythonAvailable) {
     Process(
       Seq(pythonExec, "-c", "import sys; print('%d.%d' % sys.version_info[:2])"),
       None,
@@ -70,31 +75,85 @@ object ScalarUDF {
     finally Utils.deleteRecursively(path)
   }
 
-  val pandasFunc: Array[Byte] = {
+  val additionalModule = "spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf"
+
+  val vectorizedFunction: Array[Byte] = {
+    var binaryPandasFunc: Array[Byte] = null
+    withTempPath { path =>
+      Process(
+        Seq(
+          pythonExec,
+          "-c",
+          f"""
+             |from pyspark.sql.types import FloatType
+             |from pyspark.serializers import CloudPickleSerializer
+             |f = open('$path', 'wb');
+             |
+             |def apply_function_on_number(x):
+             |    return x + 1.0
+             |f.write(CloudPickleSerializer().dumps((apply_function_on_number, FloatType())))
+             |""".stripMargin),
+        None,
+        "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!
+      binaryPandasFunc = Files.readAllBytes(path.toPath)
+    }
+    assert(binaryPandasFunc != null)
+    binaryPandasFunc
+  }
+
+  val sedonaDBGeometryToGeometryFunctionBytes: Array[Byte] = {
+    var binaryPandasFunc: Array[Byte] = null
+    withTempPath { path =>
+      Process(
+        Seq(
+          pythonExec,
+          "-c",
+          f"""
+             |import pyarrow as pa
+             |import shapely
+             |import geoarrow.pyarrow as ga
+             |from sedonadb import udf
+             |from sedona.sql.types import GeometryType
+             |from pyspark.serializers import CloudPickleSerializer
+             |from pyspark.sql.types import DoubleType, IntegerType
+             |from sedonadb import udf as sedona_udf_module
+             |
+             |@sedona_udf_module.arrow_udf(ga.wkb(), [udf.GEOMETRY, udf.NUMERIC])
+             |def geometry_udf(geom, distance):
+             |    geom_wkb = pa.array(geom.storage.to_array())
+             |    distance = pa.array(distance.to_array())
+             |    geom = shapely.from_wkb(geom_wkb)
+             |    result_shapely = shapely.buffer(geom, distance)
+             |
+             |    return pa.array(shapely.to_wkb(result_shapely))
+             |
+             |f = open('$path', 'wb');
+             |f.write(CloudPickleSerializer().dumps((lambda: geometry_udf, GeometryType())))
+             |""".stripMargin),
+        None,
+        "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!
+      binaryPandasFunc = Files.readAllBytes(path.toPath)
+    }
+    assert(binaryPandasFunc != null)
+    binaryPandasFunc
+  }
+
+  val geopandasNonGeometryToGeometryFunction: Array[Byte] = {
     var binaryPandasFunc: Array[Byte] = null
     withTempPath { path =>
-      println(path)
       Process(
         Seq(
           pythonExec,
           "-c",
           f"""
-            |from pyspark.sql.types import IntegerType
-            |from shapely.geometry import Point
-            |from sedona.sql.types import GeometryType
-            |from pyspark.serializers import CloudPickleSerializer
-            |from sedona.utils import geometry_serde
-            |from shapely import box
-            |f = open('$path', 'wb');
-            |def w(x):
-            |    def apply_function(w):
-            |        geom, offset = geometry_serde.deserialize(w)
-            |        bounds = geom.buffer(1).bounds
-            |        x = box(*bounds)
-            |        return geometry_serde.serialize(x)
-            |    return x.apply(apply_function)
-            |f.write(CloudPickleSerializer().dumps((w, GeometryType())))
-            |""".stripMargin),
+               |from sedona.sql.types import GeometryType
+               |from shapely.wkt import loads
+               |from pyspark.serializers import CloudPickleSerializer
+               |f = open('$path', 'wb');
+               |def apply_geopandas(x):
+               |    return x.apply(lambda wkt: loads(wkt).buffer(1))
+               |f.write(CloudPickleSerializer().dumps((apply_geopandas, GeometryType())))
+               |""".stripMargin),
         None,
         "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!
       binaryPandasFunc = Files.readAllBytes(path.toPath)
@@ -104,7 +163,39 @@ object ScalarUDF {
   }
 
   private val workerEnv = new java.util.HashMap[String, String]()
-  workerEnv.put("PYTHONPATH", s"$pysparkPythonPath:$pythonPath")
+
+  val pandasFunc: Array[Byte] = {
+    var binaryPandasFunc: Array[Byte] = null
+    withTempPath { path =>
+      println(path)
+      Process(
+        Seq(
+          pythonExec,
+          "-c",
+          f"""
+             |from pyspark.sql.types import IntegerType
+             |from shapely.geometry import Point
+             |from sedona.sql.types import GeometryType
+             |from pyspark.serializers import CloudPickleSerializer
+             |from sedona.utils import geometry_serde
+             |from shapely import box
+             |f = open('$path', 'wb');
+             |def w(x):
+             |    def apply_function(w):
+             |        geom, offset = geometry_serde.deserialize(w)
+             |        bounds = geom.buffer(1).bounds
+             |        x = box(*bounds)
+             |        return geometry_serde.serialize(x)
+             |    return x.apply(apply_function)
+             |f.write(CloudPickleSerializer().dumps((w, GeometryType())))
+             |""".stripMargin),
+        None,
+        "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!
+      binaryPandasFunc = Files.readAllBytes(path.toPath)
+    }
+    assert(binaryPandasFunc != null)
+    binaryPandasFunc
+  }
 
   val geoPandasScalaFunction: UserDefinedPythonFunction = UserDefinedPythonFunction(
     name = "geospatial_udf",
@@ -119,4 +210,33 @@ object ScalarUDF {
     dataType = GeometryUDT,
     pythonEvalType = UDF.PythonEvalType.SQL_SCALAR_SEDONA_UDF,
     udfDeterministic = true)
+
+  val nonGeometryVectorizedUDF: UserDefinedPythonFunction = UserDefinedPythonFunction(
+    name = "vectorized_udf",
+    func = SimplePythonFunction(
+      command = vectorizedFunction,
+      envVars = workerEnv.clone().asInstanceOf[java.util.Map[String, String]],
+      pythonIncludes = List.empty[String].asJava,
+      pythonExec = pythonExec,
+      pythonVer = pythonVer,
+      broadcastVars = List.empty[Broadcast[PythonBroadcast]].asJava,
+      accumulator = null),
+    dataType = FloatType,
+    pythonEvalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+    udfDeterministic = false)
+
+  val sedonaDBGeometryToGeometryFunction: UserDefinedPythonFunction = UserDefinedPythonFunction(
+    name = "geospatial_udf",
+    func = SimplePythonFunction(
+      command = sedonaDBGeometryToGeometryFunctionBytes,
+      envVars = workerEnv.clone().asInstanceOf[java.util.Map[String, String]],
+      pythonIncludes = List.empty[String].asJava,
+      pythonExec = pythonExec,
+      pythonVer = pythonVer,
+      broadcastVars = List.empty[Broadcast[PythonBroadcast]].asJava,
+      accumulator = null),
+    dataType = GeometryUDT,
+    pythonEvalType = UDF.PythonEvalType.SQL_SCALAR_SEDONA_DB_UDF,
+    udfDeterministic = true)
+
 }

From 45c2fd64ca1d18b1947e988a346bac793333f592 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Sun, 4 Jan 2026 22:39:13 +0100
Subject: [PATCH 02/11] add sedonadb sedona udf worker example

---
 python/sedona/spark/worker/serde.py  | 27 ---------------------------
 python/sedona/spark/worker/worker.py | 25 -------------------------
 2 files changed, 52 deletions(-)

diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py
index 31038b7fcdd..3954d075b74 100644
--- a/python/sedona/spark/worker/serde.py
+++ b/python/sedona/spark/worker/serde.py
@@ -1,29 +1,8 @@
-import socket
-
 from pyspark.serializers import write_int, SpecialLengths
 from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
 
 from sedona.spark.worker.udf_info import UDFInfo
 
-
-def read_available(buf, chunk=4096):
-    # buf.raw._sock.settimeout(0.01)   # non-blocking-ish
-    data = bytearray()
-    index = 0
-    while True:
-        index+=1
-        try:
-            chunk_bytes = buf.read(chunk)
-        except socket.timeout:
-            break
-
-        if not chunk_bytes and index > 10:
-            break
-
-        data.extend(chunk_bytes)
-
-    return bytes(data)
-
 class SedonaDBSerializer(ArrowStreamPandasSerializer):
     def __init__(self, timezone, safecheck, db, udf_info: UDFInfo):
         super(SedonaDBSerializer, self).__init__(timezone, safecheck)
@@ -64,12 +43,6 @@ def arrow_dump_stream(self, iterator, stream):
                 writer.close()
 
     def dump_stream(self, iterator, stream):
-        """
-        Override because Pandas UDFs require a START_ARROW_STREAM before the Arrow stream is sent.
-        This should be sent after creating the first record batch so in case of an error, it can
-        be sent back to the JVM before the Arrow stream starts.
-        """
-
         def init_stream_yield_batches():
             should_write_start_length = True
             for batch in iterator:
diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
index 74a61b02ee8..571134f4076 100644
--- a/python/sedona/spark/worker/worker.py
+++ b/python/sedona/spark/worker/worker.py
@@ -104,7 +104,6 @@ def add_path(path: str):
             sys.path.insert(1, path)
 
     spark_files_dir = utf_serde.loads(infile)
-    # _accumulatorRegistry.clear()
 
     SparkFiles._root_directory = spark_files_dir
     SparkFiles._is_running_on_worker = True
@@ -165,26 +164,6 @@ def read_udf(infile, pickle_ser) -> UDFInfo:
         geom_offsets=[0]
     )
 
-# def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
-#     num_arg = read_int(infile)
-#     arg_offsets = [read_int(infile) for i in range(num_arg)]
-#     chained_func = None
-#     for i in range(read_int(infile)):
-#         f, return_type = read_command(pickleSer, infile)
-#         if chained_func is None:
-#             chained_func = f
-#         else:
-#             chained_func = chain(chained_func, f)
-#
-#     func = chained_func
-#
-#     # the last returnType will be the return type of UDF
-#     if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
-#         return arg_offsets, func, return_type
-#     else:
-#         raise ValueError("Unknown eval type: {}".format(eval_type))
-#
-
 def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo:
     num_udfs = read_int(infile)
 
@@ -211,11 +190,7 @@ def write_statistics(infile, outfile, boot_time, init_time) -> None:
     write_long(shuffle.MemoryBytesSpilled, outfile)
     write_long(shuffle.DiskBytesSpilled, outfile)
 
-    # Mark the beginning of the accumulators section of the output
     write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
-    # write_int(len(_accumulatorRegistry), outfile)
-    # for (aid, accum) in _accumulatorRegistry.items():
-    #     pickleSer._write_with_length((aid, accum._value), outfile)
 
     if read_int(infile) == SpecialLengths.END_OF_STREAM:
         write_int(SpecialLengths.END_OF_STREAM, outfile)

From 880d8c7140c65eb05a91158cf6a6ae4dde7c5a37 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Sun, 4 Jan 2026 23:27:06 +0100
Subject: [PATCH 03/11] add sedonadb sedona udf worker example

---
 python/sedona/spark/worker/worker.py                | 2 --
 python/tests/utils/test_sedona_db_vectorized_udf.py | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
index 571134f4076..6b2a18c8f20 100644
--- a/python/sedona/spark/worker/worker.py
+++ b/python/sedona/spark/worker/worker.py
@@ -170,8 +170,6 @@ def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo:
     udf = None
     for _ in range(num_udfs):
         udf = read_udf(infile, pickle_ser)
-        # Here we would register the UDF with SedonaDB's internal context
-
 
     return udf
 
diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py
index 749d45420ef..6021811e916 100644
--- a/python/tests/utils/test_sedona_db_vectorized_udf.py
+++ b/python/tests/utils/test_sedona_db_vectorized_udf.py
@@ -5,6 +5,7 @@
 from sedona.sql import GeometryType
 from pyspark.sql.functions import expr, lit
 from pyspark.sql.types import DoubleType, IntegerType
+from sedona.spark.sql import ST_X
 
 
 class TestSedonaDBArrowFunction(TestBase):
@@ -28,7 +29,7 @@ def my_own_function(geom, distance):
             ["id", "wkt"],
         ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
 
-        df.select(my_own_function(df.wkt, lit(100)).alias("geom")).show()
+        df.select(ST_X(my_own_function(df.wkt, lit(100)).alias("geom"))).show()
 
     def test_geometry_to_double(self):
         @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()])

From 53f69e51f964f0e578bd2d97ec9b5971be565da2 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Fri, 9 Jan 2026 10:47:48 +0100
Subject: [PATCH 04/11] add sedonadb sedona udf worker example

---
 python/tests/test_base.py                     |   2 +
 .../utils/test_sedona_db_vectorized_udf.py    |  41 +++
 .../python/SedonaBasePythonRunner.scala       |   6 +-
 .../python/SedonaDBWorkerFactory.scala        | 247 +++++++++++++++++-
 .../python/SedonaPythonArrowOutput.scala      |  26 +-
 .../sql/execution/python/WorkerContext.scala  |  24 +-
 .../sedona/sql/SQLSyntaxTestScala.scala       |   8 +-
 .../org/apache/sedona/sql/TestBaseScala.scala |  24 +-
 .../apache/spark/sql/udf/StrategySuite.scala  |  43 ++-
 9 files changed, 373 insertions(+), 48 deletions(-)

diff --git a/python/tests/test_base.py b/python/tests/test_base.py
index a6dbae65975..911860e416f 100644
--- a/python/tests/test_base.py
+++ b/python/tests/test_base.py
@@ -70,6 +70,8 @@ def spark(self):
                     "spark.sedona.stac.load.itemsLimitMax",
                     "20",
                 )
+                .config("spark.executor.memory", "10G") \
+                .config("spark.driver.memory", "10G") \
                 # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default
                 # in Spark 4
                 .config("spark.sql.ansi.enabled", "false")
diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py
index 6021811e916..904d59a2820 100644
--- a/python/tests/utils/test_sedona_db_vectorized_udf.py
+++ b/python/tests/utils/test_sedona_db_vectorized_udf.py
@@ -93,3 +93,44 @@ def return_same_geometry(geom):
         crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect()
 
         assert crs_list == [3857, 3857, 3857]
+
+    def test_geometry_to_geometry(self):
+        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()])
+        def buffer_geometry(geom):
+            geom_wkb = pa.array(geom.storage.to_array())
+            geom = shapely.from_wkb(geom_wkb)
+
+            result_shapely = shapely.buffer(geom, 10)
+
+            return pa.array(shapely.to_wkb(result_shapely))
+
+        df = self.spark.read.\
+            format("geoparquet").\
+            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
+        # 18 24
+        # df.union(df).union(df).union(df).union(df).union(df).union(df).\
+        #     write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
+
+        values = df.select(buffer_geometry(df.geometry).alias("geometry")).\
+            selectExpr("ST_Area(geometry) as area").\
+            selectExpr("Sum(area) as total_area")
+
+        values.show()
+
+    def test_geometry_to_geometry_normal_udf(self):
+        from pyspark.sql.functions import udf
+
+        def create_buffer(geom):
+            return geom.buffer(10)
+
+        create_buffer_udf = udf(create_buffer, GeometryType())
+
+        df = self.spark.read. \
+            format("geoparquet"). \
+            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
+
+        values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \
+            selectExpr("ST_Area(geometry) as area"). \
+            selectExpr("Sum(area) as total_area")
+
+        values.show()
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
index 8ecc110e39d..276383a0ee8 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
@@ -46,6 +46,7 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
   require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
 
   private val conf = SparkEnv.get.conf
+  private val reuseWorker = conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
   private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED)
 
   private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = {
@@ -82,6 +83,7 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
 
     envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default"))
 
+    println("running the compute for SedonaBasePythonRunner and partition index: " + partitionIndex)
     val (worker: Socket, pid: Option[Int]) = {
       WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap)
     }
@@ -93,8 +95,10 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
 
     context.addTaskCompletionListener[Unit] { _ =>
       writerThread.shutdownOnTaskCompletion()
-      if (releasedOrClosed.compareAndSet(false, true)) {
+
+      if (!reuseWorker || releasedOrClosed.compareAndSet(false, true)) {
         try {
+          logInfo("Shutting down worker socket")
           worker.close()
         } catch {
           case e: Exception =>
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
index add09a7cb22..93bcaee0c6f 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
@@ -19,36 +19,68 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.{SparkException, SparkFiles}
-import org.apache.spark.api.python.{PythonUtils, PythonWorkerFactory}
+import org.apache.spark.api.python.PythonUtils
 import org.apache.spark.util.Utils
 
-import java.io.{DataInputStream, File}
-import java.net.{InetAddress, ServerSocket, Socket}
+import java.io.{DataInputStream, DataOutputStream, EOFException, File, InputStream}
+import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 import java.util.Arrays
-import java.io.InputStream
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import org.apache.spark._
+import org.apache.spark.errors.SparkCoreErrors
+import org.apache.spark.internal.Logging
 import org.apache.spark.security.SocketAuthHelper
+import org.apache.spark.sql.execution.python.SedonaPythonWorkerFactory.PROCESS_WAIT_TIMEOUT_MS
 import org.apache.spark.util.RedirectThread
 
-class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String])
-    extends PythonWorkerFactory(pythonExec, envVars) {
+import java.util.concurrent.TimeUnit
+import javax.annotation.concurrent.GuardedBy
+
+class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) extends Logging {
   self =>
 
   private val simpleWorkers = new mutable.WeakHashMap[Socket, Process]()
   private val authHelper = new SocketAuthHelper(SparkEnv.get.conf)
+  @GuardedBy("self")
+  private var daemon: Process = null
+  val daemonHost = InetAddress.getLoopbackAddress()
+  @GuardedBy("self")
+  private var daemonPort: Int = 0
+  @GuardedBy("self")
+  private val daemonWorkers = new mutable.WeakHashMap[Socket, Int]()
+  @GuardedBy("self")
+  private val idleWorkers = new mutable.Queue[Socket]()
+  @GuardedBy("self")
+  private var lastActivityNs = 0L
+
+  private val useDaemon: Boolean =
+    SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false)
 
   private val sedonaUDFWorkerModule =
     SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
 
+  private val sedonaDaemonModule =
+    SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon")
+
   private val pythonPath = PythonUtils.mergePythonPaths(
     PythonUtils.sparkPythonPath,
     envVars.getOrElse("PYTHONPATH", ""),
     sys.env.getOrElse("PYTHONPATH", ""))
 
-  override def create(): (Socket, Option[Int]) = {
-    createSimpleWorker(sedonaUDFWorkerModule)
+  def create(): (Socket, Option[Int]) = {
+    if (useDaemon) {
+      self.synchronized {
+        if (idleWorkers.nonEmpty) {
+          val worker = idleWorkers.dequeue()
+          return (worker, daemonWorkers.get(worker))
+        }
+      }
+
+      createThroughDaemon()
+    } else {
+      createSimpleWorker(sedonaUDFWorkerModule)
+    }
   }
 
   private def createSimpleWorker(workerModule: String): (Socket, Option[Int]) = {
@@ -115,4 +147,203 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String])
         logError("Exception in redirecting streams", e)
     }
   }
+
+  private def createThroughDaemon(): (Socket, Option[Int]) = {
+
+    def createSocket(): (Socket, Option[Int]) = {
+      val socket = new Socket(daemonHost, daemonPort)
+      val pid = new DataInputStream(socket.getInputStream).readInt()
+      if (pid < 0) {
+        throw new IllegalStateException("Python daemon failed to launch worker with code " + pid)
+      }
+
+      authHelper.authToServer(socket)
+      daemonWorkers.put(socket, pid)
+      (socket, Some(pid))
+    }
+
+    self.synchronized {
+      // Start the daemon if it hasn't been started
+      startDaemon()
+
+      // Attempt to connect, restart and retry once if it fails
+      try {
+        createSocket()
+      } catch {
+        case exc: SocketException =>
+          logWarning("Failed to open socket to Python daemon:", exc)
+          logWarning("Assuming that daemon unexpectedly quit, attempting to restart")
+          stopDaemon()
+          startDaemon()
+          createSocket()
+      }
+    }
+  }
+
+  private def stopDaemon(): Unit = {
+    logError("daemon stopping called")
+    self.synchronized {
+      if (useDaemon) {
+        cleanupIdleWorkers()
+
+        // Request shutdown of existing daemon by sending SIGTERM
+        if (daemon != null) {
+          daemon.destroy()
+        }
+
+        daemon = null
+        daemonPort = 0
+      } else {
+        println("Stopping simple workers")
+        simpleWorkers.mapValues(_.destroy())
+      }
+    }
+  }
+
+  private def startDaemon(): Unit = {
+    self.synchronized {
+      // Is it already running?
+      if (daemon != null) {
+        return
+      }
+
+      try {
+        // Create and start the daemon
+        val command = Arrays.asList(pythonExec, "-m", sedonaDaemonModule)
+        val pb = new ProcessBuilder(command)
+        val jobArtifactUUID = envVars.getOrElse("SPARK_JOB_ARTIFACT_UUID", "default")
+        if (jobArtifactUUID != "default") {
+          val f = new File(SparkFiles.getRootDirectory(), jobArtifactUUID)
+          f.mkdir()
+          pb.directory(f)
+        }
+        val workerEnv = pb.environment()
+        workerEnv.putAll(envVars.asJava)
+        workerEnv.put("PYTHONPATH", pythonPath)
+        workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret)
+        if (Utils.preferIPv6) {
+          workerEnv.put("SPARK_PREFER_IPV6", "True")
+        }
+        // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
+        workerEnv.put("PYTHONUNBUFFERED", "YES")
+        daemon = pb.start()
+
+        val in = new DataInputStream(daemon.getInputStream)
+        try {
+          daemonPort = in.readInt()
+        } catch {
+          case _: EOFException if daemon.isAlive =>
+            throw SparkCoreErrors.eofExceptionWhileReadPortNumberError(
+              sedonaDaemonModule)
+          case _: EOFException =>
+            throw SparkCoreErrors.
+              eofExceptionWhileReadPortNumberError(sedonaDaemonModule, Some(daemon.exitValue))
+        }
+
+        // test that the returned port number is within a valid range.
+        // note: this does not cover the case where the port number
+        // is arbitrary data but is also coincidentally within range
+        if (daemonPort < 1 || daemonPort > 0xffff) {
+          val exceptionMessage = f"""
+                                    |Bad data in $sedonaDaemonModule's standard output. Invalid port number:
+                                    |  $daemonPort (0x$daemonPort%08x)
+                                    |Python command to execute the daemon was:
+                                    |  ${command.asScala.mkString(" ")}
+                                    |Check that you don't have any unexpected modules or libraries in
+                                    |your PYTHONPATH:
+                                    |  $pythonPath
+                                    |Also, check if you have a sitecustomize.py module in your python path,
+                                    |or in your python installation, that is printing to standard output"""
+          throw new SparkException(exceptionMessage.stripMargin)
+        }
+
+        // Redirect daemon stdout and stderr
+        redirectStreamsToStderr(in, daemon.getErrorStream)
+      } catch {
+        case e: Exception =>
+
+          // If the daemon exists, wait for it to finish and get its stderr
+          val stderr = Option(daemon)
+            .flatMap { d => Utils.getStderr(d, PROCESS_WAIT_TIMEOUT_MS) }
+            .getOrElse("")
+
+          stopDaemon()
+
+          if (stderr != "") {
+            val formattedStderr = stderr.replace("\n", "\n  ")
+            val errorMessage = s"""
+                                  |Error from python worker:
+                                  |  $formattedStderr
+                                  |PYTHONPATH was:
+                                  |  $pythonPath
+                                  |$e"""
+
+            // Append error message from python daemon, but keep original stack trace
+            val wrappedException = new SparkException(errorMessage.stripMargin)
+            wrappedException.setStackTrace(e.getStackTrace)
+            throw wrappedException
+          } else {
+            throw e
+          }
+      }
+
+      // Important: don't close daemon's stdin (daemon.getOutputStream) so it can correctly
+      // detect our disappearance.
+    }
+  }
+
+  private def cleanupIdleWorkers(): Unit = {
+    while (idleWorkers.nonEmpty) {
+      val worker = idleWorkers.dequeue()
+      try {
+        // the worker will exit after closing the socket
+        worker.close()
+      } catch {
+        case e: Exception =>
+          logWarning("Failed to close worker socket", e)
+      }
+    }
+  }
+
+  def releaseWorker(worker: Socket): Unit = {
+    if (useDaemon) {
+      logInfo("Releasing worker back to daemon pool")
+      self.synchronized {
+        lastActivityNs = System.nanoTime()
+        idleWorkers.enqueue(worker)
+      }
+    } else {
+      // Cleanup the worker socket. This will also cause the Python worker to exit.
+      try {
+        worker.close()
+      } catch {
+        case e: Exception =>
+          logWarning("Failed to close worker socket", e)
+      }
+    }
+  }
+
+  def stopWorker(worker: Socket): Unit = {
+    self.synchronized {
+      if (useDaemon) {
+        if (daemon != null) {
+          daemonWorkers.get(worker).foreach { pid =>
+            // tell daemon to kill worker by pid
+            val output = new DataOutputStream(daemon.getOutputStream)
+            output.writeInt(pid)
+            output.flush()
+            daemon.getOutputStream.flush()
+          }
+        }
+      } else {
+        simpleWorkers.get(worker).foreach(_.destroy())
+      }
+    }
+    worker.close()
+  }
+}
+
+private object SedonaPythonWorkerFactory {
+  val PROCESS_WAIT_TIMEOUT_MS = 10000
+  val IDLE_WORKER_TIMEOUT_NS = TimeUnit.MINUTES.toNanos(1)  // kill idle workers after 1 minute
 }
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
index a9421df0af8..0c0b220933f 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
@@ -26,6 +26,7 @@ import org.apache.arrow.vector.VectorSchemaRoot
 import org.apache.arrow.vector.ipc.ArrowStreamReader
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{BasePythonRunner, SpecialLengths}
+import org.apache.spark.internal.config.Python.PYTHON_WORKER_REUSE
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.ArrowUtils
@@ -33,6 +34,8 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, Columna
 
 private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] =>
 
+  private val reuseWorker = SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
+
   protected def pythonMetrics: Map[String, SQLMetric]
 
   protected def deserializeColumnarBatch(batch: ColumnarBatch, schema: StructType): OUT
@@ -78,11 +81,28 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR
 
       private var batchLoaded = true
 
-      def handleEndOfDataSectionSedona(): Unit = {
-        if (stream.readInt() == SpecialLengths.END_OF_STREAM) {}
-
+      protected def handleEndOfDataSectionSedona(): Unit = {
+        // We've finished the data section of the output, but we can still
+        // read some accumulator updates:
+//        val numAccumulatorUpdates = stream.readInt()
+//        (1 to numAccumulatorUpdates).foreach { _ =>
+//          val updateLen = stream.readInt()
+//          val update = new Array[Byte](updateLen)
+//          stream.readFully(update)
+//        }
+        // Check whether the worker is ready to be re-used.
+        if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
+          if (reuseWorker && releasedOrClosed.compareAndSet(false, true)) {
+            WorkerContext.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker)
+          }
+        }
         eos = true
       }
+//      def handleEndOfDataSectionSedona(): Unit = {
+//        if (stream.readInt() == SpecialLengths.END_OF_STREAM) {}
+//
+//        eos = true
+//      }
 
       protected override def handleEndOfDataSection(): Unit = {
         handleEndOfDataSectionSedona()
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
index dbad8358d6f..82fe6deddaf 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
@@ -24,25 +24,27 @@ import scala.collection.mutable
 object WorkerContext {
 
   def createPythonWorker(
-      pythonExec: String,
-      envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
+                          pythonExec: String,
+                          envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
     synchronized {
       val key = (pythonExec, envVars)
       pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create()
     }
   }
 
-  private[spark] def destroyPythonWorker(
-      pythonExec: String,
-      envVars: Map[String, String],
-      worker: Socket): Unit = {
+  def destroyPythonWorker(pythonExec: String,
+                          envVars: Map[String, String], worker: Socket): Unit = {
     synchronized {
       val key = (pythonExec, envVars)
-      pythonWorkers
-        .get(key)
-        .foreach(workerFactory => {
-          workerFactory.stopWorker(worker)
-        })
+      pythonWorkers.get(key).foreach(_.stopWorker(worker))
+    }
+  }
+
+  def releasePythonWorker(pythonExec: String,
+                          envVars: Map[String, String], worker: Socket): Unit = {
+    synchronized {
+      val key = (pythonExec, envVars)
+      pythonWorkers.get(key).foreach(_.releaseWorker(worker))
     }
   }
 
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
index 6f873d0a087..72a27461f69 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
@@ -47,11 +47,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks {
       try {
         sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)")
         sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true)
-        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
+//        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
       } catch {
         case ex: Exception =>
           ex.getClass.getName.endsWith("ParseException") should be(true)
-          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
+//          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
       }
     }
 
@@ -61,11 +61,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks {
         sparkSession.sql(
           "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)")
         sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true)
-        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
+//        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
       } catch {
         case ex: Exception =>
           ex.getClass.getName.endsWith("ParseException") should be(true)
-          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
+//          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
       }
     }
   }
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
index e0b81c5e479..e64e9dec3bb 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
@@ -30,13 +30,13 @@ import java.io.FileInputStream
 import java.util.concurrent.ThreadLocalRandom
 
 trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
-  Logger.getRootLogger().setLevel(Level.WARN)
-  Logger.getLogger("org.apache").setLevel(Level.WARN)
-  Logger.getLogger("com").setLevel(Level.WARN)
-  Logger.getLogger("akka").setLevel(Level.WARN)
-  Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN)
+//  Logger.getRootLogger().setLevel(Level.WARN)
+//  Logger.getLogger("org.apache").setLevel(Level.WARN)
+//  Logger.getLogger("com").setLevel(Level.WARN)
+//  Logger.getLogger("akka").setLevel(Level.WARN)
+//  Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN)
 
-  val keyParserExtension = "spark.sedona.enableParserExtension"
+//  val keyParserExtension = "spark.sedona.enableParserExtension"
   val warehouseLocation = System.getProperty("user.dir") + "/target/"
   val sparkSession = SedonaContext
     .builder()
@@ -47,9 +47,19 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
     .config("sedona.join.autoBroadcastJoinThreshold", "-1")
     .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
     .config("sedona.python.worker.udf.module", "sedonaworker.worker")
-    .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
+    .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon")
+    .config("sedona.python.worker.daemon.enabled", "true")
+//    .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
     .getOrCreate()
 
+//    private val useDaemon: Boolean =
+//    SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false)
+//
+//  private val sedonaUDFWorkerModule =
+//    SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
+//
+//  private val sedonaDaemonModule =
+//    SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon")
   val sparkSessionMinio = SedonaContext
     .builder()
     .master("local[*]")
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
index 7719b2199c5..000c1f55b62 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
@@ -61,21 +61,36 @@ class StrategySuite extends TestBaseScala with Matchers {
   }
 
   it("sedona geospatial UDF - sedona db") {
-    val df = Seq(
-      (1, "value", wktReader.read("POINT(21 52)")),
-      (2, "value1", wktReader.read("POINT(20 50)")),
-      (3, "value2", wktReader.read("POINT(20 49)")),
-      (4, "value3", wktReader.read("POINT(20 48)")),
-      (5, "value4", wktReader.read("POINT(20 47)")))
-      .toDF("id", "value", "geom")
+//    val df = Seq(
+//      (1, "value", wktReader.read("POINT(21 52)")),
+//      (2, "value1", wktReader.read("POINT(20 50)")),
+//      (3, "value2", wktReader.read("POINT(20 49)")),
+//      (4, "value3", wktReader.read("POINT(20 48)")),
+//      (5, "value4", wktReader.read("POINT(20 47)")))
+//      .toDF("id", "value", "geom")
+//
+//    val dfVectorized = df
+//      .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
+//      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
+
+//    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
+//      .selectExpr("sum(x)")
+//      .as[Double]
+//      .collect().head shouldEqual 101
+
+    val dfCopied = sparkSession.read
+      .format("geoparquet")
+      .load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet")
 
-    val dfVectorized = df
-      .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
-      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
+    val values = dfCopied.unionAll(dfCopied)
+      .unionAll(dfCopied)
+//      .unionAll(dfCopied)
+//      .unionAll(dfCopied)
+//      .unionAll(dfCopied)
+      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom"))
+      .selectExpr("ST_Area(geom) as area")
+      .selectExpr("Sum(area) as total_area")
 
-    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
-      .selectExpr("sum(x)")
-      .as[Double]
-      .collect().head shouldEqual 101
+    values.show()
   }
 }

From e00915751a90249f3395dc5d71957e0521f4a640 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Tue, 13 Jan 2026 00:25:30 +0100
Subject: [PATCH 05/11] add sedonadb sedona udf worker example

---
 python/pyproject.toml                         |  14 +-
 python/sedona/spark/sql/functions.py          |  22 +-
 python/sedona/spark/utils/geometry_serde.py   |   6 +-
 python/sedona/spark/utils/udf.py              |  26 ++
 python/sedona/spark/worker/daemon.py          | 227 ++++++++++++++++++
 python/sedona/spark/worker/serde.py           |   6 +-
 python/sedona/spark/worker/udf_info.py        |   9 +-
 python/sedona/spark/worker/worker.py          |  18 +-
 python/setup.py                               |   6 +
 python/src/geom_buf.c                         |   2 +
 python/src/geomserde.c                        |   1 +
 python/src/geomserde_speedup_module.c         |  99 +++++++-
 python/tests/test_base.py                     |   2 +
 .../utils/test_sedona_db_vectorized_udf.py    | 132 +++++++++-
 .../sedona/sql/UDF/PythonEvalType.scala       |   4 +-
 .../python/SedonaArrowPythonRunner.scala      |   6 +-
 .../python/SedonaArrowStrategy.scala          |  10 +-
 .../python/SedonaBasePythonRunner.scala       |  12 +-
 .../python/SedonaDBWorkerFactory.scala        |  14 +-
 .../python/SedonaPythonArrowInput.scala       |   3 +
 .../python/SedonaPythonArrowOutput.scala      |   3 +-
 .../sql/execution/python/WorkerContext.scala  |  16 +-
 .../spark/sql/udf/ExtractSedonaUDFRule.scala  |   3 +-
 .../org/apache/sedona/sql/TestBaseScala.scala |   4 +-
 .../apache/spark/sql/udf/StrategySuite.scala  |  59 +++--
 25 files changed, 602 insertions(+), 102 deletions(-)
 create mode 100644 python/sedona/spark/utils/udf.py
 create mode 100644 python/sedona/spark/worker/daemon.py
 create mode 100644 python/setup.py

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 76169261c34..37159cf83b9 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [build-system]
-requires = ["setuptools>=80.9.0", "wheel"]
+requires = ["setuptools>=80.9.0", "wheel", "numpy"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -36,6 +36,7 @@ dependencies = [
   "geoarrow-c>=0.3.1",
   "geoarrow-pyarrow>=0.2.0",
   "geopandas>=1.1.2",
+  "numpy>=2.1.3",
   "pyarrow>=16.1.0",
   "pyspark==3.5.4",
   "sedonadb",
@@ -79,14 +80,19 @@ exclude = ["*.tests", "*.tests.*", "tests", "tests.*"]
 name = "sedona.spark.utils.geomserde_speedup"
 sources = [
   "src/geomserde_speedup_module.c",
+  "src/sedonaserde_vectorized_udf_module.c",
   "src/geomserde.c",
   "src/geom_buf.c",
   "src/geos_c_dyn.c",
 ]
 
-[tool.uv]
-dev-dependencies = [
-    "pytest>=9.0.2",
+[[tool.setuptools.ext-modules]]
+name = "sedona.spark.utils.sedonaserde_vectorized_udf_module"
+sources = [
+    "src/sedonaserde_vectorized_udf_module.c",
+    "src/geomserde.c",
+    "src/geom_buf.c",
+    "src/geos_c_dyn.c",
 ]
 
 [tool.uv.sources]
diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py
index 7c480e1700d..232ccb50a3c 100644
--- a/python/sedona/spark/sql/functions.py
+++ b/python/sedona/spark/sql/functions.py
@@ -28,8 +28,9 @@
 import geoarrow.pyarrow as ga
 from sedonadb import udf as sedona_udf_module
 from sedona.spark.sql.types import GeometryType
-from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType
+from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType, ByteType
 
+from sedona.spark.utils.udf import has_sedona_serializer_speedup
 
 SEDONA_SCALAR_EVAL_TYPE = 5200
 SEDONA_PANDAS_ARROW_NAME = "SedonaPandasArrowUDF"
@@ -51,7 +52,7 @@ class InvalidSedonaUDFType(Exception):
 
 
 def sedona_vectorized_udf(
-    return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR
+        return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR
 ):
     import geopandas as gpd
 
@@ -92,7 +93,7 @@ def apply_fn(fn):
 
 
 def _apply_shapely_series_udf(
-    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+        fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
 ):
     def apply(series: pd.Series) -> pd.Series:
         applied = series.apply(
@@ -113,7 +114,7 @@ def apply(series: pd.Series) -> pd.Series:
 
 
 def _apply_geo_series_udf(
-    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+        fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
 ):
     import geopandas as gpd
 
@@ -161,6 +162,7 @@ def infer_pa_type(spark_type: DataType):
     else:
         raise NotImplementedError(f"Type {spark_type} is not supported yet.")
 
+
 def infer_input_type(spark_type: DataType):
     if isinstance(spark_type, GeometryType):
         return sedona_udf_module.GEOMETRY
@@ -168,9 +170,12 @@ def infer_input_type(spark_type: DataType):
         return sedona_udf_module.NUMERIC
     elif isinstance(spark_type, StringType):
         return sedona_udf_module.STRING
+    elif isinstance(spark_type, ByteType):
+        return sedona_udf_module.BINARY
     else:
         raise NotImplementedError(f"Type {spark_type} is not supported yet.")
 
+
 def infer_input_types(spark_types: list[DataType]):
     pa_types = []
     for spark_type in spark_types:
@@ -182,8 +187,12 @@ def infer_input_types(spark_types: list[DataType]):
 
 def sedona_db_vectorized_udf(
         return_type: DataType,
-        input_types: list[DataType]
+        input_types: list[DataType],
 ):
+    eval_type = 6201
+    if has_sedona_serializer_speedup():
+        eval_type = 6200
+
     def apply_fn(fn):
         out_type = infer_pa_type(return_type)
         input_types_sedona_db = infer_input_types(input_types)
@@ -193,10 +202,9 @@ def shapely_udf(*args, **kwargs):
             return fn(*args, **kwargs)
 
         udf = UserDefinedFunction(
-            lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=6200
+            lambda: shapely_udf, return_type, "SedonaPandasArrowUDF", evalType=eval_type
         )
 
         return udf
 
-
     return apply_fn
diff --git a/python/sedona/spark/utils/geometry_serde.py b/python/sedona/spark/utils/geometry_serde.py
index 103eb498173..0ef3d4ed5cb 100644
--- a/python/sedona/spark/utils/geometry_serde.py
+++ b/python/sedona/spark/utils/geometry_serde.py
@@ -25,9 +25,6 @@
 
 speedup_enabled = False
 
-
-# Use geomserde_speedup when available, otherwise fallback to general pure
-# python implementation.
 try:
     from . import geomserde_speedup
 
@@ -60,8 +57,9 @@ def find_geos_c_dll():
         def deserialize(buf: bytearray) -> Optional[BaseGeometry]:
             if buf is None:
                 return None
-            return geomserde_speedup.deserialize(buf)
+            return geomserde_speedup.deserialize_2(buf)
 
+        # Export the from_sedona_func for use with numpy ufuncs
         speedup_enabled = True
 
     elif shapely.__version__.startswith("1."):
diff --git a/python/sedona/spark/utils/udf.py b/python/sedona/spark/utils/udf.py
new file mode 100644
index 00000000000..01a38a675a9
--- /dev/null
+++ b/python/sedona/spark/utils/udf.py
@@ -0,0 +1,26 @@
+import shapely
+
+
+def has_sedona_serializer_speedup():
+    try:
+        from . import geomserde_speedup
+    except ImportError:
+        return False
+    return True
+
+def to_sedona_func(arr):
+    try:
+        from . import geomserde_speedup
+    except ImportError:
+        return shapely.to_wkb(arr)
+
+    return geomserde_speedup.to_sedona_func(arr)
+
+
+def from_sedona_func(arr):
+    try:
+        from . import geomserde_speedup
+    except ImportError:
+        return shapely.from_wkb(arr)
+
+    return geomserde_speedup.from_sedona_func(arr)
diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py
new file mode 100644
index 00000000000..0d64a543c56
--- /dev/null
+++ b/python/sedona/spark/worker/daemon.py
@@ -0,0 +1,227 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import logging
+import numbers
+import os
+import signal
+import select
+import socket
+import sys
+import traceback
+import time
+import gc
+from errno import EINTR, EAGAIN
+from socket import AF_INET, AF_INET6, SOCK_STREAM, SOMAXCONN
+from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN, SIGINT
+
+from sedona.spark.worker.worker import main as worker_main
+from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer
+
+
+def compute_real_exit_code(exit_code):
+    # SystemExit's code can be integer or string, but os._exit only accepts integers
+    if isinstance(exit_code, numbers.Integral):
+        return exit_code
+    else:
+        return 1
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+file_handler = logging.FileHandler("/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", delay=False)
+file_handler.flush = file_handler.stream.flush
+
+logger.addHandler(file_handler)
+
+def worker(sock, authenticated):
+    logger.info("Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock))
+    """
+    Called by a worker process after the fork().
+    """
+    signal.signal(SIGHUP, SIG_DFL)
+    signal.signal(SIGCHLD, SIG_DFL)
+    signal.signal(SIGTERM, SIG_DFL)
+    # restore the handler for SIGINT,
+    # it's useful for debugging (show the stacktrace before exit)
+    signal.signal(SIGINT, signal.default_int_handler)
+
+    # Read the socket using fdopen instead of socket.makefile() because the latter
+    # seems to be very slow; note that we need to dup() the file descriptor because
+    # otherwise writes also cause a seek that makes us miss data on the read side.
+    buffer_size = int(os.environ.get("SPARK_BUFFER_SIZE", 65536))
+    infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size)
+    outfile = os.fdopen(os.dup(sock.fileno()), "wb", buffer_size)
+
+    if not authenticated:
+        client_secret = UTF8Deserializer().loads(infile)
+        if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret:
+            write_with_length("ok".encode("utf-8"), outfile)
+            outfile.flush()
+        else:
+            write_with_length("err".encode("utf-8"), outfile)
+            outfile.flush()
+            sock.close()
+            return 1
+
+    exit_code = 0
+    try:
+        worker_main(infile, outfile)
+    except SystemExit as exc:
+        exit_code = compute_real_exit_code(exc.code)
+    finally:
+        try:
+            outfile.flush()
+        except Exception:
+            pass
+    return exit_code
+
+
+def manager():
+    # Create a new process group to corral our children
+    os.setpgid(0, 0)
+
+    # Create a listening socket on the loopback interface
+    if os.environ.get("SPARK_PREFER_IPV6", "false").lower() == "true":
+        listen_sock = socket.socket(AF_INET6, SOCK_STREAM)
+        listen_sock.bind(("::1", 0, 0, 0))
+        listen_sock.listen(max(1024, SOMAXCONN))
+        listen_host, listen_port, _, _ = listen_sock.getsockname()
+    else:
+        listen_sock = socket.socket(AF_INET, SOCK_STREAM)
+        listen_sock.bind(("127.0.0.1", 0))
+        listen_sock.listen(max(1024, SOMAXCONN))
+        listen_host, listen_port = listen_sock.getsockname()
+
+    # re-open stdin/stdout in 'wb' mode
+    stdin_bin = os.fdopen(sys.stdin.fileno(), "rb", 4)
+    stdout_bin = os.fdopen(sys.stdout.fileno(), "wb", 4)
+    write_int(listen_port, stdout_bin)
+    stdout_bin.flush()
+
+    def shutdown(code):
+        signal.signal(SIGTERM, SIG_DFL)
+        # Send SIGHUP to notify workers of shutdown
+        os.kill(0, SIGHUP)
+        sys.exit(code)
+
+    def handle_sigterm(*args):
+        shutdown(1)
+
+    signal.signal(SIGTERM, handle_sigterm)  # Gracefully exit on SIGTERM
+    signal.signal(SIGHUP, SIG_IGN)  # Don't die on SIGHUP
+    signal.signal(SIGCHLD, SIG_IGN)
+
+    reuse = os.environ.get("SPARK_REUSE_WORKER")
+
+    # Initialization complete
+    try:
+        while True:
+            try:
+                ready_fds = select.select([0, listen_sock], [], [], 1)[0]
+            except select.error as ex:
+                if ex[0] == EINTR:
+                    continue
+                else:
+                    raise
+
+            if 0 in ready_fds:
+                try:
+                    worker_pid = read_int(stdin_bin)
+                except EOFError:
+                    # Spark told us to exit by closing stdin
+                    shutdown(0)
+                try:
+                    os.kill(worker_pid, signal.SIGKILL)
+                except OSError:
+                    pass  # process already died
+
+            if listen_sock in ready_fds:
+                try:
+                    sock, _ = listen_sock.accept()
+                except OSError as e:
+                    if e.errno == EINTR:
+                        continue
+                    raise
+
+                # Launch a worker process
+                try:
+                    pid = os.fork()
+                except OSError as e:
+                    if e.errno in (EAGAIN, EINTR):
+                        time.sleep(1)
+                        pid = os.fork()  # error here will shutdown daemon
+                    else:
+                        outfile = sock.makefile(mode="wb")
+                        write_int(e.errno, outfile)  # Signal that the fork failed
+                        outfile.flush()
+                        outfile.close()
+                        sock.close()
+                        continue
+
+                if pid == 0:
+                    # in child process
+                    listen_sock.close()
+
+                    # It should close the standard input in the child process so that
+                    # Python native function executions stay intact.
+                    #
+                    # Note that if we just close the standard input (file descriptor 0),
+                    # the lowest file descriptor (file descriptor 0) will be allocated,
+                    # later when other file descriptors should happen to open.
+                    #
+                    # Therefore, here we redirects it to '/dev/null' by duplicating
+                    # another file descriptor for '/dev/null' to the standard input (0).
+                    # See SPARK-26175.
+                    devnull = open(os.devnull, "r")
+                    os.dup2(devnull.fileno(), 0)
+                    devnull.close()
+
+                    try:
+                        # Acknowledge that the fork was successful
+                        outfile = sock.makefile(mode="wb")
+                        write_int(os.getpid(), outfile)
+                        outfile.flush()
+                        outfile.close()
+                        authenticated = False
+                        while True:
+                            code = worker(sock, authenticated)
+                            logger.info("Worker exited with code %d", code)
+                            if code == 0:
+                                authenticated = True
+                            if not reuse or code:
+                                # wait for closing
+                                try:
+                                    while sock.recv(1024):
+                                        pass
+                                except Exception:
+                                    pass
+                                break
+                            gc.collect()
+                    except BaseException:
+                        traceback.print_exc()
+                        os._exit(1)
+                    else:
+                        os._exit(0)
+                else:
+                    sock.close()
+
+    finally:
+        shutdown(1)
+
+
+if __name__ == "__main__":
+    manager()
diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py
index 3954d075b74..5a33a266105 100644
--- a/python/sedona/spark/worker/serde.py
+++ b/python/sedona/spark/worker/serde.py
@@ -4,10 +4,11 @@
 from sedona.spark.worker.udf_info import UDFInfo
 
 class SedonaDBSerializer(ArrowStreamPandasSerializer):
-    def __init__(self, timezone, safecheck, db, udf_info: UDFInfo):
+    def __init__(self, timezone, safecheck, db, udf_info: UDFInfo, cast_to_wkb=False):
         super(SedonaDBSerializer, self).__init__(timezone, safecheck)
         self.db = db
         self.udf_info = udf_info
+        self.cast_to_wkb = cast_to_wkb
 
     def load_stream(self, stream):
         import pyarrow as pa
@@ -22,7 +23,7 @@ def load_stream(self, stream):
 
             df.to_view(table_name)
 
-            sql_expression = self.udf_info.sedona_db_transformation_expr(table_name)
+            sql_expression = self.udf_info.sedona_db_transformation_expr(table_name, self.cast_to_wkb)
 
             index += 1
 
@@ -37,7 +38,6 @@ def arrow_dump_stream(self, iterator, stream):
                 if writer is None:
                     writer = pa.RecordBatchStreamWriter(stream, batch.schema)
                 writer.write_batch(batch)
-                # stream.flush()
         finally:
             if writer is not None:
                 writer.close()
diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py
index d354bcea7ed..7853133e77f 100644
--- a/python/sedona/spark/worker/udf_info.py
+++ b/python/sedona/spark/worker/udf_info.py
@@ -11,24 +11,23 @@ class UDFInfo:
     return_type: object
     name: str
 
-    def get_function_call_sql(self, table_name: str) -> str:
+    def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> str:
         arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets])
         function_expr = f"{self.name}({arg_offset_str})"
-        if isinstance(self.return_type, GeometryType):
+        if isinstance(self.return_type, GeometryType) and cast_to_wkb:
             return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"
 
         return f"SELECT {function_expr} AS _0 FROM {table_name}"
 
-    def sedona_db_transformation_expr(self, table_name: str) -> str:
+    def sedona_db_transformation_expr(self, table_name: str, cast_to_wkb: bool = False) -> str:
         fields = []
         for arg in self.arg_offsets:
-            if arg in self.geom_offsets:
+            if arg in self.geom_offsets and cast_to_wkb:
                 crs = self.geom_offsets[arg]
                 fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}")
                 continue
 
             fields.append(f"_{arg}")
 
-
         fields_expr = ", ".join(fields)
         return f"SELECT {fields_expr} FROM {table_name}"
diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
index 6b2a18c8f20..17dae02e637 100644
--- a/python/sedona/spark/worker/worker.py
+++ b/python/sedona/spark/worker/worker.py
@@ -15,16 +15,17 @@
 from sedona.spark.worker.udf_info import UDFInfo
 
 
-def apply_iterator(db, iterator, udf_info: UDFInfo):
+def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False):
     i = 0
     for df in iterator:
         i+=1
         table_name = f"output_table_{i}"
         df.to_view(table_name)
 
-        function_call_sql = udf_info.get_function_call_sql(table_name)
+        function_call_sql = udf_info.get_function_call_sql(table_name, cast_to_wkb=cast_to_wkb)
 
         df_out = db.sql(function_call_sql)
+
         df_out.to_view(f"view_{i}")
         at = df_out.to_arrow_table()
         batches = at.combine_chunks().to_batches()
@@ -207,9 +208,9 @@ def main(infile, outfile):
     pickle_ser = CPickleSerializer()
 
     split_index = read_int(infile)
-    #
+
     check_python_version(utf8_deserializer, infile)
-    #
+
     check_barrier_flag(infile)
 
     task_context = assign_task_context(utf_serde=utf8_deserializer, infile=infile)
@@ -217,7 +218,7 @@ def main(infile, outfile):
     shuffle.DiskBytesSpilled = 0
 
     resolve_python_path(utf8_deserializer, infile)
-    #
+
     check_broadcast_variables(infile)
 
     eval_type = read_int(infile)
@@ -229,11 +230,14 @@ def main(infile, outfile):
     sedona_db.register_udf(udf.function)
     init_time = time.time()
 
+    cast_to_wkb = read_bool(infile)
+
     serde = SedonaDBSerializer(
         timezone=runner_conf.get("spark.sql.session.timeZone", "UTC"),
         safecheck=False,
         db=sedona_db,
-        udf_info=udf
+        udf_info=udf,
+        cast_to_wkb=cast_to_wkb
     )
 
     number_of_geometries = read_int(infile)
@@ -247,7 +251,7 @@ def main(infile, outfile):
     udf.geom_offsets = geom_offsets
 
     iterator = serde.load_stream(infile)
-    out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf)
+    out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb)
 
     serde.dump_stream(out_iterator, outfile)
 
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 00000000000..66ab74701b0
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,6 @@
+from setuptools import setup
+import numpy
+
+setup(
+    include_dirs=[numpy.get_include()],
+)
diff --git a/python/src/geom_buf.c b/python/src/geom_buf.c
index 5239de5ae09..d6a51bb3d0e 100644
--- a/python/src/geom_buf.c
+++ b/python/src/geom_buf.c
@@ -208,6 +208,8 @@ SedonaErrorCode geom_buf_alloc(GeomBuffer *geom_buf,
   return SEDONA_SUCCESS;
 }
 
+#include <stdio.h>
+
 SedonaErrorCode read_geom_buf_header(const char *buf, int buf_size,
                                      GeomBuffer *geom_buf,
                                      CoordinateSequenceInfo *cs_info,
diff --git a/python/src/geomserde.c b/python/src/geomserde.c
index c1f74277383..81dafe216f0 100644
--- a/python/src/geomserde.c
+++ b/python/src/geomserde.c
@@ -718,6 +718,7 @@ static SedonaErrorCode deserialize_geom_buf(GEOSContextHandle_t handle,
   return SEDONA_SUCCESS;
 }
 
+#include <stdio.h>
 SedonaErrorCode sedona_deserialize_geom(GEOSContextHandle_t handle,
                                         const char *buf, int buf_size,
                                         GEOSGeometry **p_geom,
diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c
index a95ced29e57..621f956cd03 100644
--- a/python/src/geomserde_speedup_module.c
+++ b/python/src/geomserde_speedup_module.c
@@ -20,10 +20,15 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <stdio.h>
+//
+//#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 
 #include "geomserde.h"
 #include "geos_c_dyn.h"
 #include "pygeos/c_api.h"
+#include <numpy/ndarraytypes.h>
+#include <numpy/npy_3kcompat.h>
+#include <numpy/ufuncobject.h>
 
 PyDoc_STRVAR(module_doc, "Geometry serialization/deserialization module.");
 
@@ -225,7 +230,7 @@ static PyObject *serialize(PyObject *self, PyObject *args) {
   return do_serialize(geos_geom);
 }
 
-static PyObject *deserialize(PyObject *self, PyObject *args) {
+static PyObject *deserialize_2(PyObject *self, PyObject *args) {
   GEOSContextHandle_t handle = NULL;
   int length = 0;
   GEOSGeometry *geom = do_deserialize(args, &handle, &length);
@@ -262,16 +267,106 @@ static PyObject *deserialize_1(PyObject *self, PyObject *args) {
   return Py_BuildValue("(Kibi)", geom, geom_type_id, has_z, length);
 }
 
+static PyObject *to_sedona_func(PyObject *self, PyObject *args) {
+    import_array();
+    PyObject *input_obj = NULL;
+    if (!PyArg_ParseTuple(args, "O", &input_obj)){
+        return NULL;
+    };
+
+    PyArrayObject *array = (PyArrayObject *)input_obj;
+    PyObject **objs = (PyObject **)PyArray_DATA(array);
+
+    GEOSContextHandle_t handle = get_geos_context_handle();
+      if (handle == NULL) {
+        return NULL;
+      }
+
+    npy_intp n = PyArray_SIZE(input_obj);
+    npy_intp dims[1] = {n};
+    PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
+    for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
+          PyObject *obj = objs[i];
+          GEOSGeometry *geos_geom = NULL;
+          char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom);
+
+          PyObject *serialized = do_serialize(geos_geom);
+          PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized);
+    }
+
+    return out;
+}
 /* Module definition for Shapely 2.x */
+static PyObject *from_sedona_func(PyObject *self, PyObject *args) {
+    import_array();
+    PyObject *input_obj = NULL;
+    if (!PyArg_ParseTuple(args, "O", &input_obj)){
+        return NULL;
+    };
+
+    GEOSContextHandle_t handle = get_geos_context_handle();
+
+    PyArrayObject *array = (PyArrayObject *)input_obj;
+    PyObject **objs = (PyObject **)PyArray_DATA(array);
+
+    int p_bytes_read = 0;
+
+    npy_intp n = PyArray_SIZE(input_obj);
+
+    npy_intp dims[1] = {n};
+    PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
+
+    for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
+        PyObject *obj = objs[i];
+        if (!PyBytes_Check(obj)) {
+            PyErr_SetString(PyExc_TypeError, "Expected bytes");
+            return NULL;
+        }
+
+        char *buf = PyBytes_AS_STRING(obj);
+
+        Py_ssize_t len = PyBytes_GET_SIZE(obj);
+
+        GEOSGeometry *geom = NULL;
+
+        SedonaErrorCode err = sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read);
+        if (err != SEDONA_SUCCESS) {
+          handle_geomserde_error(err);
+          return NULL;
+        }
+          PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle);
+
+          PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom);
+        }
+
+        return out;
+}
+
 
 static PyMethodDef geomserde_methods_shapely_2[] = {
     {"load_libgeos_c", load_libgeos_c, METH_VARARGS, "Load libgeos_c."},
     {"serialize", serialize, METH_VARARGS,
      "Serialize geometry object as bytearray."},
-    {"deserialize", deserialize, METH_VARARGS,
+    {"deserialize_2", deserialize_2, METH_VARARGS,
+     "Deserialize bytes-like object to geometry object."},
+    {"from_sedona_func", from_sedona_func, METH_VARARGS,
+     "Deserialize bytes-like object to geometry object."},
+    {"to_sedona_func", to_sedona_func, METH_VARARGS,
      "Deserialize bytes-like object to geometry object."},
     {NULL, NULL, 0, NULL}, /* Sentinel */
 };
+//
+//static int add_from_sedona_func_to_module(PyObject *m) {
+//  PyObject *capsule = PyCapsule_New((void *)from_sedona_func, "from_sedona_func", NULL);
+//  if (capsule == NULL) {
+//    return -1;
+//  }
+//  if (PyModule_AddObject(m, "from_sedona_func", capsule) < 0) {
+//    Py_DECREF(capsule);
+//    return -1;
+//  }
+//  return 0;
+//}
 
 static struct PyModuleDef geomserde_module_shapely_2 = {
     PyModuleDef_HEAD_INIT, "geomserde_speedup", module_doc, 0,
diff --git a/python/tests/test_base.py b/python/tests/test_base.py
index 911860e416f..e240a09758c 100644
--- a/python/tests/test_base.py
+++ b/python/tests/test_base.py
@@ -72,6 +72,8 @@ def spark(self):
                 )
                 .config("spark.executor.memory", "10G") \
                 .config("spark.driver.memory", "10G") \
+                .config("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") \
+                .config("sedona.python.worker.daemon.enabled", "false") \
                 # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default
                 # in Spark 4
                 .config("spark.sql.ansi.enabled", "false")
diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py
index 904d59a2820..4b266384fa9 100644
--- a/python/tests/utils/test_sedona_db_vectorized_udf.py
+++ b/python/tests/utils/test_sedona_db_vectorized_udf.py
@@ -1,12 +1,106 @@
+import time
+
+import numpy as np
+
 from sedona.spark.sql.functions import sedona_db_vectorized_udf
+from sedona.spark.utils.udf import to_sedona_func, from_sedona_func
 from tests.test_base import TestBase
 import pyarrow as pa
 import shapely
 from sedona.sql import GeometryType
 from pyspark.sql.functions import expr, lit
-from pyspark.sql.types import DoubleType, IntegerType
+from pyspark.sql.types import DoubleType, IntegerType, ByteType
 from sedona.spark.sql import ST_X
-
+from shapely._enum import ParamEnum
+
+def test_m():
+    on_invalid="raise"
+    wkb = b'\x12\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?'
+    geometry = np.asarray([wkb, wkb], dtype=object)
+
+    DecodingErrorOptions = ParamEnum(
+        "DecodingErrorOptions", {"ignore": 0, "warn": 1, "raise": 2, "fix": 3}
+    )
+
+    # print("sss")
+
+
+    # <class 'numpy.ndarray'>
+    # object
+    #   C_CONTIGUOUS : True
+    #   F_CONTIGUOUS : True
+    #   OWNDATA : False
+    #   WRITEABLE : True
+    #   ALIGNED : True
+    #   WRITEBACKIFCOPY : False
+    # print(type(geometry))
+    # print(geometry.dtype)
+    # print(geometry.flags)
+
+    result = from_sedona_func(geometry)
+
+    result2 = to_sedona_func(result)
+
+# ensure the input has object dtype, to avoid numpy inferring it as a
+# fixed-length string dtype (which removes trailing null bytes upon access
+# of array elements)
+    #
+    # def from_sedona_func(arr):
+    #     try:
+    #         from . import sedonaserde_vectorized_udf_module
+    #         print(sedonaserde_vectorized_udf_module.from_sedona_func_3(arr))
+    #     except Exception as e:
+    #         print("Cannot import sedonaserde_vectorized_udf_module:")
+    #         print(e)
+    # # print()
+    # return None
+#
+# def from_wkb(geometry, on_invalid="raise", **kwargs):
+#     r"""Create geometries from the Well-Known Binary (WKB) representation.
+#
+#     The Well-Known Binary format is defined in the `OGC Simple Features
+#     Specification for SQL <https://www.opengeospatial.org/standards/sfs>`__.
+#
+#     Parameters
+#     ----------
+#     geometry : str or array_like
+#         The WKB byte object(s) to convert.
+#     on_invalid : {"raise", "warn", "ignore", "fix"}, default "raise"
+#         Indicates what to do when an invalid WKB is encountered. Note that the
+#         validations involved are very basic, e.g. the minimum number of points
+#         for the geometry type. For a thorough check, use :func:`is_valid` after
+#         conversion to geometries. Valid options are:
+#
+#         - raise: an exception will be raised if any input geometry is invalid.
+#         - warn: a warning will be raised and invalid WKT geometries will be
+#           returned as ``None``.
+#         - ignore: invalid geometries will be returned as ``None`` without a
+#           warning.
+#         - fix: an effort is made to fix invalid input geometries (currently just
+#           unclosed rings). If this is not possible, they are returned as
+#           ``None`` without a warning. Requires GEOS >= 3.11.
+#
+#           .. versionadded:: 2.1.0
+#     **kwargs
+#         See :ref:`NumPy ufunc docs <ufuncs.kwargs>` for other keyword arguments.
+#
+#     Examples
+#     --------
+#     >>> import shapely
+#     >>> shapely.from_wkb(b'\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?')
+#     <POINT (1 1)>
+#
+#     """  # noqa: E501
+#     if not np.isscalar(on_invalid):
+#         raise TypeError("on_invalid only accepts scalar values")
+#
+#     invalid_handler = np.uint8(DecodingErrorOptions.get_value(on_invalid))
+#
+#     # ensure the input has object dtype, to avoid numpy inferring it as a
+#     # fixed-length string dtype (which removes trailing null bytes upon access
+#     # of array elements)
+#     geometry = np.asarray(geometry, dtype=object)
+#     return lib.from_wkb(geometry, invalid_handler, **kwargs)
 
 class TestSedonaDBArrowFunction(TestBase):
     def test_vectorized_udf(self):
@@ -15,7 +109,6 @@ def my_own_function(geom, distance):
             geom_wkb = pa.array(geom.storage.to_array())
             distance = pa.array(distance.to_array())
             geom = shapely.from_wkb(geom_wkb)
-
             result_shapely = shapely.centroid(geom)
 
             return pa.array(shapely.to_wkb(result_shapely))
@@ -95,18 +188,26 @@ def return_same_geometry(geom):
         assert crs_list == [3857, 3857, 3857]
 
     def test_geometry_to_geometry(self):
-        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()])
+        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()])
         def buffer_geometry(geom):
             geom_wkb = pa.array(geom.storage.to_array())
-            geom = shapely.from_wkb(geom_wkb)
+            geometry_array = np.asarray(geom_wkb, dtype=object)
+            geom = from_sedona_func(geometry_array)
 
             result_shapely = shapely.buffer(geom, 10)
 
-            return pa.array(shapely.to_wkb(result_shapely))
+            return pa.array(to_sedona_func(result_shapely))
 
         df = self.spark.read.\
             format("geoparquet").\
-            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
+            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l1")
+
+        # 1 045 770
+        # print(df.count())
+
+        # df.unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\
+        #     unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\
+        #     write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2")
         # 18 24
         # df.union(df).union(df).union(df).union(df).union(df).union(df).\
         #     write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
@@ -117,6 +218,12 @@ def buffer_geometry(geom):
 
         values.show()
 
+        # for _ in range(4):
+        #     start_time = time.time()
+        #     values.show()
+        #     end_time = time.time()
+        #     print(f"Execution time: {end_time - start_time} seconds")
+
     def test_geometry_to_geometry_normal_udf(self):
         from pyspark.sql.functions import udf
 
@@ -127,10 +234,19 @@ def create_buffer(geom):
 
         df = self.spark.read. \
             format("geoparquet"). \
-            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
+            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2")
 
+        # print(df.count())
+        # df.limit(10).collect()
         values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \
             selectExpr("ST_Area(geometry) as area"). \
             selectExpr("Sum(area) as total_area")
 
         values.show()
+
+        # for _ in range(4):
+        #     start_time = time.time()
+        #     values.show()
+        #     end_time = time.time()
+        #     print(f"Execution time: {end_time - start_time} seconds")
+# 1 045 770
diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
index 11263dd7f68..0f1a5fe0a01 100644
--- a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
+++ b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/PythonEvalType.scala
@@ -25,6 +25,7 @@ object PythonEvalType {
 
   // sedona db eval types
   val SQL_SCALAR_SEDONA_DB_UDF = 6200
+  val SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF = 6201
   val SEDONA_DB_UDF_TYPE_CONSTANT = 6000
 
   def toString(pythonEvalType: Int): String = pythonEvalType match {
@@ -32,5 +33,6 @@ object PythonEvalType {
     case SQL_SCALAR_SEDONA_DB_UDF => "SQL_SCALAR_SEDONA_DB_UDF"
   }
 
-  def evals(): Set[Int] = Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF)
+  def evals(): Set[Int] =
+    Set(SQL_SCALAR_SEDONA_UDF, SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF)
 }
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
index 0d3960d2d81..3055e768b98 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowPythonRunner.scala
@@ -38,13 +38,15 @@ class SedonaArrowPythonRunner(
     protected override val workerConf: Map[String, String],
     val pythonMetrics: Map[String, SQLMetric],
     jobArtifactUUID: Option[String],
-    geometryFields: Seq[(Int, Int)])
+    geometryFields: Seq[(Int, Int)],
+    castGeometryToWKB: Boolean = false)
     extends SedonaBasePythonRunner[Iterator[InternalRow], ColumnarBatch](
       funcs,
       evalType,
       argOffsets,
       jobArtifactUUID,
-      geometryFields)
+      geometryFields,
+      castGeometryToWKB)
     with SedonaBasicPythonArrowInput
     with SedonaBasicPythonArrowOutput {
 
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
index bb897931b63..228ddc2cbca 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaArrowStrategy.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.sedona.sql.UDF.PythonEvalType
-import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF}
+import org.apache.sedona.sql.UDF.PythonEvalType.{SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF, SQL_SCALAR_SEDONA_DB_UDF, SQL_SCALAR_SEDONA_UDF}
 import org.apache.spark.api.python.ChainedPythonFunctions
 import org.apache.spark.sql.Strategy
 import org.apache.spark.sql.catalyst.InternalRow
@@ -115,10 +115,10 @@ case class SedonaArrowEvalPythonExec(
     val batchIter = if (batchSize > 0) new BatchIterator(full, batchSize) else Iterator(full)
 
     evalType match {
-      case SQL_SCALAR_SEDONA_DB_UDF =>
+      case SQL_SCALAR_SEDONA_DB_UDF | SQL_SCALAR_SEDONA_DB_NO_SPEEDUP_UDF =>
         val columnarBatchIter = new SedonaArrowPythonRunner(
           funcs,
-          evalType - PythonEvalType.SEDONA_DB_UDF_TYPE_CONSTANT,
+          200,
           argOffsets,
           schema,
           sessionLocalTimeZone,
@@ -126,7 +126,9 @@ case class SedonaArrowEvalPythonExec(
           pythonRunnerConf,
           pythonMetrics,
           jobArtifactUUID,
-          geometryFields).compute(batchIter, context.partitionId(), context)
+          geometryFields,
+          evalType == SQL_SCALAR_SEDONA_DB_UDF)
+          .compute(batchIter, context.partitionId(), context)
 
         val result = columnarBatchIter.flatMap { batch =>
           batch.rowIterator.asScala
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
index 276383a0ee8..055d5db15f1 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaBasePythonRunner.scala
@@ -39,14 +39,16 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
     evalType: Int,
     argOffsets: Array[Array[Int]],
     jobArtifactUUID: Option[String],
-    val geometryFields: Seq[(Int, Int)] = Seq.empty)
+    val geometryFields: Seq[(Int, Int)] = Seq.empty,
+    val castGeometryToWKB: Boolean = false)
     extends BasePythonRunner[IN, OUT](funcs, evalType, argOffsets, jobArtifactUUID)
     with Logging {
 
   require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
 
   private val conf = SparkEnv.get.conf
-  private val reuseWorker = conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
+  private val reuseWorker =
+    conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
   private val faultHandlerEnabled = conf.get(PYTHON_WORKER_FAULTHANLDER_ENABLED)
 
   private def getWorkerMemoryMb(mem: Option[Long], cores: Int): Option[Long] = {
@@ -81,9 +83,12 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
       envVars.put("PYTHON_FAULTHANDLER_DIR", SedonaBasePythonRunner.faultHandlerLogDir.toString)
     }
 
+    if (reuseWorker) {
+      envVars.put("SPARK_REUSE_WORKER", "1")
+    }
+
     envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default"))
 
-    println("running the compute for SedonaBasePythonRunner and partition index: " + partitionIndex)
     val (worker: Socket, pid: Option[Int]) = {
       WorkerContext.createPythonWorker(pythonExec, envVars.asScala.toMap)
     }
@@ -98,7 +103,6 @@ private[spark] abstract class SedonaBasePythonRunner[IN, OUT](
 
       if (!reuseWorker || releasedOrClosed.compareAndSet(false, true)) {
         try {
-          logInfo("Shutting down worker socket")
           worker.close()
         } catch {
           case e: Exception =>
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
index 93bcaee0c6f..459388856b0 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaDBWorkerFactory.scala
@@ -34,7 +34,6 @@ import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.sql.execution.python.SedonaPythonWorkerFactory.PROCESS_WAIT_TIMEOUT_MS
 import org.apache.spark.util.RedirectThread
 
-import java.util.concurrent.TimeUnit
 import javax.annotation.concurrent.GuardedBy
 
 class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) extends Logging {
@@ -181,7 +180,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
   }
 
   private def stopDaemon(): Unit = {
-    logError("daemon stopping called")
     self.synchronized {
       if (useDaemon) {
         cleanupIdleWorkers()
@@ -194,7 +192,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
         daemon = null
         daemonPort = 0
       } else {
-        println("Stopping simple workers")
         simpleWorkers.mapValues(_.destroy())
       }
     }
@@ -233,11 +230,11 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
           daemonPort = in.readInt()
         } catch {
           case _: EOFException if daemon.isAlive =>
-            throw SparkCoreErrors.eofExceptionWhileReadPortNumberError(
-              sedonaDaemonModule)
+            throw SparkCoreErrors.eofExceptionWhileReadPortNumberError(sedonaDaemonModule)
           case _: EOFException =>
-            throw SparkCoreErrors.
-              eofExceptionWhileReadPortNumberError(sedonaDaemonModule, Some(daemon.exitValue))
+            throw SparkCoreErrors.eofExceptionWhileReadPortNumberError(
+              sedonaDaemonModule,
+              Some(daemon.exitValue))
         }
 
         // test that the returned port number is within a valid range.
@@ -261,7 +258,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
         redirectStreamsToStderr(in, daemon.getErrorStream)
       } catch {
         case e: Exception =>
-
           // If the daemon exists, wait for it to finish and get its stderr
           val stderr = Option(daemon)
             .flatMap { d => Utils.getStderr(d, PROCESS_WAIT_TIMEOUT_MS) }
@@ -307,7 +303,6 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
 
   def releaseWorker(worker: Socket): Unit = {
     if (useDaemon) {
-      logInfo("Releasing worker back to daemon pool")
       self.synchronized {
         lastActivityNs = System.nanoTime()
         idleWorkers.enqueue(worker)
@@ -345,5 +340,4 @@ class SedonaDBWorkerFactory(pythonExec: String, envVars: Map[String, String]) ex
 
 private object SedonaPythonWorkerFactory {
   val PROCESS_WAIT_TIMEOUT_MS = 10000
-  val IDLE_WORKER_TIMEOUT_NS = TimeUnit.MINUTES.toNanos(1)  // kill idle workers after 1 minute
 }
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
index 18db42ae0df..2544e63a979 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
@@ -60,6 +60,9 @@ private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] {
         handleMetadataBeforeExec(dataOut)
         writeUDF(dataOut, funcs, argOffsets)
 
+        // if speedup is not available and we need to use casting
+        dataOut.writeBoolean(self.castGeometryToWKB)
+
         // write
         dataOut.writeInt(self.geometryFields.length)
         // write geometry field indices and their SRIDs
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
index 0c0b220933f..27764c2a54f 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
@@ -34,7 +34,8 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, Columna
 
 private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[_, OUT] =>
 
-  private val reuseWorker = SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
+  private val reuseWorker =
+    SparkEnv.get.conf.getBoolean(PYTHON_WORKER_REUSE.key, PYTHON_WORKER_REUSE.defaultValue.get)
 
   protected def pythonMetrics: Map[String, SQLMetric]
 
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
index 82fe6deddaf..6411bec97e4 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/WorkerContext.scala
@@ -24,24 +24,28 @@ import scala.collection.mutable
 object WorkerContext {
 
   def createPythonWorker(
-                          pythonExec: String,
-                          envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
+      pythonExec: String,
+      envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
     synchronized {
       val key = (pythonExec, envVars)
       pythonWorkers.getOrElseUpdate(key, new SedonaDBWorkerFactory(pythonExec, envVars)).create()
     }
   }
 
-  def destroyPythonWorker(pythonExec: String,
-                          envVars: Map[String, String], worker: Socket): Unit = {
+  def destroyPythonWorker(
+      pythonExec: String,
+      envVars: Map[String, String],
+      worker: Socket): Unit = {
     synchronized {
       val key = (pythonExec, envVars)
       pythonWorkers.get(key).foreach(_.stopWorker(worker))
     }
   }
 
-  def releasePythonWorker(pythonExec: String,
-                          envVars: Map[String, String], worker: Socket): Unit = {
+  def releasePythonWorker(
+      pythonExec: String,
+      envVars: Map[String, String],
+      worker: Socket): Unit = {
     synchronized {
       val key = (pythonExec, envVars)
       pythonWorkers.get(key).foreach(_.releaseWorker(worker))
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
index ebb5a568e14..3584cb01bd7 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/udf/ExtractSedonaUDFRule.scala
@@ -44,7 +44,8 @@ class ExtractSedonaUDFRule extends Rule[LogicalPlan] with Logging {
   }
 
   def isScalarPythonUDF(e: Expression): Boolean = {
-    e.isInstanceOf[PythonUDF] && PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType)
+    e.isInstanceOf[PythonUDF] &&
+    PythonEvalType.evals.contains(e.asInstanceOf[PythonUDF].evalType)
   }
 
   private def collectEvaluableUDFsFromExpressions(
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
index e64e9dec3bb..c9b4d6ac289 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
@@ -46,9 +46,9 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
     // We need to be explicit about broadcasting in tests.
     .config("sedona.join.autoBroadcastJoinThreshold", "-1")
     .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
-    .config("sedona.python.worker.udf.module", "sedonaworker.worker")
+    .config("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
     .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon")
-    .config("sedona.python.worker.daemon.enabled", "true")
+    .config("sedona.python.worker.daemon.enabled", "false")
 //    .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
     .getOrCreate()
 
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
index 000c1f55b62..4fe4acfb121 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
@@ -35,7 +35,6 @@ class StrategySuite extends TestBaseScala with Matchers {
 
   import spark.implicits._
 
-
   it("sedona geospatial UDF - geopandas") {
     val df = Seq(
       (1, "value", wktReader.read("POINT(21 52)")),
@@ -50,7 +49,8 @@ class StrategySuite extends TestBaseScala with Matchers {
 
     geopandasUDFDF.count shouldEqual 5
 
-    geopandasUDFDF.selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))")
+    geopandasUDFDF
+      .selectExpr("ST_AsText(ST_ReducePrecision(geom_buffer, 2))")
       .as[String]
       .collect() should contain theSameElementsAs Seq(
       "POLYGON ((20 51, 20 53, 22 53, 22 51, 20 51))",
@@ -61,36 +61,33 @@ class StrategySuite extends TestBaseScala with Matchers {
   }
 
   it("sedona geospatial UDF - sedona db") {
-//    val df = Seq(
-//      (1, "value", wktReader.read("POINT(21 52)")),
-//      (2, "value1", wktReader.read("POINT(20 50)")),
-//      (3, "value2", wktReader.read("POINT(20 49)")),
-//      (4, "value3", wktReader.read("POINT(20 48)")),
-//      (5, "value4", wktReader.read("POINT(20 47)")))
-//      .toDF("id", "value", "geom")
-//
-//    val dfVectorized = df
-//      .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
-//      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
-
-//    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
-//      .selectExpr("sum(x)")
-//      .as[Double]
-//      .collect().head shouldEqual 101
-
-    val dfCopied = sparkSession.read
-      .format("geoparquet")
-      .load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet")
+    val df = Seq(
+      (1, "value", wktReader.read("POINT(21 52)")),
+      (2, "value1", wktReader.read("POINT(20 50)")),
+      (3, "value2", wktReader.read("POINT(20 49)")),
+      (4, "value3", wktReader.read("POINT(20 48)")),
+      (5, "value4", wktReader.read("POINT(20 47)")))
+      .toDF("id", "value", "geom")
 
-    val values = dfCopied.unionAll(dfCopied)
-      .unionAll(dfCopied)
-//      .unionAll(dfCopied)
-//      .unionAll(dfCopied)
-//      .unionAll(dfCopied)
-      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom"))
-      .selectExpr("ST_Area(geom) as area")
-      .selectExpr("Sum(area) as total_area")
+    val dfVectorized = df
+      .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
+      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
 
-    values.show()
+    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
+      .selectExpr("sum(x)")
+      .as[Double]
+      .collect().head shouldEqual 101
+//
+//    val dfCopied = sparkSession.read
+//      .format("geoparquet")
+//      .load(
+//        "/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet")
+//
+//    val values = dfCopied
+//      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom"))
+//      .selectExpr("ST_Area(geom) as area")
+//      .selectExpr("Sum(area) as total_area")
+//
+//    values.show()
   }
 }

From 42229e5f66bb0cf032348799b358991c77e3e63a Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Tue, 13 Jan 2026 23:23:05 +0100
Subject: [PATCH 06/11] add sedonadb sedona udf worker example

---
 python/sedona/spark/sql/functions.py          |  29 ++-
 python/sedona/spark/utils/udf.py              |  22 +-
 python/sedona/spark/worker/__init__.py        |  16 ++
 python/sedona/spark/worker/daemon.py          |  48 ++--
 python/sedona/spark/worker/serde.py           |  25 +-
 python/sedona/spark/worker/udf_info.py        |  25 +-
 python/sedona/spark/worker/worker.py          |  56 ++++-
 python/setup.py                               |  17 ++
 python/src/geomserde_speedup_module.c         | 138 +++++------
 python/tests/test_base.py                     |  14 +-
 .../utils/test_sedona_db_vectorized_udf.py    | 218 ++++--------------
 .../apache/spark/sql/udf/StrategySuite.scala  |  18 +-
 12 files changed, 311 insertions(+), 315 deletions(-)

diff --git a/python/sedona/spark/sql/functions.py b/python/sedona/spark/sql/functions.py
index 232ccb50a3c..d8bf73c1522 100644
--- a/python/sedona/spark/sql/functions.py
+++ b/python/sedona/spark/sql/functions.py
@@ -28,7 +28,14 @@
 import geoarrow.pyarrow as ga
 from sedonadb import udf as sedona_udf_module
 from sedona.spark.sql.types import GeometryType
-from pyspark.sql.types import DataType, FloatType, DoubleType, IntegerType, StringType, ByteType
+from pyspark.sql.types import (
+    DataType,
+    FloatType,
+    DoubleType,
+    IntegerType,
+    StringType,
+    ByteType,
+)
 
 from sedona.spark.utils.udf import has_sedona_serializer_speedup
 
@@ -52,7 +59,7 @@ class InvalidSedonaUDFType(Exception):
 
 
 def sedona_vectorized_udf(
-        return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR
+    return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR
 ):
     import geopandas as gpd
 
@@ -93,7 +100,7 @@ def apply_fn(fn):
 
 
 def _apply_shapely_series_udf(
-        fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
 ):
     def apply(series: pd.Series) -> pd.Series:
         applied = series.apply(
@@ -114,7 +121,7 @@ def apply(series: pd.Series) -> pd.Series:
 
 
 def _apply_geo_series_udf(
-        fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
 ):
     import geopandas as gpd
 
@@ -166,7 +173,11 @@ def infer_pa_type(spark_type: DataType):
 def infer_input_type(spark_type: DataType):
     if isinstance(spark_type, GeometryType):
         return sedona_udf_module.GEOMETRY
-    elif isinstance(spark_type, FloatType) or isinstance(spark_type, DoubleType) or isinstance(spark_type, IntegerType):
+    elif (
+        isinstance(spark_type, FloatType)
+        or isinstance(spark_type, DoubleType)
+        or isinstance(spark_type, IntegerType)
+    ):
         return sedona_udf_module.NUMERIC
     elif isinstance(spark_type, StringType):
         return sedona_udf_module.STRING
@@ -186,12 +197,12 @@ def infer_input_types(spark_types: list[DataType]):
 
 
 def sedona_db_vectorized_udf(
-        return_type: DataType,
-        input_types: list[DataType],
+    return_type: DataType,
+    input_types: list[DataType],
 ):
-    eval_type = 6201
+    eval_type = 6200
     if has_sedona_serializer_speedup():
-        eval_type = 6200
+        eval_type = 6201
 
     def apply_fn(fn):
         out_type = infer_pa_type(return_type)
diff --git a/python/sedona/spark/utils/udf.py b/python/sedona/spark/utils/udf.py
index 01a38a675a9..0f88ef07f29 100644
--- a/python/sedona/spark/utils/udf.py
+++ b/python/sedona/spark/utils/udf.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import shapely
 
 
@@ -8,7 +25,8 @@ def has_sedona_serializer_speedup():
         return False
     return True
 
-def to_sedona_func(arr):
+
+def to_sedona(arr):
     try:
         from . import geomserde_speedup
     except ImportError:
@@ -17,7 +35,7 @@ def to_sedona_func(arr):
     return geomserde_speedup.to_sedona_func(arr)
 
 
-def from_sedona_func(arr):
+def from_sedona(arr):
     try:
         from . import geomserde_speedup
     except ImportError:
diff --git a/python/sedona/spark/worker/__init__.py b/python/sedona/spark/worker/__init__.py
index e69de29bb2d..13a83393a91 100644
--- a/python/sedona/spark/worker/__init__.py
+++ b/python/sedona/spark/worker/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py
index 0d64a543c56..ce75e376eac 100644
--- a/python/sedona/spark/worker/daemon.py
+++ b/python/sedona/spark/worker/daemon.py
@@ -1,19 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import logging
 import numbers
 import os
@@ -39,16 +40,23 @@ def compute_real_exit_code(exit_code):
     else:
         return 1
 
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-file_handler = logging.FileHandler("/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log", delay=False)
+file_handler = logging.FileHandler(
+    "/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log",
+    delay=False,
+)
 file_handler.flush = file_handler.stream.flush
 
 logger.addHandler(file_handler)
 
+
 def worker(sock, authenticated):
-    logger.info("Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock))
+    logger.info(
+        "Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock)
+    )
     """
     Called by a worker process after the fork().
     """
@@ -69,10 +77,10 @@ def worker(sock, authenticated):
     if not authenticated:
         client_secret = UTF8Deserializer().loads(infile)
         if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret:
-            write_with_length("ok".encode("utf-8"), outfile)
+            write_with_length(b"ok", outfile)
             outfile.flush()
         else:
-            write_with_length("err".encode("utf-8"), outfile)
+            write_with_length(b"err", outfile)
             outfile.flush()
             sock.close()
             return 1
@@ -132,7 +140,7 @@ def handle_sigterm(*args):
         while True:
             try:
                 ready_fds = select.select([0, listen_sock], [], [], 1)[0]
-            except select.error as ex:
+            except OSError as ex:
                 if ex[0] == EINTR:
                     continue
                 else:
@@ -186,7 +194,7 @@ def handle_sigterm(*args):
                     # Therefore, here we redirects it to '/dev/null' by duplicating
                     # another file descriptor for '/dev/null' to the standard input (0).
                     # See SPARK-26175.
-                    devnull = open(os.devnull, "r")
+                    devnull = open(os.devnull)
                     os.dup2(devnull.fileno(), 0)
                     devnull.close()
 
diff --git a/python/sedona/spark/worker/serde.py b/python/sedona/spark/worker/serde.py
index 5a33a266105..52e7b663a58 100644
--- a/python/sedona/spark/worker/serde.py
+++ b/python/sedona/spark/worker/serde.py
@@ -1,11 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from pyspark.serializers import write_int, SpecialLengths
 from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
 
 from sedona.spark.worker.udf_info import UDFInfo
 
+
 class SedonaDBSerializer(ArrowStreamPandasSerializer):
     def __init__(self, timezone, safecheck, db, udf_info: UDFInfo, cast_to_wkb=False):
-        super(SedonaDBSerializer, self).__init__(timezone, safecheck)
+        super().__init__(timezone, safecheck)
         self.db = db
         self.udf_info = udf_info
         self.cast_to_wkb = cast_to_wkb
@@ -18,12 +36,15 @@ def load_stream(self, stream):
         for batch in batches:
             table = pa.Table.from_batches(batches=[batch])
             import pyarrow as pa
+
             df = self.db.create_data_frame(table)
             table_name = f"my_table_{index}"
 
             df.to_view(table_name)
 
-            sql_expression = self.udf_info.sedona_db_transformation_expr(table_name, self.cast_to_wkb)
+            sql_expression = self.udf_info.sedona_db_transformation_expr(
+                table_name, self.cast_to_wkb
+            )
 
             index += 1
 
diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py
index 7853133e77f..eb278a1511c 100644
--- a/python/sedona/spark/worker/udf_info.py
+++ b/python/sedona/spark/worker/udf_info.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from dataclasses import dataclass
 
 from sedona.spark import GeometryType
@@ -15,11 +32,15 @@ def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> s
         arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets])
         function_expr = f"{self.name}({arg_offset_str})"
         if isinstance(self.return_type, GeometryType) and cast_to_wkb:
-            return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"
+            return (
+                f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"
+            )
 
         return f"SELECT {function_expr} AS _0 FROM {table_name}"
 
-    def sedona_db_transformation_expr(self, table_name: str, cast_to_wkb: bool = False) -> str:
+    def sedona_db_transformation_expr(
+        self, table_name: str, cast_to_wkb: bool = False
+    ) -> str:
         fields = []
         for arg in self.arg_offsets:
             if arg in self.geom_offsets and cast_to_wkb:
diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
index 17dae02e637..02fedf00581 100644
--- a/python/sedona/spark/worker/worker.py
+++ b/python/sedona/spark/worker/worker.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import importlib
 import os
 import sys
@@ -8,8 +25,16 @@
 from pyspark.errors import PySparkRuntimeError
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.resource import ResourceInformation
-from pyspark.serializers import read_int, UTF8Deserializer, read_bool, read_long, CPickleSerializer, write_int, \
-    write_long, SpecialLengths
+from pyspark.serializers import (
+    read_int,
+    UTF8Deserializer,
+    read_bool,
+    read_long,
+    CPickleSerializer,
+    write_int,
+    write_long,
+    SpecialLengths,
+)
 
 from sedona.spark.worker.serde import SedonaDBSerializer
 from sedona.spark.worker.udf_info import UDFInfo
@@ -18,11 +43,13 @@
 def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False):
     i = 0
     for df in iterator:
-        i+=1
+        i += 1
         table_name = f"output_table_{i}"
         df.to_view(table_name)
 
-        function_call_sql = udf_info.get_function_call_sql(table_name, cast_to_wkb=cast_to_wkb)
+        function_call_sql = udf_info.get_function_call_sql(
+            table_name, cast_to_wkb=cast_to_wkb
+        )
 
         df_out = db.sql(function_call_sql)
 
@@ -30,8 +57,7 @@ def apply_iterator(db, iterator, udf_info: UDFInfo, cast_to_wkb: bool = False):
         at = df_out.to_arrow_table()
         batches = at.combine_chunks().to_batches()
 
-        for batch in batches:
-            yield batch
+        yield from batches
 
 
 def check_python_version(utf_serde: UTF8Deserializer, infile) -> str:
@@ -50,6 +76,7 @@ def check_python_version(utf_serde: UTF8Deserializer, infile) -> str:
 
     return version
 
+
 def check_barrier_flag(infile):
     is_barrier = read_bool(infile)
     bound_port = read_int(infile)
@@ -66,6 +93,7 @@ def check_barrier_flag(infile):
 
     return is_barrier
 
+
 def assign_task_context(utf_serde: UTF8Deserializer, infile):
     stage_id = read_int(infile)
     partition_id = read_int(infile)
@@ -97,6 +125,7 @@ def assign_task_context(utf_serde: UTF8Deserializer, infile):
 
     return task_context
 
+
 def resolve_python_path(utf_serde: UTF8Deserializer, infile):
     def add_path(path: str):
         # worker can be used, so do not add path multiple times
@@ -131,6 +160,7 @@ def check_broadcast_variables(infile):
             },
         )
 
+
 def get_runner_conf(utf_serde: UTF8Deserializer, infile):
     runner_conf = {}
     num_conf = read_int(infile)
@@ -145,6 +175,7 @@ def read_command(serializer, infile):
     command = serializer._read_with_length(infile)
     return command
 
+
 def read_udf(infile, pickle_ser) -> UDFInfo:
     num_arg = read_int(infile)
     arg_offsets = [read_int(infile) for i in range(num_arg)]
@@ -162,9 +193,10 @@ def read_udf(infile, pickle_ser) -> UDFInfo:
         function=sedona_db_udf_expression,
         return_type=return_type,
         name=sedona_db_udf_expression._name,
-        geom_offsets=[0]
+        geom_offsets=[0],
     )
 
+
 def register_sedona_db_udf(infile, pickle_ser) -> UDFInfo:
     num_udfs = read_int(infile)
 
@@ -237,7 +269,7 @@ def main(infile, outfile):
         safecheck=False,
         db=sedona_db,
         udf_info=udf,
-        cast_to_wkb=cast_to_wkb
+        cast_to_wkb=cast_to_wkb,
     )
 
     number_of_geometries = read_int(infile)
@@ -251,13 +283,13 @@ def main(infile, outfile):
     udf.geom_offsets = geom_offsets
 
     iterator = serde.load_stream(infile)
-    out_iterator = apply_iterator(db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb)
+    out_iterator = apply_iterator(
+        db=sedona_db, iterator=iterator, udf_info=udf, cast_to_wkb=cast_to_wkb
+    )
 
     serde.dump_stream(out_iterator, outfile)
 
-    write_statistics(
-        infile, outfile, boot_time=boot_time, init_time=init_time
-    )
+    write_statistics(infile, outfile, boot_time=boot_time, init_time=init_time)
 
 
 if __name__ == "__main__":
diff --git a/python/setup.py b/python/setup.py
index 66ab74701b0..ae5e7bf1746 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 from setuptools import setup
 import numpy
 
diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c
index 621f956cd03..1d7aefcd772 100644
--- a/python/src/geomserde_speedup_module.c
+++ b/python/src/geomserde_speedup_module.c
@@ -19,16 +19,14 @@
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <numpy/ndarraytypes.h>
+#include <numpy/npy_3kcompat.h>
+#include <numpy/ufuncobject.h>
 #include <stdio.h>
-//
-//#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 
 #include "geomserde.h"
 #include "geos_c_dyn.h"
 #include "pygeos/c_api.h"
-#include <numpy/ndarraytypes.h>
-#include <numpy/npy_3kcompat.h>
-#include <numpy/ufuncobject.h>
 
 PyDoc_STRVAR(module_doc, "Geometry serialization/deserialization module.");
 
@@ -230,7 +228,7 @@ static PyObject *serialize(PyObject *self, PyObject *args) {
   return do_serialize(geos_geom);
 }
 
-static PyObject *deserialize_2(PyObject *self, PyObject *args) {
+static PyObject *deserialize(PyObject *self, PyObject *args) {
   GEOSContextHandle_t handle = NULL;
   int length = 0;
   GEOSGeometry *geom = do_deserialize(args, &handle, &length);
@@ -268,86 +266,86 @@ static PyObject *deserialize_1(PyObject *self, PyObject *args) {
 }
 
 static PyObject *to_sedona_func(PyObject *self, PyObject *args) {
-    import_array();
-    PyObject *input_obj = NULL;
-    if (!PyArg_ParseTuple(args, "O", &input_obj)){
-        return NULL;
-    };
-
-    PyArrayObject *array = (PyArrayObject *)input_obj;
-    PyObject **objs = (PyObject **)PyArray_DATA(array);
-
-    GEOSContextHandle_t handle = get_geos_context_handle();
-      if (handle == NULL) {
-        return NULL;
-      }
-
-    npy_intp n = PyArray_SIZE(input_obj);
-    npy_intp dims[1] = {n};
-    PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
-    for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
-          PyObject *obj = objs[i];
-          GEOSGeometry *geos_geom = NULL;
-          char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom);
-
-          PyObject *serialized = do_serialize(geos_geom);
-          PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized);
-    }
+  import_array();
+  PyObject *input_obj = NULL;
+  if (!PyArg_ParseTuple(args, "O", &input_obj)) {
+    return NULL;
+  };
+
+  PyArrayObject *array = (PyArrayObject *)input_obj;
+  PyObject **objs = (PyObject **)PyArray_DATA(array);
 
-    return out;
+  GEOSContextHandle_t handle = get_geos_context_handle();
+  if (handle == NULL) {
+    return NULL;
+  }
+
+  npy_intp n = PyArray_SIZE(input_obj);
+  npy_intp dims[1] = {n};
+  PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
+  for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
+    PyObject *obj = objs[i];
+    GEOSGeometry *geos_geom = NULL;
+    char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom);
+
+    PyObject *serialized = do_serialize(geos_geom);
+    PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized);
+  }
+
+  return out;
 }
 /* Module definition for Shapely 2.x */
 static PyObject *from_sedona_func(PyObject *self, PyObject *args) {
-    import_array();
-    PyObject *input_obj = NULL;
-    if (!PyArg_ParseTuple(args, "O", &input_obj)){
-        return NULL;
-    };
+  import_array();
+  PyObject *input_obj = NULL;
+  if (!PyArg_ParseTuple(args, "O", &input_obj)) {
+    return NULL;
+  };
 
-    GEOSContextHandle_t handle = get_geos_context_handle();
+  GEOSContextHandle_t handle = get_geos_context_handle();
 
-    PyArrayObject *array = (PyArrayObject *)input_obj;
-    PyObject **objs = (PyObject **)PyArray_DATA(array);
+  PyArrayObject *array = (PyArrayObject *)input_obj;
+  PyObject **objs = (PyObject **)PyArray_DATA(array);
 
-    int p_bytes_read = 0;
+  int p_bytes_read = 0;
 
-    npy_intp n = PyArray_SIZE(input_obj);
+  npy_intp n = PyArray_SIZE(input_obj);
 
-    npy_intp dims[1] = {n};
-    PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
+  npy_intp dims[1] = {n};
+  PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNew(1, dims, NPY_OBJECT);
 
-    for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
-        PyObject *obj = objs[i];
-        if (!PyBytes_Check(obj)) {
-            PyErr_SetString(PyExc_TypeError, "Expected bytes");
-            return NULL;
-        }
+  for (npy_intp i = 0; i < PyArray_SIZE(array); i++) {
+    PyObject *obj = objs[i];
+    if (!PyBytes_Check(obj)) {
+      PyErr_SetString(PyExc_TypeError, "Expected bytes");
+      return NULL;
+    }
 
-        char *buf = PyBytes_AS_STRING(obj);
+    char *buf = PyBytes_AS_STRING(obj);
 
-        Py_ssize_t len = PyBytes_GET_SIZE(obj);
+    Py_ssize_t len = PyBytes_GET_SIZE(obj);
 
-        GEOSGeometry *geom = NULL;
+    GEOSGeometry *geom = NULL;
 
-        SedonaErrorCode err = sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read);
-        if (err != SEDONA_SUCCESS) {
-          handle_geomserde_error(err);
-          return NULL;
-        }
-          PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle);
+    SedonaErrorCode err =
+        sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read);
+    if (err != SEDONA_SUCCESS) {
+      handle_geomserde_error(err);
+      return NULL;
+    }
+    PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle);
 
-          PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom);
-        }
+    PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom);
+  }
 
-        return out;
+  return out;
 }
 
-
 static PyMethodDef geomserde_methods_shapely_2[] = {
     {"load_libgeos_c", load_libgeos_c, METH_VARARGS, "Load libgeos_c."},
     {"serialize", serialize, METH_VARARGS,
      "Serialize geometry object as bytearray."},
-    {"deserialize_2", deserialize_2, METH_VARARGS,
+    {"deserialize", deserialize, METH_VARARGS,
      "Deserialize bytes-like object to geometry object."},
     {"from_sedona_func", from_sedona_func, METH_VARARGS,
      "Deserialize bytes-like object to geometry object."},
@@ -355,18 +353,6 @@ static PyMethodDef geomserde_methods_shapely_2[] = {
      "Deserialize bytes-like object to geometry object."},
     {NULL, NULL, 0, NULL}, /* Sentinel */
 };
-//
-//static int add_from_sedona_func_to_module(PyObject *m) {
-//  PyObject *capsule = PyCapsule_New((void *)from_sedona_func, "from_sedona_func", NULL);
-//  if (capsule == NULL) {
-//    return -1;
-//  }
-//  if (PyModule_AddObject(m, "from_sedona_func", capsule) < 0) {
-//    Py_DECREF(capsule);
-//    return -1;
-//  }
-//  return 0;
-//}
 
 static struct PyModuleDef geomserde_module_shapely_2 = {
     PyModuleDef_HEAD_INIT, "geomserde_speedup", module_doc, 0,
diff --git a/python/tests/test_base.py b/python/tests/test_base.py
index e240a09758c..300d937d278 100644
--- a/python/tests/test_base.py
+++ b/python/tests/test_base.py
@@ -70,11 +70,15 @@ def spark(self):
                     "spark.sedona.stac.load.itemsLimitMax",
                     "20",
                 )
-                .config("spark.executor.memory", "10G") \
-                .config("spark.driver.memory", "10G") \
-                .config("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon") \
-                .config("sedona.python.worker.daemon.enabled", "false") \
-                # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default
+                .config("spark.executor.memory", "10G")
+                .config("spark.driver.memory", "10G")
+                .config(
+                    "sedona.python.worker.udf.daemon.module",
+                    "sedona.spark.worker.daemon",
+                )
+                .config(
+                    "sedona.python.worker.daemon.enabled", "false"
+                )  # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default
                 # in Spark 4
                 .config("spark.sql.ansi.enabled", "false")
             )
diff --git a/python/tests/utils/test_sedona_db_vectorized_udf.py b/python/tests/utils/test_sedona_db_vectorized_udf.py
index 4b266384fa9..eea84eec91f 100644
--- a/python/tests/utils/test_sedona_db_vectorized_udf.py
+++ b/python/tests/utils/test_sedona_db_vectorized_udf.py
@@ -1,9 +1,24 @@
-import time
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
 import numpy as np
 
 from sedona.spark.sql.functions import sedona_db_vectorized_udf
-from sedona.spark.utils.udf import to_sedona_func, from_sedona_func
+from sedona.spark.utils.udf import to_sedona, from_sedona
 from tests.test_base import TestBase
 import pyarrow as pa
 import shapely
@@ -11,107 +26,21 @@
 from pyspark.sql.functions import expr, lit
 from pyspark.sql.types import DoubleType, IntegerType, ByteType
 from sedona.spark.sql import ST_X
-from shapely._enum import ParamEnum
-
-def test_m():
-    on_invalid="raise"
-    wkb = b'\x12\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?'
-    geometry = np.asarray([wkb, wkb], dtype=object)
-
-    DecodingErrorOptions = ParamEnum(
-        "DecodingErrorOptions", {"ignore": 0, "warn": 1, "raise": 2, "fix": 3}
-    )
-
-    # print("sss")
-
-
-    # <class 'numpy.ndarray'>
-    # object
-    #   C_CONTIGUOUS : True
-    #   F_CONTIGUOUS : True
-    #   OWNDATA : False
-    #   WRITEABLE : True
-    #   ALIGNED : True
-    #   WRITEBACKIFCOPY : False
-    # print(type(geometry))
-    # print(geometry.dtype)
-    # print(geometry.flags)
-
-    result = from_sedona_func(geometry)
-
-    result2 = to_sedona_func(result)
-
-# ensure the input has object dtype, to avoid numpy inferring it as a
-# fixed-length string dtype (which removes trailing null bytes upon access
-# of array elements)
-    #
-    # def from_sedona_func(arr):
-    #     try:
-    #         from . import sedonaserde_vectorized_udf_module
-    #         print(sedonaserde_vectorized_udf_module.from_sedona_func_3(arr))
-    #     except Exception as e:
-    #         print("Cannot import sedonaserde_vectorized_udf_module:")
-    #         print(e)
-    # # print()
-    # return None
-#
-# def from_wkb(geometry, on_invalid="raise", **kwargs):
-#     r"""Create geometries from the Well-Known Binary (WKB) representation.
-#
-#     The Well-Known Binary format is defined in the `OGC Simple Features
-#     Specification for SQL <https://www.opengeospatial.org/standards/sfs>`__.
-#
-#     Parameters
-#     ----------
-#     geometry : str or array_like
-#         The WKB byte object(s) to convert.
-#     on_invalid : {"raise", "warn", "ignore", "fix"}, default "raise"
-#         Indicates what to do when an invalid WKB is encountered. Note that the
-#         validations involved are very basic, e.g. the minimum number of points
-#         for the geometry type. For a thorough check, use :func:`is_valid` after
-#         conversion to geometries. Valid options are:
-#
-#         - raise: an exception will be raised if any input geometry is invalid.
-#         - warn: a warning will be raised and invalid WKT geometries will be
-#           returned as ``None``.
-#         - ignore: invalid geometries will be returned as ``None`` without a
-#           warning.
-#         - fix: an effort is made to fix invalid input geometries (currently just
-#           unclosed rings). If this is not possible, they are returned as
-#           ``None`` without a warning. Requires GEOS >= 3.11.
-#
-#           .. versionadded:: 2.1.0
-#     **kwargs
-#         See :ref:`NumPy ufunc docs <ufuncs.kwargs>` for other keyword arguments.
-#
-#     Examples
-#     --------
-#     >>> import shapely
-#     >>> shapely.from_wkb(b'\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?')
-#     <POINT (1 1)>
-#
-#     """  # noqa: E501
-#     if not np.isscalar(on_invalid):
-#         raise TypeError("on_invalid only accepts scalar values")
-#
-#     invalid_handler = np.uint8(DecodingErrorOptions.get_value(on_invalid))
-#
-#     # ensure the input has object dtype, to avoid numpy inferring it as a
-#     # fixed-length string dtype (which removes trailing null bytes upon access
-#     # of array elements)
-#     geometry = np.asarray(geometry, dtype=object)
-#     return lib.from_wkb(geometry, invalid_handler, **kwargs)
+
 
 class TestSedonaDBArrowFunction(TestBase):
     def test_vectorized_udf(self):
-        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType(), IntegerType()])
+        @sedona_db_vectorized_udf(
+            return_type=GeometryType(), input_types=[ByteType(), IntegerType()]
+        )
         def my_own_function(geom, distance):
             geom_wkb = pa.array(geom.storage.to_array())
+            geometry_array = np.asarray(geom_wkb, dtype=object)
             distance = pa.array(distance.to_array())
-            geom = shapely.from_wkb(geom_wkb)
+            geom = from_sedona(geometry_array)
             result_shapely = shapely.centroid(geom)
 
-            return pa.array(shapely.to_wkb(result_shapely))
+            return pa.array(to_sedona(result_shapely))
 
         df = self.spark.createDataFrame(
             [
@@ -125,56 +54,61 @@ def my_own_function(geom, distance):
         df.select(ST_X(my_own_function(df.wkt, lit(100)).alias("geom"))).show()
 
     def test_geometry_to_double(self):
-        @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[GeometryType()])
+        @sedona_db_vectorized_udf(return_type=DoubleType(), input_types=[ByteType()])
         def geometry_to_non_geometry_udf(geom):
             geom_wkb = pa.array(geom.storage.to_array())
-            geom = shapely.from_wkb(geom_wkb)
+            geometry_array = np.asarray(geom_wkb, dtype=object)
+            geom = from_sedona(geometry_array)
 
             result_shapely = shapely.get_x(shapely.centroid(geom))
 
-            return pa.array(result_shapely, pa.float64())
+            return pa.array(result_shapely)
 
         df = self.spark.createDataFrame(
             [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
             ["id", "wkt"],
         ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
 
-        values = df.select(geometry_to_non_geometry_udf(df.wkt).alias("x_coord")) \
-            .collect()
+        values = df.select(
+            geometry_to_non_geometry_udf(df.wkt).alias("x_coord")
+        ).collect()
 
         values_list = [row["x_coord"] for row in values]
 
         assert values_list == [1.0, 2.0, 3.0]
 
     def test_geometry_to_int(self):
-        @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[GeometryType()])
+        @sedona_db_vectorized_udf(return_type=IntegerType(), input_types=[ByteType()])
         def geometry_to_int(geom):
             geom_wkb = pa.array(geom.storage.to_array())
-            geom = shapely.from_wkb(geom_wkb)
+            geometry_array = np.asarray(geom_wkb, dtype=object)
+
+            geom = from_sedona(geometry_array)
 
             result_shapely = shapely.get_num_points(geom)
 
-            return pa.array(result_shapely, pa.int32())
+            return pa.array(result_shapely)
 
         df = self.spark.createDataFrame(
             [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
             ["id", "wkt"],
         ).withColumn("wkt", expr("ST_GeomFromWKT(wkt)"))
 
-        values = df.select(geometry_to_int(df.wkt)) \
-            .collect()
+        values = df.select(geometry_to_int(df.wkt)).collect()
 
         values_list = [row[0] for row in values]
 
         assert values_list == [0, 0, 0]
 
     def test_geometry_crs_preservation(self):
-        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[GeometryType()])
+        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()])
         def return_same_geometry(geom):
             geom_wkb = pa.array(geom.storage.to_array())
-            geom = shapely.from_wkb(geom_wkb)
+            geometry_array = np.asarray(geom_wkb, dtype=object)
+
+            geom = from_sedona(geometry_array)
 
-            return pa.array(shapely.to_wkb(geom))
+            return pa.array(to_sedona(geom))
 
         df = self.spark.createDataFrame(
             [(1, "POINT (1 1)"), (2, "POINT (2 2)"), (3, "POINT (3 3)")],
@@ -183,70 +117,8 @@ def return_same_geometry(geom):
 
         result_df = df.select(return_same_geometry(df.wkt).alias("geom"))
 
-        crs_list = result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect()
+        crs_list = (
+            result_df.selectExpr("ST_SRID(geom)").rdd.flatMap(lambda x: x).collect()
+        )
 
         assert crs_list == [3857, 3857, 3857]
-
-    def test_geometry_to_geometry(self):
-        @sedona_db_vectorized_udf(return_type=GeometryType(), input_types=[ByteType()])
-        def buffer_geometry(geom):
-            geom_wkb = pa.array(geom.storage.to_array())
-            geometry_array = np.asarray(geom_wkb, dtype=object)
-            geom = from_sedona_func(geometry_array)
-
-            result_shapely = shapely.buffer(geom, 10)
-
-            return pa.array(to_sedona_func(result_shapely))
-
-        df = self.spark.read.\
-            format("geoparquet").\
-            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l1")
-
-        # 1 045 770
-        # print(df.count())
-
-        # df.unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\
-        #     unionAll(df).unionAll(df).unionAll(df).unionAll(df).unionAll(df).\
-        #     write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2")
-        # 18 24
-        # df.union(df).union(df).union(df).union(df).union(df).union(df).\
-        #     write.format("geoparquet").mode("overwrite").save("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/data/warehouse/buildings_large_3")
-
-        values = df.select(buffer_geometry(df.geometry).alias("geometry")).\
-            selectExpr("ST_Area(geometry) as area").\
-            selectExpr("Sum(area) as total_area")
-
-        values.show()
-
-        # for _ in range(4):
-        #     start_time = time.time()
-        #     values.show()
-        #     end_time = time.time()
-        #     print(f"Execution time: {end_time - start_time} seconds")
-
-    def test_geometry_to_geometry_normal_udf(self):
-        from pyspark.sql.functions import udf
-
-        def create_buffer(geom):
-            return geom.buffer(10)
-
-        create_buffer_udf = udf(create_buffer, GeometryType())
-
-        df = self.spark.read. \
-            format("geoparquet"). \
-            load("/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona_dupl_l2")
-
-        # print(df.count())
-        # df.limit(10).collect()
-        values = df.select(create_buffer_udf(df.geometry).alias("geometry")). \
-            selectExpr("ST_Area(geometry) as area"). \
-            selectExpr("Sum(area) as total_area")
-
-        values.show()
-
-        # for _ in range(4):
-        #     start_time = time.time()
-        #     values.show()
-        #     end_time = time.time()
-        #     print(f"Execution time: {end_time - start_time} seconds")
-# 1 045 770
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
index 4fe4acfb121..94ce194c654 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/StrategySuite.scala
@@ -73,21 +73,11 @@ class StrategySuite extends TestBaseScala with Matchers {
       .withColumn("geometry", expr("ST_SetSRID(geom, '4326')"))
       .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(100)).alias("geom"))
 
-    dfVectorized.selectExpr("ST_X(ST_Centroid(geom)) AS x")
+    dfVectorized
+      .selectExpr("ST_X(ST_Centroid(geom)) AS x")
       .selectExpr("sum(x)")
       .as[Double]
-      .collect().head shouldEqual 101
-//
-//    val dfCopied = sparkSession.read
-//      .format("geoparquet")
-//      .load(
-//        "/Users/pawelkocinski/Desktop/projects/sedona-production/apache-sedona-book/book/source_data/transportation_barcelona/barcelona.geoparquet")
-//
-//    val values = dfCopied
-//      .select(sedonaDBGeometryToGeometryFunction(col("geometry"), lit(10)).alias("geom"))
-//      .selectExpr("ST_Area(geom) as area")
-//      .selectExpr("Sum(area) as total_area")
-//
-//    values.show()
+      .collect()
+      .head shouldEqual 101
   }
 }

From 0fd81806a87ebc8cdaf50d75dccbd1a6bf7e16e3 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Wed, 14 Jan 2026 16:17:48 +0100
Subject: [PATCH 07/11] add sedonadb sedona udf worker example

---
 python/pyproject.toml                         | 49 +++++++++++--------
 python/sedona/spark/utils/geometry_serde.py   |  6 ++-
 python/sedona/spark/worker/daemon.py          | 17 -------
 python/src/geom_buf.c                         |  2 -
 python/src/geomserde.c                        |  1 -
 python/src/geomserde_speedup_module.c         | 33 ++++++++++++-
 python/tests/test_base.py                     |  2 +-
 .../apache/sedona/spark/SedonaContext.scala   |  1 +
 .../python/SedonaPythonArrowInput.scala       | 30 ------------
 .../python/SedonaPythonArrowOutput.scala      |  5 --
 .../sedona/sql/SQLSyntaxTestScala.scala       |  8 +--
 .../org/apache/sedona/sql/TestBaseScala.scala | 22 +++------
 .../spark/sql/udf/TestScalarPandasUDF.scala   |  3 --
 13 files changed, 77 insertions(+), 102 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 37159cf83b9..8b2a06a5d37 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -26,7 +26,7 @@ description = "Apache Sedona is a cluster computing system for processing large-
 readme = "README.md"
 license = { text = "Apache-2.0" }
 authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ]
-requires-python = ">=3.12"
+requires-python = ">=3.8"
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",
@@ -50,16 +50,38 @@ kepler-map = ["geopandas", "keplergl==0.3.2"]
 flink = ["apache-flink>=1.19.0"]
 db = ["sedonadb[geopandas]; python_version >= '3.9'"]
 all = [
-#  "pyspark>=3.4.0,<4.1.0",
-#  "geopandas",
-#  "pydeck==0.8.0",
-#  "keplergl==0.3.2",
-#  "rasterio>=1.2.10",
+  "pyspark>=3.4.0,<4.1.0",
+  "geopandas",
+  "pydeck==0.8.0",
+  "keplergl==0.3.2",
+  "rasterio>=1.2.10",
 ]
 
 [dependency-groups]
 dev = [
-    "pytest>=9.0.2",
+  "pytest",
+  "pytest-cov",
+  "notebook==6.4.12",
+  "jupyter",
+  "mkdocs",
+  "scikit-learn",
+  "esda",
+  "libpysal",
+  "matplotlib",  # implicit dependency of esda
+  # prevent incompatibility with pysal 4.7.0, which is what is resolved to when shapely >2 is specified
+  "scipy<=1.10.0",
+  "pandas>=2.0.0",
+  "numpy<2",
+  "geopandas",
+  # https://stackoverflow.com/questions/78949093/how-to-resolve-attributeerror-module-fiona-has-no-attribute-path
+  # cannot set geopandas>=0.14.4 since it doesn't support python 3.8, so we pin fiona to <1.10.0
+  "fiona<1.10.0",
+  "pyarrow",
+  "pyspark>=3.4.0,<4.1.0",
+  "keplergl==0.3.2",
+  "pydeck==0.8.0",
+  "pystac==1.5.0",
+  "rasterio>=1.2.10",
 ]
 
 [project.urls]
@@ -80,20 +102,7 @@ exclude = ["*.tests", "*.tests.*", "tests", "tests.*"]
 name = "sedona.spark.utils.geomserde_speedup"
 sources = [
   "src/geomserde_speedup_module.c",
-  "src/sedonaserde_vectorized_udf_module.c",
   "src/geomserde.c",
   "src/geom_buf.c",
   "src/geos_c_dyn.c",
 ]
-
-[[tool.setuptools.ext-modules]]
-name = "sedona.spark.utils.sedonaserde_vectorized_udf_module"
-sources = [
-    "src/sedonaserde_vectorized_udf_module.c",
-    "src/geomserde.c",
-    "src/geom_buf.c",
-    "src/geos_c_dyn.c",
-]
-
-[tool.uv.sources]
-sedonadb = { path = "../../../sedona-db/target/wheels/sedonadb-0.3.0-cp312-cp312-macosx_11_0_arm64.whl" }
diff --git a/python/sedona/spark/utils/geometry_serde.py b/python/sedona/spark/utils/geometry_serde.py
index 0ef3d4ed5cb..103eb498173 100644
--- a/python/sedona/spark/utils/geometry_serde.py
+++ b/python/sedona/spark/utils/geometry_serde.py
@@ -25,6 +25,9 @@
 
 speedup_enabled = False
 
+
+# Use geomserde_speedup when available, otherwise fallback to general pure
+# python implementation.
 try:
     from . import geomserde_speedup
 
@@ -57,9 +60,8 @@ def find_geos_c_dll():
         def deserialize(buf: bytearray) -> Optional[BaseGeometry]:
             if buf is None:
                 return None
-            return geomserde_speedup.deserialize_2(buf)
+            return geomserde_speedup.deserialize(buf)
 
-        # Export the from_sedona_func for use with numpy ufuncs
         speedup_enabled = True
 
     elif shapely.__version__.startswith("1."):
diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py
index ce75e376eac..266baf76d5d 100644
--- a/python/sedona/spark/worker/daemon.py
+++ b/python/sedona/spark/worker/daemon.py
@@ -40,23 +40,7 @@ def compute_real_exit_code(exit_code):
     else:
         return 1
 
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-file_handler = logging.FileHandler(
-    "/Users/pawelkocinski/Desktop/projects/sedonaworker/sedonaworker/logs/worker_daemon_main.log",
-    delay=False,
-)
-file_handler.flush = file_handler.stream.flush
-
-logger.addHandler(file_handler)
-
-
 def worker(sock, authenticated):
-    logger.info(
-        "Starting worker process with pid =" + str(os.getpid()) + " socket " + str(sock)
-    )
     """
     Called by a worker process after the fork().
     """
@@ -207,7 +191,6 @@ def handle_sigterm(*args):
                         authenticated = False
                         while True:
                             code = worker(sock, authenticated)
-                            logger.info("Worker exited with code %d", code)
                             if code == 0:
                                 authenticated = True
                             if not reuse or code:
diff --git a/python/src/geom_buf.c b/python/src/geom_buf.c
index d6a51bb3d0e..5239de5ae09 100644
--- a/python/src/geom_buf.c
+++ b/python/src/geom_buf.c
@@ -208,8 +208,6 @@ SedonaErrorCode geom_buf_alloc(GeomBuffer *geom_buf,
   return SEDONA_SUCCESS;
 }
 
-#include <stdio.h>
-
 SedonaErrorCode read_geom_buf_header(const char *buf, int buf_size,
                                      GeomBuffer *geom_buf,
                                      CoordinateSequenceInfo *cs_info,
diff --git a/python/src/geomserde.c b/python/src/geomserde.c
index 81dafe216f0..c1f74277383 100644
--- a/python/src/geomserde.c
+++ b/python/src/geomserde.c
@@ -718,7 +718,6 @@ static SedonaErrorCode deserialize_geom_buf(GEOSContextHandle_t handle,
   return SEDONA_SUCCESS;
 }
 
-#include <stdio.h>
 SedonaErrorCode sedona_deserialize_geom(GEOSContextHandle_t handle,
                                         const char *buf, int buf_size,
                                         GEOSGeometry **p_geom,
diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c
index 1d7aefcd772..610c4d1b05b 100644
--- a/python/src/geomserde_speedup_module.c
+++ b/python/src/geomserde_speedup_module.c
@@ -287,9 +287,24 @@ static PyObject *to_sedona_func(PyObject *self, PyObject *args) {
     PyObject *obj = objs[i];
     GEOSGeometry *geos_geom = NULL;
     char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom);
+    if (!success || geos_geom == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry");
+        Py_DECREF(out);
+        return NULL;
+    }
 
     PyObject *serialized = do_serialize(geos_geom);
-    PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized);
+    if (!serialized) {
+        Py_DECREF(out);
+        return NULL;
+    }
+
+    if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized) < 0) {
+        Py_DECREF(serialized);
+        Py_DECREF(out);
+        return NULL;
+    }
+    Py_DECREF(serialized);
   }
 
   return out;
@@ -318,6 +333,8 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) {
     PyObject *obj = objs[i];
     if (!PyBytes_Check(obj)) {
       PyErr_SetString(PyExc_TypeError, "Expected bytes");
+      Py_DECREF(out);
+
       return NULL;
     }
 
@@ -331,11 +348,23 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) {
         sedona_deserialize_geom(handle, buf, len, &geom, &p_bytes_read);
     if (err != SEDONA_SUCCESS) {
       handle_geomserde_error(err);
+      Py_DECREF(out);
       return NULL;
     }
+
     PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle);
+    if (!pygeom) {
+        Py_DECREF(out);
+        return NULL;
+    }
+
+   if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) {
+        Py_DECREF(pygeom);
+        Py_DECREF(out);
+        return NULL;
+    }
 
-    PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom);
+    Py_DECREF(pygeom);
   }
 
   return out;
diff --git a/python/tests/test_base.py b/python/tests/test_base.py
index 300d937d278..39749302072 100644
--- a/python/tests/test_base.py
+++ b/python/tests/test_base.py
@@ -77,7 +77,7 @@ def spark(self):
                     "sedona.spark.worker.daemon",
                 )
                 .config(
-                    "sedona.python.worker.daemon.enabled", "false"
+                    "sedona.python.worker.daemon.enabled", "true"
                 )  # Pandas on PySpark doesn't work with ANSI mode, which is enabled by default
                 # in Spark 4
                 .config("spark.sql.ansi.enabled", "false")
diff --git a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
index add3caf225a..c9e8497f7ed 100644
--- a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
+++ b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala
@@ -41,6 +41,7 @@ class InternalApi(
     extends StaticAnnotation
 
 object SedonaContext {
+
   private def customOptimizationsWithSession(sparkSession: SparkSession) =
     Seq(
       new TransformNestedUDTParquet(sparkSession),
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
index 2544e63a979..66029673519 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowInput.scala
@@ -18,23 +18,6 @@
  */
 package org.apache.spark.sql.execution.python
 
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 import org.apache.arrow.vector.VectorSchemaRoot
 import org.apache.arrow.vector.ipc.ArrowStreamWriter
 import org.apache.spark.sql.catalyst.InternalRow
@@ -87,21 +70,8 @@ private[python] trait SedonaPythonArrowInput[IN] extends PythonArrowInput[IN] {
 
           writeIteratorToArrowStream(root, writer, dataOut, inputIterator)
 
-          // end writes footer to the output stream and doesn't clean any resources.
-          // It could throw exception if the output stream is closed, so it should be
-          // in the try block.
           writer.end()
         } {
-          // If we close root and allocator in TaskCompletionListener, there could be a race
-          // condition where the writer thread keeps writing to the VectorSchemaRoot while
-          // it's being closed by the TaskCompletion listener.
-          // Closing root and allocator here is cleaner because root and allocator is owned
-          // by the writer thread and is only visible to the writer thread.
-          //
-          // If the writer thread is interrupted by TaskCompletionListener, it should either
-          // (1) in the try block, in which case it will get an InterruptedException when
-          // performing io, and goes into the finally block or (2) in the finally block,
-          // in which case it will ignore the interruption and close the resources.
           root.close()
           allocator.close()
         }
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
index 27764c2a54f..8940a376a22 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
@@ -99,11 +99,6 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR
         }
         eos = true
       }
-//      def handleEndOfDataSectionSedona(): Unit = {
-//        if (stream.readInt() == SpecialLengths.END_OF_STREAM) {}
-//
-//        eos = true
-//      }
 
       protected override def handleEndOfDataSection(): Unit = {
         handleEndOfDataSectionSedona()
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
index 72a27461f69..6f873d0a087 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala
@@ -47,11 +47,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks {
       try {
         sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)")
         sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true)
-//        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
+        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
       } catch {
         case ex: Exception =>
           ex.getClass.getName.endsWith("ParseException") should be(true)
-//          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
+          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
       }
     }
 
@@ -61,11 +61,11 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks {
         sparkSession.sql(
           "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)")
         sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true)
-//        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
+        sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true")
       } catch {
         case ex: Exception =>
           ex.getClass.getName.endsWith("ParseException") should be(true)
-//          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
+          sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false")
       }
     }
   }
diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
index c9b4d6ac289..50d751f4842 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala
@@ -30,13 +30,13 @@ import java.io.FileInputStream
 import java.util.concurrent.ThreadLocalRandom
 
 trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
-//  Logger.getRootLogger().setLevel(Level.WARN)
-//  Logger.getLogger("org.apache").setLevel(Level.WARN)
-//  Logger.getLogger("com").setLevel(Level.WARN)
-//  Logger.getLogger("akka").setLevel(Level.WARN)
-//  Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN)
+  Logger.getRootLogger().setLevel(Level.WARN)
+  Logger.getLogger("org.apache").setLevel(Level.WARN)
+  Logger.getLogger("com").setLevel(Level.WARN)
+  Logger.getLogger("akka").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN)
 
-//  val keyParserExtension = "spark.sedona.enableParserExtension"
+  val keyParserExtension = "spark.sedona.enableParserExtension"
   val warehouseLocation = System.getProperty("user.dir") + "/target/"
   val sparkSession = SedonaContext
     .builder()
@@ -49,17 +49,9 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll {
     .config("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
     .config("sedona.python.worker.udf.daemon.module", "sedonaworker.daemon")
     .config("sedona.python.worker.daemon.enabled", "false")
-//    .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
+    .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean())
     .getOrCreate()
 
-//    private val useDaemon: Boolean =
-//    SparkEnv.get.conf.getBoolean("sedona.python.worker.daemon.enabled", false)
-//
-//  private val sedonaUDFWorkerModule =
-//    SparkEnv.get.conf.get("sedona.python.worker.udf.module", "sedona.spark.worker.worker")
-//
-//  private val sedonaDaemonModule =
-//    SparkEnv.get.conf.get("sedona.python.worker.udf.daemon.module", "sedona.spark.worker.daemon")
   val sparkSessionMinio = SedonaContext
     .builder()
     .master("local[*]")
diff --git a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
index 23aac14bbe5..d2c0d71c703 100644
--- a/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
+++ b/spark/spark-3.5/src/test/scala/org/apache/spark/sql/udf/TestScalarPandasUDF.scala
@@ -45,9 +45,6 @@ object ScalarUDF {
     }
   }
 
-  SparkEnv.get.conf.set(PYTHON_USE_DAEMON, false)
-  SparkEnv.get.conf.set(PYTHON_WORKER_MODULE, "sedonaworker.work")
-
   private[spark] lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "")
   protected lazy val sparkHome: String = {
     sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))

From 04d5cfc269dc3513efcf70602eea4c15b1703cd3 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Wed, 14 Jan 2026 20:26:32 +0100
Subject: [PATCH 08/11] fix pre commit

---
 python/sedona/spark/worker/daemon.py          |  6 ++--
 python/sedona/spark/worker/udf_info.py        | 12 ++++----
 python/sedona/spark/worker/worker.py          |  2 +-
 python/src/geomserde_speedup_module.c         | 28 +++++++++----------
 .../python/SedonaPythonArrowOutput.scala      | 10 +------
 5 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/python/sedona/spark/worker/daemon.py b/python/sedona/spark/worker/daemon.py
index 266baf76d5d..0c03dde5b8a 100644
--- a/python/sedona/spark/worker/daemon.py
+++ b/python/sedona/spark/worker/daemon.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import logging
 import numbers
 import os
 import signal
@@ -40,6 +39,7 @@ def compute_real_exit_code(exit_code):
     else:
         return 1
 
+
 def worker(sock, authenticated):
     """
     Called by a worker process after the fork().
@@ -77,7 +77,7 @@ def worker(sock, authenticated):
     finally:
         try:
             outfile.flush()
-        except Exception:
+        except Exception:  # nosec
             pass
     return exit_code
 
@@ -198,7 +198,7 @@ def handle_sigterm(*args):
                                 try:
                                     while sock.recv(1024):
                                         pass
-                                except Exception:
+                                except Exception:  # nosec
                                     pass
                                 break
                             gc.collect()
diff --git a/python/sedona/spark/worker/udf_info.py b/python/sedona/spark/worker/udf_info.py
index eb278a1511c..32a0833f51f 100644
--- a/python/sedona/spark/worker/udf_info.py
+++ b/python/sedona/spark/worker/udf_info.py
@@ -32,11 +32,9 @@ def get_function_call_sql(self, table_name: str, cast_to_wkb: bool = False) -> s
         arg_offset_str = ", ".join([f"_{el}" for el in self.arg_offsets])
         function_expr = f"{self.name}({arg_offset_str})"
         if isinstance(self.return_type, GeometryType) and cast_to_wkb:
-            return (
-                f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"
-            )
+            return f"SELECT ST_GeomToSedonaSpark({function_expr}) AS _0 FROM {table_name}"  # nosec
 
-        return f"SELECT {function_expr} AS _0 FROM {table_name}"
+        return f"SELECT {function_expr} AS _0 FROM {table_name}"  # nosec
 
     def sedona_db_transformation_expr(
         self, table_name: str, cast_to_wkb: bool = False
@@ -45,10 +43,12 @@ def sedona_db_transformation_expr(
         for arg in self.arg_offsets:
             if arg in self.geom_offsets and cast_to_wkb:
                 crs = self.geom_offsets[arg]
-                fields.append(f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}")
+                fields.append(
+                    f"ST_GeomFromSedonaSpark(_{arg}, 'EPSG:{crs}') AS _{arg}"
+                )  # nosec
                 continue
 
             fields.append(f"_{arg}")
 
         fields_expr = ", ".join(fields)
-        return f"SELECT {fields_expr} FROM {table_name}"
+        return f"SELECT {fields_expr} FROM {table_name}"  # nosec
diff --git a/python/sedona/spark/worker/worker.py b/python/sedona/spark/worker/worker.py
index 02fedf00581..e31d8e76d1f 100644
--- a/python/sedona/spark/worker/worker.py
+++ b/python/sedona/spark/worker/worker.py
@@ -117,7 +117,7 @@ def assign_task_context(utf_serde: UTF8Deserializer, infile):
             addresses.append(utf_serde.loads(infile))
         task_context._resources[key] = ResourceInformation(name, addresses)
 
-    task_context._localProperties = dict()
+    task_context._localProperties = {}
     for i in range(read_int(infile)):
         k = utf_serde.loads(infile)
         v = utf_serde.loads(infile)
diff --git a/python/src/geomserde_speedup_module.c b/python/src/geomserde_speedup_module.c
index 610c4d1b05b..99358d6fd76 100644
--- a/python/src/geomserde_speedup_module.c
+++ b/python/src/geomserde_speedup_module.c
@@ -288,21 +288,21 @@ static PyObject *to_sedona_func(PyObject *self, PyObject *args) {
     GEOSGeometry *geos_geom = NULL;
     char success = PyGEOS_GetGEOSGeometry(obj, &geos_geom);
     if (!success || geos_geom == NULL) {
-        PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry");
-        Py_DECREF(out);
-        return NULL;
+      PyErr_SetString(PyExc_TypeError, "Invalid GEOS geometry");
+      Py_DECREF(out);
+      return NULL;
     }
 
     PyObject *serialized = do_serialize(geos_geom);
     if (!serialized) {
-        Py_DECREF(out);
-        return NULL;
+      Py_DECREF(out);
+      return NULL;
     }
 
     if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), serialized) < 0) {
-        Py_DECREF(serialized);
-        Py_DECREF(out);
-        return NULL;
+      Py_DECREF(serialized);
+      Py_DECREF(out);
+      return NULL;
     }
     Py_DECREF(serialized);
   }
@@ -354,14 +354,14 @@ static PyObject *from_sedona_func(PyObject *self, PyObject *args) {
 
     PyObject *pygeom = PyGEOS_CreateGeometry(geom, handle);
     if (!pygeom) {
-        Py_DECREF(out);
-        return NULL;
+      Py_DECREF(out);
+      return NULL;
     }
 
-   if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) {
-        Py_DECREF(pygeom);
-        Py_DECREF(out);
-        return NULL;
+    if (PyArray_SETITEM(out, PyArray_GETPTR1(out, i), pygeom) < 0) {
+      Py_DECREF(pygeom);
+      Py_DECREF(out);
+      return NULL;
     }
 
     Py_DECREF(pygeom);
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
index 8940a376a22..d0316052601 100644
--- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
+++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/python/SedonaPythonArrowOutput.scala
@@ -83,15 +83,7 @@ private[python] trait SedonaPythonArrowOutput[OUT <: AnyRef] { self: BasePythonR
       private var batchLoaded = true
 
       protected def handleEndOfDataSectionSedona(): Unit = {
-        // We've finished the data section of the output, but we can still
-        // read some accumulator updates:
-//        val numAccumulatorUpdates = stream.readInt()
-//        (1 to numAccumulatorUpdates).foreach { _ =>
-//          val updateLen = stream.readInt()
-//          val update = new Array[Byte](updateLen)
-//          stream.readFully(update)
-//        }
-        // Check whether the worker is ready to be re-used.
+        // Check whether the worker is ready to be reused.
         if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
           if (reuseWorker && releasedOrClosed.compareAndSet(false, true)) {
             WorkerContext.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker)

From 84f75dcb9e39d875d9773675daebbc94cb73872a Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Wed, 14 Jan 2026 20:30:43 +0100
Subject: [PATCH 09/11] fix pre commit

---
 pom.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 613e310983f..071f5233d85 100644
--- a/pom.xml
+++ b/pom.xml
@@ -631,7 +631,8 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.12.0</version>
+<!--                <version>3.12.0</version>-->
+                <version>2.10.4</version>
                 <executions>
                     <execution>
                         <id>attach-javadocs</id>

From e4cab0c51eea081fdf7d51ae48307bdb2953400e Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Wed, 14 Jan 2026 23:57:53 +0100
Subject: [PATCH 10/11] fix pre commit

---
 python/pyproject.toml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 8b2a06a5d37..7795f73962d 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -26,20 +26,17 @@ description = "Apache Sedona is a cluster computing system for processing large-
 readme = "README.md"
 license = { text = "Apache-2.0" }
 authors = [ { name = "Apache Sedona", email = "dev@sedona.apache.org" } ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
   "Programming Language :: Python :: 3",
   "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
   "attrs",
-  "geoarrow-c>=0.3.1",
-  "geoarrow-pyarrow>=0.2.0",
-  "geopandas>=1.1.2",
-  "numpy>=2.1.3",
   "pyarrow>=16.1.0",
   "pyspark==3.5.4",
   "sedonadb",
+  "setuptools==80.9.0",
   "shapely>=1.7.0",
 ]
 

From b415c53c31163535b305cf04dd17df8160473b99 Mon Sep 17 00:00:00 2001
From: pawelkocinski <pawel93kocinski@gmail.com>
Date: Thu, 15 Jan 2026 00:00:08 +0100
Subject: [PATCH 11/11] fix pre commit

---
 .github/workflows/python.yml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 05782bbae9e..9f1159e57cf 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -85,10 +85,10 @@ jobs:
             scala: '2.12.8'
             java: '11'
             python: '3.9'
-          - spark: '3.5.0'
-            scala: '2.12.8'
-            java: '11'
-            python: '3.8'
+#          - spark: '3.5.0'
+#            scala: '2.12.8'
+#            java: '11'
+#            python: '3.8'
           - spark: '3.4.0'
             scala: '2.12.8'
             java: '11'
@@ -101,15 +101,15 @@ jobs:
             scala: '2.12.8'
             java: '11'
             python: '3.9'
-          - spark: '3.4.0'
-            scala: '2.12.8'
-            java: '11'
-            python: '3.8'
-          - spark: '3.4.0'
-            scala: '2.12.8'
-            java: '11'
-            python: '3.8'
-            shapely: '1'
+#          - spark: '3.4.0'
+#            scala: '2.12.8'
+#            java: '11'
+#            python: '3.8'
+#          - spark: '3.4.0'
+#            scala: '2.12.8'
+#            java: '11'
+#            python: '3.8'
+#            shapely: '1'
 
     steps:
       - uses: actions/checkout@v6