From f6d9436fb8f0098335e5eed36257b76d4a8cf599 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Mon, 8 Dec 2025 11:16:48 +0100 Subject: [PATCH 01/12] Implement DuckDB-based converter --- fiboa_cli/conversion/duckdb.py | 111 +++++++++++++++++++++++++++++++++ fiboa_cli/datasets/jp.py | 21 ++----- 2 files changed, 115 insertions(+), 17 deletions(-) create mode 100644 fiboa_cli/conversion/duckdb.py diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py new file mode 100644 index 0000000..932cb42 --- /dev/null +++ b/fiboa_cli/conversion/duckdb.py @@ -0,0 +1,111 @@ +import json +import os + +import duckdb +from vecorel_cli.encoding.geojson import VecorelJSONEncoder + +from .fiboa_converter import FiboaBaseConverter + + +class FiboaDuckDBBaseConverter(FiboaBaseConverter): + def convert( + self, + output_file, + cache=None, + input_files=None, + variant=None, + compression=None, + geoparquet_version=None, + original_geometries=False, + **kwargs, + ) -> str: + self.variant = variant + cid = self.id.strip() + if self.bbox is not None and len(self.bbox) != 4: + raise ValueError("If provided, the bounding box must consist of 4 numbers") + + # Create output folder if it doesn't exist + directory = os.path.dirname(output_file) + if directory: + os.makedirs(directory, exist_ok=True) + + if input_files is not None and isinstance(input_files, dict) and len(input_files) > 0: + self.warning("Using user provided input file(s) instead of the pre-defined file(s)") + urls = input_files + else: + urls = self.get_urls() + if urls is None: + raise ValueError("No input files provided") + + selections = [] + geom_column = None + for k, v in self.columns.items(): + if k in self.column_migrations: + selections.append(f'{self.column_migrations.get(k)} as "{v}"') + else: + selections.append(f'"{k}" as "{v}"') + if v == "geometry": + geom_column = k + + filters = [] + where = "" + if self.bbox is not None: + filters.append( + f"ST_Intersects(geometry, ST_MakeEnvelope({self.bbox[0]}, {self.bbox[1]}, {self.bbox[2]}, {self.bbox[3]}))" + ) + for k, v in self.column_filters.items(): + filters.append(v) + if len(filters) > 0: + where = f"WHERE {' AND '.join(filters)}" + + selection = ", ".join(selections) + if isinstance(urls, str): + sources = f'"{urls}"' + else: + sources = "[" + ",".join([f'"{url}"' for url in urls]) + "]" + + _collection = self.create_collection(cid) + _collection.update(self.column_additions) + collection = json.dumps(_collection, cls=VecorelJSONEncoder).encode("utf-8") + + # TODO how to get metadata ARROW:schema ? + # from vecorel_cli.parquet.types import get_pyarrow_field + # schemas = _collection.merge_schemas({}) + # props = schemas.get("properties", {}) + # pq_fields = [] + # for column in self.columns.values(): + # schema = props.get(column, {}) + # dtype = schema.get("type") + # if dtype is None: + # self.warning(f"{column}: No mapping") + # continue + # try: + # field = get_pyarrow_field(column, schema=schema) + # pq_fields.append(field) + # except Exception as e: + # self.warning(f"{column}: Skipped - {e}") + # + # pq_schema = pa.schema(pq_fields) + # pq_schema = pq_schema.with_metadata({"collection": collection}) + + con = duckdb.connect() + con.install_extension("spatial") + con.load_extension("spatial") + con.execute( + f""" + COPY ( + SELECT {selection} FROM read_parquet({sources}) + {where} + ORDER BY ST_Hilbert({geom_column}) + ) TO ? ( + FORMAT parquet, + compression 'brotli', + KV_METADATA {{ + collection: ?, + }} + ) + """, + [output_file, collection], + ) + + return output_file diff --git a/fiboa_cli/datasets/jp.py b/fiboa_cli/datasets/jp.py index 57c31a2..1799c7f 100644 --- a/fiboa_cli/datasets/jp.py +++ b/fiboa_cli/datasets/jp.py @@ -1,10 +1,9 @@ -import pandas as pd +from fiboa_cli.conversion.duckdb import FiboaDuckDBBaseConverter -from ..conversion.fiboa_converter import FiboaBaseConverter - -class JPConverter(FiboaBaseConverter): +class JPConverter(FiboaDuckDBBaseConverter): variants = { + "test": "./tests/data-files/convert/jp/jp_field_polygons_2024.parquet", "2024": "https://data.source.coop/pacificspatial/field-polygon-jp/parquet/jp_field_polygons_2024.parquet", "2023": "https://data.source.coop/pacificspatial/field-polygon-jp/parquet/jp_field_polygons_2023.parquet", "2022": "https://data.source.coop/pacificspatial/field-polygon-jp/parquet/jp_field_polygons_2022.parquet", @@ -30,23 +29,11 @@ class JPConverter(FiboaBaseConverter): "polygon_uuid": "id", "land_type_en": "land_type_en", "local_government_cd": "admin_local_code", - "issue_year": "determination:datetime", - } - column_migrations = { - "issue_year": lambda col: pd.to_datetime(col, format="%Y"), } - + column_additions = {"determination:datetime": "2024-01-01T00:00:00Z"} missing_schemas = { "properties": { "land_type_en": {"type": "string"}, "admin_local_code": {"type": "string"}, } } - - def convert(self, *args, **kwargs): - # Open only these columns to limit memory usage - super().convert( - *args, - columns=["GEOM", "polygon_uuid", "land_type_en", "local_government_cd", "issue_year"], - **kwargs, - ) From 6f52d9732e935cacbac9edb4c6979ba79e9eb03c Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Mon, 8 Dec 2025 14:13:14 +0100 Subject: [PATCH 02/12] Not sure if this is right --- fiboa_cli/conversion/duckdb.py | 40 ++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 932cb42..a43b7ff 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -2,7 +2,9 @@ import os import duckdb +import pyarrow as pa from vecorel_cli.encoding.geojson import VecorelJSONEncoder +from vecorel_cli.parquet.types import get_pyarrow_field from .fiboa_converter import FiboaBaseConverter @@ -68,24 +70,23 @@ def convert( _collection.update(self.column_additions) collection = json.dumps(_collection, cls=VecorelJSONEncoder).encode("utf-8") - # TODO how to get metadata ARROW:schema ? - # from vecorel_cli.parquet.types import get_pyarrow_field - # schemas = _collection.merge_schemas({}) - # props = schemas.get("properties", {}) - # pq_fields = [] - # for column in self.columns.values(): - # schema = props.get(column, {}) - # dtype = schema.get("type") - # if dtype is None: - # self.warning(f"{column}: No mapping") - # continue - # try: - # field = get_pyarrow_field(column, schema=schema) - # pq_fields.append(field) - # except Exception as e: - # self.warning(f"{column}: Skipped - {e}") - # - # pq_schema = pa.schema(pq_fields) + schemas = _collection.merge_schemas({}) + props = schemas.get("properties", {}) + pq_fields = [] + for column in self.columns.values(): + schema = props.get(column, {}) + dtype = schema.get("type") + if dtype is None: + self.warning(f"{column}: No mapping") + continue + try: + field = get_pyarrow_field(column, schema=schema) + pq_fields.append(field) + except Exception as e: + self.warning(f"{column}: Skipped - {e}") + + pq_schema = pa.schema(pq_fields) + schema_bytes = pq_schema.serialize().to_pybytes() # pq_schema = pq_schema.with_metadata({"collection": collection}) con = duckdb.connect() @@ -102,10 +103,11 @@ def convert( compression 'brotli', KV_METADATA {{ collection: ?, + "PYARROW:schema": ? }} ) """, - [output_file, collection], + [output_file, collection, schema_bytes], ) return output_file From 0a854ad05de581e6c53464e25fb8062327c0cf0a Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Mon, 8 Dec 2025 21:42:18 +0100 Subject: [PATCH 03/12] Update project --- CHANGELOG.md | 1 + pixi.lock | 67 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4960eb..8621520 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +- Add DuckDB BaseConverter for efficiently transforming large datasets - Extend create_stac, include include fiboa data - Publish command; skip hidden files, generate better texts - Fix to vecorel: converter.license and provider should be string diff --git a/pixi.lock b/pixi.lock index f7ff81e..fc4ca0a 100644 --- a/pixi.lock +++ b/pixi.lock @@ -141,6 +141,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -306,6 +307,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -472,6 +474,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-h6491c7d_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -636,6 +639,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-hbeecb71_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -793,6 +797,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.25.0-py313h54dd161_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -933,6 +938,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.25.0-py313hcb05632_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1074,6 +1080,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstandard-0.25.0-py313h9734d34_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-h6491c7d_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1214,6 +1221,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/zstandard-0.25.0-py313h5fd188c_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-hbeecb71_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1333,6 +1341,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.25.0-py313h54dd161_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1436,6 +1445,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.25.0-py313hcb05632_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1540,6 +1550,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstandard-0.25.0-py313h9734d34_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-h6491c7d_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1641,6 +1652,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/zstandard-0.25.0-py313h5fd188c_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-hbeecb71_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1784,6 +1796,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -1912,6 +1925,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -2041,6 +2055,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-h6491c7d_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -2166,6 +2181,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-hbeecb71_2.conda - pypi: https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/3e/0d/424de6e5612f1399ff69bf86500d6a62ff0a4843979701ae97f120c7f1fe/flatdict-4.0.1.tar.gz - pypi: https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl @@ -2903,6 +2919,54 @@ packages: - pkg:pypi/distlib?source=hash-mapping size: 275642 timestamp: 1752823081585 +- pypi: https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl + name: duckdb + version: 1.4.2 + sha256: f1fac31babda2045d4cdefe6d0fd2ebdd8d4c2a333fbcc11607cfeaec202d18d + requires_dist: + - ipython ; extra == 'all' + - fsspec ; extra == 'all' + - numpy ; extra == 'all' + - pandas ; extra == 'all' + - pyarrow ; extra == 'all' + - adbc-driver-manager ; extra == 'all' + requires_python: '>=3.9.0' +- pypi: https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl + name: duckdb + version: 1.4.2 + sha256: 2f7c61617d2b1da3da5d7e215be616ad45aa3221c4b9e2c4d1c28ed09bc3c1c4 + requires_dist: + - ipython ; extra == 'all' + - fsspec ; extra == 'all' + - numpy ; extra == 'all' + - pandas ; extra == 'all' + - pyarrow ; extra == 'all' + - adbc-driver-manager ; extra == 'all' + requires_python: '>=3.9.0' +- pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl + name: duckdb + version: 1.4.2 + sha256: a456adbc3459c9dcd99052fad20bd5f8ef642be5b04d09590376b2eb3eb84f5c + requires_dist: + - ipython ; extra == 'all' + - fsspec ; extra == 'all' + - numpy ; extra == 'all' + - pandas ; extra == 'all' + - pyarrow ; extra == 'all' + - adbc-driver-manager ; extra == 'all' + requires_python: '>=3.9.0' +- pypi: https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl + name: duckdb + version: 1.4.2 + sha256: 43ac632f40ab1aede9b4ce3c09ea043f26f3db97b83c07c632c84ebd7f7c0f4a + requires_dist: + - ipython ; extra == 'all' + - fsspec ; extra == 'all' + - numpy ; extra == 'all' + - pandas ; extra == 'all' + - pyarrow ; extra == 'all' + - adbc-driver-manager ; extra == 'all' + requires_python: '>=3.9.0' - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda sha256: ce61f4f99401a4bd455b89909153b40b9c823276aefcbb06f2044618696009ca md5: 72e42d28960d875c7654614f8b50939a @@ -2917,10 +2981,11 @@ packages: - pypi: ./ name: fiboa-cli version: 0.20.3 - sha256: 0b3743a9a5f590df1aa36c60901b73456e1f2f7dbc25163ba4597134928c1132 + sha256: f2957643698e34ef54cbf2f15bac13fdc0b228e6d7c3034f57a139b8d92cdcb5 requires_dist: - vecorel-cli==0.2.11 - spdx-license-list==3.27.0 + - duckdb==1.4.2 requires_python: '>=3.10,<3.14' editable: true - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.20.0-pyhd8ed1ab_0.conda diff --git a/pyproject.toml b/pyproject.toml index cb36db9..46e7076 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ requires-python = ">=3.10,<3.14" dependencies = [ "vecorel-cli==0.2.11", "spdx-license-list==3.27.0", + "duckdb==1.4.2", ] [project.scripts] From 996cc380cc8a535dcf585496fd9d1b7a8208c4b2 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Mon, 8 Dec 2025 23:32:51 +0100 Subject: [PATCH 04/12] Fix tests --- fiboa_cli/conversion/duckdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index a43b7ff..746b1e3 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path import duckdb import pyarrow as pa @@ -85,6 +86,9 @@ def convert( except Exception as e: self.warning(f"{column}: Skipped - {e}") + if isinstance(output_file, Path): + output_file = str(output_file) + pq_schema = pa.schema(pq_fields) schema_bytes = pq_schema.serialize().to_pybytes() # pq_schema = pq_schema.with_metadata({"collection": collection}) From 51bea87f8fbca4f41728508161a8313d912b8a25 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Tue, 9 Dec 2025 11:46:20 +0100 Subject: [PATCH 05/12] Add collection to metadata --- fiboa_cli/conversion/duckdb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 746b1e3..3f6ae21 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -69,6 +69,7 @@ def convert( _collection = self.create_collection(cid) _collection.update(self.column_additions) + _collection["collection"] = self.id collection = json.dumps(_collection, cls=VecorelJSONEncoder).encode("utf-8") schemas = _collection.merge_schemas({}) From 2d3fc50718f12715ea07a9fa9fe4d33a415644d2 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Tue, 9 Dec 2025 11:49:38 +0100 Subject: [PATCH 06/12] Add required to arrow definition --- fiboa_cli/conversion/duckdb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 3f6ae21..ef8a646 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -49,6 +49,7 @@ def convert( selections.append(f'"{k}" as "{v}"') if v == "geometry": geom_column = k + selection = ", ".join(selections) filters = [] where = "" @@ -61,7 +62,6 @@ def convert( if len(filters) > 0: where = f"WHERE {' AND '.join(filters)}" - selection = ", ".join(selections) if isinstance(urls, str): sources = f'"{urls}"' else: @@ -74,6 +74,7 @@ def convert( schemas = _collection.merge_schemas({}) props = schemas.get("properties", {}) + required = schemas.get("required", []) pq_fields = [] for column in self.columns.values(): schema = props.get(column, {}) @@ -82,7 +83,7 @@ def convert( self.warning(f"{column}: No mapping") continue try: - field = get_pyarrow_field(column, schema=schema) + field = get_pyarrow_field(column, schema=schema, required=column in required) pq_fields.append(field) except Exception as e: self.warning(f"{column}: Skipped - {e}") From 852fb6aa102d6583f52e182d46d8fa8000286994 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Wed, 10 Dec 2025 10:12:55 +0100 Subject: [PATCH 07/12] Support for sources with different schemas --- fiboa_cli/conversion/duckdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index ef8a646..2ee6fd1 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -101,7 +101,7 @@ def convert( con.execute( f""" COPY ( - SELECT {selection} FROM read_parquet({sources}) + SELECT {selection} FROM read_parquet({sources}, union_by_name=true) {where} ORDER BY ST_Hilbert({geom_column}) ) TO ? ( From 3cf3b8d1f3eb0030e0b0ee24f297beeb23bdc755 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Wed, 10 Dec 2025 11:26:29 +0100 Subject: [PATCH 08/12] Export collection and set compression --- fiboa_cli/conversion/duckdb.py | 35 ++++++++-------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 2ee6fd1..1541e53 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -67,33 +67,14 @@ def convert( else: sources = "[" + ",".join([f'"{url}"' for url in urls]) + "]" - _collection = self.create_collection(cid) - _collection.update(self.column_additions) - _collection["collection"] = self.id - collection = json.dumps(_collection, cls=VecorelJSONEncoder).encode("utf-8") - - schemas = _collection.merge_schemas({}) - props = schemas.get("properties", {}) - required = schemas.get("required", []) - pq_fields = [] - for column in self.columns.values(): - schema = props.get(column, {}) - dtype = schema.get("type") - if dtype is None: - self.warning(f"{column}: No mapping") - continue - try: - field = get_pyarrow_field(column, schema=schema, required=column in required) - pq_fields.append(field) - except Exception as e: - self.warning(f"{column}: Skipped - {e}") + collection = self.create_collection(cid) + collection.update(self.column_additions) + collection["collection"] = self.id if isinstance(output_file, Path): output_file = str(output_file) - pq_schema = pa.schema(pq_fields) - schema_bytes = pq_schema.serialize().to_pybytes() - # pq_schema = pq_schema.with_metadata({"collection": collection}) + collection_json = json.dumps(collection, cls=VecorelJSONEncoder).encode("utf-8") con = duckdb.connect() con.install_extension("spatial") @@ -101,19 +82,19 @@ def convert( con.execute( f""" COPY ( - SELECT {selection} FROM read_parquet({sources}, union_by_name=true) + SELECT {selection} + FROM read_parquet({sources}, union_by_name=true) {where} ORDER BY ST_Hilbert({geom_column}) ) TO ? ( FORMAT parquet, - compression 'brotli', + compression '{compression}', KV_METADATA {{ collection: ?, - "PYARROW:schema": ? }} ) """, - [output_file, collection, schema_bytes], + [output_file, collection_json], ) return output_file From c5241424dbc49ddee77c525c2d3c1a00da5611cb Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Wed, 10 Dec 2025 11:53:45 +0100 Subject: [PATCH 09/12] Add caching and warnings --- fiboa_cli/conversion/duckdb.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 1541e53..45579d7 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -3,9 +3,7 @@ from pathlib import Path import duckdb -import pyarrow as pa from vecorel_cli.encoding.geojson import VecorelJSONEncoder -from vecorel_cli.parquet.types import get_pyarrow_field from .fiboa_converter import FiboaBaseConverter @@ -22,6 +20,11 @@ def convert( original_geometries=False, **kwargs, ) -> str: + if geoparquet_version is not None: + self.warning("geoparquet_version is not supported for DuckDB-based converters and will always write GeoParquet v1.0") + if not original_geometries: + self.warning("original_geometries is not supported for DuckDB-based converters and will always write original geometries") + self.variant = variant cid = self.id.strip() if self.bbox is not None and len(self.bbox) != 4: @@ -40,6 +43,15 @@ def convert( if urls is None: raise ValueError("No input files provided") + self.info("Getting file(s) if not cached yet") + if cache: + request_args = {} + if self.avoid_range_request: + request_args["block_size"] = 0 + urls = self.download_files(urls, cache, **request_args) + elif self.avoid_range_request: + self.warning("avoid_range_request is set, but cache is not used, so this setting has no effect") + selections = [] geom_column = None for k, v in self.columns.items(): From 0d5a9f91adadb7167ccff00a155f25e77792b547 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Wed, 10 Dec 2025 12:10:16 +0100 Subject: [PATCH 10/12] Add todos --- fiboa_cli/conversion/duckdb.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 45579d7..32f1317 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -109,4 +109,10 @@ def convert( [output_file, collection_json], ) + # todo: write the file again to do the following: + # - update geoparquet version to 1.1 + # - add bounding box + metadata + # - add the non-nullability to the respective columns + # Ideally do this in improve... + return output_file From 07d7aac02e3067261f31d41b039acd005fae3321 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Wed, 10 Dec 2025 12:33:12 +0100 Subject: [PATCH 11/12] Fix tests --- fiboa_cli/conversion/duckdb.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 32f1317..20e5035 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -77,7 +77,13 @@ def convert( if isinstance(urls, str): sources = f'"{urls}"' else: - sources = "[" + ",".join([f'"{url}"' for url in urls]) + "]" + paths = [] + for url in urls: + if isinstance(url, tuple): + paths.append(f'"{url[0]}"') + else: + paths.append(f'"{url}"') + sources = "[" + ",".join(paths) + "]" collection = self.create_collection(cid) collection.update(self.column_additions) @@ -100,13 +106,13 @@ def convert( ORDER BY ST_Hilbert({geom_column}) ) TO ? ( FORMAT parquet, - compression '{compression}', + compression ?, KV_METADATA {{ collection: ?, }} ) """, - [output_file, collection_json], + [output_file, compression or 'brotli', collection_json], ) # todo: write the file again to do the following: From 78fb00d7eb85765953a0da58e779e7b401a4b95f Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Wed, 10 Dec 2025 21:31:04 +0100 Subject: [PATCH 12/12] Implement rewrite to correct GeoParquet --- fiboa_cli/conversion/duckdb.py | 114 ++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 10 deletions(-) diff --git a/fiboa_cli/conversion/duckdb.py b/fiboa_cli/conversion/duckdb.py index 20e5035..6da3ac1 100644 --- a/fiboa_cli/conversion/duckdb.py +++ b/fiboa_cli/conversion/duckdb.py @@ -1,8 +1,14 @@ import json import os from pathlib import Path +from tempfile import NamedTemporaryFile import duckdb +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from geopandas.array import from_wkb +from pyarrow.lib import StructArray from vecorel_cli.encoding.geojson import VecorelJSONEncoder from .fiboa_converter import FiboaBaseConverter @@ -20,10 +26,13 @@ def convert( original_geometries=False, **kwargs, ) -> str: - if geoparquet_version is not None: - self.warning("geoparquet_version is not supported for DuckDB-based converters and will always write GeoParquet v1.0") if not original_geometries: - self.warning("original_geometries is not supported for DuckDB-based converters and will always write original geometries") + self.warning( + "original_geometries is not supported for DuckDB-based converters and will always write original geometries" + ) + + geoparquet_version = geoparquet_version or "1.1.0" + compression = compression or "brotli" self.variant = variant cid = self.id.strip() @@ -50,7 +59,9 @@ def convert( request_args["block_size"] = 0 urls = self.download_files(urls, cache, **request_args) elif self.avoid_range_request: - self.warning("avoid_range_request is set, but cache is not used, so this setting has no effect") + self.warning( + "avoid_range_request is set, but cache is not used, so this setting has no effect" + ) selections = [] geom_column = None @@ -112,13 +123,96 @@ def convert( }} ) """, - [output_file, compression or 'brotli', collection_json], + [output_file, compression, collection_json], ) - # todo: write the file again to do the following: - # - update geoparquet version to 1.1 - # - add bounding box + metadata - # - add the non-nullability to the respective columns - # Ideally do this in improve... + # Post-process the written Parquet to proper GeoParquet v1.1 with bbox and nullability + try: + pq_file = pq.ParquetFile(output_file) + + existing_schema = pq_file.schema_arrow + col_names = existing_schema.names + assert "geometry" in col_names, "Missing geometry column in output parquet file" + + schemas = collection.merge_schemas({}) + collection_only = {k for k, v in schemas.get("collection", {}).items() if v} + required_columns = {"geometry"} | { + r + for r in schemas.get("required", []) + if r in col_names and r not in collection_only + } + if "id" in col_names: + required_columns.add("id") + + # Update for version 1.1.0 + metadata = existing_schema.metadata + if geoparquet_version > "1.0.0": + geo_meta = json.loads(existing_schema.metadata[b"geo"]) + geo_meta["version"] = geoparquet_version + metadata[b"geo"] = json.dumps(geo_meta).encode("utf-8") + + # Build a new Arrow schema with adjusted nullability + new_fields = [] + for field in existing_schema: + if field.name in required_columns and field.nullable: + new_fields.append( + pa.field(field.name, field.type, nullable=False, metadata=field.metadata) + ) + else: + new_fields.append(field) + + add_bbox = geoparquet_version > "1.0.0" and "bbox" not in col_names + if add_bbox: + new_fields.append( + pa.field( + "bbox", + pa.struct( + [ + ("xmin", pa.float64()), + ("ymin", pa.float64()), + ("xmax", pa.float64()), + ("ymax", pa.float64()), + ] + ), + ) + ) + new_schema = pa.schema(new_fields, metadata=metadata) + + # 7) Streamingly rewrite the file to a temp file and replace atomically + with NamedTemporaryFile( + "wb", delete=False, dir=os.path.dirname(output_file), suffix=".parquet" + ) as tmp: + tmp_path = tmp.name + + writer = pq.ParquetWriter( + tmp_path, + new_schema, + compression=compression, + use_dictionary=True, + write_statistics=True, + ) + try: + bbox_names = ["ymax", "xmax", "ymin", "xmin"] + for rg in range(pq_file.num_row_groups): + tbl = pq_file.read_row_group(rg) + if add_bbox: + # determine bounds, change to StructArray type + bounds = from_wkb(tbl["geometry"]).bounds + bbox_array = StructArray.from_arrays( + np.rot90(bounds), + names=bbox_names, + ) + tbl = tbl.append_column("bbox", bbox_array) + # Ensure table adheres to the new schema (mainly nullability); cast if needed + if tbl.schema != new_schema: + # Align field order/types; this does not materialize data beyond the batch + tbl = tbl.cast(new_schema, safe=False) + writer.write_table(tbl) + finally: + writer.close() + + os.replace(tmp_path, output_file) + except Exception as e: + self.warning(f"GeoParquet 1.1 post-processing failed: {e}") return output_file