From 6b90169eb0b5c21a374ee098cba1b5d326230486 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Thu, 30 Oct 2025 14:22:41 +0100 Subject: [PATCH] Use zstd with compression 15 by default, fixes #12 --- CHANGELOG.md | 2 ++ vecorel_cli/cli/options.py | 10 +++++++++- vecorel_cli/conversion/base.py | 2 ++ vecorel_cli/convert.py | 9 ++++++++- vecorel_cli/encoding/geoparquet.py | 18 ++++++++---------- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 910cfdb..c07260a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +- Change default compression to zstd +- Add option to set compression level, zstd defaults to 15 - Change default temporal property to datetime - Enable Converter.columns list and tuple types - Add BaseConverter get_columns hook to customize columns after reading the file diff --git a/vecorel_cli/cli/options.py b/vecorel_cli/cli/options.py index eb4d67c..085cb04 100644 --- a/vecorel_cli/cli/options.py +++ b/vecorel_cli/cli/options.py @@ -31,7 +31,15 @@ def CRS(default_value): type=click.Choice(COMPRESSION_METHODS), help="GeoParquet only: Compression method", show_default=True, - default="brotli", + default="zstd", +) + +GEOPARQUET_COMPRESSION_LEVEL = click.option( + "--compression_level", + "-pcl", + type=click.IntRange(min=0, max=255), + help="GeoParquet only: Compression level", + default=None, ) GEOPARQUET_VERSION = click.option( diff --git a/vecorel_cli/conversion/base.py b/vecorel_cli/conversion/base.py index 8185276..e427396 100644 --- a/vecorel_cli/conversion/base.py +++ b/vecorel_cli/conversion/base.py @@ -298,6 +298,7 @@ def convert( input_files=None, variant=None, compression=None, + compression_level=None, geoparquet_version=None, original_geometries=False, **kwargs, @@ -425,6 +426,7 @@ def convert( gdf, properties=columns, compression=compression, + compression_level=compression_level, geoparquet_version=geoparquet_version, ) diff --git a/vecorel_cli/convert.py b/vecorel_cli/convert.py index f627e8c..c9aa7e0 100644 --- a/vecorel_cli/convert.py +++ b/vecorel_cli/convert.py @@ -1,7 +1,13 @@ import click from .basecommand import BaseCommand, runnable -from .cli.options import GEOPARQUET_COMPRESSION, GEOPARQUET_VERSION, PY_PACKAGE, VECOREL_TARGET +from .cli.options import ( + GEOPARQUET_COMPRESSION, + GEOPARQUET_COMPRESSION_LEVEL, + GEOPARQUET_VERSION, + PY_PACKAGE, + VECOREL_TARGET, +) from .cli.util import parse_converter_input_files from .converters import Converters from .registry import Registry @@ -61,6 +67,7 @@ def get_cli_args(): default=None, ), "compression": GEOPARQUET_COMPRESSION, + "compression_level": GEOPARQUET_COMPRESSION_LEVEL, "geoparquet_version": GEOPARQUET_VERSION, "mapping_file": click.option( "--mapping-file", diff --git a/vecorel_cli/encoding/geoparquet.py b/vecorel_cli/encoding/geoparquet.py index 5567d95..251ecdc 100644 --- a/vecorel_cli/encoding/geoparquet.py +++ b/vecorel_cli/encoding/geoparquet.py @@ -131,22 +131,22 @@ def get_compression(self) -> Optional[str]: # geoparquet_version: bool, optional, default False # If True, writes the data in GeoParquet 1.0.0 format, # otherwise in GeoParquet 1.1.0 format. - # compression: str, optional, default "brotli" - # Compression algorithm to use, defaults to "brotli". - # Other options are "snappy", "gzip", "lz4", "zstd", etc. + # compression: str, optional, default "zstd" + # Compression algorithm to use, defaults to "zstd". + # Other options are "snappy", "gzip", "lz4", "brotli", etc. def write( self, data: GeoDataFrame, properties: Optional[list[str]] = None, schema_map: SchemaMapping = {}, dehydrate: bool = True, - compression: Optional[str] = None, + compression: Optional[str] = "zstd", + compression_level=None, # default level for compression geoparquet_version: Optional[str] = None, **kwargs, # capture unknown arguments ) -> bool: - if compression is None: - compression = "brotli" - + if compression == "zstd" and compression_level is None: + compression_level = 15 if geoparquet_version not in GEOPARQUET_VERSIONS: geoparquet_version = GEOPARQUET_DEFAULT_VERSION self.uri.parent.mkdir(parents=True, exist_ok=True) @@ -240,9 +240,6 @@ def write( } ) - if compression is None: - compression = "brotli" - # Write the data to the Parquet file to_parquet( data, @@ -254,6 +251,7 @@ def write( schema_version=geoparquet_version, row_group_size=self.row_group_size, write_covering_bbox=bool(geoparquet_version != "1.0.0"), + compression_level=compression_level, ) return True