From 53eb71281b8b37b234e73546264a54d66db7d714 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 16 Sep 2025 17:21:34 -0600 Subject: [PATCH 01/27] initial commit --- src/chronify/time_configs.py | 13 ++ src/chronify/time_series_mapper_base.py | 22 +-- src/chronify/time_zone_converter.py | 176 ++++++++++++++++++++++++ tests/test_time_zone_converter.py | 151 ++++++++++++++++++++ 4 files changed, 351 insertions(+), 11 deletions(-) create mode 100644 src/chronify/time_zone_converter.py create mode 100644 tests/test_time_zone_converter.py diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 9e115a0..db4f2da 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -4,6 +4,7 @@ from typing import Union, Literal, Optional from pydantic import Field, field_validator from typing_extensions import Annotated +from zoneinfo import ZoneInfo from chronify.base_models import ChronifyBaseModel from chronify.time import ( @@ -81,6 +82,18 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> None: return None + def convert_time_zone(self, tz: ZoneInfo | None) -> "DatetimeRange": + """Return a copy of the DatetimeRange with the start timestamp converted to time zone.""" + return self.model_copy(update={"start": self.start.astimezone(tz)}) + + def replace_time_zone(self, tz: ZoneInfo | None) -> "DatetimeRange": + """Return a copy of the DatetimeRange with the time zone replaced.""" + return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) + + +# TODO: +# class DatetimeRangeWithTZColumn(TimeBaseModel): + class AnnualTimeRange(TimeBaseModel): """Defines a time range that uses years as integers.""" diff --git a/src/chronify/time_series_mapper_base.py b/src/chronify/time_series_mapper_base.py index 073c526..d1c7bd4 100644 --- a/src/chronify/time_series_mapper_base.py +++ b/src/chronify/time_series_mapper_base.py @@ -113,6 +113,11 @@ def apply_mapping( ) metadata.reflect(engine, views=True) + # with engine.connect() as conn: + # import pandas as pd + # dfm = pd.read_sql(f"select * from {mapping_schema.name}", conn) + # df = pd.read_sql(f"select * from {from_schema.name}", conn) + # breakpoint() created_tmp_view = False try: _apply_mapping( @@ -211,18 +216,13 @@ def _apply_mapping( msg = f"Unsupported {resampling_operation=}" raise ValueError(msg) - keys = from_schema.time_config.list_time_columns() - # check time_zone - tz_col = from_schema.time_config.get_time_zone_column() - if tz_col is not None: - keys.append(tz_col) - assert tz_col in left_table_columns, f"{tz_col} not in table={from_schema.name}" - ftz_col = "from_" + tz_col - assert ( - ftz_col in right_table_columns - ), f"{ftz_col} not in mapping table={mapping_table_name}" - + from_keys = [x for x in right_table_columns if x.startswith("from_")] + keys = [x.removeprefix("from_") for x in from_keys] + assert set(keys).issubset( + set(left_table_columns) + ), f"Keys {keys} not in table={from_schema.name}" on_stmt = reduce(and_, (left_table.c[x] == right_table.c["from_" + x] for x in keys)) + query = select(*select_stmt).select_from(left_table).join(right_table, on_stmt) if resampling_operation: query = query.group_by(*groupby_stmt) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py new file mode 100644 index 0000000..a06493a --- /dev/null +++ b/src/chronify/time_zone_converter.py @@ -0,0 +1,176 @@ +import abc +from zoneinfo import ZoneInfo +from sqlalchemy import Engine, MetaData, Table, select +from typing import Optional +from pathlib import Path +import pandas as pd + +from chronify.models import TableSchema, MappingTableSchema +from chronify.time_configs import DatetimeRange, TimeBasedDataAdjustment +from chronify.exceptions import InvalidParameter +from chronify.time_series_mapper_base import apply_mapping +from chronify.time_series_mapper_datetime import MapperDatetimeToDatetime +from chronify.time_range_generator_factory import make_time_range_generator +from chronify.sqlalchemy.functions import read_database + + +class TimeZoneConverterBase(abc.ABC): + """Base class for time zone conversion of time series data.""" + + def __init__( + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + ): + self._engine = engine + self._metadata = metadata + self._from_schema = from_schema + + def check_from_schema(self) -> None: + if not isinstance(self._from_schema.time_config, DatetimeRange): + msg = "Source schema does not have DatetimeRange time config." + raise InvalidParameter(msg) + + @abc.abstractmethod + def generate_to_schema(self) -> TableSchema: + """Generate to_schema based on from_schema""" + + @abc.abstractmethod + def convert_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> None: + """Convert time zone of the from_schema""" + + +class TimeZoneConverter(TimeZoneConverterBase): + """Class for time zone conversion of time series data to a specified time zone.""" + + def __init__( + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + to_time_zone: ZoneInfo | None, + ): + super().__init__(engine, metadata, from_schema) + self._to_time_zone = to_time_zone + self._to_schema = self.generate_to_schema() + + def generate_to_schema(self) -> TableSchema: + to_schema: TableSchema = self._from_schema.model_copy( + update={ + "name": f"{self._from_schema.name}_tz_converted", + "time_config": self._from_schema.time_config.convert_time_zone(self._to_time_zone), + } + ) + return to_schema + + def convert_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> None: + self.check_from_schema() + MapperDatetimeToDatetime( + self._engine, + self._metadata, + self._from_schema, + self._to_schema, + ).map_time( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + +class TimeZoneConverterByGeography(TimeZoneConverterBase): + """Class for time zone conversion of time series data based on a geography-based time zone column.""" + + def __init__( + self, engine: Engine, metadata: MetaData, from_schema: TableSchema, time_zone_column: str + ): + super().__init__(engine, metadata, from_schema) + self._from_time_config: DatetimeRange = self._from_schema.time_config + self.time_zone_column = time_zone_column + self._to_schema = self.generate_to_schema() + + def generate_to_schema(self) -> TableSchema: + to_schema: TableSchema = self._from_schema.model_copy( + update={ + "name": f"{self._from_schema.name}_tz_converted", + "time_config": self._from_schema.time_config.replace_time_zone(None), + } + ) + return to_schema + + def convert_time_zone( + self, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, # will not be used + ) -> None: + self.check_from_schema() + df, mapping_schema = self._create_interm_map_with_time_zone() + + # Do not check mapped timestamps because they cannot be described by the mapped_schema time_config + apply_mapping( + df, + mapping_schema, + self._from_schema, + self._to_schema, + self._engine, + self._metadata, + TimeBasedDataAdjustment(), + scratch_dir=scratch_dir, + output_file=output_file, + ) + + def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableSchema]: + """Create mapping dataframe for converting datetime to geography-based time zone""" + mapped_time_col = self._from_time_config.time_column + from_time_col = "from_" + mapped_time_col + from_time_data = make_time_range_generator(self._from_time_config).list_timestamps() + + from_tz_col = "from_" + self.time_zone_column + + with self._engine.connect() as conn: + table = Table(self._from_schema.name, self._metadata) + stmt = ( + select(table.c[self.time_zone_column]) + .distinct() + .where(table.c[self.time_zone_column].is_not(None)) + ) + time_zones = read_database(stmt, conn, self._from_time_config)[ + self.time_zone_column + ].to_list() + + # TODO may need to enforce tz is not None, clean up model_copy() + + from_time_config = self._from_time_config.model_copy(update={"time_column": from_time_col}) + to_time_config = self._from_time_config.replace_time_zone(None) + + df_tz = [] + for time_zone in time_zones: + tz = ZoneInfo(time_zone) if time_zone not in [None, "None"] else None + mapped_time_data = [x.tz_convert(tz).tz_localize(None) for x in from_time_data] + df_tz.append( + pd.DataFrame( + { + from_time_col: from_time_data, + from_tz_col: time_zone, + mapped_time_col: mapped_time_data, + } + ) + ) + df = pd.concat(df_tz, ignore_index=True) + + mapping_schema = MappingTableSchema( + name="mapping_table_gtz_conversion", + time_configs=[from_time_config, to_time_config], + ) + return df, mapping_schema diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py new file mode 100644 index 0000000..b6d14f9 --- /dev/null +++ b/tests/test_time_zone_converter.py @@ -0,0 +1,151 @@ +from zoneinfo import ZoneInfo +from datetime import datetime, timedelta +import numpy as np + +import pandas as pd +from sqlalchemy import Engine, MetaData + +from chronify.sqlalchemy.functions import read_database, write_database +from chronify.time_zone_converter import TimeZoneConverter, TimeZoneConverterByGeography +from chronify.time_configs import DatetimeRange +from chronify.models import TableSchema +from chronify.time import TimeIntervalType +from chronify.datetime_range_generator import DatetimeRangeGenerator + + +def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore + return pd.to_datetime(list(DatetimeRangeGenerator(time_config).iter_timestamps())) + + +def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: + df = pd.DataFrame({schema.time_config.time_column: generate_datetime_data(schema.time_config)}) + + for i, x in enumerate(schema.time_array_id_columns): + df[x] = i + df[schema.value_column] = np.random.rand(len(df)) + return df + + +def generate_dataframe_with_tz_col(schema: TableSchema) -> pd.DataFrame: + df = generate_datetime_dataframe(schema).drop(columns=["id"]) + time_zones = [ + ZoneInfo("US/Eastern"), + ZoneInfo("US/Central"), + ZoneInfo("US/Mountain"), + ] # , None] + time_zones = [tz.key if tz is not None else "None" for tz in time_zones] + dfo = pd.merge( + df, pd.DataFrame({"id": range(len(time_zones)), "time_zone": time_zones}), how="cross" + ) + dfo = ( + dfo.drop(columns=["time_zone_x"]) + .rename(columns={"time_zone_y": "time_zone"}) + .reset_index() + ) + return dfo + + +def get_datetime_schema( + year: int, + tzinfo: ZoneInfo | None, + interval_type: TimeIntervalType, + name: str, + has_tz_col: bool = False, +) -> TableSchema: + start = datetime(year=year, month=1, day=1, tzinfo=tzinfo) + end = datetime(year=year, month=1, day=2, tzinfo=tzinfo) + resolution = timedelta(hours=1) + length = (end - start) / resolution + 1 + cols = ["id"] + cols += ["time_zone"] if has_tz_col else [] + schema = TableSchema( + name=name, + time_config=DatetimeRange( + start=start, + resolution=resolution, + length=length, + interval_type=interval_type, + time_column="timestamp", + ), + time_array_id_columns=cols, + value_column="value", + ) + return schema + + +def ingest_data( + engine: Engine, + metadata: MetaData, + df: pd.DataFrame, + schema: TableSchema, +) -> None: + with engine.begin() as conn: + write_database(df, conn, schema.name, [schema.time_config], if_table_exists="replace") + metadata.reflect(engine, views=True) + + +def get_mapped_dataframe( + engine: Engine, + table_name: str, + time_config: DatetimeRange, +) -> pd.DataFrame: + with engine.connect() as conn: + query = f"select * from {table_name}" + queried = read_database(query, conn, time_config) + queried = queried.sort_values(by=["id", "timestamp"]).reset_index(drop=True) + return queried + + +def run_conversion( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, + to_time_zone: ZoneInfo | None, +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + TZC = TimeZoneConverter(engine, metadata, from_schema, to_time_zone) + TZC.convert_time_zone(check_mapped_timestamps=True) + dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) + + assert (df["timestamp"] == dfo["timestamp"]).prod() == 1 # TODO: these will always be equal + + +def run_conversion_by_geography( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + TZC = TimeZoneConverterByGeography(engine, metadata, from_schema, "time_zone") + TZC.convert_time_zone() + dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) + dfo = dfo[df.columns].sort_values(by="index").reset_index(drop=True) + + assert df["value"].equals(dfo["value"]) + for i in range(len(df)): + tz = ZoneInfo(df.loc[i, "time_zone"]) + ts = df.loc[i, "timestamp"].tz_convert(tz).replace(tzinfo=None) + assert dfo.loc[i, "timestamp"] == ts + + +def test_time_conversion(iter_engines: Engine) -> None: + from_schema = get_datetime_schema( + 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" + ) + df = generate_datetime_dataframe(from_schema) + to_time_zone = ZoneInfo("US/Central") + run_conversion(iter_engines, df, from_schema, to_time_zone) + + +def test_time_conversion_by_geography(iter_engines: Engine) -> None: + from_schema = get_datetime_schema( + 2018, + ZoneInfo("US/Mountain"), + TimeIntervalType.PERIOD_BEGINNING, + "base_table", + has_tz_col=True, + ) + df = generate_dataframe_with_tz_col(from_schema) + run_conversion_by_geography(iter_engines, df, from_schema) From ea1ce5bcda33dba2b9a63e696c93a280ae630568 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 16 Sep 2025 17:29:58 -0600 Subject: [PATCH 02/27] clean up --- src/chronify/time_series_mapper_datetime.py | 3 +-- src/chronify/time_series_mapper_index_time.py | 15 +++++++-------- src/chronify/time_zone_converter.py | 15 +++++++-------- tests/test_time_zone_converter.py | 6 ++++-- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/chronify/time_series_mapper_datetime.py b/src/chronify/time_series_mapper_datetime.py index d4dd392..2637fb3 100644 --- a/src/chronify/time_series_mapper_datetime.py +++ b/src/chronify/time_series_mapper_datetime.py @@ -131,8 +131,7 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: assert ( df[to_time_col].nunique() == self._to_time_config.length ), "to_time_col does not have the right number of timestamps" - from_time_config = self._from_time_config.model_copy() - from_time_config.time_column = from_time_col + from_time_config = self._from_time_config.model_copy(update={"time_column": from_time_col}) mapping_schema = MappingTableSchema( name="mapping_table", time_configs=[ diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index 4f664ef..0cafb0c 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -190,8 +190,7 @@ def _create_interm_map(self) -> tuple[pd.DataFrame, MappingTableSchema, TableSch from_time_col = "from_" + self._from_time_config.time_column from_time_data = make_time_range_generator(self._from_time_config).list_timestamps() - from_time_config = self._from_time_config.model_copy() - from_time_config.time_column = from_time_col + from_time_config = self._from_time_config.model_copy(update={"time_column": from_time_col}) mapping_schema = MappingTableSchema( name="mapping_table", time_configs=[from_time_config, mapped_schema.time_config], @@ -225,10 +224,10 @@ def _create_interm_map_with_time_zone( stmt = select(table.c[tz_col]).distinct().where(table.c[tz_col].is_not(None)) time_zones = read_database(stmt, conn, self._from_time_config)[tz_col].to_list() - from_time_config = self._from_time_config.model_copy() + from_time_config = self._from_time_config.model_copy( + update={"time_column": from_time_col, "time_zone_column": from_tz_col} + ) assert isinstance(from_time_config, IndexTimeRangeLocalTime) - from_time_config.time_column = from_time_col - from_time_config.time_zone_column = from_tz_col df_tz = [] to_tz = self._to_time_config.start.tzinfo @@ -295,10 +294,10 @@ def _create_interm_map_with_time_zone_and_dst_adjustment( stmt = select(table.c[tz_col]).distinct().where(table.c[tz_col].is_not(None)) time_zones = read_database(stmt, conn, self._from_time_config)[tz_col].to_list() - from_time_config = self._from_time_config.model_copy() + from_time_config = self._from_time_config.model_copy( + update={"time_column": from_time_col, "time_zone_column": from_tz_col} + ) assert isinstance(from_time_config, IndexTimeRangeLocalTime) - from_time_config.time_column = from_time_col - from_time_config.time_zone_column = from_tz_col df_tz = [] for time_zone in time_zones: diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index a06493a..4549033 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -95,7 +95,6 @@ def __init__( self, engine: Engine, metadata: MetaData, from_schema: TableSchema, time_zone_column: str ): super().__init__(engine, metadata, from_schema) - self._from_time_config: DatetimeRange = self._from_schema.time_config self.time_zone_column = time_zone_column self._to_schema = self.generate_to_schema() @@ -132,9 +131,9 @@ def convert_time_zone( def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for converting datetime to geography-based time zone""" - mapped_time_col = self._from_time_config.time_column + mapped_time_col = self._from_schema.time_config.time_column from_time_col = "from_" + mapped_time_col - from_time_data = make_time_range_generator(self._from_time_config).list_timestamps() + from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() from_tz_col = "from_" + self.time_zone_column @@ -145,14 +144,14 @@ def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableS .distinct() .where(table.c[self.time_zone_column].is_not(None)) ) - time_zones = read_database(stmt, conn, self._from_time_config)[ + time_zones = read_database(stmt, conn, self._from_schema.time_config)[ self.time_zone_column ].to_list() - # TODO may need to enforce tz is not None, clean up model_copy() - - from_time_config = self._from_time_config.model_copy(update={"time_column": from_time_col}) - to_time_config = self._from_time_config.replace_time_zone(None) + from_time_config = self._from_schema.time_config.model_copy( + update={"time_column": from_time_col} + ) + to_time_config = self._from_schema.time_config.replace_time_zone(None) df_tz = [] for time_zone in time_zones: diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index b6d14f9..0f71c7c 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -32,7 +32,8 @@ def generate_dataframe_with_tz_col(schema: TableSchema) -> pd.DataFrame: ZoneInfo("US/Eastern"), ZoneInfo("US/Central"), ZoneInfo("US/Mountain"), - ] # , None] + None, + ] time_zones = [tz.key if tz is not None else "None" for tz in time_zones] dfo = pd.merge( df, pd.DataFrame({"id": range(len(time_zones)), "time_zone": time_zones}), how="cross" @@ -125,7 +126,8 @@ def run_conversion_by_geography( assert df["value"].equals(dfo["value"]) for i in range(len(df)): - tz = ZoneInfo(df.loc[i, "time_zone"]) + tzn = df.loc[i, "time_zone"] + tz = ZoneInfo(tzn) if tzn != "None" else None ts = df.loc[i, "timestamp"].tz_convert(tz).replace(tzinfo=None) assert dfo.loc[i, "timestamp"] == ts From 4e04cd0bf743e8d649427548693cfa3902154938 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:13:48 -0600 Subject: [PATCH 03/27] Add time_wrap feature for tz col conversion --- src/chronify/__init__.py | 2 + src/chronify/time.py | 1 + src/chronify/time_configs.py | 49 +++++++++++++--- src/chronify/time_range_generator_factory.py | 4 +- src/chronify/time_zone_converter.py | 61 +++++++++++++++++--- tests/test_time_zone_converter.py | 27 ++++++--- 6 files changed, 117 insertions(+), 27 deletions(-) diff --git a/src/chronify/__init__.py b/src/chronify/__init__.py index 230bac8..29d5704 100644 --- a/src/chronify/__init__.py +++ b/src/chronify/__init__.py @@ -21,6 +21,7 @@ from chronify.time_configs import ( AnnualTimeRange, DatetimeRange, + DatetimeRangeWithTZColumn, IndexTimeRangeNTZ, IndexTimeRangeTZ, IndexTimeRangeLocalTime, @@ -37,6 +38,7 @@ "ConflictingInputsError", "CsvTableSchema", "DatetimeRange", + "DatetimeRangeWithTZColumn", "IndexTimeRangeLocalTime", "IndexTimeRangeNTZ", "IndexTimeRangeTZ", diff --git a/src/chronify/time.py b/src/chronify/time.py index 74a9164..1661457 100644 --- a/src/chronify/time.py +++ b/src/chronify/time.py @@ -11,6 +11,7 @@ class TimeType(StrEnum): """Defines the supported time formats in the load data.""" DATETIME = "datetime" + DATETIME_TZ_COL = "datetime_tz_col" ANNUAL = "annual" INDEX_NTZ = "index_ntz" INDEX_TZ = "index_tz" diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index db4f2da..f8f6e03 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -60,15 +60,10 @@ def get_time_zone_column(self) -> Optional[str]: """Return the column in the table that contains time zone or offset information.""" -class DatetimeRange(TimeBaseModel): - """Defines a time range that uses Python datetime instances.""" +class DatetimeRangeBase(TimeBaseModel): + """Defines a time range base class that uses Python datetime instances.""" time_column: str = Field(description="Column in the table that represents time.") - time_type: Literal[TimeType.DATETIME] = TimeType.DATETIME - start: datetime = Field( - description="Start time of the range. If it includes a time zone, the timestamps in " - "the data must be time zone-aware." - ) length: int resolution: timedelta @@ -79,6 +74,16 @@ def start_time_is_tz_naive(self) -> bool: def list_time_columns(self) -> list[str]: return [self.time_column] + +class DatetimeRange(DatetimeRangeBase): + """Defines a time range with a single time zone.""" + + time_type: Literal[TimeType.DATETIME] = TimeType.DATETIME + start: datetime = Field( + description="Start time of the range. If it includes a time zone, the timestamps in " + "the data must be time zone-aware." + ) + def get_time_zone_column(self) -> None: return None @@ -91,8 +96,33 @@ def replace_time_zone(self, tz: ZoneInfo | None) -> "DatetimeRange": return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) -# TODO: -# class DatetimeRangeWithTZColumn(TimeBaseModel): +class DatetimeRangeWithTZColumn(DatetimeRangeBase): + """Defines a time range that uses an external time zone column to interpret timestamps.""" + + time_type: Literal[TimeType.DATETIME_TZ_COL] = TimeType.DATETIME_TZ_COL + start: datetime = Field( + description="Start time of the range. The timestamps in the data must be tz-naive." + ) + time_zone_column: str = Field( + description="Column in the table that has time zone or offset information." + ) + + @field_validator("start") + @classmethod + def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: + if start_timestamp.tzinfo is not None: + msg = "start_timestamp must be tz-naive for DATETIME_TZ_COL" + raise ValueError(msg) + return start_timestamp + + def get_time_zone_column(self) -> str: + return self.time_zone_column + + +DateTimeRanges = Union[ + DatetimeRange, + DatetimeRangeWithTZColumn, +] class AnnualTimeRange(TimeBaseModel): @@ -363,6 +393,7 @@ def default_config(cls, length: int, year: int) -> "MonthDayHourTimeNTZ": Union[ AnnualTimeRange, DatetimeRange, + DatetimeRangeWithTZColumn, IndexTimeRangeNTZ, IndexTimeRangeTZ, IndexTimeRangeLocalTime, diff --git a/src/chronify/time_range_generator_factory.py b/src/chronify/time_range_generator_factory.py index bbdf4fb..b69b4ef 100644 --- a/src/chronify/time_range_generator_factory.py +++ b/src/chronify/time_range_generator_factory.py @@ -2,7 +2,7 @@ from chronify.time_configs import ( AnnualTimeRange, - DatetimeRange, + DatetimeRangeBase, IndexTimeRangeBase, RepresentativePeriodTimeBase, TimeBaseModel, @@ -22,7 +22,7 @@ def make_time_range_generator( leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, ) -> TimeRangeGeneratorBase: match model: - case DatetimeRange(): + case DatetimeRangeBase(): return DatetimeRangeGenerator(model, leap_day_adjustment=leap_day_adjustment) case AnnualTimeRange(): return AnnualTimeRangeGenerator(model) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 4549033..4b6f52b 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -6,12 +6,19 @@ import pandas as pd from chronify.models import TableSchema, MappingTableSchema -from chronify.time_configs import DatetimeRange, TimeBasedDataAdjustment +from chronify.time_configs import ( + DatetimeRangeBase, + DatetimeRange, + DatetimeRangeWithTZColumn, + TimeBasedDataAdjustment, +) from chronify.exceptions import InvalidParameter from chronify.time_series_mapper_base import apply_mapping from chronify.time_series_mapper_datetime import MapperDatetimeToDatetime from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database +from chronify.time import TimeType +from chronify.time_utils import wrap_timestamps class TimeZoneConverterBase(abc.ABC): @@ -28,8 +35,13 @@ def __init__( self._from_schema = from_schema def check_from_schema(self) -> None: + msg = "" if not isinstance(self._from_schema.time_config, DatetimeRange): - msg = "Source schema does not have DatetimeRange time config." + msg += "Source schema does not have DatetimeRange time config. " + if self._from_schema.time_config.start_time_is_tz_naive(): + msg += "Source schema start_time must be timezone-aware. " + msg += "To convert from timezone-naive to timezone-aware, use the TimeSeriesMapperDatetime.map_time() method instead. " + if msg != "": raise InvalidParameter(msg) @abc.abstractmethod @@ -92,17 +104,39 @@ class TimeZoneConverterByGeography(TimeZoneConverterBase): """Class for time zone conversion of time series data based on a geography-based time zone column.""" def __init__( - self, engine: Engine, metadata: MetaData, from_schema: TableSchema, time_zone_column: str + self, + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + time_zone_column: str, + wrap_time_allowed: Optional[bool] = False, ): super().__init__(engine, metadata, from_schema) self.time_zone_column = time_zone_column + self._wrap_time_allowed = wrap_time_allowed self._to_schema = self.generate_to_schema() + def generate_to_time_config(self) -> DatetimeRangeBase: + if self._wrap_time_allowed: + time_kwargs = self._from_schema.time_config.model_dump() + time_kwargs = dict( + filter( + lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, + time_kwargs.items(), + ) + ) + time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL + time_kwargs["start"] = self._from_schema.time_config.start.replace(tzinfo=None) + time_kwargs["time_zone_column"] = self.time_zone_column + return DatetimeRangeWithTZColumn(**time_kwargs) + + return self._from_schema.time_config.replace_time_zone(None) + def generate_to_schema(self) -> TableSchema: to_schema: TableSchema = self._from_schema.model_copy( update={ "name": f"{self._from_schema.name}_tz_converted", - "time_config": self._from_schema.time_config.replace_time_zone(None), + "time_config": self.generate_to_time_config(), } ) return to_schema @@ -116,7 +150,8 @@ def convert_time_zone( self.check_from_schema() df, mapping_schema = self._create_interm_map_with_time_zone() - # Do not check mapped timestamps because they cannot be described by the mapped_schema time_config + # Do not check mapped timestamps when not wrap_time_allowed + # because they cannot be fully described by the to_schema time_config apply_mapping( df, mapping_schema, @@ -127,6 +162,7 @@ def convert_time_zone( TimeBasedDataAdjustment(), scratch_dir=scratch_dir, output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps if self._wrap_time_allowed else False, ) def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableSchema]: @@ -135,6 +171,9 @@ def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableS from_time_col = "from_" + mapped_time_col from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() + if self._wrap_time_allowed: + to_time_data = make_time_range_generator(self._to_schema.time_config).list_timestamps() + from_tz_col = "from_" + self.time_zone_column with self._engine.connect() as conn: @@ -151,18 +190,24 @@ def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableS from_time_config = self._from_schema.time_config.model_copy( update={"time_column": from_time_col} ) - to_time_config = self._from_schema.time_config.replace_time_zone(None) + to_time_config = self.generate_to_time_config() df_tz = [] for time_zone in time_zones: tz = ZoneInfo(time_zone) if time_zone not in [None, "None"] else None - mapped_time_data = [x.tz_convert(tz).tz_localize(None) for x in from_time_data] + converted_time_data = [x.tz_convert(tz).tz_localize(None) for x in from_time_data] + if self._wrap_time_allowed: + final_time_data = wrap_timestamps( + pd.Series(converted_time_data), pd.Series(to_time_data) + ) + else: + final_time_data = converted_time_data df_tz.append( pd.DataFrame( { from_time_col: from_time_data, from_tz_col: time_zone, - mapped_time_col: mapped_time_data, + mapped_time_col: final_time_data, } ) ) diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 0f71c7c..599fe5a 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -1,6 +1,7 @@ from zoneinfo import ZoneInfo from datetime import datetime, timedelta import numpy as np +import pytest import pandas as pd from sqlalchemy import Engine, MetaData @@ -116,20 +117,29 @@ def run_conversion_by_geography( engine: Engine, df: pd.DataFrame, from_schema: TableSchema, + wrap_time_allowed: bool, ) -> None: metadata = MetaData() ingest_data(engine, metadata, df, from_schema) - TZC = TimeZoneConverterByGeography(engine, metadata, from_schema, "time_zone") + TZC = TimeZoneConverterByGeography( + engine, metadata, from_schema, "time_zone", wrap_time_allowed=wrap_time_allowed + ) TZC.convert_time_zone() dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) dfo = dfo[df.columns].sort_values(by="index").reset_index(drop=True) + dfo["timestamp"] = pd.to_datetime(dfo["timestamp"]) # needed for engine 2, not sure why assert df["value"].equals(dfo["value"]) - for i in range(len(df)): - tzn = df.loc[i, "time_zone"] - tz = ZoneInfo(tzn) if tzn != "None" else None - ts = df.loc[i, "timestamp"].tz_convert(tz).replace(tzinfo=None) - assert dfo.loc[i, "timestamp"] == ts + if wrap_time_allowed: + assert set(dfo["timestamp"].value_counts()) == {4} + expected = [x.replace(tzinfo=None) for x in sorted(set(df["timestamp"]))] + assert set(dfo["timestamp"]) == set(expected) + else: + for i in range(len(df)): + tzn = df.loc[i, "time_zone"] + tz = ZoneInfo(tzn) if tzn != "None" else None + ts = df.loc[i, "timestamp"].tz_convert(tz).replace(tzinfo=None) + assert dfo.loc[i, "timestamp"] == ts def test_time_conversion(iter_engines: Engine) -> None: @@ -141,7 +151,8 @@ def test_time_conversion(iter_engines: Engine) -> None: run_conversion(iter_engines, df, from_schema, to_time_zone) -def test_time_conversion_by_geography(iter_engines: Engine) -> None: +@pytest.mark.parametrize("wrap_time_allowed", [False, True]) +def test_time_conversion_by_geography(iter_engines: Engine, wrap_time_allowed: bool) -> None: from_schema = get_datetime_schema( 2018, ZoneInfo("US/Mountain"), @@ -150,4 +161,4 @@ def test_time_conversion_by_geography(iter_engines: Engine) -> None: has_tz_col=True, ) df = generate_dataframe_with_tz_col(from_schema) - run_conversion_by_geography(iter_engines, df, from_schema) + run_conversion_by_geography(iter_engines, df, from_schema, wrap_time_allowed) From ed297e1ff8cb7c25b57dab618ea435085a169ff1 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:15:58 -0600 Subject: [PATCH 04/27] Rename IndexTimeRangeLocalTime to IndexTimeRangeWithTZColumn --- src/chronify/__init__.py | 4 ++-- src/chronify/time_configs.py | 8 ++++---- src/chronify/time_series_mapper_index_time.py | 6 +++--- tests/test_mapper_index_time_to_datetime.py | 4 ++-- tests/test_store.py | 8 ++++++-- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/chronify/__init__.py b/src/chronify/__init__.py index 29d5704..0ca60e7 100644 --- a/src/chronify/__init__.py +++ b/src/chronify/__init__.py @@ -24,7 +24,7 @@ DatetimeRangeWithTZColumn, IndexTimeRangeNTZ, IndexTimeRangeTZ, - IndexTimeRangeLocalTime, + IndexTimeRangeWithTZColumn, RepresentativePeriodTimeNTZ, RepresentativePeriodTimeTZ, TimeBaseModel, @@ -39,7 +39,7 @@ "CsvTableSchema", "DatetimeRange", "DatetimeRangeWithTZColumn", - "IndexTimeRangeLocalTime", + "IndexTimeRangeWithTZColumn", "IndexTimeRangeNTZ", "IndexTimeRangeTZ", "InvalidOperation", diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index f8f6e03..05a020a 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -201,7 +201,7 @@ def get_time_zone_column(self) -> None: return None -class IndexTimeRangeLocalTime(IndexTimeRangeBase): +class IndexTimeRangeWithTZColumn(IndexTimeRangeBase): """Index time that reprsents local time relative to a time zone column. start_timestamp is tz-naive. Used for dataset where the timeseries for all geographies start at the same @@ -217,7 +217,7 @@ class IndexTimeRangeLocalTime(IndexTimeRangeBase): @classmethod def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: if start_timestamp.tzinfo is not None: - msg = "start_timestamp must be tz-naive for IndexTimeRangeLocalTime" + msg = "start_timestamp must be tz-naive for IndexTimeRangeWithTZColumn" raise ValueError(msg) return start_timestamp @@ -228,7 +228,7 @@ def get_time_zone_column(self) -> str: IndexTimeRanges = Union[ IndexTimeRangeNTZ, IndexTimeRangeTZ, - IndexTimeRangeLocalTime, + IndexTimeRangeWithTZColumn, ] @@ -396,7 +396,7 @@ def default_config(cls, length: int, year: int) -> "MonthDayHourTimeNTZ": DatetimeRangeWithTZColumn, IndexTimeRangeNTZ, IndexTimeRangeTZ, - IndexTimeRangeLocalTime, + IndexTimeRangeWithTZColumn, RepresentativePeriodTimeNTZ, RepresentativePeriodTimeTZ, YearMonthDayPeriodTimeNTZ, diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index 0cafb0c..2dfc14f 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -14,7 +14,7 @@ DatetimeRange, IndexTimeRanges, IndexTimeRangeBase, - IndexTimeRangeLocalTime, + IndexTimeRangeWithTZColumn, TimeBasedDataAdjustment, ) from chronify.time_range_generator_factory import make_time_range_generator @@ -227,7 +227,7 @@ def _create_interm_map_with_time_zone( from_time_config = self._from_time_config.model_copy( update={"time_column": from_time_col, "time_zone_column": from_tz_col} ) - assert isinstance(from_time_config, IndexTimeRangeLocalTime) + assert isinstance(from_time_config, IndexTimeRangeWithTZColumn) df_tz = [] to_tz = self._to_time_config.start.tzinfo @@ -297,7 +297,7 @@ def _create_interm_map_with_time_zone_and_dst_adjustment( from_time_config = self._from_time_config.model_copy( update={"time_column": from_time_col, "time_zone_column": from_tz_col} ) - assert isinstance(from_time_config, IndexTimeRangeLocalTime) + assert isinstance(from_time_config, IndexTimeRangeWithTZColumn) df_tz = [] for time_zone in time_zones: diff --git a/tests/test_mapper_index_time_to_datetime.py b/tests/test_mapper_index_time_to_datetime.py index 87bf19d..580537e 100644 --- a/tests/test_mapper_index_time_to_datetime.py +++ b/tests/test_mapper_index_time_to_datetime.py @@ -11,7 +11,7 @@ DatetimeRange, IndexTimeRangeNTZ, IndexTimeRangeTZ, - IndexTimeRangeLocalTime, + IndexTimeRangeWithTZColumn, TimeBasedDataAdjustment, ) from chronify.exceptions import ConflictingInputsError @@ -118,7 +118,7 @@ def data_for_unaligned_time_mapping( ] ) - time_config = IndexTimeRangeLocalTime( + time_config = IndexTimeRangeWithTZColumn( start=1, length=time_array_len, start_timestamp=pd.Timestamp("2018-01-01 00:00"), diff --git a/tests/test_store.py b/tests/test_store.py index 9c5b0cc..b84aebe 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -34,7 +34,11 @@ from chronify.models import ColumnDType, CsvTableSchema, PivotedTableSchema, TableSchema from chronify.store import Store from chronify.time import TimeIntervalType, DaylightSavingAdjustmentType -from chronify.time_configs import DatetimeRange, IndexTimeRangeLocalTime, TimeBasedDataAdjustment +from chronify.time_configs import ( + DatetimeRange, + IndexTimeRangeWithTZColumn, + TimeBasedDataAdjustment, +) from chronify.time_range_generator_factory import make_time_range_generator from chronify.time_series_checker import check_timestamp_lists from chronify.utils.sql import make_temp_view_name @@ -516,7 +520,7 @@ def test_map_index_time_to_datetime( name="generators_index", time_array_id_columns=["generator", "time_zone"], value_column="value", - time_config=IndexTimeRangeLocalTime( + time_config=IndexTimeRangeWithTZColumn( start=0, length=time_array_len, start_timestamp=pd.Timestamp(f"{year}-01-01 00:00"), From 87a116e8836ac101d47476e752d59d2885abbc8c Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:46:00 -0600 Subject: [PATCH 05/27] Rename INDEX_LOCAL to INDEX_TZ_COL --- src/chronify/time.py | 2 +- src/chronify/time_configs.py | 13 ++++++++----- src/chronify/time_series_mapper_index_time.py | 10 +++++----- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/chronify/time.py b/src/chronify/time.py index 1661457..e78f508 100644 --- a/src/chronify/time.py +++ b/src/chronify/time.py @@ -15,7 +15,7 @@ class TimeType(StrEnum): ANNUAL = "annual" INDEX_NTZ = "index_ntz" INDEX_TZ = "index_tz" - INDEX_LOCAL = "index_local" + INDEX_TZ_COL = "INDEX_TZ_COL" REPRESENTATIVE_PERIOD_NTZ = "representative_period_ntz" REPRESENTATIVE_PERIOD_TZ = "representative_period_tz" YEAR_MONTH_DAY_HOUR_NTZ = "year_month_day_hour" diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 05a020a..29c370c 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -67,10 +67,6 @@ class DatetimeRangeBase(TimeBaseModel): length: int resolution: timedelta - def start_time_is_tz_naive(self) -> bool: - """Return True if the timestamps in the range do not have time zones.""" - return self.start.tzinfo is None - def list_time_columns(self) -> list[str]: return [self.time_column] @@ -84,6 +80,10 @@ class DatetimeRange(DatetimeRangeBase): "the data must be time zone-aware." ) + def start_time_is_tz_naive(self) -> bool: + """Return True if the timestamps in the range do not have time zones.""" + return self.start.tzinfo is None + def get_time_zone_column(self) -> None: return None @@ -115,6 +115,9 @@ def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: raise ValueError(msg) return start_timestamp + def start_time_is_tz_naive(self) -> bool: + return True + def get_time_zone_column(self) -> str: return self.time_zone_column @@ -208,7 +211,7 @@ class IndexTimeRangeWithTZColumn(IndexTimeRangeBase): clock time. """ - time_type: Literal[TimeType.INDEX_LOCAL] = TimeType.INDEX_LOCAL + time_type: Literal[TimeType.INDEX_TZ_COL] = TimeType.INDEX_TZ_COL time_zone_column: str = Field( description="Column in the table that has time zone or offset information." ) diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index 2dfc14f..34c4094 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -77,7 +77,7 @@ def map_time( self.check_schema_consistency() # Convert from index time to its represented datetime - if self._from_time_config.time_type == TimeType.INDEX_LOCAL: + if self._from_time_config.time_type == TimeType.INDEX_TZ_COL: if ( self._dst_adjustment == DaylightSavingAdjustmentType.DROP_SPRING_FORWARD_DUPLICATE_FALLBACK @@ -207,7 +207,7 @@ def _create_interm_map(self) -> tuple[pd.DataFrame, MappingTableSchema, TableSch def _create_interm_map_with_time_zone( self, ) -> tuple[pd.DataFrame, MappingTableSchema, TableSchema]: - """Create mapping dataframe for converting INDEX_LOCAL time to its represented datetime""" + """Create mapping dataframe for converting INDEX_TZ_COL time to its represented datetime""" mapped_schema = self._create_intermediate_schema() assert isinstance(mapped_schema.time_config, DatetimeRange) mapped_time_col = mapped_schema.time_config.time_column @@ -216,7 +216,7 @@ def _create_interm_map_with_time_zone( from_time_data = make_time_range_generator(self._from_time_config).list_timestamps() tz_col = self._from_time_config.get_time_zone_column() - assert tz_col is not None, "Expecting a time zone column for INDEX_LOCAL" + assert tz_col is not None, "Expecting a time zone column for INDEX_TZ_COL" from_tz_col = "from_" + tz_col with self._engine.connect() as conn: @@ -262,7 +262,7 @@ def _create_interm_map_with_time_zone_and_dst_adjustment( self, interpolate_fallback: bool = False, ) -> tuple[pd.DataFrame, MappingTableSchema, TableSchema]: - """Create mapping dataframe for converting INDEX_LOCAL time to its represented datetime + """Create mapping dataframe for converting INDEX_TZ_COL time to its represented datetime with time-based daylight_saving adjustment that drops the spring-forward hour and, per user input, interpolates or duplicates the fall-back hour @@ -286,7 +286,7 @@ def _create_interm_map_with_time_zone_and_dst_adjustment( df_ntz["clock_time"] = df_ntz["clock_time"].astype(str) tz_col = self._from_time_config.get_time_zone_column() - assert tz_col is not None, "Expecting a time zone column for INDEX_LOCAL" + assert tz_col is not None, "Expecting a time zone column for INDEX_TZ_COL" from_tz_col = "from_" + tz_col with self._engine.connect() as conn: From d65a2b342cf15c5fc563bdabe52890ae41451662 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:48:42 -0600 Subject: [PATCH 06/27] Rework time_zone_converter to output tz naive col --- src/chronify/datetime_range_generator.py | 5 +- src/chronify/time_series_mapper_base.py | 6 -- src/chronify/time_zone_converter.py | 71 ++++++++++++++++++++---- tests/test_time_zone_converter.py | 11 +++- 4 files changed, 71 insertions(+), 22 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 7402c03..50686f7 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -66,7 +66,10 @@ def iter_timestamps(self) -> Generator[datetime, None, None]: yield pd.Timestamp(cur) def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: - return sorted(df[self._model.time_column].unique()) + result = sorted(df[self._model.time_column].unique()) + if not isinstance(result[0], datetime): + result = [pd.Timestamp(x) for x in result] + return result def list_time_columns(self) -> list[str]: return self._model.list_time_columns() diff --git a/src/chronify/time_series_mapper_base.py b/src/chronify/time_series_mapper_base.py index d1c7bd4..5de3e15 100644 --- a/src/chronify/time_series_mapper_base.py +++ b/src/chronify/time_series_mapper_base.py @@ -112,12 +112,6 @@ def apply_mapping( scratch_dir=scratch_dir, ) metadata.reflect(engine, views=True) - - # with engine.connect() as conn: - # import pandas as pd - # dfm = pd.read_sql(f"select * from {mapping_schema.name}", conn) - # df = pd.read_sql(f"select * from {from_schema.name}", conn) - # breakpoint() created_tmp_view = False try: _apply_mapping( diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 4b6f52b..e67df3e 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -14,7 +14,6 @@ ) from chronify.exceptions import InvalidParameter from chronify.time_series_mapper_base import apply_mapping -from chronify.time_series_mapper_datetime import MapperDatetimeToDatetime from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database from chronify.time import TimeType @@ -72,11 +71,28 @@ def __init__( self._to_time_zone = to_time_zone self._to_schema = self.generate_to_schema() + def generate_to_time_config(self) -> DatetimeRangeBase: + to_time_config = self._from_schema.time_config.convert_time_zone( + self._to_time_zone + ).replace_time_zone(None) + time_kwargs = to_time_config.model_dump() + # time_kwargs = self._from_schema.time_config.model_dump() + time_kwargs = dict( + filter( + lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, + time_kwargs.items(), + ) + ) + time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL + # time_kwargs["start"] = self._from_schema.time_config.start.astimezone(self._to_time_zone).replace(tzinfo=None) + time_kwargs["time_zone_column"] = "time_zone" + return DatetimeRangeWithTZColumn(**time_kwargs) + def generate_to_schema(self) -> TableSchema: to_schema: TableSchema = self._from_schema.model_copy( update={ "name": f"{self._from_schema.name}_tz_converted", - "time_config": self._from_schema.time_config.convert_time_zone(self._to_time_zone), + "time_config": self.generate_to_time_config(), } ) return to_schema @@ -88,17 +104,48 @@ def convert_time_zone( check_mapped_timestamps: bool = False, ) -> None: self.check_from_schema() - MapperDatetimeToDatetime( - self._engine, - self._metadata, + df, mapping_schema = self._create_mapping() + + apply_mapping( + df, + mapping_schema, self._from_schema, self._to_schema, - ).map_time( + self._engine, + self._metadata, + TimeBasedDataAdjustment(), scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, ) + def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: + """Create mapping dataframe for converting datetime to geography-based time zone""" + time_col = self._from_schema.time_config.time_column + from_time_col = "from_" + time_col + from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() + to_time_data = make_time_range_generator(self._to_schema.time_config).list_timestamps() + + from_time_config = self._from_schema.time_config.model_copy( + update={"time_column": from_time_col} + ) + to_time_config = self.generate_to_time_config() + tz_col = to_time_config.time_zone_column + tz_name = None if self._to_time_zone is None else self._to_time_zone.key + df = pd.DataFrame( + { + from_time_col: from_time_data, + tz_col: tz_name, + time_col: to_time_data, + } + ) + + mapping_schema = MappingTableSchema( + name="mapping_table_gtz_conversion", + time_configs=[from_time_config, to_time_config], + ) + return df, mapping_schema + class TimeZoneConverterByGeography(TimeZoneConverterBase): """Class for time zone conversion of time series data based on a geography-based time zone column.""" @@ -148,7 +195,7 @@ def convert_time_zone( check_mapped_timestamps: bool = False, # will not be used ) -> None: self.check_from_schema() - df, mapping_schema = self._create_interm_map_with_time_zone() + df, mapping_schema = self._create_mapping() # Do not check mapped timestamps when not wrap_time_allowed # because they cannot be fully described by the to_schema time_config @@ -165,10 +212,10 @@ def convert_time_zone( check_mapped_timestamps=check_mapped_timestamps if self._wrap_time_allowed else False, ) - def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableSchema]: + def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for converting datetime to geography-based time zone""" - mapped_time_col = self._from_schema.time_config.time_column - from_time_col = "from_" + mapped_time_col + time_col = self._from_schema.time_config.time_column + from_time_col = "from_" + time_col from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() if self._wrap_time_allowed: @@ -194,7 +241,7 @@ def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableS df_tz = [] for time_zone in time_zones: - tz = ZoneInfo(time_zone) if time_zone not in [None, "None"] else None + tz = None if time_zone in [None, "None"] else ZoneInfo(time_zone) converted_time_data = [x.tz_convert(tz).tz_localize(None) for x in from_time_data] if self._wrap_time_allowed: final_time_data = wrap_timestamps( @@ -207,7 +254,7 @@ def _create_interm_map_with_time_zone(self) -> tuple[pd.DataFrame, MappingTableS { from_time_col: from_time_data, from_tz_col: time_zone, - mapped_time_col: final_time_data, + time_col: final_time_data, } ) ) diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 599fe5a..0307ab4 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -110,7 +110,9 @@ def run_conversion( TZC.convert_time_zone(check_mapped_timestamps=True) dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) - assert (df["timestamp"] == dfo["timestamp"]).prod() == 1 # TODO: these will always be equal + assert df["value"].equals(dfo["value"]) + expected = df["timestamp"].dt.tz_convert(to_time_zone).dt.tz_localize(None) + assert (dfo["timestamp"] == expected).prod() == 1 def run_conversion_by_geography( @@ -142,12 +144,15 @@ def run_conversion_by_geography( assert dfo.loc[i, "timestamp"] == ts -def test_time_conversion(iter_engines: Engine) -> None: +@pytest.mark.parametrize( + "to_time_zone", [ZoneInfo("US/Central"), ZoneInfo("UTC"), ZoneInfo("America/Los_Angeles")] +) +def test_time_conversion(iter_engines: Engine, to_time_zone: ZoneInfo) -> None: from_schema = get_datetime_schema( 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" ) df = generate_datetime_dataframe(from_schema) - to_time_zone = ZoneInfo("US/Central") + to_time_zone = ZoneInfo("US/Central") # TODO run_conversion(iter_engines, df, from_schema, to_time_zone) From 38691a373c22069fbd704f47f1794188c7c61ec6 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:03:23 -0600 Subject: [PATCH 07/27] Common API --- src/chronify/time_zone_converter.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index e67df3e..2acec03 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -20,6 +20,27 @@ from chronify.time_utils import wrap_timestamps +def convert_to_single_time_zone( + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + to_time_zone: ZoneInfo | None, +) -> None: + TimeZoneConverter(engine, metadata, from_schema, to_time_zone) + + +def convert_to_multiple_time_zones( + engine: Engine, + metadata: MetaData, + from_schema: TableSchema, + time_zone_column: str, + wrap_time_allowed: Optional[bool] = False, +) -> None: + TimeZoneConverterByGeography( + engine, metadata, from_schema, time_zone_column, wrap_time_allowed + ) + + class TimeZoneConverterBase(abc.ABC): """Base class for time zone conversion of time series data.""" @@ -76,7 +97,6 @@ def generate_to_time_config(self) -> DatetimeRangeBase: self._to_time_zone ).replace_time_zone(None) time_kwargs = to_time_config.model_dump() - # time_kwargs = self._from_schema.time_config.model_dump() time_kwargs = dict( filter( lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, @@ -84,7 +104,6 @@ def generate_to_time_config(self) -> DatetimeRangeBase: ) ) time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL - # time_kwargs["start"] = self._from_schema.time_config.start.astimezone(self._to_time_zone).replace(tzinfo=None) time_kwargs["time_zone_column"] = "time_zone" return DatetimeRangeWithTZColumn(**time_kwargs) From db8ab6b4320da5c9008e0d5ff56316833655243d Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:00:24 -0600 Subject: [PATCH 08/27] Revise DatetimeRangeWithTZColumn class and downstream funcs --- src/chronify/annual_time_range_generator.py | 5 +- ...umn_representative_time_range_generator.py | 5 +- src/chronify/datetime_range_generator.py | 122 +++++++++-- src/chronify/exceptions.py | 4 + src/chronify/index_time_range_generator.py | 5 +- .../representative_time_range_generator.py | 13 +- src/chronify/store.py | 184 ++++++++++++++++ src/chronify/time_configs.py | 21 +- src/chronify/time_range_generator_base.py | 10 +- src/chronify/time_range_generator_factory.py | 14 +- src/chronify/time_series_checker.py | 72 ++++++- ...apper_column_representative_to_datetime.py | 2 +- src/chronify/time_utils.py | 18 ++ src/chronify/time_zone_converter.py | 197 +++++++++++++----- tests/test_mapper_datetime_to_datetime.py | 2 +- ..._mapper_representative_time_to_datetime.py | 2 +- tests/test_store.py | 155 ++++++++++++++ tests/test_time_zone_converter.py | 84 ++++++-- 18 files changed, 789 insertions(+), 126 deletions(-) diff --git a/src/chronify/annual_time_range_generator.py b/src/chronify/annual_time_range_generator.py index d8e7282..ce127bb 100644 --- a/src/chronify/annual_time_range_generator.py +++ b/src/chronify/annual_time_range_generator.py @@ -11,10 +11,13 @@ def __init__(self, model: AnnualTimeRange) -> None: super().__init__() self._model = model - def iter_timestamps(self) -> Generator[int, None, None]: + def _iter_timestamps(self) -> Generator[int, None, None]: for i in range(1, self._model.length + 1): yield i + def list_timestamps(self) -> list[int]: + return list(self._iter_timestamps()) + def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[Any]: raise NotImplementedError diff --git a/src/chronify/column_representative_time_range_generator.py b/src/chronify/column_representative_time_range_generator.py index 74e09b3..5c24429 100644 --- a/src/chronify/column_representative_time_range_generator.py +++ b/src/chronify/column_representative_time_range_generator.py @@ -37,9 +37,12 @@ def __init__(self, model: ColumnRepresentativeBase): msg = f"No time generator for ColumnRepresentative time with time_config {type(self._model)}" raise exceptions.InvalidOperation(msg) - def iter_timestamps(self) -> Generator[tuple[int, ...], None, None]: + def _iter_timestamps(self) -> Generator[tuple[int, ...], None, None]: yield from self._handler._iter_timestamps() + def list_timestamps(self) -> list[tuple[int, ...]]: + return list(self._iter_timestamps()) + def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[tuple[int, ...]]: return self._handler.list_distinct_timestamps_from_dataframe(df) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 50686f7..39e25ab 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -7,15 +7,14 @@ from chronify.time import ( LeapDayAdjustmentType, ) -from chronify.time_configs import ( - DatetimeRange, -) +from chronify.time_configs import DatetimeRange, DatetimeRangeWithTZColumn from chronify.time_utils import adjust_timestamp_by_dst_offset from chronify.time_range_generator_base import TimeRangeGeneratorBase +from chronify.exceptions import InvalidValue -class DatetimeRangeGenerator(TimeRangeGeneratorBase): - """Generates datetime ranges based on a DatetimeRange model.""" +class DatetimeRangeGeneratorBase(TimeRangeGeneratorBase): + """Base class that generates datetime ranges based on a DatetimeRange model.""" def __init__( self, @@ -25,18 +24,24 @@ def __init__( self._model = model self._adjustment = leap_day_adjustment or LeapDayAdjustmentType.NONE - def iter_timestamps(self) -> Generator[datetime, None, None]: + def _iter_timestamps( + self, start: Optional[datetime] = None + ) -> Generator[datetime, None, None]: + """ + if start is supplied, override self._model.start + """ + if start is None: + start = self._model.start + tz = start.tzinfo + for i in range(self._model.length): - if self._model.start_time_is_tz_naive(): + if not tz: cur = adjust_timestamp_by_dst_offset( - self._model.start + i * self._model.resolution, self._model.resolution + start + i * self._model.resolution, self._model.resolution ) else: - tz = self._model.start.tzinfo # always step in standard time - cur_utc = ( - self._model.start.astimezone(ZoneInfo("UTC")) + i * self._model.resolution - ) + cur_utc = start.astimezone(ZoneInfo("UTC")) + i * self._model.resolution cur = adjust_timestamp_by_dst_offset( cur_utc.astimezone(tz), self._model.resolution ) @@ -65,11 +70,98 @@ def iter_timestamps(self) -> Generator[datetime, None, None]: ): yield pd.Timestamp(cur) - def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: + def list_time_columns(self) -> list[str]: + return self._model.list_time_columns() + + def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: # TODO result = sorted(df[self._model.time_column].unique()) if not isinstance(result[0], datetime): result = [pd.Timestamp(x) for x in result] return result - def list_time_columns(self) -> list[str]: - return self._model.list_time_columns() + +class DatetimeRangeGenerator(DatetimeRangeGeneratorBase): + """Generates datetime ranges based on a DatetimeRange model.""" + + def __init__( + self, + model: DatetimeRange, + leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, + ) -> None: + super().__init__(model, leap_day_adjustment=leap_day_adjustment) + + def list_timestamps(self) -> list[datetime]: + return list(self._iter_timestamps()) + + +class DatetimeRangeGeneratorExternalTimeZone(DatetimeRangeGeneratorBase): + """Generates datetime ranges based on a DatetimeRangeWithTZColumn model. + datetime ranges will be tz-naive and can be listed by time_zone name using special class func + These ranges may be localized by the time_zone name. + # TODO: add offset as a column + """ + + def __init__( + self, + model: DatetimeRangeWithTZColumn, + leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, + ) -> None: + super().__init__(model, leap_day_adjustment=leap_day_adjustment) + if not self._model.time_zones: + msg = ( + "DatetimeRangeWithTZColumn.time_zones needs to be instantiated for ", + f"DatetimeRangeGeneratorExternalTimeZone: {self._model}", + ) + raise InvalidValue(msg) + + if len(set(self._model.time_zones)) < len(self._model.time_zones): + msg = f"DatetimeRangeWithTZColumn.time_zones has duplicates: {self._model.time_zones}" + raise InvalidValue(msg) + + def _list_timestamps(self, time_zone: ZoneInfo) -> list[datetime]: + """always return tz-naive timestamps relative to input time_zone""" + # assert self._model.start.tzinfo is None + if self._model.start_time_is_tz_naive(): + if time_zone: + start = self._model.start.replace(tzinfo=time_zone) + else: + start = None + else: + if time_zone: + start = self._model.start.astimezone(time_zone) + else: + start = self._model.start.replace(tzinfo=None) + timestamps = list(self._iter_timestamps(start=start)) + return [x.replace(tzinfo=None) for x in timestamps] + + def list_timestamps(self) -> list[datetime]: + """return only unique values, this means no duplicates for prevailing time""" + ts_set = set() + for tz in self._model.time_zones: + ts_set.update(set(self._list_timestamps(tz))) + timestamps = sorted(ts_set) + return timestamps + + def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list[datetime]]: + """for each time zone, returns full timestamp iteration with duplicates allowed""" + dct: dict[str, datetime] = {} + for tz in self._model.time_zones: + timestamps = self._list_timestamps(tz) + if distinct: + timestamps = sorted(set(timestamps)) + tz_name = tz.key if tz else "None" + dct[tz_name] = timestamps + + return dct + + def list_distinct_timestamps_by_time_zone_from_dataframe( + self, df: pd.DataFrame + ) -> dict[str, list[datetime]]: + tz_col = self._model.time_zone_column + t_col = self._model.time_column + df[t_col] = pd.to_datetime(df[t_col]) + df2 = df[[tz_col, t_col]].drop_duplicates() + dct: dict[str, list[datetime]] = {} + for tz_name in sorted(df2[tz_col].unique()): + dct[tz_name] = sorted(df2.loc[df2[tz_col] == tz_name, t_col].tolist()) + return dct diff --git a/src/chronify/exceptions.py b/src/chronify/exceptions.py index 8326f33..fa0c0b7 100644 --- a/src/chronify/exceptions.py +++ b/src/chronify/exceptions.py @@ -14,6 +14,10 @@ class InvalidOperation(ChronifyExceptionBase): """Raised when an invalid operation is requested.""" +class InvalidModel(ChronifyExceptionBase): + """Raised when an invalid model is passed.""" + + class InvalidParameter(ChronifyExceptionBase): """Raised when an invalid parameter is passed.""" diff --git a/src/chronify/index_time_range_generator.py b/src/chronify/index_time_range_generator.py index c78a31f..44402eb 100644 --- a/src/chronify/index_time_range_generator.py +++ b/src/chronify/index_time_range_generator.py @@ -13,9 +13,12 @@ def __init__(self, model: IndexTimeRangeBase) -> None: super().__init__() self._model = model - def iter_timestamps(self) -> Generator[int, None, None]: + def _iter_timestamps(self) -> Generator[int, None, None]: yield from range(self._model.start, self._model.length + self._model.start) + def list_timestamps(self) -> list[int]: + return list(self._iter_timestamps()) + def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[Any]: return sorted(df[self._model.time_column].unique()) diff --git a/src/chronify/representative_time_range_generator.py b/src/chronify/representative_time_range_generator.py index 43dcaa9..9dbc269 100644 --- a/src/chronify/representative_time_range_generator.py +++ b/src/chronify/representative_time_range_generator.py @@ -28,8 +28,11 @@ def __init__(self, model: RepresentativePeriodTimeBase) -> None: case RepresentativePeriodFormat.ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR: self._handler = OneWeekdayDayAndWeekendDayPerMonthByHourHandler() - def iter_timestamps(self) -> Generator[NamedTuple, None, None]: - return self._handler.iter_timestamps() + def _iter_timestamps(self) -> Generator[NamedTuple, None, None]: + return self._handler._iter_timestamps() + + def list_timestamps(self) -> list[NamedTuple]: + return list(self._iter_timestamps()) def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[Any]: columns = self._model.list_time_columns() @@ -77,7 +80,7 @@ def get_time_type(self) -> str: """Return the time type name representing the data.""" @abc.abstractmethod - def iter_timestamps(self) -> Generator[Any, None, None]: + def _iter_timestamps(self) -> Generator[Any, None, None]: """Return an iterator over all time indexes in the table. Type of the time is dependent on the class. """ @@ -97,7 +100,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: def get_time_type() -> str: return OneWeekPerMonthByHour.__name__ - def iter_timestamps(self) -> Generator[OneWeekPerMonthByHour, None, None]: + def _iter_timestamps(self) -> Generator[OneWeekPerMonthByHour, None, None]: for month in range(1, 13): for dow in range(7): for hour in range(24): @@ -123,7 +126,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: def get_time_type() -> str: return OneWeekdayDayOneWeekendDayPerMonthByHour.__name__ - def iter_timestamps(self) -> Generator[OneWeekdayDayOneWeekendDayPerMonthByHour, None, None]: + def _iter_timestamps(self) -> Generator[OneWeekdayDayOneWeekendDayPerMonthByHour, None, None]: for month in range(1, 13): for is_weekday in [False, True]: for hour in range(24): diff --git a/src/chronify/store.py b/src/chronify/store.py index c5ecc31..c34d34d 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -3,6 +3,7 @@ import shutil from typing import Any, Optional from chronify.utils.sql import make_temp_view_name +from zoneinfo import ZoneInfo import duckdb import pandas as pd @@ -49,6 +50,7 @@ from chronify.time_configs import DatetimeRange, IndexTimeRangeBase, TimeBasedDataAdjustment from chronify.time_series_checker import check_timestamps from chronify.time_series_mapper import map_time +from chronify.time_zone_converter import TimeZoneConverter, TimeZoneConverterByColumn from chronify.utils.path_utils import check_overwrite, to_path from chronify.utils.sqlalchemy_view import create_view @@ -870,6 +872,8 @@ def map_table_time_config( config in dst_schema when it does not line up with the time config scratch_dir Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. check_mapped_timestamps Perform time checks on the result of the mapping operation. This can be slow and is not required. @@ -940,6 +944,186 @@ def map_table_time_config( with self._engine.begin() as conn: self._schema_mgr.add_schema(conn, dst_schema) + def convert_time_zone( + self, + src_name: str, + time_zone: ZoneInfo, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> TableSchema: + """ + Convert the time zone of the existing table represented by src_name to a new time zone + + Parameters + ---------- + src_name + Refers to the table name of the source data. + time_zone + Time zone to convert to. + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Raises + ------ + TableAlreadyExists + Raised if the dst_schema name already exists. + + Examples + -------- + >>> store = Store() + >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("EST")) + >>> freq = timedelta(hours=1) + >>> hours_per_year = 8760 + >>> num_time_arrays = 1 + >>> df = pd.DataFrame( + ... { + ... "id": np.concat( + ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] + ... ), + ... "timestamp": np.tile( + ... pd.date_range(start, periods=hours_per_year, freq="h"), num_time_arrays + ... ), + ... "value": np.random.random(hours_per_year * num_time_arrays), + ... } + ... ) + >>> schema = TableSchema( + ... name="some_data", + ... time_config=DatetimeRange( + ... time_column="timestamp", + ... start=start, + ... length=hours_per_year, + ... resolution=freq, + ... ), + ... time_array_id_columns=["id"], + ... value_column="value", + ... ) + >>> store.ingest_table(df, schema) + >>> to_time_zone = ZoneInfo("US/Mountain") + >>> dst_schema = store.convert_time_zone( + ... schema.name, to_time_zone, check_mapped_timestamps=True + ... ) + """ + + src_schema = self._schema_mgr.get_schema(src_name) + TZC = TimeZoneConverter(self._engine, self._metadata, src_schema, time_zone) + + dst_schema = TZC.generate_to_schema() + if self.has_table(dst_schema.name): + msg = dst_schema.name + raise TableAlreadyExists(msg) + + TZC.convert_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + with self._engine.begin() as conn: + self._schema_mgr.add_schema(conn, dst_schema) + + return dst_schema + + def convert_time_zone_by_column( + self, + src_name: str, + time_zone_column: str, + wrap_time_allowed: bool = False, + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, + ) -> TableSchema: + """ + Convert the time zone of the existing table represented by src_name to new time zone(s) defined by a column + + Parameters + ---------- + src_name + Refers to the table name of the source data. + time_zone_column + Name of the time zone column for conversion. + wrap_time_allowed + Defines whether the time column is allowed to be wrapped to reflect the same time + range as the src_name schema in tz-naive clock time + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Raises + ------ + TableAlreadyExists + Raised if the dst_schema name already exists. + + Examples + -------- + >>> store = Store() + >>> start = datetime(year=2018, month=1, day=1, tzinfo=ZoneInfo("EST")) + >>> freq = timedelta(hours=1) + >>> hours_per_year = 8760 + >>> num_time_arrays = 3 + >>> df = pd.DataFrame( + ... { + ... "id": np.concat( + ... [np.repeat(i, hours_per_year) for i in range(1, 1 + num_time_arrays)] + ... ), + ... "timestamp": np.tile( + ... pd.date_range(start, periods=hours_per_year, freq="h"), num_time_arrays + ... ), + ... "time_zone": np.repeat(["US/Eastern", "US/Mountain", "None"], hours_per_year), + ... "value": np.random.random(hours_per_year * num_time_arrays), + ... } + ... ) + >>> schema = TableSchema( + ... name="some_data", + ... time_config=DatetimeRange( + ... time_column="timestamp", + ... start=start, + ... length=hours_per_year, + ... resolution=freq, + ... ), + ... time_array_id_columns=["id"], + ... value_column="value", + ... ) + >>> store.ingest_table(df, schema) + >>> time_zone_column = "time_zone" + >>> dst_schema = store.convert_time_zone_by_column( + ... schema.name, + ... time_zone_column, + ... wrap_time_allowed=False, + ... check_mapped_timestamps=True, + ... ) + """ + + src_schema = self._schema_mgr.get_schema(src_name) + TZC = TimeZoneConverterByColumn( + self._engine, self._metadata, src_schema, time_zone_column, wrap_time_allowed + ) + + dst_schema = TZC.generate_to_schema() + if self.has_table(dst_schema.name): + msg = dst_schema.name + raise TableAlreadyExists(msg) + + TZC.convert_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + with self._engine.begin() as conn: + self._schema_mgr.add_schema(conn, dst_schema) + + return dst_schema + def read_query( self, name: str, diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 29c370c..5662fb9 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -101,26 +101,27 @@ class DatetimeRangeWithTZColumn(DatetimeRangeBase): time_type: Literal[TimeType.DATETIME_TZ_COL] = TimeType.DATETIME_TZ_COL start: datetime = Field( - description="Start time of the range. The timestamps in the data must be tz-naive." + description=( + "Start time of the range. If tz-naive, timestamps of different time zones ", + "are expected to align in clock time. If tz-aware, timestamps of different ", + "time zones are expected to align in real time.", + ) ) time_zone_column: str = Field( description="Column in the table that has time zone or offset information." ) - - @field_validator("start") - @classmethod - def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: - if start_timestamp.tzinfo is not None: - msg = "start_timestamp must be tz-naive for DATETIME_TZ_COL" - raise ValueError(msg) - return start_timestamp + time_zones: Optional[list[ZoneInfo | None]] = Field( + description="Unique time zones from the table." + ) def start_time_is_tz_naive(self) -> bool: - return True + return self.start.tzinfo is None def get_time_zone_column(self) -> str: return self.time_zone_column + # Lixi TODO: ensure table schema has time_zone col? + DateTimeRanges = Union[ DatetimeRange, diff --git a/src/chronify/time_range_generator_base.py b/src/chronify/time_range_generator_base.py index b1065fe..e845dff 100644 --- a/src/chronify/time_range_generator_base.py +++ b/src/chronify/time_range_generator_base.py @@ -1,5 +1,5 @@ import abc -from typing import Any, Generator +from typing import Any import pandas as pd @@ -8,20 +8,16 @@ class TimeRangeGeneratorBase(abc.ABC): """Base class for classes that generate time ranges.""" @abc.abstractmethod - def iter_timestamps(self) -> Generator[Any, None, None]: - """Return an iterator over all time indexes in the table. - Type of the time is dependent on the class. - """ - def list_timestamps(self) -> list[Any]: """Return a list of timestamps for a time range. Type of the timestamps depends on the class. + Note: For DatetimeRangeGeneratorExternalTimeZone class with more than one time zone, + this shows distinct timestamps only Returns ------- list[Any] """ - return list(self.iter_timestamps()) @abc.abstractmethod def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[Any]: diff --git a/src/chronify/time_range_generator_factory.py b/src/chronify/time_range_generator_factory.py index b69b4ef..d20de0a 100644 --- a/src/chronify/time_range_generator_factory.py +++ b/src/chronify/time_range_generator_factory.py @@ -2,7 +2,8 @@ from chronify.time_configs import ( AnnualTimeRange, - DatetimeRangeBase, + DatetimeRange, + DatetimeRangeWithTZColumn, IndexTimeRangeBase, RepresentativePeriodTimeBase, TimeBaseModel, @@ -10,7 +11,10 @@ ) from chronify.time import LeapDayAdjustmentType from chronify.annual_time_range_generator import AnnualTimeRangeGenerator -from chronify.datetime_range_generator import DatetimeRangeGenerator +from chronify.datetime_range_generator import ( + DatetimeRangeGenerator, + DatetimeRangeGeneratorExternalTimeZone, +) from chronify.index_time_range_generator import IndexTimeRangeGenerator from chronify.representative_time_range_generator import RepresentativePeriodTimeGenerator from chronify.time_range_generator_base import TimeRangeGeneratorBase @@ -22,8 +26,12 @@ def make_time_range_generator( leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, ) -> TimeRangeGeneratorBase: match model: - case DatetimeRangeBase(): + case DatetimeRange(): return DatetimeRangeGenerator(model, leap_day_adjustment=leap_day_adjustment) + case DatetimeRangeWithTZColumn(): + return DatetimeRangeGeneratorExternalTimeZone( + model, leap_day_adjustment=leap_day_adjustment + ) case AnnualTimeRange(): return AnnualTimeRangeGenerator(model) case IndexTimeRangeBase(): diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index e09f0eb..3a02382 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -1,5 +1,6 @@ from sqlalchemy import Connection, Table, select, text from typing import Optional +from zoneinfo import ZoneInfo import pandas as pd @@ -7,7 +8,9 @@ from chronify.models import TableSchema from chronify.sqlalchemy.functions import read_database from chronify.time_range_generator_factory import make_time_range_generator +from chronify.datetime_range_generator import DatetimeRangeGeneratorExternalTimeZone from chronify.time import LeapDayAdjustmentType +from chronify.time_utils import is_prevailing_time_zone def check_timestamps( @@ -40,9 +43,25 @@ def __init__( ) def check_timestamps(self) -> None: - count = self._check_expected_timestamps() + preserve_duplicates = False + if isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone): + count = self._check_expected_timestamps_with_external_time_zone() + if self._has_prevailing_time_zone(self._schema.time_config.time_zones): + preserve_duplicates = True + else: + count = self._check_expected_timestamps() + self._check_null_consistency() - self._check_expected_timestamps_by_time_array(count) + self._check_expected_timestamps_by_time_array( + count, preserve_duplicates=preserve_duplicates + ) + + @staticmethod + def _has_prevailing_time_zone(lst: list[ZoneInfo]) -> bool: + for tz in lst: + if is_prevailing_time_zone(tz): + return True + return False def _check_expected_timestamps(self) -> int: expected = self._time_generator.list_timestamps() @@ -55,6 +74,29 @@ def _check_expected_timestamps(self) -> int: check_timestamp_lists(actual, expected) return len(expected) + def _check_expected_timestamps_with_external_time_zone(self) -> int: + expected_dct = self._time_generator.list_timestamps_by_time_zone() + time_columns = self._time_generator.list_time_columns() + time_columns.append(self._schema.time_config.get_time_zone_column()) + stmt = select(*(self._table.c[x] for x in time_columns)).distinct() + for col in time_columns: + stmt = stmt.where(self._table.c[col].is_not(None)) + df = read_database(stmt, self._conn, self._schema.time_config) + actual_dct = self._time_generator.list_distinct_timestamps_by_time_zone_from_dataframe(df) + + if sorted(expected_dct.keys()) != sorted(actual_dct.keys()): + msg = "Time zone records do not match between expected and actual from table " + msg += f"\nexpected: {sorted(expected_dct.keys())} vs. \neactual: {sorted(actual_dct.keys())}" + raise InvalidTable(msg) + + for tz_name in expected_dct.keys(): + # this drops duplicates in tz-naive prevailing time + expected = sorted(set(expected_dct[tz_name])) + actual = actual_dct[tz_name] + check_timestamp_lists(actual, expected, msg_prefix=f"For {tz_name}\n") + # return len by preserving duplicates for tz-naive prevailing time + return len(expected_dct[tz_name]) + def _check_null_consistency(self) -> None: # If any time column has a NULL, all time columns must have a NULL. time_columns = self._time_generator.list_time_columns() @@ -78,7 +120,9 @@ def _check_null_consistency(self) -> None: ) raise InvalidTable(msg) - def _check_expected_timestamps_by_time_array(self, count: int) -> None: + def _check_expected_timestamps_by_time_array( + self, count: int, preserve_duplicates: bool = False + ) -> None: id_cols = ",".join(self._schema.time_array_id_columns) time_cols = ",".join(self._schema.time_config.list_time_columns()) # NULL consistency was checked above. @@ -137,7 +181,20 @@ def _check_expected_timestamps_by_time_array(self, count: int) -> None: for result in self._conn.execute(text(query)).fetchall(): distinct_count_by_ta = result[0] count_by_ta = result[1] - if not count_by_ta == count == distinct_count_by_ta: + + if preserve_duplicates and not count_by_ta == count: + id_vals = result[2:] + values = ", ".join( + f"{x}={y}" for x, y in zip(self._schema.time_array_id_columns, id_vals) + ) + msg = ( + f"The count of time values in each time array must be {count}" + f"Time array identifiers: {values}. " + f"count = {count_by_ta}" + ) + raise InvalidTable(msg) + + if not preserve_duplicates and not count_by_ta == count == distinct_count_by_ta: id_vals = result[2:] values = ", ".join( f"{x}={y}" for x, y in zip(self._schema.time_array_id_columns, id_vals) @@ -151,13 +208,14 @@ def _check_expected_timestamps_by_time_array(self, count: int) -> None: raise InvalidTable(msg) -def check_timestamp_lists(actual: list[pd.Timestamp], expected: list[pd.Timestamp]) -> None: +def check_timestamp_lists( + actual: list[pd.Timestamp], expected: list[pd.Timestamp], msg_prefix: str = "" +) -> None: match = actual == expected + msg = msg_prefix if not match: if len(actual) != len(expected): msg = f"Mismatch number of timestamps: actual: {len(actual)} vs. expected: {len(expected)}\n" - else: - msg = "" missing = set(expected).difference(set(actual)) extra = set(actual).difference(set(expected)) msg += "Actual timestamps do not match expected timestamps. \n" diff --git a/src/chronify/time_series_mapper_column_representative_to_datetime.py b/src/chronify/time_series_mapper_column_representative_to_datetime.py index 166c3a2..2ce44fe 100644 --- a/src/chronify/time_series_mapper_column_representative_to_datetime.py +++ b/src/chronify/time_series_mapper_column_representative_to_datetime.py @@ -195,7 +195,7 @@ def _create_intermediate_ymdh_schema( def _iter_datetime(self) -> Generator[datetime, None, None]: datetime_generator = DatetimeRangeGenerator(self._to_time_config) - yield from datetime_generator.iter_timestamps() + yield from datetime_generator._iter_timestamps() def _create_ymdh_mapping( self, col_names: list[str] = ["year", "month", "day", "hour"] diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index bb7c6a4..8a278e4 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -146,6 +146,24 @@ def roll_time_interval( return ser +def is_prevailing_time_zone(tz: tzinfo | None) -> bool: + if not tz: + return False + ts1 = datetime(year=2020, month=1, day=1, tzinfo=tz) + ts2 = datetime(year=2020, month=6, day=1, tzinfo=tz) + + return ts1.utcoffset() != ts2.utcoffset() + + +def is_standard_time_zone(tz: tzinfo | None) -> bool: + if not tz: + return False + ts1 = datetime(year=2020, month=1, day=1, tzinfo=tz) + ts2 = datetime(year=2020, month=6, day=1, tzinfo=tz) + + return ts1.utcoffset() == ts2.utcoffset() + + def get_standard_time_zone(tz: tzinfo | None) -> tzinfo | None: ts = datetime(year=2020, month=1, day=1, tzinfo=tz) std_tz_name = ts.tzname() diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 2acec03..6ddf2bc 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -20,25 +20,103 @@ from chronify.time_utils import wrap_timestamps -def convert_to_single_time_zone( +def convert_time_zone( engine: Engine, metadata: MetaData, - from_schema: TableSchema, + src_schema: TableSchema, to_time_zone: ZoneInfo | None, -) -> None: - TimeZoneConverter(engine, metadata, from_schema, to_time_zone) + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, +) -> TableSchema: + """Convert time zone of a table to a specified time zone. + Output timestamp is tz-naive with a new time_zone column added. + Parameters + ---------- + engine + sqlalchemy engine + metadata + sqlalchemy metadata + src_schema + Defines the source table in the database. + to_time_zone + time zone to convert to. If None, convert to tz-naive. + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + dst_schema + schema of output table with converted timestamps + """ + TZC = TimeZoneConverter(engine, metadata, src_schema, to_time_zone) + TZC.convert_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, + ) + + return TZC._to_schema -def convert_to_multiple_time_zones( +def convert_time_zone_by_column( engine: Engine, metadata: MetaData, - from_schema: TableSchema, + src_schema: TableSchema, time_zone_column: str, wrap_time_allowed: Optional[bool] = False, -) -> None: - TimeZoneConverterByGeography( - engine, metadata, from_schema, time_zone_column, wrap_time_allowed + scratch_dir: Optional[Path] = None, + output_file: Optional[Path] = None, + check_mapped_timestamps: bool = False, +) -> TableSchema: + """Convert time zone of a table to multiple time zones specified by a column. + Output timestamp is tz-naive, reflecting the local time relative to the time_zone_column. + Parameters + ---------- + engine + sqlalchemy engine + metadata + sqlalchemy metadata + srd_schema + Defines the source table in the database. + time_zone_column + Column name in the source table that contains the time zone information. + wrap_time_allowed + If False, the converted timestamps will aligned with the original timestamps in real time scale + E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes + 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central + If True, the converted timestamps will fit into the time range of the src_schema in tz-naive clock time + E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes + 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central, which is then wrapped such that + no clock time timestamps are in 2017. The final timestamps are: + 2018-12-31 23:00, 2018-01-01 00:00 ~ 2018-12-31 22:00 in US/Central + scratch_dir + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file + If set, write the mapped table to this Parquet file. + check_mapped_timestamps + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + dst_schema + schema of output table with converted timestamps + """ + TZC = TimeZoneConverterByColumn( + engine, metadata, src_schema, time_zone_column, wrap_time_allowed + ) + TZC.convert_time_zone( + scratch_dir=scratch_dir, + output_file=output_file, + check_mapped_timestamps=check_mapped_timestamps, ) + return TZC._to_schema class TimeZoneConverterBase(abc.ABC): @@ -93,9 +171,12 @@ def __init__( self._to_schema = self.generate_to_schema() def generate_to_time_config(self) -> DatetimeRangeBase: - to_time_config = self._from_schema.time_config.convert_time_zone( - self._to_time_zone - ).replace_time_zone(None) + if self._to_time_zone: + to_time_config = self._from_schema.time_config.convert_time_zone( + self._to_time_zone + ).replace_time_zone(None) + else: + to_time_config = self._from_schema.time_config.replace_time_zone(None) time_kwargs = to_time_config.model_dump() time_kwargs = dict( filter( @@ -105,13 +186,18 @@ def generate_to_time_config(self) -> DatetimeRangeBase: ) time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL time_kwargs["time_zone_column"] = "time_zone" + time_kwargs["time_zones"] = [self._to_time_zone] return DatetimeRangeWithTZColumn(**time_kwargs) def generate_to_schema(self) -> TableSchema: + id_cols = self._from_schema.time_array_id_columns + if "time_zone" not in id_cols: + id_cols.append("time_zone") to_schema: TableSchema = self._from_schema.model_copy( update={ "name": f"{self._from_schema.name}_tz_converted", "time_config": self.generate_to_time_config(), + "time_array_id_columns": id_cols, } ) return to_schema @@ -143,14 +229,17 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() - to_time_data = make_time_range_generator(self._to_schema.time_config).list_timestamps() + to_time_data_dct = make_time_range_generator( + self._to_schema.time_config + ).list_timestamps_by_time_zone() from_time_config = self._from_schema.time_config.model_copy( update={"time_column": from_time_col} ) - to_time_config = self.generate_to_time_config() + to_time_config = self._to_schema.time_config tz_col = to_time_config.time_zone_column - tz_name = None if self._to_time_zone is None else self._to_time_zone.key + tz_name = self._to_time_zone.key if self._to_time_zone else "None" + to_time_data = to_time_data_dct[tz_name] df = pd.DataFrame( { from_time_col: from_time_data, @@ -166,8 +255,8 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: return df, mapping_schema -class TimeZoneConverterByGeography(TimeZoneConverterBase): - """Class for time zone conversion of time series data based on a geography-based time zone column.""" +class TimeZoneConverterByColumn(TimeZoneConverterBase): + """Class for time zone conversion of time series data based on a time zone column.""" def __init__( self, @@ -184,19 +273,20 @@ def __init__( def generate_to_time_config(self) -> DatetimeRangeBase: if self._wrap_time_allowed: - time_kwargs = self._from_schema.time_config.model_dump() - time_kwargs = dict( - filter( - lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, - time_kwargs.items(), - ) + to_time_config = self._from_schema.time_config.replace_time_zone(None) + else: + to_time_config = self._from_schema.time_config + time_kwargs = to_time_config.model_dump() + time_kwargs = dict( + filter( + lambda k_v: k_v[0] in DatetimeRangeWithTZColumn.model_fields, + time_kwargs.items(), ) - time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL - time_kwargs["start"] = self._from_schema.time_config.start.replace(tzinfo=None) - time_kwargs["time_zone_column"] = self.time_zone_column - return DatetimeRangeWithTZColumn(**time_kwargs) - - return self._from_schema.time_config.replace_time_zone(None) + ) + time_kwargs["time_type"] = TimeType.DATETIME_TZ_COL + time_kwargs["time_zone_column"] = self.time_zone_column + time_kwargs["time_zones"] = self._get_time_zones() + return DatetimeRangeWithTZColumn(**time_kwargs) def generate_to_schema(self) -> TableSchema: to_schema: TableSchema = self._from_schema.model_copy( @@ -211,13 +301,11 @@ def convert_time_zone( self, scratch_dir: Optional[Path] = None, output_file: Optional[Path] = None, - check_mapped_timestamps: bool = False, # will not be used + check_mapped_timestamps: bool = False, ) -> None: self.check_from_schema() df, mapping_schema = self._create_mapping() - # Do not check mapped timestamps when not wrap_time_allowed - # because they cannot be fully described by the to_schema time_config apply_mapping( df, mapping_schema, @@ -228,20 +316,10 @@ def convert_time_zone( TimeBasedDataAdjustment(), scratch_dir=scratch_dir, output_file=output_file, - check_mapped_timestamps=check_mapped_timestamps if self._wrap_time_allowed else False, + check_mapped_timestamps=check_mapped_timestamps, ) - def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: - """Create mapping dataframe for converting datetime to geography-based time zone""" - time_col = self._from_schema.time_config.time_column - from_time_col = "from_" + time_col - from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() - - if self._wrap_time_allowed: - to_time_data = make_time_range_generator(self._to_schema.time_config).list_timestamps() - - from_tz_col = "from_" + self.time_zone_column - + def _get_time_zones(self) -> list[ZoneInfo | None]: with self._engine.connect() as conn: table = Table(self._from_schema.name, self._metadata) stmt = ( @@ -253,27 +331,38 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: self.time_zone_column ].to_list() + time_zones = [None if tz == "None" else ZoneInfo(tz) for tz in time_zones] + return time_zones + + def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: + """Create mapping dataframe for converting datetime to column time zones""" + time_col = self._from_schema.time_config.time_column + from_time_col = "from_" + time_col + from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() + to_time_data_dct = make_time_range_generator( + self._to_schema.time_config + ).list_timestamps_by_time_zone() + + from_tz_col = "from_" + self.time_zone_column from_time_config = self._from_schema.time_config.model_copy( update={"time_column": from_time_col} ) - to_time_config = self.generate_to_time_config() + to_time_config = self._to_schema.time_config df_tz = [] - for time_zone in time_zones: - tz = None if time_zone in [None, "None"] else ZoneInfo(time_zone) - converted_time_data = [x.tz_convert(tz).tz_localize(None) for x in from_time_data] + for tz_name, time_data in to_time_data_dct.items(): if self._wrap_time_allowed: - final_time_data = wrap_timestamps( - pd.Series(converted_time_data), pd.Series(to_time_data) - ) + # assume it is being wrapped based on the tz-naive version of the original time data + final_time_data = [x.replace(tzinfo=None) for x in from_time_data] + to_time_data = wrap_timestamps(pd.Series(time_data), pd.Series(final_time_data)) else: - final_time_data = converted_time_data + to_time_data = time_data df_tz.append( pd.DataFrame( { from_time_col: from_time_data, - from_tz_col: time_zone, - time_col: final_time_data, + from_tz_col: tz_name, + time_col: to_time_data, } ) ) diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index ba9fa00..8d6303d 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -18,7 +18,7 @@ def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore - return pd.to_datetime(list(DatetimeRangeGenerator(time_config).iter_timestamps())) + return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: diff --git a/tests/test_mapper_representative_time_to_datetime.py b/tests/test_mapper_representative_time_to_datetime.py index f1e141f..1bec752 100644 --- a/tests/test_mapper_representative_time_to_datetime.py +++ b/tests/test_mapper_representative_time_to_datetime.py @@ -15,7 +15,7 @@ def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: - return pd.to_datetime(list(DatetimeRangeGenerator(time_config).iter_timestamps())) + return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) def get_datetime_schema(year: int, tzinfo: ZoneInfo | None) -> TableSchema: diff --git a/tests/test_store.py b/tests/test_store.py index b84aebe..d36a1ed 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -36,6 +36,7 @@ from chronify.time import TimeIntervalType, DaylightSavingAdjustmentType from chronify.time_configs import ( DatetimeRange, + DatetimeRangeWithTZColumn, IndexTimeRangeWithTZColumn, TimeBasedDataAdjustment, ) @@ -783,3 +784,157 @@ def test_check_timestamps(iter_stores_by_engine: Store, one_week_per_month_by_ho store.check_timestamps(schema.name) with store.engine.begin() as conn: store.check_timestamps(schema.name, connection=conn) + + +@pytest.mark.parametrize("to_time_zone", [ZoneInfo("US/Eastern"), ZoneInfo("US/Mountain"), None]) +def test_convert_time_zone( + tmp_path, iter_stores_by_engine_no_data_ingestion: Store, to_time_zone: ZoneInfo | None +): + store = iter_stores_by_engine_no_data_ingestion + time_array_len = 8784 + year = 2020 + tzinfo = ZoneInfo("EST") + + src_time_config = DatetimeRange( + start=datetime(year=year, month=1, day=1, hour=0, tzinfo=tzinfo), + resolution=timedelta(hours=1), + length=time_array_len, + interval_type=TimeIntervalType.PERIOD_BEGINNING, + time_column="timestamp", + ) + + src_csv_schema = CsvTableSchema( + time_config=src_time_config, + column_dtypes=[ + ColumnDType(name="timestamp", dtype=DateTime(timezone=False)), + ColumnDType(name="gen1", dtype=Double()), + ColumnDType(name="gen2", dtype=Double()), + ColumnDType(name="gen3", dtype=Double()), + ], + value_columns=["gen1", "gen2", "gen3"], + pivoted_dimension_name="generator", + time_array_id_columns=[], + ) + rel = read_csv(GENERATOR_TIME_SERIES_FILE, src_csv_schema) + rel2 = unpivot(rel, ("gen1", "gen2", "gen3"), "generator", "value") # noqa: F841 + + src_schema = TableSchema( + name="generators_pb", + time_config=src_time_config, + time_array_id_columns=["generator"], + value_column="value", + ) + if store.engine.name == "hive": + out_file = tmp_path / "data.parquet" + rel2.to_df().to_parquet(out_file) + store.create_view_from_parquet(out_file, src_schema) + else: + store.ingest_table(rel2, src_schema) + + if tzinfo is None and store.engine.name != "sqlite": + output_file = tmp_path / "mapped_data" + else: + output_file = None + + dst_schema = store.convert_time_zone( + src_schema.name, to_time_zone, output_file=output_file, check_mapped_timestamps=True + ) + if output_file is None or store.engine.name == "sqlite": + df2 = store.read_table(dst_schema.name) + else: + df2 = pd.read_parquet(output_file) + df2["timestamp"] = pd.to_datetime(df2["timestamp"]) + assert len(df2) == time_array_len * 3 + actual = sorted(df2["timestamp"].unique()) + assert isinstance(dst_schema.time_config, DatetimeRangeWithTZColumn) + if to_time_zone: + expected_start = src_time_config.start.astimezone(to_time_zone).replace(tzinfo=None) + else: + expected_start = src_time_config.start.replace(tzinfo=None) + assert dst_schema.time_config.start == expected_start + assert pd.Timestamp(actual[0]) == dst_schema.time_config.start + expected = make_time_range_generator(dst_schema.time_config).list_timestamps() + check_timestamp_lists(actual, expected) + + +@pytest.mark.parametrize("wrapped_time_allowed", [False, True]) +def test_convert_time_zone_by_column( + tmp_path, iter_stores_by_engine_no_data_ingestion: Store, wrapped_time_allowed: bool +): + store = iter_stores_by_engine_no_data_ingestion + time_array_len = 8784 + year = 2020 + tzinfo = ZoneInfo("EST") + + src_time_config = DatetimeRange( + start=datetime(year=year, month=1, day=1, hour=0, tzinfo=tzinfo), + resolution=timedelta(hours=1), + length=time_array_len, + interval_type=TimeIntervalType.PERIOD_BEGINNING, + time_column="timestamp", + ) + + src_csv_schema = CsvTableSchema( + time_config=src_time_config, + column_dtypes=[ + ColumnDType(name="timestamp", dtype=DateTime(timezone=False)), + ColumnDType(name="gen1", dtype=Double()), + ColumnDType(name="gen2", dtype=Double()), + ColumnDType(name="gen3", dtype=Double()), + ], + value_columns=["gen1", "gen2", "gen3"], + pivoted_dimension_name="generator", + time_array_id_columns=[], + ) + rel = read_csv(GENERATOR_TIME_SERIES_FILE, src_csv_schema) + rel2 = unpivot(rel, ("gen1", "gen2", "gen3"), "generator", "value") # noqa: F841 + # add time_zone column + stmt = ", ".join(rel2.columns) + tz_col_stmt = "CASE WHEN generator='gen1' THEN 'US/Eastern' WHEN generator='gen2' THEN 'US/Central' ELSE 'None' END AS time_zone" + stmt += f", {tz_col_stmt}" + rel2 = rel2.project(stmt) + + src_schema = TableSchema( + name="generators_pb", + time_config=src_time_config, + time_array_id_columns=["generator", "time_zone"], + value_column="value", + ) + if store.engine.name == "hive": + out_file = tmp_path / "data.parquet" + rel2.to_df().to_parquet(out_file) + store.create_view_from_parquet(out_file, src_schema) + else: + store.ingest_table(rel2, src_schema) + + if tzinfo is None and store.engine.name != "sqlite": + output_file = tmp_path / "mapped_data" + else: + output_file = None + + dst_schema = store.convert_time_zone_by_column( + src_schema.name, + "time_zone", + output_file=output_file, + wrap_time_allowed=wrapped_time_allowed, + check_mapped_timestamps=True, + ) + if output_file is None or store.engine.name == "sqlite": + df2 = store.read_table(dst_schema.name) + else: + df2 = pd.read_parquet(output_file) + df2["timestamp"] = pd.to_datetime(df2["timestamp"]) + df_stats = df2.groupby(["time_zone"])["timestamp"].agg(["min", "max", "count"]) + assert set(df_stats["count"]) == {time_array_len} + if wrapped_time_allowed: + assert set(df_stats["min"]) == {dst_schema.time_config.start.replace(tzinfo=None)} + else: + assert (df_stats.loc["US/Eastern"] == df_stats.loc["None"]).prod() == 1 + assert df_stats.loc["US/Central", "min"] == dst_schema.time_config.start.astimezone( + ZoneInfo("US/Central") + ).replace(tzinfo=None) + assert isinstance(dst_schema.time_config, DatetimeRangeWithTZColumn) + expected_dct = make_time_range_generator(dst_schema.time_config).list_timestamps_by_time_zone() + for tz, expected in expected_dct.items(): + actual = sorted(df2.loc[df2["time_zone"] == tz, "timestamp"]) + check_timestamp_lists(actual, expected) diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 0307ab4..9a6d474 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -2,20 +2,27 @@ from datetime import datetime, timedelta import numpy as np import pytest +from typing import Any import pandas as pd from sqlalchemy import Engine, MetaData from chronify.sqlalchemy.functions import read_database, write_database -from chronify.time_zone_converter import TimeZoneConverter, TimeZoneConverterByGeography +from chronify.time_zone_converter import ( + TimeZoneConverter, + TimeZoneConverterByColumn, + convert_time_zone, + convert_time_zone_by_column, +) from chronify.time_configs import DatetimeRange from chronify.models import TableSchema from chronify.time import TimeIntervalType from chronify.datetime_range_generator import DatetimeRangeGenerator +from chronify.exceptions import InvalidParameter def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore - return pd.to_datetime(list(DatetimeRangeGenerator(time_config).iter_timestamps())) + return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: @@ -35,7 +42,7 @@ def generate_dataframe_with_tz_col(schema: TableSchema) -> pd.DataFrame: ZoneInfo("US/Mountain"), None, ] - time_zones = [tz.key if tz is not None else "None" for tz in time_zones] + time_zones = [tz.key if tz else "None" for tz in time_zones] dfo = pd.merge( df, pd.DataFrame({"id": range(len(time_zones)), "time_zone": time_zones}), how="cross" ) @@ -106,16 +113,19 @@ def run_conversion( ) -> None: metadata = MetaData() ingest_data(engine, metadata, df, from_schema) - TZC = TimeZoneConverter(engine, metadata, from_schema, to_time_zone) - TZC.convert_time_zone(check_mapped_timestamps=True) - dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) - + to_schema = convert_time_zone( + engine, metadata, from_schema, to_time_zone, check_mapped_timestamps=True + ) + dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) assert df["value"].equals(dfo["value"]) - expected = df["timestamp"].dt.tz_convert(to_time_zone).dt.tz_localize(None) + if to_time_zone is None: + expected = df["timestamp"].dt.tz_localize(None) + else: + expected = df["timestamp"].dt.tz_convert(to_time_zone).dt.tz_localize(None) assert (dfo["timestamp"] == expected).prod() == 1 -def run_conversion_by_geography( +def run_conversion_to_column_time_zones( engine: Engine, df: pd.DataFrame, from_schema: TableSchema, @@ -123,11 +133,15 @@ def run_conversion_by_geography( ) -> None: metadata = MetaData() ingest_data(engine, metadata, df, from_schema) - TZC = TimeZoneConverterByGeography( - engine, metadata, from_schema, "time_zone", wrap_time_allowed=wrap_time_allowed + to_schema = convert_time_zone_by_column( + engine, + metadata, + from_schema, + "time_zone", + wrap_time_allowed=wrap_time_allowed, + check_mapped_timestamps=True, ) - TZC.convert_time_zone() - dfo = get_mapped_dataframe(engine, TZC._to_schema.name, TZC._to_schema.time_config) + dfo = get_mapped_dataframe(engine, to_schema.name, to_schema.time_config) dfo = dfo[df.columns].sort_values(by="index").reset_index(drop=True) dfo["timestamp"] = pd.to_datetime(dfo["timestamp"]) # needed for engine 2, not sure why @@ -139,25 +153,57 @@ def run_conversion_by_geography( else: for i in range(len(df)): tzn = df.loc[i, "time_zone"] - tz = ZoneInfo(tzn) if tzn != "None" else None - ts = df.loc[i, "timestamp"].tz_convert(tz).replace(tzinfo=None) + if tzn == "None": + ts = df.loc[i, "timestamp"].replace(tzinfo=None) + else: + ts = df.loc[i, "timestamp"].tz_convert(ZoneInfo(tzn)).replace(tzinfo=None) assert dfo.loc[i, "timestamp"] == ts +def run_conversion_with_error( + engine: Engine, + df: pd.DataFrame, + from_schema: TableSchema, + use_tz_col: bool, + error: tuple[Any, str], +) -> None: + metadata = MetaData() + ingest_data(engine, metadata, df, from_schema) + with pytest.raises(error[0], match=error[1]): + if use_tz_col: + TZC = TimeZoneConverterByColumn( + engine, metadata, from_schema, "time_zone", wrap_time_allowed=False + ) + TZC.convert_time_zone(check_mapped_timestamps=True) + else: + TZC2 = TimeZoneConverter(engine, metadata, from_schema, None) + TZC2.convert_time_zone(check_mapped_timestamps=True) + + +def test_src_table_no_time_zone(iter_engines: Engine) -> None: + from_schema = get_datetime_schema(2018, None, TimeIntervalType.PERIOD_BEGINNING, "base_table") + df = generate_datetime_dataframe(from_schema) + error = (InvalidParameter, "Source schema start_time must be timezone-aware") + run_conversion_with_error( + iter_engines, df, from_schema, False, error + ) # TODO, support tz-naive to tz-aware conversion + + @pytest.mark.parametrize( - "to_time_zone", [ZoneInfo("US/Central"), ZoneInfo("UTC"), ZoneInfo("America/Los_Angeles")] + "to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("America/Los_Angeles")] ) def test_time_conversion(iter_engines: Engine, to_time_zone: ZoneInfo) -> None: from_schema = get_datetime_schema( 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" ) df = generate_datetime_dataframe(from_schema) - to_time_zone = ZoneInfo("US/Central") # TODO run_conversion(iter_engines, df, from_schema, to_time_zone) @pytest.mark.parametrize("wrap_time_allowed", [False, True]) -def test_time_conversion_by_geography(iter_engines: Engine, wrap_time_allowed: bool) -> None: +def test_time_conversion_to_column_time_zones( + iter_engines: Engine, wrap_time_allowed: bool +) -> None: from_schema = get_datetime_schema( 2018, ZoneInfo("US/Mountain"), @@ -166,4 +212,4 @@ def test_time_conversion_by_geography(iter_engines: Engine, wrap_time_allowed: b has_tz_col=True, ) df = generate_dataframe_with_tz_col(from_schema) - run_conversion_by_geography(iter_engines, df, from_schema, wrap_time_allowed) + run_conversion_to_column_time_zones(iter_engines, df, from_schema, wrap_time_allowed) From ceb6692bc7598ee4859b9c7b1cfe875983ce7d62 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Sat, 18 Oct 2025 01:52:49 -0600 Subject: [PATCH 09/27] keep time_zone in output for convert by col --- src/chronify/time_zone_converter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 6ddf2bc..027db0b 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -289,10 +289,14 @@ def generate_to_time_config(self) -> DatetimeRangeBase: return DatetimeRangeWithTZColumn(**time_kwargs) def generate_to_schema(self) -> TableSchema: + id_cols = self._from_schema.time_array_id_columns + if "time_zone" not in id_cols: + id_cols.append("time_zone") to_schema: TableSchema = self._from_schema.model_copy( update={ "name": f"{self._from_schema.name}_tz_converted", "time_config": self.generate_to_time_config(), + "time_array_id_columns": id_cols, } ) return to_schema From 977dfddbc614af7203b238befb5207059c6b3d04 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 24 Oct 2025 18:07:32 -0600 Subject: [PATCH 10/27] some cleanup --- src/chronify/datetime_range_generator.py | 19 +++---- src/chronify/exceptions.py | 4 ++ src/chronify/models.py | 14 ++--- src/chronify/time_configs.py | 66 ++++++++++++++---------- src/chronify/time_series_mapper_base.py | 4 +- src/chronify/time_zone_converter.py | 5 +- tests/test_models.py | 5 +- 7 files changed, 67 insertions(+), 50 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 39e25ab..ade2188 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -7,7 +7,7 @@ from chronify.time import ( LeapDayAdjustmentType, ) -from chronify.time_configs import DatetimeRange, DatetimeRangeWithTZColumn +from chronify.time_configs import DatetimeRanges, DatetimeRange, DatetimeRangeWithTZColumn from chronify.time_utils import adjust_timestamp_by_dst_offset from chronify.time_range_generator_base import TimeRangeGeneratorBase from chronify.exceptions import InvalidValue @@ -18,7 +18,7 @@ class DatetimeRangeGeneratorBase(TimeRangeGeneratorBase): def __init__( self, - model: DatetimeRange, + model: DatetimeRanges, leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, ) -> None: self._model = model @@ -89,6 +89,7 @@ def __init__( leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, ) -> None: super().__init__(model, leap_day_adjustment=leap_day_adjustment) + assert isinstance(self._model, DatetimeRange) def list_timestamps(self) -> list[datetime]: return list(self._iter_timestamps()) @@ -107,20 +108,16 @@ def __init__( leap_day_adjustment: Optional[LeapDayAdjustmentType] = None, ) -> None: super().__init__(model, leap_day_adjustment=leap_day_adjustment) - if not self._model.time_zones: + assert isinstance(self._model, DatetimeRangeWithTZColumn) + if self._model.get_time_zones() == []: msg = ( "DatetimeRangeWithTZColumn.time_zones needs to be instantiated for ", f"DatetimeRangeGeneratorExternalTimeZone: {self._model}", ) raise InvalidValue(msg) - if len(set(self._model.time_zones)) < len(self._model.time_zones): - msg = f"DatetimeRangeWithTZColumn.time_zones has duplicates: {self._model.time_zones}" - raise InvalidValue(msg) - - def _list_timestamps(self, time_zone: ZoneInfo) -> list[datetime]: + def _list_timestamps(self, time_zone: Optional[ZoneInfo]) -> list[datetime]: """always return tz-naive timestamps relative to input time_zone""" - # assert self._model.start.tzinfo is None if self._model.start_time_is_tz_naive(): if time_zone: start = self._model.start.replace(tzinfo=time_zone) @@ -137,7 +134,7 @@ def _list_timestamps(self, time_zone: ZoneInfo) -> list[datetime]: def list_timestamps(self) -> list[datetime]: """return only unique values, this means no duplicates for prevailing time""" ts_set = set() - for tz in self._model.time_zones: + for tz in self._model.get_time_zones(): ts_set.update(set(self._list_timestamps(tz))) timestamps = sorted(ts_set) return timestamps @@ -145,7 +142,7 @@ def list_timestamps(self) -> list[datetime]: def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list[datetime]]: """for each time zone, returns full timestamp iteration with duplicates allowed""" dct: dict[str, datetime] = {} - for tz in self._model.time_zones: + for tz in self._model.get_time_zones(): timestamps = self._list_timestamps(tz) if distinct: timestamps = sorted(set(timestamps)) diff --git a/src/chronify/exceptions.py b/src/chronify/exceptions.py index fa0c0b7..8186e62 100644 --- a/src/chronify/exceptions.py +++ b/src/chronify/exceptions.py @@ -26,6 +26,10 @@ class InvalidValue(ChronifyExceptionBase): """Raised when an invalid value is passed.""" +class MissingValue(ChronifyExceptionBase): + """Raised when an expecting value is missing.""" + + class MissingParameter(ChronifyExceptionBase): """Raised when a parameter is not found or missing.""" diff --git a/src/chronify/models.py b/src/chronify/models.py index 348fba1..b770200 100644 --- a/src/chronify/models.py +++ b/src/chronify/models.py @@ -9,7 +9,7 @@ from typing_extensions import Annotated from chronify.base_models import ChronifyBaseModel -from chronify.exceptions import InvalidParameter +from chronify.exceptions import InvalidParameter, InvalidValue from chronify.time_configs import TimeConfig @@ -67,7 +67,7 @@ def check_name(cls, name: str) -> str: _check_name(name) if name.lower() == "table": msg = f"Table schema cannot use {name=}." - raise ValueError(msg) + raise InvalidValue(msg) return name @field_validator("value_column") @@ -102,7 +102,7 @@ def check_column(cls, value_columns: str) -> str: def check_time_array_id_columns(cls, value: list[str]) -> list[str]: if value: msg = f"PivotedTableSchema doesn't yet support time_array_id_columns: {value}" - raise ValueError(msg) + raise InvalidValue(msg) return value def list_columns(self) -> list[str]: @@ -124,7 +124,7 @@ def check_name(cls, name: str) -> str: _check_name(name) if name.lower() == "table": msg = f"Table schema cannot use {name=}." - raise ValueError(msg) + raise InvalidValue(msg) return name @field_validator("time_configs") @@ -239,11 +239,11 @@ def fix_data_type(cls, data: dict[str, Any]) -> dict[str, Any]: if val is None: options = sorted(_COLUMN_TYPES.keys()) + list(_DB_TYPES) msg = f"{dtype=} must be one of {options}" - raise ValueError(msg) + raise InvalidValue(msg) data["dtype"] = val() else: msg = f"dtype is an unsupported type: {type(dtype)}. It must be a str or type." - raise ValueError(msg) + raise InvalidValue(msg) return data @@ -287,4 +287,4 @@ class CsvTableSchemaSingleTimeArrayPivotedByComponent(CsvTableSchema): def _check_name(name: str) -> None: if not REGEX_NAME_REQUIREMENT.search(name): msg = f"A name can only have alphanumeric characters: {name=}" - raise ValueError(msg) + raise InvalidValue(msg) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 5662fb9..94d3532 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -1,7 +1,7 @@ import abc import logging from datetime import datetime, timedelta -from typing import Union, Literal, Optional +from typing import Union, Literal, Optional, Any from pydantic import Field, field_validator from typing_extensions import Annotated from zoneinfo import ZoneInfo @@ -16,7 +16,7 @@ RepresentativePeriodFormat, list_representative_time_columns, ) - +from chronify.exceptions import InvalidValue, InvalidParameter logger = logging.getLogger(__name__) @@ -66,10 +66,23 @@ class DatetimeRangeBase(TimeBaseModel): time_column: str = Field(description="Column in the table that represents time.") length: int resolution: timedelta + start: datetime def list_time_columns(self) -> list[str]: return [self.time_column] + def start_time_is_tz_naive(self) -> bool: + """Return True if the timestamps in the range do not have time zones.""" + return self.start.tzinfo is None + + def convert_time_zone(self, tz: ZoneInfo | None) -> Any: # "DatetimeRanges" + """Return a copy of the DatetimeRange with the start timestamp converted to time zone.""" + return self.model_copy(update={"start": self.start.astimezone(tz)}) + + def replace_time_zone(self, tz: ZoneInfo | None) -> Any: # "DatetimeRanges" + """Return a copy of the DatetimeRange with the time zone replaced.""" + return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) + class DatetimeRange(DatetimeRangeBase): """Defines a time range with a single time zone.""" @@ -80,20 +93,11 @@ class DatetimeRange(DatetimeRangeBase): "the data must be time zone-aware." ) - def start_time_is_tz_naive(self) -> bool: - """Return True if the timestamps in the range do not have time zones.""" - return self.start.tzinfo is None - def get_time_zone_column(self) -> None: return None - def convert_time_zone(self, tz: ZoneInfo | None) -> "DatetimeRange": - """Return a copy of the DatetimeRange with the start timestamp converted to time zone.""" - return self.model_copy(update={"start": self.start.astimezone(tz)}) - - def replace_time_zone(self, tz: ZoneInfo | None) -> "DatetimeRange": - """Return a copy of the DatetimeRange with the time zone replaced.""" - return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) + def get_time_zones(self) -> list[ZoneInfo | None]: + return [] class DatetimeRangeWithTZColumn(DatetimeRangeBase): @@ -102,28 +106,36 @@ class DatetimeRangeWithTZColumn(DatetimeRangeBase): time_type: Literal[TimeType.DATETIME_TZ_COL] = TimeType.DATETIME_TZ_COL start: datetime = Field( description=( - "Start time of the range. If tz-naive, timestamps of different time zones ", - "are expected to align in clock time. If tz-aware, timestamps of different ", - "time zones are expected to align in real time.", + "Start time of the range. If tz-naive, timestamps of different time zones " + "are expected to align in clock time. If tz-aware, timestamps of different " + "time zones are expected to align in real time." ) ) time_zone_column: str = Field( description="Column in the table that has time zone or offset information." ) - time_zones: Optional[list[ZoneInfo | None]] = Field( - description="Unique time zones from the table." - ) - - def start_time_is_tz_naive(self) -> bool: - return self.start.tzinfo is None + time_zones: list[ZoneInfo | None] = Field(description="Unique time zones from the table.") def get_time_zone_column(self) -> str: return self.time_zone_column + def get_time_zones(self) -> list[ZoneInfo | None]: + return self.time_zones + + @field_validator("time_zones") + @classmethod + def check_duplicated_time_zones( + cls, time_zones: list[ZoneInfo | None] + ) -> list[ZoneInfo | None]: + if len(set(time_zones)) < len(time_zones): + msg = ("DatetimeRangeWithTZColumn.time_zones has duplicates: ", time_zones) + raise InvalidValue(msg) + return time_zones + # Lixi TODO: ensure table schema has time_zone col? -DateTimeRanges = Union[ +DatetimeRanges = Union[ DatetimeRange, DatetimeRangeWithTZColumn, ] @@ -177,7 +189,7 @@ class IndexTimeRangeNTZ(IndexTimeRangeBase): def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: if start_timestamp.tzinfo is not None: msg = "start_timestamp must be tz-naive for IndexTimeRangeNTZ" - raise ValueError(msg) + raise InvalidValue(msg) return start_timestamp def get_time_zone_column(self) -> None: @@ -198,7 +210,7 @@ class IndexTimeRangeTZ(IndexTimeRangeBase): def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: if start_timestamp.tzinfo is None: msg = "start_timestamp must be tz-aware for IndexTimeRangeTZ" - raise ValueError(msg) + raise InvalidValue(msg) return start_timestamp def get_time_zone_column(self) -> None: @@ -222,7 +234,7 @@ class IndexTimeRangeWithTZColumn(IndexTimeRangeBase): def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: if start_timestamp.tzinfo is not None: msg = "start_timestamp must be tz-naive for IndexTimeRangeWithTZColumn" - raise ValueError(msg) + raise InvalidValue(msg) return start_timestamp def get_time_zone_column(self) -> str: @@ -308,7 +320,7 @@ class YearMonthDayPeriodTimeNTZ(ColumnRepresentativeBase): def one_hour_column(cls, value: list[str]) -> list[str]: if len(value) != 1: msg = "YearMonthDayPeriodTimeNTZ requires exactly one hour column." - raise ValueError(msg) + raise InvalidParameter(msg) return value def list_time_columns(self) -> list[str]: diff --git a/src/chronify/time_series_mapper_base.py b/src/chronify/time_series_mapper_base.py index 5de3e15..127d9db 100644 --- a/src/chronify/time_series_mapper_base.py +++ b/src/chronify/time_series_mapper_base.py @@ -15,7 +15,7 @@ write_query_to_parquet, ) from chronify.models import TableSchema, MappingTableSchema -from chronify.exceptions import ConflictingInputsError +from chronify.exceptions import ConflictingInputsError, InvalidOperation from chronify.utils.sqlalchemy_table import create_table from chronify.time_series_checker import check_timestamps from chronify.time import TimeIntervalType, ResamplingOperationType, AggregationType @@ -208,7 +208,7 @@ def _apply_mapping( # select_stmt.append(func.max(tval_col).label(val_col)) case _: msg = f"Unsupported {resampling_operation=}" - raise ValueError(msg) + raise InvalidOperation(msg) from_keys = [x for x in right_table_columns if x.startswith("from_")] keys = [x.removeprefix("from_") for x in from_keys] diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 027db0b..9e83752 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -12,7 +12,7 @@ DatetimeRangeWithTZColumn, TimeBasedDataAdjustment, ) -from chronify.exceptions import InvalidParameter +from chronify.exceptions import InvalidParameter, MissingValue from chronify.time_series_mapper_base import apply_mapping from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database @@ -266,6 +266,9 @@ def __init__( time_zone_column: str, wrap_time_allowed: Optional[bool] = False, ): + if time_zone_column not in from_schema.time_array_id_columns: + msg = f"{time_zone_column=} is missing from {from_schema.time_array_id_columns=}" + raise MissingValue(msg) super().__init__(engine, metadata, from_schema) self.time_zone_column = time_zone_column self._wrap_time_allowed = wrap_time_allowed diff --git a/tests/test_models.py b/tests/test_models.py index 18a05c3..480931a 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -2,6 +2,7 @@ from sqlalchemy import BigInteger, Boolean, DateTime, Double, Integer, String from chronify.models import ColumnDType, _check_name +from chronify.exceptions import InvalidValue def test_column_dtypes() -> None: @@ -12,10 +13,10 @@ def test_column_dtypes() -> None: for string_type in ("int", "bigint", "bool", "datetime", "float", "str"): ColumnDType(name="col1", dtype=string_type) - with pytest.raises(ValueError): + with pytest.raises(InvalidValue): ColumnDType(name="col1", dtype="invalid") def test_invalid_column_name() -> None: - with pytest.raises(ValueError): + with pytest.raises(InvalidValue): _check_name(name="invalid - name") From 4d5922ab223220629d832820c7805103f9a6ce83 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 11:53:02 -0600 Subject: [PATCH 11/27] mostly changing from ZoneInfo to tzinfo as typehint --- src/chronify/datetime_range_generator.py | 18 ++++--- src/chronify/schema_manager.py | 1 + src/chronify/sqlalchemy/functions.py | 6 +-- src/chronify/time_configs.py | 48 +++++++++++++++---- src/chronify/time_series_checker.py | 2 +- src/chronify/time_zone_converter.py | 11 +++-- tests/test_mapper_datetime_to_datetime.py | 9 ++-- ..._mapper_representative_time_to_datetime.py | 4 +- tests/test_store.py | 4 +- tests/test_time_series_checker.py | 20 ++++---- tests/test_time_zone_converter.py | 6 +-- 11 files changed, 83 insertions(+), 46 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index ade2188..4e03e3a 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from typing import Generator, Optional from zoneinfo import ZoneInfo @@ -116,7 +116,7 @@ def __init__( ) raise InvalidValue(msg) - def _list_timestamps(self, time_zone: Optional[ZoneInfo]) -> list[datetime]: + def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: """always return tz-naive timestamps relative to input time_zone""" if self._model.start_time_is_tz_naive(): if time_zone: @@ -141,12 +141,18 @@ def list_timestamps(self) -> list[datetime]: def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list[datetime]]: """for each time zone, returns full timestamp iteration with duplicates allowed""" - dct: dict[str, datetime] = {} + dct = {} for tz in self._model.get_time_zones(): timestamps = self._list_timestamps(tz) if distinct: timestamps = sorted(set(timestamps)) - tz_name = tz.key if tz else "None" + if not tz: + tz_name = "None" + elif isinstance(tz, ZoneInfo): + tz_name = tz.key + else: + tz_name = tz.tzname(datetime(2020, 1, 1, tzinfo=tz)) # type: ignore + dct[tz_name] = timestamps return dct @@ -154,11 +160,11 @@ def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list def list_distinct_timestamps_by_time_zone_from_dataframe( self, df: pd.DataFrame ) -> dict[str, list[datetime]]: - tz_col = self._model.time_zone_column + tz_col = self._model.get_time_zone_column() t_col = self._model.time_column df[t_col] = pd.to_datetime(df[t_col]) df2 = df[[tz_col, t_col]].drop_duplicates() - dct: dict[str, list[datetime]] = {} + dct = {} for tz_name in sorted(df2[tz_col].unique()): dct[tz_name] = sorted(df2.loc[df2[tz_col] == tz_name, t_col].tolist()) return dct diff --git a/src/chronify/schema_manager.py b/src/chronify/schema_manager.py index 23828cc..4987266 100644 --- a/src/chronify/schema_manager.py +++ b/src/chronify/schema_manager.py @@ -68,6 +68,7 @@ def _get_schema_table(self) -> Table: def add_schema(self, conn: Connection, schema: TableSchema) -> Table: """Add the schema to the store.""" table = self._get_schema_table() + breakpoint() stmt = insert(table).values(name=schema.name, schema=schema.model_dump_json()) conn.execute(stmt) # If there is a rollback after this addition to cached, things _should_ still be OK. diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 7203d60..4a41bfa 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -103,12 +103,12 @@ def _convert_database_input_for_datetime( def _convert_database_output_for_datetime(df: pd.DataFrame, config: DatetimeRange) -> None: if config.time_column in df.columns: if not config.start_time_is_tz_naive(): - if isinstance(df[config.time_column].dtype, ObjectDType): + if isinstance(df[config.time_column].dtype, ObjectDType): # type: ignore df[config.time_column] = pd.to_datetime(df[config.time_column], utc=True) else: df[config.time_column] = df[config.time_column].dt.tz_localize("UTC") else: - if isinstance(df[config.time_column].dtype, ObjectDType): + if isinstance(df[config.time_column].dtype, ObjectDType): # type: ignore df[config.time_column] = pd.to_datetime(df[config.time_column], utc=False) @@ -152,7 +152,7 @@ def _write_to_hive( "datetime64[ns", "datetime64[us" ) df2[config.time_column] = df2[config.time_column].astype(new_dtype) # type: ignore - elif isinstance(df2[config.time_column].dtype, DateTime64DType): + elif isinstance(df2[config.time_column].dtype, DateTime64DType): # type: ignore new_dtype = "datetime64[us]" df2[config.time_column] = df2[config.time_column].astype(new_dtype) # type: ignore diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 94d3532..e67554d 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -1,10 +1,9 @@ import abc import logging -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from typing import Union, Literal, Optional, Any from pydantic import Field, field_validator from typing_extensions import Annotated -from zoneinfo import ZoneInfo from chronify.base_models import ChronifyBaseModel from chronify.time import ( @@ -59,6 +58,10 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> Optional[str]: """Return the column in the table that contains time zone or offset information.""" + @abc.abstractmethod + def get_time_zones(self) -> list[tzinfo | None]: + """Return a list of unique time zones represented by the time column(s).""" + class DatetimeRangeBase(TimeBaseModel): """Defines a time range base class that uses Python datetime instances.""" @@ -75,11 +78,11 @@ def start_time_is_tz_naive(self) -> bool: """Return True if the timestamps in the range do not have time zones.""" return self.start.tzinfo is None - def convert_time_zone(self, tz: ZoneInfo | None) -> Any: # "DatetimeRanges" + def convert_time_zone(self, tz: tzinfo | None) -> Any: # "DatetimeRanges" """Return a copy of the DatetimeRange with the start timestamp converted to time zone.""" return self.model_copy(update={"start": self.start.astimezone(tz)}) - def replace_time_zone(self, tz: ZoneInfo | None) -> Any: # "DatetimeRanges" + def replace_time_zone(self, tz: tzinfo | None) -> Any: # "DatetimeRanges" """Return a copy of the DatetimeRange with the time zone replaced.""" return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) @@ -96,7 +99,7 @@ class DatetimeRange(DatetimeRangeBase): def get_time_zone_column(self) -> None: return None - def get_time_zones(self) -> list[ZoneInfo | None]: + def get_time_zones(self) -> list[tzinfo | None]: return [] @@ -114,19 +117,17 @@ class DatetimeRangeWithTZColumn(DatetimeRangeBase): time_zone_column: str = Field( description="Column in the table that has time zone or offset information." ) - time_zones: list[ZoneInfo | None] = Field(description="Unique time zones from the table.") + time_zones: list[tzinfo | None] = Field(description="Unique time zones from the table.") def get_time_zone_column(self) -> str: return self.time_zone_column - def get_time_zones(self) -> list[ZoneInfo | None]: + def get_time_zones(self) -> list[tzinfo | None]: return self.time_zones @field_validator("time_zones") @classmethod - def check_duplicated_time_zones( - cls, time_zones: list[ZoneInfo | None] - ) -> list[ZoneInfo | None]: + def check_duplicated_time_zones(cls, time_zones: list[tzinfo | None]) -> list[tzinfo | None]: if len(set(time_zones)) < len(time_zones): msg = ("DatetimeRangeWithTZColumn.time_zones has duplicates: ", time_zones) raise InvalidValue(msg) @@ -156,6 +157,9 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + class IndexTimeRangeBase(TimeBaseModel): """Defines a time range in the form of indexes""" @@ -195,6 +199,9 @@ def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + class IndexTimeRangeTZ(IndexTimeRangeBase): """Index time that represents tz-aware timestamps of a single time zone. @@ -216,6 +223,9 @@ def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + class IndexTimeRangeWithTZColumn(IndexTimeRangeBase): """Index time that reprsents local time relative to a time zone column. @@ -240,6 +250,9 @@ def check_start_timestamp(cls, start_timestamp: datetime) -> datetime: def get_time_zone_column(self) -> str: return self.time_zone_column + def get_time_zones(self) -> list[tzinfo | None]: + return [] # LIXI TODO + IndexTimeRanges = Union[ IndexTimeRangeNTZ, @@ -265,6 +278,9 @@ class RepresentativePeriodTimeNTZ(RepresentativePeriodTimeBase): def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + class RepresentativePeriodTimeTZ(RepresentativePeriodTimeBase): """Defines a tz-aware representative time dimension that covers one full year of time.""" @@ -277,6 +293,9 @@ class RepresentativePeriodTimeTZ(RepresentativePeriodTimeBase): def get_time_zone_column(self) -> str: return self.time_zone_column + def get_time_zones(self) -> list[tzinfo | None]: + return [] # LIXI TODO + class ColumnRepresentativeBase(TimeBaseModel): """Base class for time formats that use multiple integer columns to represent time.""" @@ -329,6 +348,9 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + @property def unique_timestamps_length(self) -> int: return int(self.length / 24) @@ -361,6 +383,9 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + @classmethod def default_config(cls, length: int, year: int) -> "YearMonthDayHourTimeNTZ": return cls( @@ -384,6 +409,9 @@ def list_time_columns(self) -> list[str]: def get_time_zone_column(self) -> None: return None + def get_time_zones(self) -> list[tzinfo | None]: + return [] + @classmethod def default_config(cls, length: int, year: int) -> "MonthDayHourTimeNTZ": return cls( diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index 3a02382..faea0d0 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -46,7 +46,7 @@ def check_timestamps(self) -> None: preserve_duplicates = False if isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone): count = self._check_expected_timestamps_with_external_time_zone() - if self._has_prevailing_time_zone(self._schema.time_config.time_zones): + if self._has_prevailing_time_zone(self._schema.time_config.get_time_zones()): preserve_duplicates = True else: count = self._check_expected_timestamps() diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 9e83752..3d77254 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -1,5 +1,6 @@ import abc from zoneinfo import ZoneInfo +from datetime import tzinfo from sqlalchemy import Engine, MetaData, Table, select from typing import Optional from pathlib import Path @@ -24,7 +25,7 @@ def convert_time_zone( engine: Engine, metadata: MetaData, src_schema: TableSchema, - to_time_zone: ZoneInfo | None, + to_time_zone: tzinfo | None, scratch_dir: Optional[Path] = None, output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, @@ -164,15 +165,15 @@ def __init__( engine: Engine, metadata: MetaData, from_schema: TableSchema, - to_time_zone: ZoneInfo | None, + to_time_zone: tzinfo | None, ): super().__init__(engine, metadata, from_schema) self._to_time_zone = to_time_zone self._to_schema = self.generate_to_schema() - def generate_to_time_config(self) -> DatetimeRangeBase: + def generate_to_time_config(self) -> DatetimeRangeWithTZColumn: if self._to_time_zone: - to_time_config = self._from_schema.time_config.convert_time_zone( + to_time_config: DatetimeRange = self._from_schema.time_config.convert_time_zone( self._to_time_zone ).replace_time_zone(None) else: @@ -326,7 +327,7 @@ def convert_time_zone( check_mapped_timestamps=check_mapped_timestamps, ) - def _get_time_zones(self) -> list[ZoneInfo | None]: + def _get_time_zones(self) -> list[tzinfo | None]: with self._engine.connect() as conn: table = Table(self._from_schema.name, self._metadata) stmt = ( diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index 8d6303d..fec6504 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -1,4 +1,5 @@ from zoneinfo import ZoneInfo +from datetime import tzinfo import pytest from datetime import datetime, timedelta from typing import Any @@ -31,7 +32,7 @@ def generate_datetime_dataframe(schema: TableSchema) -> pd.DataFrame: def get_datetime_schema( - year: int, tzinfo: ZoneInfo | None, interval_type: TimeIntervalType, name: str + year: int, tzinfo: tzinfo | None, interval_type: TimeIntervalType, name: str ) -> TableSchema: start = datetime(year=year, month=1, day=1, tzinfo=tzinfo) end = datetime(year=year + 1, month=1, day=1, tzinfo=tzinfo) @@ -155,7 +156,7 @@ def test_roll_time_using_shift_and_wrap() -> None: @pytest.mark.parametrize("tzinfo", [ZoneInfo("US/Eastern"), None]) def test_time_interval_shift( iter_engines: Engine, - tzinfo: ZoneInfo | None, + tzinfo: tzinfo | None, ) -> None: from_schema = get_datetime_schema( 2020, tzinfo, TimeIntervalType.PERIOD_BEGINNING, "from_table" @@ -171,7 +172,7 @@ def test_time_interval_shift( @pytest.mark.parametrize("tzinfo", [ZoneInfo("US/Eastern"), None]) def test_time_interval_shift_different_time_ranges( iter_engines: Engine, - tzinfo: ZoneInfo | None, + tzinfo: tzinfo | None, ) -> None: from_schema = get_datetime_schema( 2020, tzinfo, TimeIntervalType.PERIOD_BEGINNING, "from_table" @@ -194,7 +195,7 @@ def test_time_interval_shift_different_time_ranges( ], ) def test_time_shift_different_timezones( - iter_engines: Engine, tzinfo_tuple: tuple[ZoneInfo | None] + iter_engines: Engine, tzinfo_tuple: tuple[tzinfo | None] ) -> None: from_schema = get_datetime_schema( 2020, tzinfo_tuple[0], TimeIntervalType.PERIOD_BEGINNING, "from_table" diff --git a/tests/test_mapper_representative_time_to_datetime.py b/tests/test_mapper_representative_time_to_datetime.py index 1bec752..06ead60 100644 --- a/tests/test_mapper_representative_time_to_datetime.py +++ b/tests/test_mapper_representative_time_to_datetime.py @@ -1,6 +1,6 @@ from zoneinfo import ZoneInfo import pytest -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from typing import Any, Optional import pandas as pd @@ -18,7 +18,7 @@ def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: return pd.to_datetime(list(DatetimeRangeGenerator(time_config)._iter_timestamps())) -def get_datetime_schema(year: int, tzinfo: ZoneInfo | None) -> TableSchema: +def get_datetime_schema(year: int, tzinfo: tzinfo | None) -> TableSchema: start = datetime(year=year, month=1, day=1, tzinfo=tzinfo) end = datetime(year=year + 1, month=1, day=1, tzinfo=tzinfo) resolution = timedelta(hours=1) diff --git a/tests/test_store.py b/tests/test_store.py index d36a1ed..fc9b566 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,7 +1,7 @@ import fileinput import gc import shutil -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from pathlib import Path from zoneinfo import ZoneInfo from itertools import chain @@ -788,7 +788,7 @@ def test_check_timestamps(iter_stores_by_engine: Store, one_week_per_month_by_ho @pytest.mark.parametrize("to_time_zone", [ZoneInfo("US/Eastern"), ZoneInfo("US/Mountain"), None]) def test_convert_time_zone( - tmp_path, iter_stores_by_engine_no_data_ingestion: Store, to_time_zone: ZoneInfo | None + tmp_path, iter_stores_by_engine_no_data_ingestion: Store, to_time_zone: tzinfo | None ): store = iter_stores_by_engine_no_data_ingestion time_array_len = 8784 diff --git a/tests/test_time_series_checker.py b/tests/test_time_series_checker.py index eba800c..cf05c30 100644 --- a/tests/test_time_series_checker.py +++ b/tests/test_time_series_checker.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from typing import Optional from zoneinfo import ZoneInfo @@ -17,42 +17,42 @@ from chronify.time_series_checker import check_timestamps -def test_valid_datetimes_with_tz(iter_engines: Engine): +def test_valid_datetimes_with_tz(iter_engines: Engine) -> None: """Valid timestamps with time zones.""" _run_test(iter_engines, *_get_inputs_for_valid_datetimes_with_tz()) -def test_valid_datetimes_without_tz(iter_engines: Engine): +def test_valid_datetimes_without_tz(iter_engines: Engine) -> None: """Valid timestamps without time zones.""" _run_test(iter_engines, *_get_inputs_for_valid_datetimes_without_tz()) -def test_invalid_datetimes(iter_engines: Engine): +def test_invalid_datetimes(iter_engines: Engine) -> None: """Timestamps do not match the schema.""" _run_test(iter_engines, *_get_inputs_for_incorrect_datetimes()) -def test_invalid_datetime_length(iter_engines: Engine): +def test_invalid_datetime_length(iter_engines: Engine) -> None: """Timestamps do not match the schema.""" _run_test(iter_engines, *_get_inputs_for_incorrect_datetime_length()) -def test_mismatched_time_array_lengths(iter_engines: Engine): +def test_mismatched_time_array_lengths(iter_engines: Engine) -> None: """Some time arrays have different lengths.""" _run_test(iter_engines, *_get_inputs_for_mismatched_time_array_lengths()) -def test_incorrect_lengths(iter_engines: Engine): +def test_incorrect_lengths(iter_engines: Engine) -> None: """All time arrays are consistent but have the wrong length.""" _run_test(iter_engines, *_get_inputs_for_incorrect_lengths()) -def test_incorrect_time_arrays(iter_engines: Engine): +def test_incorrect_time_arrays(iter_engines: Engine) -> None: """The time arrays form a complete set but are individually incorrect.""" _run_test(iter_engines, *_get_inputs_for_incorrect_time_arrays()) -def test_incorrect_time_arrays_with_duplicates(iter_engines: Engine): +def test_incorrect_time_arrays_with_duplicates(iter_engines: Engine) -> None: """The time arrays form a complete set but are individually incorrect.""" _run_test(iter_engines, *_get_inputs_for_incorrect_time_arrays_with_duplicates()) @@ -60,7 +60,7 @@ def test_incorrect_time_arrays_with_duplicates(iter_engines: Engine): def _run_test( engine: Engine, df: pd.DataFrame, - tzinfo: Optional[ZoneInfo], + tzinfo: Optional[tzinfo], length: int, message: Optional[str], ) -> None: diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 9a6d474..e36c7e9 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -1,5 +1,5 @@ from zoneinfo import ZoneInfo -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo import numpy as np import pytest from typing import Any @@ -56,7 +56,7 @@ def generate_dataframe_with_tz_col(schema: TableSchema) -> pd.DataFrame: def get_datetime_schema( year: int, - tzinfo: ZoneInfo | None, + tzinfo: tzinfo | None, interval_type: TimeIntervalType, name: str, has_tz_col: bool = False, @@ -109,7 +109,7 @@ def run_conversion( engine: Engine, df: pd.DataFrame, from_schema: TableSchema, - to_time_zone: ZoneInfo | None, + to_time_zone: tzinfo | None, ) -> None: metadata = MetaData() ingest_data(engine, metadata, df, from_schema) From 329ed70e01a2bac39e6a894d68c7cc6dd41c3943 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:49:57 -0600 Subject: [PATCH 12/27] fix mypy errors --- src/chronify/csv_time_series_parser.py | 2 +- src/chronify/datetime_range_generator.py | 10 +--- src/chronify/schema_manager.py | 1 - src/chronify/time_configs.py | 10 +--- src/chronify/time_series_checker.py | 11 +++- ...apper_column_representative_to_datetime.py | 2 +- src/chronify/time_utils.py | 19 ++++++- src/chronify/time_zone_converter.py | 57 ++++++++++++------- 8 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/chronify/csv_time_series_parser.py b/src/chronify/csv_time_series_parser.py index b3484eb..49393be 100644 --- a/src/chronify/csv_time_series_parser.py +++ b/src/chronify/csv_time_series_parser.py @@ -114,7 +114,7 @@ def _check_input_format(data_file: Path) -> None: @staticmethod def _read_data_file(data_file: Path) -> pd.DataFrame: - return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) # type: ignore + return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) def _ingest_data(self, data: pd.DataFrame, table_name: str, year: int, length: int) -> None: csv_fmt = CsvTimeSeriesFormats.from_columns(data.columns) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 4e03e3a..033c2e1 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -8,7 +8,7 @@ LeapDayAdjustmentType, ) from chronify.time_configs import DatetimeRanges, DatetimeRange, DatetimeRangeWithTZColumn -from chronify.time_utils import adjust_timestamp_by_dst_offset +from chronify.time_utils import adjust_timestamp_by_dst_offset, get_tzname from chronify.time_range_generator_base import TimeRangeGeneratorBase from chronify.exceptions import InvalidValue @@ -146,13 +146,7 @@ def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list timestamps = self._list_timestamps(tz) if distinct: timestamps = sorted(set(timestamps)) - if not tz: - tz_name = "None" - elif isinstance(tz, ZoneInfo): - tz_name = tz.key - else: - tz_name = tz.tzname(datetime(2020, 1, 1, tzinfo=tz)) # type: ignore - + tz_name = get_tzname(tz) dct[tz_name] = timestamps return dct diff --git a/src/chronify/schema_manager.py b/src/chronify/schema_manager.py index 4987266..23828cc 100644 --- a/src/chronify/schema_manager.py +++ b/src/chronify/schema_manager.py @@ -68,7 +68,6 @@ def _get_schema_table(self) -> Table: def add_schema(self, conn: Connection, schema: TableSchema) -> Table: """Add the schema to the store.""" table = self._get_schema_table() - breakpoint() stmt = insert(table).values(name=schema.name, schema=schema.model_dump_json()) conn.execute(stmt) # If there is a rollback after this addition to cached, things _should_ still be OK. diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index e67554d..1babd51 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -1,7 +1,7 @@ import abc import logging from datetime import datetime, timedelta, tzinfo -from typing import Union, Literal, Optional, Any +from typing import Union, Literal, Optional from pydantic import Field, field_validator from typing_extensions import Annotated @@ -78,14 +78,6 @@ def start_time_is_tz_naive(self) -> bool: """Return True if the timestamps in the range do not have time zones.""" return self.start.tzinfo is None - def convert_time_zone(self, tz: tzinfo | None) -> Any: # "DatetimeRanges" - """Return a copy of the DatetimeRange with the start timestamp converted to time zone.""" - return self.model_copy(update={"start": self.start.astimezone(tz)}) - - def replace_time_zone(self, tz: tzinfo | None) -> Any: # "DatetimeRanges" - """Return a copy of the DatetimeRange with the time zone replaced.""" - return self.model_copy(update={"start": self.start.replace(tzinfo=tz)}) - class DatetimeRange(DatetimeRangeBase): """Defines a time range with a single time zone.""" diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index faea0d0..8af4517 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -1,11 +1,12 @@ from sqlalchemy import Connection, Table, select, text from typing import Optional -from zoneinfo import ZoneInfo +from datetime import datetime, tzinfo import pandas as pd from chronify.exceptions import InvalidTable from chronify.models import TableSchema +from chronify.time_configs import DatetimeRangeWithTZColumn from chronify.sqlalchemy.functions import read_database from chronify.time_range_generator_factory import make_time_range_generator from chronify.datetime_range_generator import DatetimeRangeGeneratorExternalTimeZone @@ -57,7 +58,7 @@ def check_timestamps(self) -> None: ) @staticmethod - def _has_prevailing_time_zone(lst: list[ZoneInfo]) -> bool: + def _has_prevailing_time_zone(lst: list[tzinfo | None]) -> bool: for tz in lst: if is_prevailing_time_zone(tz): return True @@ -75,8 +76,10 @@ def _check_expected_timestamps(self) -> int: return len(expected) def _check_expected_timestamps_with_external_time_zone(self) -> int: + assert isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone) expected_dct = self._time_generator.list_timestamps_by_time_zone() time_columns = self._time_generator.list_time_columns() + assert isinstance(self._schema.time_config, DatetimeRangeWithTZColumn) time_columns.append(self._schema.time_config.get_time_zone_column()) stmt = select(*(self._table.c[x] for x in time_columns)).distinct() for col in time_columns: @@ -209,7 +212,9 @@ def _check_expected_timestamps_by_time_array( def check_timestamp_lists( - actual: list[pd.Timestamp], expected: list[pd.Timestamp], msg_prefix: str = "" + actual: list[pd.Timestamp] | list[datetime], + expected: list[pd.Timestamp] | list[datetime], + msg_prefix: str = "", ) -> None: match = actual == expected msg = msg_prefix diff --git a/src/chronify/time_series_mapper_column_representative_to_datetime.py b/src/chronify/time_series_mapper_column_representative_to_datetime.py index 2ce44fe..ed549de 100644 --- a/src/chronify/time_series_mapper_column_representative_to_datetime.py +++ b/src/chronify/time_series_mapper_column_representative_to_datetime.py @@ -232,7 +232,7 @@ def mdh_from_datetime(timestamp: datetime) -> tuple[int, int, int]: return timestamp.month, timestamp.day, timestamp.hour + 1 -def generate_period_mapping(periods: pd.Series) -> pd.DataFrame: +def generate_period_mapping(periods: "pd.Series[str]") -> pd.DataFrame: unique_periods = periods.unique() mappings = [] for period_str in unique_periods: diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index 8a278e4..26b2ad2 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -3,7 +3,7 @@ import logging import numpy as np from datetime import datetime, timedelta, timezone, tzinfo -import zoneinfo +from zoneinfo import ZoneInfo, ZoneInfoNotFoundError import pandas as pd from chronify.time import ( @@ -147,6 +147,7 @@ def roll_time_interval( def is_prevailing_time_zone(tz: tzinfo | None) -> bool: + """Check that tz is a prevailing time zone""" if not tz: return False ts1 = datetime(year=2020, month=1, day=1, tzinfo=tz) @@ -156,6 +157,7 @@ def is_prevailing_time_zone(tz: tzinfo | None) -> bool: def is_standard_time_zone(tz: tzinfo | None) -> bool: + """Check that tz is a standard time zone""" if not tz: return False ts1 = datetime(year=2020, month=1, day=1, tzinfo=tz) @@ -165,14 +167,25 @@ def is_standard_time_zone(tz: tzinfo | None) -> bool: def get_standard_time_zone(tz: tzinfo | None) -> tzinfo | None: + """Get the standard time zone counterpart of tz""" ts = datetime(year=2020, month=1, day=1, tzinfo=tz) std_tz_name = ts.tzname() if not std_tz_name: return None try: - return zoneinfo.ZoneInfo(std_tz_name) - except zoneinfo.ZoneInfoNotFoundError: + return ZoneInfo(std_tz_name) + except ZoneInfoNotFoundError: utcoffset = ts.utcoffset() if not utcoffset: return None return timezone(utcoffset) + + +def get_tzname(tz: tzinfo | None) -> str: + """Get the time zone name of tz""" + if not tz: + return "None" + if isinstance(tz, ZoneInfo): + return tz.key + ts = datetime(year=2020, month=1, day=1, tzinfo=tz) + return tz.tzname(ts) # type: ignore # LIXI TODO diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 3d77254..ba03098 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -1,6 +1,6 @@ import abc from zoneinfo import ZoneInfo -from datetime import tzinfo +from datetime import datetime, tzinfo from sqlalchemy import Engine, MetaData, Table, select from typing import Optional from pathlib import Path @@ -13,12 +13,15 @@ DatetimeRangeWithTZColumn, TimeBasedDataAdjustment, ) +from chronify.datetime_range_generator import ( + DatetimeRangeGeneratorExternalTimeZone, +) from chronify.exceptions import InvalidParameter, MissingValue from chronify.time_series_mapper_base import apply_mapping from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database from chronify.time import TimeType -from chronify.time_utils import wrap_timestamps +from chronify.time_utils import wrap_timestamps, get_tzname def convert_time_zone( @@ -131,13 +134,17 @@ def __init__( ): self._engine = engine self._metadata = metadata + self._check_from_schema(from_schema) self._from_schema = from_schema - def check_from_schema(self) -> None: + def _check_from_schema(self, from_schema: TableSchema) -> None: msg = "" - if not isinstance(self._from_schema.time_config, DatetimeRange): + if not isinstance(from_schema.time_config, DatetimeRange): msg += "Source schema does not have DatetimeRange time config. " - if self._from_schema.time_config.start_time_is_tz_naive(): + if ( + isinstance(from_schema.time_config, DatetimeRange) + and from_schema.time_config.start_time_is_tz_naive() + ): msg += "Source schema start_time must be timezone-aware. " msg += "To convert from timezone-naive to timezone-aware, use the TimeSeriesMapperDatetime.map_time() method instead. " if msg != "": @@ -172,12 +179,14 @@ def __init__( self._to_schema = self.generate_to_schema() def generate_to_time_config(self) -> DatetimeRangeWithTZColumn: + assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy + to_time_config = self._from_schema.time_config.model_copy() if self._to_time_zone: - to_time_config: DatetimeRange = self._from_schema.time_config.convert_time_zone( - self._to_time_zone - ).replace_time_zone(None) + to_time_config.start = to_time_config.start.astimezone(self._to_time_zone).replace( + tzinfo=None + ) else: - to_time_config = self._from_schema.time_config.replace_time_zone(None) + to_time_config.start = to_time_config.start.replace(tzinfo=None) time_kwargs = to_time_config.model_dump() time_kwargs = dict( filter( @@ -209,7 +218,6 @@ def convert_time_zone( output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, ) -> None: - self.check_from_schema() df, mapping_schema = self._create_mapping() apply_mapping( @@ -227,19 +235,21 @@ def convert_time_zone( def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for converting datetime to geography-based time zone""" + assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() - to_time_data_dct = make_time_range_generator( - self._to_schema.time_config - ).list_timestamps_by_time_zone() + to_time_generator = make_time_range_generator(self._to_schema.time_config) + assert isinstance(to_time_generator, DatetimeRangeGeneratorExternalTimeZone) # mypy + to_time_data_dct = to_time_generator.list_timestamps_by_time_zone() from_time_config = self._from_schema.time_config.model_copy( update={"time_column": from_time_col} ) to_time_config = self._to_schema.time_config + assert isinstance(to_time_config, DatetimeRangeWithTZColumn) # mypy tz_col = to_time_config.time_zone_column - tz_name = self._to_time_zone.key if self._to_time_zone else "None" + tz_name = get_tzname(self._to_time_zone) to_time_data = to_time_data_dct[tz_name] df = pd.DataFrame( { @@ -276,10 +286,10 @@ def __init__( self._to_schema = self.generate_to_schema() def generate_to_time_config(self) -> DatetimeRangeBase: + assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy + to_time_config = self._from_schema.time_config.model_copy() if self._wrap_time_allowed: - to_time_config = self._from_schema.time_config.replace_time_zone(None) - else: - to_time_config = self._from_schema.time_config + to_time_config.start = to_time_config.start.replace(tzinfo=None) time_kwargs = to_time_config.model_dump() time_kwargs = dict( filter( @@ -311,7 +321,6 @@ def convert_time_zone( output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, ) -> None: - self.check_from_schema() df, mapping_schema = self._create_mapping() apply_mapping( @@ -344,12 +353,13 @@ def _get_time_zones(self) -> list[tzinfo | None]: def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: """Create mapping dataframe for converting datetime to column time zones""" + assert isinstance(self._from_schema.time_config, DatetimeRange) # mypy time_col = self._from_schema.time_config.time_column from_time_col = "from_" + time_col from_time_data = make_time_range_generator(self._from_schema.time_config).list_timestamps() - to_time_data_dct = make_time_range_generator( - self._to_schema.time_config - ).list_timestamps_by_time_zone() + to_time_generator = make_time_range_generator(self._to_schema.time_config) + assert isinstance(to_time_generator, DatetimeRangeGeneratorExternalTimeZone) # mypy + to_time_data_dct = to_time_generator.list_timestamps_by_time_zone() from_tz_col = "from_" + self.time_zone_column from_time_config = self._from_schema.time_config.model_copy( @@ -359,10 +369,13 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: df_tz = [] for tz_name, time_data in to_time_data_dct.items(): + to_time_data: list[datetime] | list[pd.Timestamp] if self._wrap_time_allowed: # assume it is being wrapped based on the tz-naive version of the original time data final_time_data = [x.replace(tzinfo=None) for x in from_time_data] - to_time_data = wrap_timestamps(pd.Series(time_data), pd.Series(final_time_data)) + to_time_data = wrap_timestamps( + pd.Series(time_data), pd.Series(final_time_data) + ).tolist() else: to_time_data = time_data df_tz.append( From 63798a7551805f4784bd30d3c58ac78dc8064dda Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:04:23 -0600 Subject: [PATCH 13/27] more mypy stuff --- pyproject.toml | 1 + src/chronify/sqlalchemy/functions.py | 6 +++--- src/chronify/utils/sqlalchemy_table.py | 4 ++-- src/chronify/utils/sqlalchemy_view.py | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e5e1ae7..b1dd311 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ Source = "https://github.com/NREL/chronify" files = [ "src", ] +disable_error_code = ["no-untyped-call"] strict = true [tool.pytest.ini_options] diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 4a41bfa..7203d60 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -103,12 +103,12 @@ def _convert_database_input_for_datetime( def _convert_database_output_for_datetime(df: pd.DataFrame, config: DatetimeRange) -> None: if config.time_column in df.columns: if not config.start_time_is_tz_naive(): - if isinstance(df[config.time_column].dtype, ObjectDType): # type: ignore + if isinstance(df[config.time_column].dtype, ObjectDType): df[config.time_column] = pd.to_datetime(df[config.time_column], utc=True) else: df[config.time_column] = df[config.time_column].dt.tz_localize("UTC") else: - if isinstance(df[config.time_column].dtype, ObjectDType): # type: ignore + if isinstance(df[config.time_column].dtype, ObjectDType): df[config.time_column] = pd.to_datetime(df[config.time_column], utc=False) @@ -152,7 +152,7 @@ def _write_to_hive( "datetime64[ns", "datetime64[us" ) df2[config.time_column] = df2[config.time_column].astype(new_dtype) # type: ignore - elif isinstance(df2[config.time_column].dtype, DateTime64DType): # type: ignore + elif isinstance(df2[config.time_column].dtype, DateTime64DType): new_dtype = "datetime64[us]" df2[config.time_column] = df2[config.time_column].astype(new_dtype) # type: ignore diff --git a/src/chronify/utils/sqlalchemy_table.py b/src/chronify/utils/sqlalchemy_table.py index ec7e0b7..c22c77b 100644 --- a/src/chronify/utils/sqlalchemy_table.py +++ b/src/chronify/utils/sqlalchemy_table.py @@ -21,7 +21,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateTable) +@compiler.compiles(CreateTable) # type: ignore def _create_table(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE TABLE %s AS %s" % ( element.name, @@ -29,7 +29,7 @@ def _create_table(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropTable) +@compiler.compiles(DropTable) # type: ignore def _drop_table(element: Any, compiler: Any, **kw: Any) -> str: return "DROP TABLE %s" % (element.name) diff --git a/src/chronify/utils/sqlalchemy_view.py b/src/chronify/utils/sqlalchemy_view.py index b7eec2f..bd53334 100644 --- a/src/chronify/utils/sqlalchemy_view.py +++ b/src/chronify/utils/sqlalchemy_view.py @@ -20,7 +20,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateView) +@compiler.compiles(CreateView) # type: ignore def _create_view(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE VIEW %s AS %s" % ( element.name, @@ -28,7 +28,7 @@ def _create_view(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropView) +@compiler.compiles(DropView) # type: ignore def _drop_view(element: Any, compiler: Any, **kw: Any) -> str: return "DROP VIEW %s" % (element.name) From aebd0a24d67b20e85f3152e2e6fbaf1c954d0d37 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 17:34:16 -0600 Subject: [PATCH 14/27] Add pytest for time_utils --- src/chronify/time_utils.py | 9 ++- tests/test_time_utils.py | 128 +++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 tests/test_time_utils.py diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index 26b2ad2..a273a57 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -18,8 +18,8 @@ def adjust_timestamp_by_dst_offset(timestamp: datetime, resolution: timedelta) - """Reduce the timestamps within the daylight saving range by 1 hour. Used to ensure that a time series at daily (or lower) resolution returns each day at the same timestamp in prevailing time, an expected behavior in most standard libraries. - (e.g., ensure a time series can return 2018-03-11 00:00, 2018-03-12 00:00... - instead of 2018-03-11 00:00, 2018-03-12 01:00...) + (e.g., ensure a time series can return 2018-03-11 00:00, 2018-03-12 00:00, 2018-03-13 00:00... + instead of 2018-03-11 00:00, 2018-03-12 01:00, 2018-03-13 01:00...) """ if resolution < timedelta(hours=24): return timestamp @@ -182,7 +182,10 @@ def get_standard_time_zone(tz: tzinfo | None) -> tzinfo | None: def get_tzname(tz: tzinfo | None) -> str: - """Get the time zone name of tz""" + """Get the time zone name of tz + Note: except for the tzname extracted from ZoneInfo, + tzname may not be reinstantiated into a tzinfo object + """ if not tz: return "None" if isinstance(tz, ZoneInfo): diff --git a/tests/test_time_utils.py b/tests/test_time_utils.py new file mode 100644 index 0000000..4594297 --- /dev/null +++ b/tests/test_time_utils.py @@ -0,0 +1,128 @@ +import pytest +import pandas as pd +from datetime import datetime, timedelta, timezone +from zoneinfo import ZoneInfo +from chronify.time_utils import ( + adjust_timestamp_by_dst_offset, + shift_time_interval, + wrap_timestamps, + roll_time_interval, + is_prevailing_time_zone, + is_standard_time_zone, + get_standard_time_zone, + get_tzname, +) +from chronify.time import TimeIntervalType + + +def test_adjust_timestamp_by_dst_offset() -> None: + # DST-aware datetime vs standard time zone + tzs = [ZoneInfo("America/New_York"), ZoneInfo("EST")] + hours = [23, 0] + for tz, hour in zip(tzs, hours): + dt = datetime(2020, 7, 1, 0, 0, tzinfo=tz) + res = adjust_timestamp_by_dst_offset(dt, timedelta(days=1)) + assert res.hour == hour + + +def test_shift_time_interval_period_beginning_to_ending() -> None: + ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + shifted = shift_time_interval( + ser, + TimeIntervalType.PERIOD_BEGINNING, + TimeIntervalType.PERIOD_ENDING, + ) + breakpoint() + assert all(shifted == ser + pd.Timedelta(hours=1)) + + +def test_shift_time_interval_period_ending_to_beginning() -> None: + ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + shifted = shift_time_interval( + ser, + TimeIntervalType.PERIOD_ENDING, + TimeIntervalType.PERIOD_BEGINNING, + ) + assert all(shifted == ser - pd.Timedelta(hours=1)) + + +def test_shift_time_interval_invalid() -> None: + ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + with pytest.raises(Exception): + shift_time_interval( + ser, + TimeIntervalType.PERIOD_BEGINNING, + TimeIntervalType.PERIOD_BEGINNING, + ) + + +def test_wrap_timestamps() -> None: + ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() + wrapped = wrap_timestamps(ser, to_timestamps) + assert set(wrapped) <= set(to_timestamps) + + +def test_roll_time_interval() -> None: + ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() + rolled = roll_time_interval( + ser, + TimeIntervalType.PERIOD_BEGINNING, + TimeIntervalType.PERIOD_ENDING, + to_timestamps, + ) + assert set(rolled) <= set(to_timestamps) + + +def test_is_prevailing_time_zone() -> None: + tz = ZoneInfo("America/New_York") + assert is_prevailing_time_zone(tz) is True + assert is_prevailing_time_zone(None) is False + + +def test_is_standard_time_zone() -> None: + tz = timezone(timedelta(hours=0)) + assert is_standard_time_zone(tz) is True + assert is_standard_time_zone(None) is False + + +def test_get_standard_time_zone() -> None: + tzs = [ + ZoneInfo("America/New_York"), + ZoneInfo("EST"), + timezone(timedelta(hours=-5)), + None, + ] + stzs = [ + ZoneInfo("EST"), + ZoneInfo("EST"), + timezone(timedelta(hours=-5)), + None, + ] + for tz, stz in zip(tzs, stzs): + std_tz = get_standard_time_zone(tz) + if tz is None: + assert std_tz is None + continue + assert std_tz == stz + + +def test_get_tzname() -> None: + tzs = [ + ZoneInfo("America/New_York"), + ZoneInfo("EST"), + timezone(timedelta(hours=-5)), + None, + ] + etzs = [ + "America/New_York", + "EST", + "UTC-05:00", + "None", + ] + + for tz, etz in zip(tzs, etzs): + name = get_tzname(tz) + assert isinstance(name, str) + assert name == etz From f2d245d359b43dc15da6d6a038cfc0553cac29aa Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 18:43:00 -0600 Subject: [PATCH 15/27] refactor time util funcs --- src/chronify/time_series_mapper_datetime.py | 17 ++- .../time_series_mapper_representative.py | 2 +- src/chronify/time_utils.py | 113 ++++++++---------- src/chronify/time_zone_converter.py | 4 +- tests/test_mapper_datetime_to_datetime.py | 6 +- tests/test_time_utils.py | 17 +-- 6 files changed, 76 insertions(+), 83 deletions(-) diff --git a/src/chronify/time_series_mapper_datetime.py b/src/chronify/time_series_mapper_datetime.py index 2637fb3..08412ec 100644 --- a/src/chronify/time_series_mapper_datetime.py +++ b/src/chronify/time_series_mapper_datetime.py @@ -110,14 +110,19 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: ser_from = ser_from.dt.tz_convert(fm_tz_std).dt.tz_localize(to_tz) match (self._adjust_interval, self._wrap_time_allowed): case (True, _): - ser = roll_time_interval( - ser_from, - self._from_time_config.interval_type, - self._to_time_config.interval_type, - to_time_data, + ser = pd.Series( + roll_time_interval( + ser_from.tolist(), + self._from_time_config.interval_type, + self._to_time_config.interval_type, + to_time_data, + ), + index=ser_from.index, ) case (False, True): - ser = wrap_timestamps(ser_from, to_time_data) + ser = pd.Series( + wrap_timestamps(ser_from.tolist(), to_time_data), index=ser_from.index + ) case (False, False): ser = ser_from diff --git a/src/chronify/time_series_mapper_representative.py b/src/chronify/time_series_mapper_representative.py index 7670e5b..8a70067 100644 --- a/src/chronify/time_series_mapper_representative.py +++ b/src/chronify/time_series_mapper_representative.py @@ -94,7 +94,7 @@ def _create_mapping(self, is_tz_naive: bool) -> tuple[pd.DataFrame, MappingTable # Mapping works backward for representative time by shifting interval type of # to_time_config to match from_time_config before extracting time info dft[time_col] = shift_time_interval( - dft[to_time_col], + dft[to_time_col].tolist(), self._to_time_config.interval_type, self._from_time_config.interval_type, ) diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index a273a57..b07118e 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -1,6 +1,7 @@ """Functions related to time""" import logging +from numpy.typing import NDArray import numpy as np from datetime import datetime, timedelta, timezone, tzinfo from zoneinfo import ZoneInfo, ZoneInfoNotFoundError @@ -29,33 +30,25 @@ def adjust_timestamp_by_dst_offset(timestamp: datetime, resolution: timedelta) - def shift_time_interval( - ser: "pd.Series[pd.Timestamp]", + ts_list: list[datetime], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, -) -> "pd.Series[pd.Timestamp]": +) -> list[datetime]: """Shift pandas timeseries by ONE time interval based on interval type. Example: - >>> ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) - 0 2018-12-31 22:00:00 - 1 2018-12-31 23:00:00 - 2 2019-01-01 00:00:00 - 3 2019-01-01 01:00:00 - dtype: datetime64[ns] - - >>> ser2 = shift_time_interval( - ... ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING + >>> ts_list = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() + [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + + >>> ts_list2 = shift_time_interval( + ... ts_list, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING ... ) - 0 2018-12-31 23:00:00 - 1 2019-01-01 00:00:00 - 2 2019-01-01 01:00:00 - 3 2019-01-01 02:00:00 - dtype: datetime64[ns] + [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] """ assert ( from_interval_type != to_interval_type ), f"from_ and to_interval_type are the same: {from_interval_type}" - arr = np.sort(ser) + arr: NDArray[np.datetime64] = np.sort(ts_list) # type: ignore freqs = set((np.roll(arr, -1) - arr)[:-1]) assert len(freqs), f"Timeseries has more than one frequency, {freqs}" freq: np.timedelta64 = next(iter(freqs)) @@ -70,80 +63,76 @@ def shift_time_interval( case _: msg = f"Cannot handle from {from_interval_type} to {to_interval_type}" raise InvalidParameter(msg) - return ser + freq * mult + ts_list2 = (arr + freq * mult).tolist() + return ts_list2 # type: ignore def wrap_timestamps( - ser: "pd.Series[pd.Timestamp]", to_timestamps: list[pd.Timestamp] -) -> "pd.Series[pd.Timestamp]": + ts_list: list[datetime], + to_timestamps: list[datetime], +) -> list[datetime]: """Wrap pandas timeseries so it stays within a list of timestamps. Example: - >>> ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) - 0 2018-12-31 22:00:00 - 1 2018-12-31 23:00:00 - 2 2019-01-01 00:00:00 - 3 2019-01-01 01:00:00 - dtype: datetime64[ns] + >>> ts_list = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() + [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] >>> to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00')] - >>> ser2 = wrap_timestamps(ser, to_timestamps) - 0 2019-01-01 02:00:00 - 1 2019-01-01 03:00:00 - 2 2019-01-01 00:00:00 - 3 2019-01-01 01:00:00 - dtype: datetime64[ns] + >>> ts_list2 = wrap_timestamps(ts_list, to_timestamps) + [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] """ - arr = np.sort(np.array(to_timestamps)) - freqs = set((np.roll(arr, -1) - arr)[:-1]) + to_arr = np.sort(np.array(to_timestamps)) + freqs = set((np.roll(to_arr, -1) - to_arr)[:-1]) assert len(freqs), f"Timeseries has more than one frequency, {freqs}" freq = next(iter(freqs)) - tmin, tmax = arr[0], arr[-1] + tmin, tmax = to_arr[0], to_arr[-1] tdelta = tmax - tmin + freq - ser2 = ser.copy() - lower_cond = ser < tmin + + arr = pd.Series(ts_list) # np.array is not as robust as pd.Series here + arr2 = arr.copy() + lower_cond = arr < tmin if lower_cond.sum() > 0: - ser2.loc[lower_cond] += tdelta - upper_cond = ser > tmax + arr2.loc[lower_cond] += tdelta + upper_cond = arr > tmax if upper_cond.sum() > 0: - ser2.loc[upper_cond] -= tdelta - return ser2 + arr2.loc[upper_cond] -= tdelta + ts_list2 = arr2.tolist() + return ts_list2 # type: ignore def roll_time_interval( - ser: "pd.Series[pd.Timestamp]", + ts_list: list[datetime], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, - to_timestamps: list[pd.Timestamp], -) -> "pd.Series[pd.Timestamp]": + to_timestamps: list[datetime], +) -> list[datetime]: """Roll pandas timeseries by shifting time interval based on interval type and then wrapping timestamps Example: - >>> ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) - 0 2018-12-31 22:00:00 - 1 2018-12-31 23:00:00 - 2 2019-01-01 00:00:00 - 3 2019-01-01 01:00:00 - dtype: datetime64[ns] - - >>> to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() + >>> ts_list = pd.date_range( + ... "2018-12-31 22:00", periods=4, freq="h" + ... ).tolist() # period-beginning + [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + + >>> to_timestamps = pd.date_range( + ... "2019-01-01 00:00", periods=4, freq="h" + ... ).tolist() # period-ending [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00')] - >>> ser2 = roll_time_interval( - ... ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING, to_timestamps + >>> ts_list2 = roll_time_interval( + ... ts_list, + ... TimeIntervalType.PERIOD_BEGINNING, + ... TimeIntervalType.PERIOD_ENDING, + ... to_timestamps, ... ) - 0 2019-01-01 03:00:00 - 1 2019-01-01 00:00:00 - 2 2019-01-01 01:00:00 - 3 2019-01-01 02:00:00 - dtype: datetime64[ns] + [Timestamp('2019-01-01 03:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] """ - ser = shift_time_interval(ser, from_interval_type, to_interval_type) - ser = wrap_timestamps(ser, to_timestamps) - return ser + ts_list2 = shift_time_interval(ts_list, from_interval_type, to_interval_type) + ts_list3 = wrap_timestamps(ts_list2, to_timestamps) + return ts_list3 def is_prevailing_time_zone(tz: tzinfo | None) -> bool: diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index ba03098..b0ece3b 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -373,9 +373,7 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: if self._wrap_time_allowed: # assume it is being wrapped based on the tz-naive version of the original time data final_time_data = [x.replace(tzinfo=None) for x in from_time_data] - to_time_data = wrap_timestamps( - pd.Series(time_data), pd.Series(final_time_data) - ).tolist() + to_time_data = wrap_timestamps(time_data, final_time_data) else: to_time_data = time_data df_tz.append( diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index fec6504..9776c87 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -135,18 +135,18 @@ def test_roll_time_using_shift_and_wrap() -> None: data = generate_datetime_data(to_schema.time_config) df["rolled"] = roll_time_interval( - df[from_schema.time_config.time_column], + df[from_schema.time_config.time_column].tolist(), from_schema.time_config.interval_type, to_schema.time_config.interval_type, data, ) df["rolled2"] = shift_time_interval( - df[from_schema.time_config.time_column], + df[from_schema.time_config.time_column].tolist(), from_schema.time_config.interval_type, to_schema.time_config.interval_type, ) df["rolled2"] = wrap_timestamps( - df["rolled2"], + df["rolled2"].tolist(), data, ) assert df["rolled"].equals(df["rolled2"]) diff --git a/tests/test_time_utils.py b/tests/test_time_utils.py index 4594297..4cb4aa6 100644 --- a/tests/test_time_utils.py +++ b/tests/test_time_utils.py @@ -1,7 +1,9 @@ import pytest +import numpy as np import pandas as pd from datetime import datetime, timedelta, timezone from zoneinfo import ZoneInfo + from chronify.time_utils import ( adjust_timestamp_by_dst_offset, shift_time_interval, @@ -26,28 +28,27 @@ def test_adjust_timestamp_by_dst_offset() -> None: def test_shift_time_interval_period_beginning_to_ending() -> None: - ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() shifted = shift_time_interval( ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING, ) - breakpoint() - assert all(shifted == ser + pd.Timedelta(hours=1)) + assert all(np.array(shifted) == np.array(ser) + pd.Timedelta(hours=1)) def test_shift_time_interval_period_ending_to_beginning() -> None: - ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() shifted = shift_time_interval( ser, TimeIntervalType.PERIOD_ENDING, TimeIntervalType.PERIOD_BEGINNING, ) - assert all(shifted == ser - pd.Timedelta(hours=1)) + assert all(np.array(shifted) == np.array(ser) - pd.Timedelta(hours=1)) def test_shift_time_interval_invalid() -> None: - ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() with pytest.raises(Exception): shift_time_interval( ser, @@ -57,14 +58,14 @@ def test_shift_time_interval_invalid() -> None: def test_wrap_timestamps() -> None: - ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() wrapped = wrap_timestamps(ser, to_timestamps) assert set(wrapped) <= set(to_timestamps) def test_roll_time_interval() -> None: - ser = pd.Series(pd.date_range("2018-12-31 22:00", periods=4, freq="h")) + ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() rolled = roll_time_interval( ser, From 3d7464a5d84b724f3d074ddbb2bcae3468645657 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:00:03 -0600 Subject: [PATCH 16/27] Fix pytest issue --- src/chronify/time_configs.py | 11 ++++++----- src/chronify/time_utils.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 1babd51..e2b862a 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -1,5 +1,6 @@ import abc import logging +from zoneinfo import ZoneInfo from datetime import datetime, timedelta, tzinfo from typing import Union, Literal, Optional from pydantic import Field, field_validator @@ -109,7 +110,9 @@ class DatetimeRangeWithTZColumn(DatetimeRangeBase): time_zone_column: str = Field( description="Column in the table that has time zone or offset information." ) - time_zones: list[tzinfo | None] = Field(description="Unique time zones from the table.") + time_zones: list[tzinfo | ZoneInfo | None] = Field( + description="Unique time zones from the table." + ) def get_time_zone_column(self) -> str: return self.time_zone_column @@ -125,8 +128,6 @@ def check_duplicated_time_zones(cls, time_zones: list[tzinfo | None]) -> list[tz raise InvalidValue(msg) return time_zones - # Lixi TODO: ensure table schema has time_zone col? - DatetimeRanges = Union[ DatetimeRange, @@ -243,7 +244,7 @@ def get_time_zone_column(self) -> str: return self.time_zone_column def get_time_zones(self) -> list[tzinfo | None]: - return [] # LIXI TODO + return [] # Issue 57 IndexTimeRanges = Union[ @@ -286,7 +287,7 @@ def get_time_zone_column(self) -> str: return self.time_zone_column def get_time_zones(self) -> list[tzinfo | None]: - return [] # LIXI TODO + return [] # Issue 57 class ColumnRepresentativeBase(TimeBaseModel): diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index b07118e..b4a1449 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -180,4 +180,4 @@ def get_tzname(tz: tzinfo | None) -> str: if isinstance(tz, ZoneInfo): return tz.key ts = datetime(year=2020, month=1, day=1, tzinfo=tz) - return tz.tzname(ts) # type: ignore # LIXI TODO + return tz.tzname(ts) # type: ignore From df784af3ef46e0bd85d2bea7980344866cb100fd Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:16:23 -0600 Subject: [PATCH 17/27] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/chronify/time_configs.py | 2 +- src/chronify/time_series_checker.py | 2 +- src/chronify/time_zone_converter.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index e2b862a..32007da 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -124,7 +124,7 @@ def get_time_zones(self) -> list[tzinfo | None]: @classmethod def check_duplicated_time_zones(cls, time_zones: list[tzinfo | None]) -> list[tzinfo | None]: if len(set(time_zones)) < len(time_zones): - msg = ("DatetimeRangeWithTZColumn.time_zones has duplicates: ", time_zones) + msg = f"DatetimeRangeWithTZColumn.time_zones has duplicates: {time_zones}" raise InvalidValue(msg) return time_zones diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index 8af4517..ce6df20 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -89,7 +89,7 @@ def _check_expected_timestamps_with_external_time_zone(self) -> int: if sorted(expected_dct.keys()) != sorted(actual_dct.keys()): msg = "Time zone records do not match between expected and actual from table " - msg += f"\nexpected: {sorted(expected_dct.keys())} vs. \neactual: {sorted(actual_dct.keys())}" + msg += f"\nexpected: {sorted(expected_dct.keys())} vs. \nactual: {sorted(actual_dct.keys())}" raise InvalidTable(msg) for tz_name in expected_dct.keys(): diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index b0ece3b..45ee0b4 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -86,12 +86,12 @@ def convert_time_zone_by_column( sqlalchemy engine metadata sqlalchemy metadata - srd_schema + src_schema Defines the source table in the database. time_zone_column Column name in the source table that contains the time zone information. wrap_time_allowed - If False, the converted timestamps will aligned with the original timestamps in real time scale + If False, the converted timestamps will be aligned with the original timestamps in real time scale E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central If True, the converted timestamps will fit into the time range of the src_schema in tz-naive clock time From dda0b781b9dcddb8c087e3c4b6dc29f1382ae034 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:18:00 -0600 Subject: [PATCH 18/27] fix mypy! mypy mypy attempt 3 fix mypy error again --- src/chronify/csv_time_series_parser.py | 17 ++++++++++++----- src/chronify/time_zone_converter.py | 7 ++++--- src/chronify/utils/sqlalchemy_table.py | 4 ++-- src/chronify/utils/sqlalchemy_view.py | 16 ++++++++++++---- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/chronify/csv_time_series_parser.py b/src/chronify/csv_time_series_parser.py index 49393be..2dddc88 100644 --- a/src/chronify/csv_time_series_parser.py +++ b/src/chronify/csv_time_series_parser.py @@ -109,12 +109,15 @@ def __init__(self, store: Store) -> None: def _check_input_format(data_file: Path) -> None: valid_extensions = [".csv"] if data_file.suffix not in valid_extensions: - msg = f"{data_file.name} does not have a file extension in the supported extensions: {valid_extensions}" + msg = ( + f"{data_file.name} does not have a file extension in the " + f"supported extensions: {valid_extensions}" + ) raise InvalidValue(msg) @staticmethod def _read_data_file(data_file: Path) -> pd.DataFrame: - return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) + return pd.read_csv(data_file, header=0, dtype=COLUMN_DTYPES) # type: ignore def _ingest_data(self, data: pd.DataFrame, table_name: str, year: int, length: int) -> None: csv_fmt = CsvTimeSeriesFormats.from_columns(data.columns) @@ -130,7 +133,10 @@ def _ingest_data(self, data: pd.DataFrame, table_name: str, year: int, length: i def _create_schemas( csv_fmt: CsvTimeSeriesFormats, name: str, year: int, length: int ) -> tuple[PivotedTableSchema | None, TableSchema]: - """Create a PivotedTableSchema if necessary, and a TableSchema for both the time format and datetime format.""" + """ + Create a PivotedTableSchema if necessary, and a TableSchema for both + the time format and datetime format. + """ create_pivoted_schema = True pivoted_dimension_name = "hour" value_columns = [str(x) for x in range(1, 25)] @@ -166,8 +172,9 @@ def ingest_to_datetime( self, data_file: Path, table_name: str, data_year: int, length: int ) -> None: """ - Given a file of csv time series data, convert the time format to datetime timestamps - and ingest into database + Given a file of csv time series data, convert the time format to datetime + timestamps + and ingest into database """ self._check_input_format(data_file) df = self._read_data_file(data_file) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index 45ee0b4..e0913d3 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -200,13 +200,14 @@ def generate_to_time_config(self) -> DatetimeRangeWithTZColumn: return DatetimeRangeWithTZColumn(**time_kwargs) def generate_to_schema(self) -> TableSchema: + to_time_config = self.generate_to_time_config() id_cols = self._from_schema.time_array_id_columns - if "time_zone" not in id_cols: - id_cols.append("time_zone") + if to_time_config.time_zone_column not in id_cols: + id_cols.append(to_time_config.time_zone_column) to_schema: TableSchema = self._from_schema.model_copy( update={ "name": f"{self._from_schema.name}_tz_converted", - "time_config": self.generate_to_time_config(), + "time_config": to_time_config, "time_array_id_columns": id_cols, } ) diff --git a/src/chronify/utils/sqlalchemy_table.py b/src/chronify/utils/sqlalchemy_table.py index c22c77b..ec7e0b7 100644 --- a/src/chronify/utils/sqlalchemy_table.py +++ b/src/chronify/utils/sqlalchemy_table.py @@ -21,7 +21,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateTable) # type: ignore +@compiler.compiles(CreateTable) def _create_table(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE TABLE %s AS %s" % ( element.name, @@ -29,7 +29,7 @@ def _create_table(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropTable) # type: ignore +@compiler.compiles(DropTable) def _drop_table(element: Any, compiler: Any, **kw: Any) -> str: return "DROP TABLE %s" % (element.name) diff --git a/src/chronify/utils/sqlalchemy_view.py b/src/chronify/utils/sqlalchemy_view.py index bd53334..bf4c495 100644 --- a/src/chronify/utils/sqlalchemy_view.py +++ b/src/chronify/utils/sqlalchemy_view.py @@ -20,7 +20,7 @@ def __init__(self, name: str) -> None: self.name = name -@compiler.compiles(CreateView) # type: ignore +@compiler.compiles(CreateView) def _create_view(element: Any, compiler: Any, **kw: Any) -> str: return "CREATE VIEW %s AS %s" % ( element.name, @@ -28,7 +28,7 @@ def _create_view(element: Any, compiler: Any, **kw: Any) -> str: ) -@compiler.compiles(DropView) # type: ignore +@compiler.compiles(DropView) def _drop_view(element: Any, compiler: Any, **kw: Any) -> str: return "DROP VIEW %s" % (element.name) @@ -53,9 +53,17 @@ def create_view( sa.event.listen( metadata, "after_create", - CreateView(name, selectable).execute_if(callable_=_view_doesnt_exist), # type: ignore + CreateView(name, selectable).execute_if( + callable_=_view_doesnt_exist # type: ignore + ), + ) + sa.event.listen( + metadata, + "before_drop", + DropView(name).execute_if( + callable_=_view_exists # type: ignore + ), ) - sa.event.listen(metadata, "before_drop", DropView(name).execute_if(callable_=_view_exists)) # type: ignore metadata.create_all(engine) metadata.reflect(engine, views=True) return view From f95785f27146579de0518d2eabcb214b81cc3a7a Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 7 Nov 2025 10:17:16 -0700 Subject: [PATCH 19/27] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/chronify/csv_time_series_parser.py | 2 +- src/chronify/datetime_range_generator.py | 4 +- src/chronify/time_configs.py | 2 +- src/chronify/time_series_checker.py | 4 +- src/chronify/time_utils.py | 4 +- src/chronify/time_zone_converter.py | 105 ++++++++++++----------- 6 files changed, 62 insertions(+), 59 deletions(-) diff --git a/src/chronify/csv_time_series_parser.py b/src/chronify/csv_time_series_parser.py index 2dddc88..1364b3d 100644 --- a/src/chronify/csv_time_series_parser.py +++ b/src/chronify/csv_time_series_parser.py @@ -174,7 +174,7 @@ def ingest_to_datetime( """ Given a file of csv time series data, convert the time format to datetime timestamps - and ingest into database + and ingest into database """ self._check_input_format(data_file) df = self._read_data_file(data_file) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index 033c2e1..b4d1fcd 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -111,8 +111,8 @@ def __init__( assert isinstance(self._model, DatetimeRangeWithTZColumn) if self._model.get_time_zones() == []: msg = ( - "DatetimeRangeWithTZColumn.time_zones needs to be instantiated for ", - f"DatetimeRangeGeneratorExternalTimeZone: {self._model}", + f"DatetimeRangeWithTZColumn.time_zones needs to be instantiated for " + f"DatetimeRangeGeneratorExternalTimeZone: {self._model}" ) raise InvalidValue(msg) diff --git a/src/chronify/time_configs.py b/src/chronify/time_configs.py index 32007da..e14709c 100644 --- a/src/chronify/time_configs.py +++ b/src/chronify/time_configs.py @@ -221,7 +221,7 @@ def get_time_zones(self) -> list[tzinfo | None]: class IndexTimeRangeWithTZColumn(IndexTimeRangeBase): - """Index time that reprsents local time relative to a time zone column. + """Index time that represents local time relative to a time zone column. start_timestamp is tz-naive. Used for dataset where the timeseries for all geographies start at the same clock time. diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index ce6df20..78d5b2c 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -191,7 +191,7 @@ def _check_expected_timestamps_by_time_array( f"{x}={y}" for x, y in zip(self._schema.time_array_id_columns, id_vals) ) msg = ( - f"The count of time values in each time array must be {count}" + f"The count of time values in each time array must be {count}." f"Time array identifiers: {values}. " f"count = {count_by_ta}" ) @@ -220,7 +220,7 @@ def check_timestamp_lists( msg = msg_prefix if not match: if len(actual) != len(expected): - msg = f"Mismatch number of timestamps: actual: {len(actual)} vs. expected: {len(expected)}\n" + msg += f"Mismatch number of timestamps: actual: {len(actual)} vs. expected: {len(expected)}\n" missing = set(expected).difference(set(actual)) extra = set(actual).difference(set(expected)) msg += "Actual timestamps do not match expected timestamps. \n" diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index b4a1449..58115e1 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -50,7 +50,7 @@ def shift_time_interval( ), f"from_ and to_interval_type are the same: {from_interval_type}" arr: NDArray[np.datetime64] = np.sort(ts_list) # type: ignore freqs = set((np.roll(arr, -1) - arr)[:-1]) - assert len(freqs), f"Timeseries has more than one frequency, {freqs}" + assert len(freqs) == 1, f"Timeseries must have exactly one frequency, found: {freqs}" freq: np.timedelta64 = next(iter(freqs)) match (from_interval_type, to_interval_type): @@ -85,7 +85,7 @@ def wrap_timestamps( """ to_arr = np.sort(np.array(to_timestamps)) freqs = set((np.roll(to_arr, -1) - to_arr)[:-1]) - assert len(freqs), f"Timeseries has more than one frequency, {freqs}" + assert len(freqs) == 1, f"Timeseries must have exactly one frequency, found: {freqs}" freq = next(iter(freqs)) tmin, tmax = to_arr[0], to_arr[-1] tdelta = tmax - tmin + freq diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index e0913d3..a933f78 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -34,29 +34,31 @@ def convert_time_zone( check_mapped_timestamps: bool = False, ) -> TableSchema: """Convert time zone of a table to a specified time zone. + Output timestamp is tz-naive with a new time_zone column added. + Parameters - ---------- - engine - sqlalchemy engine - metadata - sqlalchemy metadata - src_schema - Defines the source table in the database. - to_time_zone - time zone to convert to. If None, convert to tz-naive. - scratch_dir - Directory to use for temporary writes. Default to the system's tmp filesystem. - output_file - If set, write the mapped table to this Parquet file. - check_mapped_timestamps - Perform time checks on the result of the mapping operation. This can be slow and - is not required. - - Returns - ------- - dst_schema - schema of output table with converted timestamps + ---------- + engine : sqlalchemy.Engine + SQLAlchemy engine. + metadata : sqlalchemy.MetaData + SQLAlchemy metadata. + src_schema : TableSchema + Defines the source table in the database. + to_time_zone : tzinfo or None + Time zone to convert to. If None, convert to tz-naive. + scratch_dir : pathlib.Path, optional + Directory to use for temporary writes. Defaults to the system's tmp filesystem. + output_file : pathlib.Path, optional + If set, write the mapped table to this Parquet file. + check_mapped_timestamps : bool, optional + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + TableSchema + Schema of output table with converted timestamps. """ TZC = TimeZoneConverter(engine, metadata, src_schema, to_time_zone) TZC.convert_time_zone( @@ -80,37 +82,38 @@ def convert_time_zone_by_column( ) -> TableSchema: """Convert time zone of a table to multiple time zones specified by a column. Output timestamp is tz-naive, reflecting the local time relative to the time_zone_column. + Parameters - ---------- - engine - sqlalchemy engine - metadata - sqlalchemy metadata - src_schema - Defines the source table in the database. - time_zone_column - Column name in the source table that contains the time zone information. - wrap_time_allowed - If False, the converted timestamps will be aligned with the original timestamps in real time scale - E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes - 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central - If True, the converted timestamps will fit into the time range of the src_schema in tz-naive clock time - E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes - 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central, which is then wrapped such that - no clock time timestamps are in 2017. The final timestamps are: - 2018-12-31 23:00, 2018-01-01 00:00 ~ 2018-12-31 22:00 in US/Central - scratch_dir - Directory to use for temporary writes. Default to the system's tmp filesystem. - output_file - If set, write the mapped table to this Parquet file. - check_mapped_timestamps - Perform time checks on the result of the mapping operation. This can be slow and - is not required. - - Returns - ------- - dst_schema - schema of output table with converted timestamps + ---------- + engine : sqlalchemy.Engine + sqlalchemy engine + metadata : sqlalchemy.MetaData + sqlalchemy metadata + src_schema : TableSchema + Defines the source table in the database. + time_zone_column : str + Column name in the source table that contains the time zone information. + wrap_time_allowed : bool + If False, the converted timestamps will be aligned with the original timestamps in real time scale + E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes + 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central + If True, the converted timestamps will fit into the time range of the src_schema in tz-naive clock time + E.g. 2018-01-01 00:00 ~ 2018-12-31 23:00 in US/Eastern becomes + 2017-12-31 23:00 ~ 2018-12-31 22:00 in US/Central, which is then wrapped such that + no clock time timestamps are in 2017. The final timestamps are: + 2018-12-31 23:00, 2018-01-01 00:00 ~ 2018-12-31 22:00 in US/Central + scratch_dir : pathlib.Path, optional + Directory to use for temporary writes. Default to the system's tmp filesystem. + output_file : pathlib.Path, optional + If set, write the mapped table to this Parquet file. + check_mapped_timestamps : bool, optional + Perform time checks on the result of the mapping operation. This can be slow and + is not required. + + Returns + ------- + dst_schema : TableSchema + schema of output table with converted timestamps """ TZC = TimeZoneConverterByColumn( engine, metadata, src_schema, time_zone_column, wrap_time_allowed From 9ac6e698ab86c0842e2c6971a19f17b3536ba931 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 7 Nov 2025 10:17:34 -0700 Subject: [PATCH 20/27] Apply code review suggestions 2 --- src/chronify/store.py | 4 ++-- tests/test_time_zone_converter.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/chronify/store.py b/src/chronify/store.py index c34d34d..7b15e48 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -3,7 +3,7 @@ import shutil from typing import Any, Optional from chronify.utils.sql import make_temp_view_name -from zoneinfo import ZoneInfo +from datetime import tzinfo import duckdb import pandas as pd @@ -947,7 +947,7 @@ def map_table_time_config( def convert_time_zone( self, src_name: str, - time_zone: ZoneInfo, + time_zone: tzinfo | None, scratch_dir: Optional[Path] = None, output_file: Optional[Path] = None, check_mapped_timestamps: bool = False, diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index e36c7e9..85c51a6 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -192,7 +192,7 @@ def test_src_table_no_time_zone(iter_engines: Engine) -> None: @pytest.mark.parametrize( "to_time_zone", [None, ZoneInfo("US/Central"), ZoneInfo("America/Los_Angeles")] ) -def test_time_conversion(iter_engines: Engine, to_time_zone: ZoneInfo) -> None: +def test_time_conversion(iter_engines: Engine, to_time_zone: tzinfo | None) -> None: from_schema = get_datetime_schema( 2018, ZoneInfo("US/Mountain"), TimeIntervalType.PERIOD_BEGINNING, "base_table" ) From 191eacb53783a6746717f8f5f9e59a0ec59415da Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Fri, 7 Nov 2025 17:23:07 -0700 Subject: [PATCH 21/27] review suggestions --- src/chronify/datetime_range_generator.py | 25 +++++------- src/chronify/store.py | 12 +++--- src/chronify/time.py | 2 +- src/chronify/time_range_generator_base.py | 2 +- src/chronify/time_series_checker.py | 50 +++++++++++++---------- src/chronify/time_utils.py | 45 ++++++++++---------- src/chronify/time_zone_converter.py | 19 +++++---- tests/test_store.py | 1 + tests/test_time_zone_converter.py | 8 ++-- 9 files changed, 87 insertions(+), 77 deletions(-) diff --git a/src/chronify/datetime_range_generator.py b/src/chronify/datetime_range_generator.py index b4d1fcd..b030e68 100644 --- a/src/chronify/datetime_range_generator.py +++ b/src/chronify/datetime_range_generator.py @@ -1,6 +1,7 @@ -from datetime import datetime, timedelta, tzinfo +from datetime import datetime, tzinfo from typing import Generator, Optional from zoneinfo import ZoneInfo +from itertools import chain import pandas as pd @@ -47,7 +48,7 @@ def _iter_timestamps( ) is_leap_year = ( - pd.Timestamp(f"{cur.year}-01-01") + timedelta(days=365) + pd.Timestamp(f"{cur.year}-01-01") + pd.Timedelta(days=365) ).year == cur.year if not is_leap_year: yield pd.Timestamp(cur) @@ -73,7 +74,7 @@ def _iter_timestamps( def list_time_columns(self) -> list[str]: return self._model.list_time_columns() - def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: # TODO + def list_distinct_timestamps_from_dataframe(self, df: pd.DataFrame) -> list[datetime]: result = sorted(df[self._model.time_column].unique()) if not isinstance(result[0], datetime): result = [pd.Timestamp(x) for x in result] @@ -132,22 +133,16 @@ def _list_timestamps(self, time_zone: Optional[tzinfo]) -> list[datetime]: return [x.replace(tzinfo=None) for x in timestamps] def list_timestamps(self) -> list[datetime]: - """return only unique values, this means no duplicates for prevailing time""" - ts_set = set() - for tz in self._model.get_time_zones(): - ts_set.update(set(self._list_timestamps(tz))) - timestamps = sorted(ts_set) - return timestamps + """return ordered timestamps across all time zones in the order of the time zones.""" + dct = self.list_timestamps_by_time_zone() + return list(chain(*dct.values())) - def list_timestamps_by_time_zone(self, distinct: bool = False) -> dict[str, list[datetime]]: - """for each time zone, returns full timestamp iteration with duplicates allowed""" + def list_timestamps_by_time_zone(self) -> dict[str, list[datetime]]: + """for each time zone, returns full timestamp iteration (duplicates allowed)""" dct = {} for tz in self._model.get_time_zones(): - timestamps = self._list_timestamps(tz) - if distinct: - timestamps = sorted(set(timestamps)) tz_name = get_tzname(tz) - dct[tz_name] = timestamps + dct[tz_name] = self._list_timestamps(tz) return dct diff --git a/src/chronify/store.py b/src/chronify/store.py index 7b15e48..dec92f6 100644 --- a/src/chronify/store.py +++ b/src/chronify/store.py @@ -1011,14 +1011,14 @@ def convert_time_zone( """ src_schema = self._schema_mgr.get_schema(src_name) - TZC = TimeZoneConverter(self._engine, self._metadata, src_schema, time_zone) + tzc = TimeZoneConverter(self._engine, self._metadata, src_schema, time_zone) - dst_schema = TZC.generate_to_schema() + dst_schema = tzc.generate_to_schema() if self.has_table(dst_schema.name): msg = dst_schema.name raise TableAlreadyExists(msg) - TZC.convert_time_zone( + tzc.convert_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, @@ -1104,16 +1104,16 @@ def convert_time_zone_by_column( """ src_schema = self._schema_mgr.get_schema(src_name) - TZC = TimeZoneConverterByColumn( + tzc = TimeZoneConverterByColumn( self._engine, self._metadata, src_schema, time_zone_column, wrap_time_allowed ) - dst_schema = TZC.generate_to_schema() + dst_schema = tzc.generate_to_schema() if self.has_table(dst_schema.name): msg = dst_schema.name raise TableAlreadyExists(msg) - TZC.convert_time_zone( + tzc.convert_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, diff --git a/src/chronify/time.py b/src/chronify/time.py index e78f508..08651ef 100644 --- a/src/chronify/time.py +++ b/src/chronify/time.py @@ -15,7 +15,7 @@ class TimeType(StrEnum): ANNUAL = "annual" INDEX_NTZ = "index_ntz" INDEX_TZ = "index_tz" - INDEX_TZ_COL = "INDEX_TZ_COL" + INDEX_TZ_COL = "index_tz_col" REPRESENTATIVE_PERIOD_NTZ = "representative_period_ntz" REPRESENTATIVE_PERIOD_TZ = "representative_period_tz" YEAR_MONTH_DAY_HOUR_NTZ = "year_month_day_hour" diff --git a/src/chronify/time_range_generator_base.py b/src/chronify/time_range_generator_base.py index e845dff..a009973 100644 --- a/src/chronify/time_range_generator_base.py +++ b/src/chronify/time_range_generator_base.py @@ -12,7 +12,7 @@ def list_timestamps(self) -> list[Any]: """Return a list of timestamps for a time range. Type of the timestamps depends on the class. Note: For DatetimeRangeGeneratorExternalTimeZone class with more than one time zone, - this shows distinct timestamps only + this shows all timestamps across all time zones in the order of the time zones. Returns ------- diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index 78d5b2c..414ab6c 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -44,18 +44,9 @@ def __init__( ) def check_timestamps(self) -> None: - preserve_duplicates = False - if isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone): - count = self._check_expected_timestamps_with_external_time_zone() - if self._has_prevailing_time_zone(self._schema.time_config.get_time_zones()): - preserve_duplicates = True - else: - count = self._check_expected_timestamps() - + count = self._check_expected_timestamps() self._check_null_consistency() - self._check_expected_timestamps_by_time_array( - count, preserve_duplicates=preserve_duplicates - ) + self._check_expected_timestamps_by_time_array(count) @staticmethod def _has_prevailing_time_zone(lst: list[tzinfo | None]) -> bool: @@ -65,6 +56,13 @@ def _has_prevailing_time_zone(lst: list[tzinfo | None]) -> bool: return False def _check_expected_timestamps(self) -> int: + """Check that the timestamps in the table match the expected timestamps.""" + if isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone): + return self._check_expected_timestamps_with_external_time_zone() + return self._check_expected_timestamps_datetime() + + def _check_expected_timestamps_datetime(self) -> int: + """For tz-naive or tz-aware time without external time zone column""" expected = self._time_generator.list_timestamps() time_columns = self._time_generator.list_time_columns() stmt = select(*(self._table.c[x] for x in time_columns)).distinct() @@ -72,14 +70,16 @@ def _check_expected_timestamps(self) -> int: stmt = stmt.where(self._table.c[col].is_not(None)) df = read_database(stmt, self._conn, self._schema.time_config) actual = self._time_generator.list_distinct_timestamps_from_dataframe(df) + expected = sorted(set(expected)) # drop duplicates for tz-naive prevailing time check_timestamp_lists(actual, expected) return len(expected) def _check_expected_timestamps_with_external_time_zone(self) -> int: - assert isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone) + """For tz-naive time with external time zone column""" + assert isinstance(self._time_generator, DatetimeRangeGeneratorExternalTimeZone) # for mypy expected_dct = self._time_generator.list_timestamps_by_time_zone() time_columns = self._time_generator.list_time_columns() - assert isinstance(self._schema.time_config, DatetimeRangeWithTZColumn) + assert isinstance(self._schema.time_config, DatetimeRangeWithTZColumn) # for mypy time_columns.append(self._schema.time_config.get_time_zone_column()) stmt = select(*(self._table.c[x] for x in time_columns)).distinct() for col in time_columns: @@ -88,12 +88,14 @@ def _check_expected_timestamps_with_external_time_zone(self) -> int: actual_dct = self._time_generator.list_distinct_timestamps_by_time_zone_from_dataframe(df) if sorted(expected_dct.keys()) != sorted(actual_dct.keys()): - msg = "Time zone records do not match between expected and actual from table " - msg += f"\nexpected: {sorted(expected_dct.keys())} vs. \nactual: {sorted(actual_dct.keys())}" + msg = ( + "Time zone records do not match between expected and actual from table " + f"\nexpected: {sorted(expected_dct.keys())} vs. \nactual: {sorted(actual_dct.keys())}" + ) raise InvalidTable(msg) for tz_name in expected_dct.keys(): - # this drops duplicates in tz-naive prevailing time + # drops duplicates for tz-naive prevailing time expected = sorted(set(expected_dct[tz_name])) actual = actual_dct[tz_name] check_timestamp_lists(actual, expected, msg_prefix=f"For {tz_name}\n") @@ -123,9 +125,15 @@ def _check_null_consistency(self) -> None: ) raise InvalidTable(msg) - def _check_expected_timestamps_by_time_array( - self, count: int, preserve_duplicates: bool = False - ) -> None: + def _check_expected_timestamps_by_time_array(self, count: int) -> None: + if isinstance( + self._time_generator, DatetimeRangeGeneratorExternalTimeZone + ) and self._has_prevailing_time_zone(self._schema.time_config.get_time_zones()): + # cannot check counts by timestamps when tz-naive prevailing time zones are present + has_tz_naive_prevailing = True + else: + has_tz_naive_prevailing = False + id_cols = ",".join(self._schema.time_array_id_columns) time_cols = ",".join(self._schema.time_config.list_time_columns()) # NULL consistency was checked above. @@ -185,7 +193,7 @@ def _check_expected_timestamps_by_time_array( distinct_count_by_ta = result[0] count_by_ta = result[1] - if preserve_duplicates and not count_by_ta == count: + if has_tz_naive_prevailing and not count_by_ta == count: id_vals = result[2:] values = ", ".join( f"{x}={y}" for x, y in zip(self._schema.time_array_id_columns, id_vals) @@ -197,7 +205,7 @@ def _check_expected_timestamps_by_time_array( ) raise InvalidTable(msg) - if not preserve_duplicates and not count_by_ta == count == distinct_count_by_ta: + if not has_tz_naive_prevailing and not count_by_ta == count == distinct_count_by_ta: id_vals = result[2:] values = ", ".join( f"{x}={y}" for x, y in zip(self._schema.time_array_id_columns, id_vals) diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index 58115e1..3485236 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -34,16 +34,21 @@ def shift_time_interval( from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, ) -> list[datetime]: - """Shift pandas timeseries by ONE time interval based on interval type. + """Shift ts_list by ONE time interval based on interval type. Example: - >>> ts_list = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() - [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + >>> ts_list = pd.date_range("2018-12-31 23:00", periods=3, freq="h").tolist() + [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] >>> ts_list2 = shift_time_interval( ... ts_list, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING ... ) - [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] + [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] + + >>> ts_list2 = shift_time_interval( + ... ts_list, TimeIntervalType.PERIOD_ENDING, TimeIntervalType.PERIOD_BEGINNING + ... ) + [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00')] """ assert ( from_interval_type != to_interval_type @@ -71,17 +76,17 @@ def wrap_timestamps( ts_list: list[datetime], to_timestamps: list[datetime], ) -> list[datetime]: - """Wrap pandas timeseries so it stays within a list of timestamps. + """Returns the replacement timestamps in order to wrap the ts_list into the to_timestamps range. Example: - >>> ts_list = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() - [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + >>> ts_list = pd.date_range("2018-12-31 23:00", periods=3, freq="h").tolist() + [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] - >>> to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() - [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00')] + >>> to_timestamps = pd.date_range("2019-01-01 00:00", periods=3, freq="h").tolist() + [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] >>> ts_list2 = wrap_timestamps(ts_list, to_timestamps) - [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] """ to_arr = np.sort(np.array(to_timestamps)) freqs = set((np.roll(to_arr, -1) - to_arr)[:-1]) @@ -108,27 +113,25 @@ def roll_time_interval( to_interval_type: TimeIntervalType, to_timestamps: list[datetime], ) -> list[datetime]: - """Roll pandas timeseries by shifting time interval based on interval type and then - wrapping timestamps + """Roll ts_list by shifting time interval based on interval type and then + wrapping timestamps according to to_timestamps. Example: - >>> ts_list = pd.date_range( - ... "2018-12-31 22:00", periods=4, freq="h" - ... ).tolist() # period-beginning - [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] + >>> ts_list = pd.date_range("2019-01-01 00:00", periods=3, freq="h").tolist() # period-ending + [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] >>> to_timestamps = pd.date_range( - ... "2019-01-01 00:00", periods=4, freq="h" - ... ).tolist() # period-ending - [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 03:00:00')] + ... "2019-01-01 00:00", periods=3, freq="h" + ... ).tolist() # period-beginning + [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] >>> ts_list2 = roll_time_interval( ... ts_list, - ... TimeIntervalType.PERIOD_BEGINNING, ... TimeIntervalType.PERIOD_ENDING, + ... TimeIntervalType.PERIOD_BEGINNING, ... to_timestamps, ... ) - [Timestamp('2019-01-01 03:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] + [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] """ ts_list2 = shift_time_interval(ts_list, from_interval_type, to_interval_type) ts_list3 = wrap_timestamps(ts_list2, to_timestamps) diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index a933f78..ba98614 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -60,14 +60,14 @@ def convert_time_zone( TableSchema Schema of output table with converted timestamps. """ - TZC = TimeZoneConverter(engine, metadata, src_schema, to_time_zone) - TZC.convert_time_zone( + tzc = TimeZoneConverter(engine, metadata, src_schema, to_time_zone) + tzc.convert_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, ) - return TZC._to_schema + return tzc._to_schema def convert_time_zone_by_column( @@ -115,15 +115,15 @@ def convert_time_zone_by_column( dst_schema : TableSchema schema of output table with converted timestamps """ - TZC = TimeZoneConverterByColumn( + tzc = TimeZoneConverterByColumn( engine, metadata, src_schema, time_zone_column, wrap_time_allowed ) - TZC.convert_time_zone( + tzc.convert_time_zone( scratch_dir=scratch_dir, output_file=output_file, check_mapped_timestamps=check_mapped_timestamps, ) - return TZC._to_schema + return tzc._to_schema class TimeZoneConverterBase(abc.ABC): @@ -148,8 +148,11 @@ def _check_from_schema(self, from_schema: TableSchema) -> None: isinstance(from_schema.time_config, DatetimeRange) and from_schema.time_config.start_time_is_tz_naive() ): - msg += "Source schema start_time must be timezone-aware. " - msg += "To convert from timezone-naive to timezone-aware, use the TimeSeriesMapperDatetime.map_time() method instead. " + msg += ( + "Source schema start_time must be timezone-aware. " + "To convert from timezone-naive to timezone-aware, " + "use the TimeSeriesMapperDatetime.map_time() method instead. " + ) if msg != "": raise InvalidParameter(msg) diff --git a/tests/test_store.py b/tests/test_store.py index fc9b566..e9204a9 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -854,6 +854,7 @@ def test_convert_time_zone( assert dst_schema.time_config.start == expected_start assert pd.Timestamp(actual[0]) == dst_schema.time_config.start expected = make_time_range_generator(dst_schema.time_config).list_timestamps() + expected = sorted(set(expected)) check_timestamp_lists(actual, expected) diff --git a/tests/test_time_zone_converter.py b/tests/test_time_zone_converter.py index 85c51a6..3b0230a 100644 --- a/tests/test_time_zone_converter.py +++ b/tests/test_time_zone_converter.py @@ -171,13 +171,13 @@ def run_conversion_with_error( ingest_data(engine, metadata, df, from_schema) with pytest.raises(error[0], match=error[1]): if use_tz_col: - TZC = TimeZoneConverterByColumn( + tzc = TimeZoneConverterByColumn( engine, metadata, from_schema, "time_zone", wrap_time_allowed=False ) - TZC.convert_time_zone(check_mapped_timestamps=True) + tzc.convert_time_zone(check_mapped_timestamps=True) else: - TZC2 = TimeZoneConverter(engine, metadata, from_schema, None) - TZC2.convert_time_zone(check_mapped_timestamps=True) + tzc2 = TimeZoneConverter(engine, metadata, from_schema, None) + tzc2.convert_time_zone(check_mapped_timestamps=True) def test_src_table_no_time_zone(iter_engines: Engine) -> None: From 72c886c457cc3359f5403d3f7deb8af7dd4e56fb Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:54:48 -0700 Subject: [PATCH 22/27] address comments v3 --- src/chronify/time_series_checker.py | 6 ++++- src/chronify/time_series_mapper_datetime.py | 10 ++++--- .../time_series_mapper_representative.py | 4 +-- src/chronify/time_utils.py | 18 ++++++------- src/chronify/time_zone_converter.py | 4 +-- tests/test_mapper_datetime_to_datetime.py | 12 ++++++--- tests/test_time_utils.py | 26 +++++++++---------- 7 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/chronify/time_series_checker.py b/src/chronify/time_series_checker.py index 414ab6c..17c94c6 100644 --- a/src/chronify/time_series_checker.py +++ b/src/chronify/time_series_checker.py @@ -94,13 +94,17 @@ def _check_expected_timestamps_with_external_time_zone(self) -> int: ) raise InvalidTable(msg) + assert len(expected_dct) > 0 # for mypy + count = set() for tz_name in expected_dct.keys(): + count.add(len(expected_dct[tz_name])) # drops duplicates for tz-naive prevailing time expected = sorted(set(expected_dct[tz_name])) actual = actual_dct[tz_name] check_timestamp_lists(actual, expected, msg_prefix=f"For {tz_name}\n") # return len by preserving duplicates for tz-naive prevailing time - return len(expected_dct[tz_name]) + assert len(count) == 1, "Mismatch in counts among time zones" + return count.pop() def _check_null_consistency(self) -> None: # If any time column has a NULL, all time columns must have a NULL. diff --git a/src/chronify/time_series_mapper_datetime.py b/src/chronify/time_series_mapper_datetime.py index 08412ec..2d70551 100644 --- a/src/chronify/time_series_mapper_datetime.py +++ b/src/chronify/time_series_mapper_datetime.py @@ -10,7 +10,11 @@ from chronify.time_series_mapper_base import TimeSeriesMapperBase, apply_mapping from chronify.time_configs import DatetimeRange, TimeBasedDataAdjustment from chronify.time_range_generator_factory import make_time_range_generator -from chronify.time_utils import roll_time_interval, wrap_timestamps, get_standard_time_zone +from chronify.time_utils import ( + rolled_interval_timestamps, + wrapped_time_timestamps, + get_standard_time_zone, +) logger = logging.getLogger(__name__) @@ -111,7 +115,7 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: match (self._adjust_interval, self._wrap_time_allowed): case (True, _): ser = pd.Series( - roll_time_interval( + rolled_interval_timestamps( ser_from.tolist(), self._from_time_config.interval_type, self._to_time_config.interval_type, @@ -121,7 +125,7 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: ) case (False, True): ser = pd.Series( - wrap_timestamps(ser_from.tolist(), to_time_data), index=ser_from.index + wrapped_time_timestamps(ser_from.tolist(), to_time_data), index=ser_from.index ) case (False, False): ser = ser_from diff --git a/src/chronify/time_series_mapper_representative.py b/src/chronify/time_series_mapper_representative.py index 8a70067..9c545cd 100644 --- a/src/chronify/time_series_mapper_representative.py +++ b/src/chronify/time_series_mapper_representative.py @@ -19,7 +19,7 @@ RepresentativePeriodTimeBase, TimeBasedDataAdjustment, ) -from chronify.time_utils import shift_time_interval +from chronify.time_utils import shifted_interval_timestamps logger = logging.getLogger(__name__) @@ -93,7 +93,7 @@ def _create_mapping(self, is_tz_naive: bool) -> tuple[pd.DataFrame, MappingTable time_col = "to_" + to_time_col # Mapping works backward for representative time by shifting interval type of # to_time_config to match from_time_config before extracting time info - dft[time_col] = shift_time_interval( + dft[time_col] = shifted_interval_timestamps( dft[to_time_col].tolist(), self._to_time_config.interval_type, self._from_time_config.interval_type, diff --git a/src/chronify/time_utils.py b/src/chronify/time_utils.py index 3485236..0744c46 100644 --- a/src/chronify/time_utils.py +++ b/src/chronify/time_utils.py @@ -29,7 +29,7 @@ def adjust_timestamp_by_dst_offset(timestamp: datetime, resolution: timedelta) - return timestamp - offset -def shift_time_interval( +def shifted_interval_timestamps( ts_list: list[datetime], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, @@ -40,12 +40,12 @@ def shift_time_interval( >>> ts_list = pd.date_range("2018-12-31 23:00", periods=3, freq="h").tolist() [Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] - >>> ts_list2 = shift_time_interval( + >>> ts_list2 = shifted_interval_timestamps( ... ts_list, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING ... ) [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] - >>> ts_list2 = shift_time_interval( + >>> ts_list2 = shifted_interval_timestamps( ... ts_list, TimeIntervalType.PERIOD_ENDING, TimeIntervalType.PERIOD_BEGINNING ... ) [Timestamp('2018-12-31 22:00:00'), Timestamp('2018-12-31 23:00:00'), Timestamp('2019-01-01 00:00:00')] @@ -72,7 +72,7 @@ def shift_time_interval( return ts_list2 # type: ignore -def wrap_timestamps( +def wrapped_time_timestamps( ts_list: list[datetime], to_timestamps: list[datetime], ) -> list[datetime]: @@ -85,7 +85,7 @@ def wrap_timestamps( >>> to_timestamps = pd.date_range("2019-01-01 00:00", periods=3, freq="h").tolist() [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] - >>> ts_list2 = wrap_timestamps(ts_list, to_timestamps) + >>> ts_list2 = wrapped_time_timestamps(ts_list, to_timestamps) [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] """ to_arr = np.sort(np.array(to_timestamps)) @@ -107,7 +107,7 @@ def wrap_timestamps( return ts_list2 # type: ignore -def roll_time_interval( +def rolled_interval_timestamps( ts_list: list[datetime], from_interval_type: TimeIntervalType, to_interval_type: TimeIntervalType, @@ -125,7 +125,7 @@ def roll_time_interval( ... ).tolist() # period-beginning [Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00'), Timestamp('2019-01-01 02:00:00')] - >>> ts_list2 = roll_time_interval( + >>> ts_list2 = rolled_interval_timestamps( ... ts_list, ... TimeIntervalType.PERIOD_ENDING, ... TimeIntervalType.PERIOD_BEGINNING, @@ -133,8 +133,8 @@ def roll_time_interval( ... ) [Timestamp('2019-01-01 02:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-01 01:00:00')] """ - ts_list2 = shift_time_interval(ts_list, from_interval_type, to_interval_type) - ts_list3 = wrap_timestamps(ts_list2, to_timestamps) + ts_list2 = shifted_interval_timestamps(ts_list, from_interval_type, to_interval_type) + ts_list3 = wrapped_time_timestamps(ts_list2, to_timestamps) return ts_list3 diff --git a/src/chronify/time_zone_converter.py b/src/chronify/time_zone_converter.py index ba98614..115c112 100644 --- a/src/chronify/time_zone_converter.py +++ b/src/chronify/time_zone_converter.py @@ -21,7 +21,7 @@ from chronify.time_range_generator_factory import make_time_range_generator from chronify.sqlalchemy.functions import read_database from chronify.time import TimeType -from chronify.time_utils import wrap_timestamps, get_tzname +from chronify.time_utils import wrapped_time_timestamps, get_tzname def convert_time_zone( @@ -380,7 +380,7 @@ def _create_mapping(self) -> tuple[pd.DataFrame, MappingTableSchema]: if self._wrap_time_allowed: # assume it is being wrapped based on the tz-naive version of the original time data final_time_data = [x.replace(tzinfo=None) for x in from_time_data] - to_time_data = wrap_timestamps(time_data, final_time_data) + to_time_data = wrapped_time_timestamps(time_data, final_time_data) else: to_time_data = time_data df_tz.append( diff --git a/tests/test_mapper_datetime_to_datetime.py b/tests/test_mapper_datetime_to_datetime.py index 9776c87..3d144e9 100644 --- a/tests/test_mapper_datetime_to_datetime.py +++ b/tests/test_mapper_datetime_to_datetime.py @@ -15,7 +15,11 @@ from chronify.time import TimeIntervalType, MeasurementType from chronify.exceptions import ConflictingInputsError, InvalidParameter from chronify.datetime_range_generator import DatetimeRangeGenerator -from chronify.time_utils import shift_time_interval, roll_time_interval, wrap_timestamps +from chronify.time_utils import ( + shifted_interval_timestamps, + rolled_interval_timestamps, + wrapped_time_timestamps, +) def generate_datetime_data(time_config: DatetimeRange) -> pd.Series: # type: ignore @@ -134,18 +138,18 @@ def test_roll_time_using_shift_and_wrap() -> None: to_schema = get_datetime_schema(2024, None, TimeIntervalType.PERIOD_BEGINNING, "to_table") data = generate_datetime_data(to_schema.time_config) - df["rolled"] = roll_time_interval( + df["rolled"] = rolled_interval_timestamps( df[from_schema.time_config.time_column].tolist(), from_schema.time_config.interval_type, to_schema.time_config.interval_type, data, ) - df["rolled2"] = shift_time_interval( + df["rolled2"] = shifted_interval_timestamps( df[from_schema.time_config.time_column].tolist(), from_schema.time_config.interval_type, to_schema.time_config.interval_type, ) - df["rolled2"] = wrap_timestamps( + df["rolled2"] = wrapped_time_timestamps( df["rolled2"].tolist(), data, ) diff --git a/tests/test_time_utils.py b/tests/test_time_utils.py index 4cb4aa6..ed52ad1 100644 --- a/tests/test_time_utils.py +++ b/tests/test_time_utils.py @@ -6,9 +6,9 @@ from chronify.time_utils import ( adjust_timestamp_by_dst_offset, - shift_time_interval, - wrap_timestamps, - roll_time_interval, + shifted_interval_timestamps, + wrapped_time_timestamps, + rolled_interval_timestamps, is_prevailing_time_zone, is_standard_time_zone, get_standard_time_zone, @@ -27,9 +27,9 @@ def test_adjust_timestamp_by_dst_offset() -> None: assert res.hour == hour -def test_shift_time_interval_period_beginning_to_ending() -> None: +def test_shifted_interval_timestamps_period_beginning_to_ending() -> None: ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() - shifted = shift_time_interval( + shifted = shifted_interval_timestamps( ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING, @@ -37,9 +37,9 @@ def test_shift_time_interval_period_beginning_to_ending() -> None: assert all(np.array(shifted) == np.array(ser) + pd.Timedelta(hours=1)) -def test_shift_time_interval_period_ending_to_beginning() -> None: +def test_shifted_interval_timestamps_period_ending_to_beginning() -> None: ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() - shifted = shift_time_interval( + shifted = shifted_interval_timestamps( ser, TimeIntervalType.PERIOD_ENDING, TimeIntervalType.PERIOD_BEGINNING, @@ -47,27 +47,27 @@ def test_shift_time_interval_period_ending_to_beginning() -> None: assert all(np.array(shifted) == np.array(ser) - pd.Timedelta(hours=1)) -def test_shift_time_interval_invalid() -> None: +def test_shifted_interval_timestamps_invalid() -> None: ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() with pytest.raises(Exception): - shift_time_interval( + shifted_interval_timestamps( ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_BEGINNING, ) -def test_wrap_timestamps() -> None: +def test_wrapped_time_timestamps() -> None: ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() - wrapped = wrap_timestamps(ser, to_timestamps) + wrapped = wrapped_time_timestamps(ser, to_timestamps) assert set(wrapped) <= set(to_timestamps) -def test_roll_time_interval() -> None: +def test_rolled_interval_timestamps() -> None: ser = pd.date_range("2018-12-31 22:00", periods=4, freq="h").tolist() to_timestamps = pd.date_range("2019-01-01 00:00", periods=4, freq="h").tolist() - rolled = roll_time_interval( + rolled = rolled_interval_timestamps( ser, TimeIntervalType.PERIOD_BEGINNING, TimeIntervalType.PERIOD_ENDING, From 5a0b4411b43678bb26099b2a4e15f88996d9c3fb Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 18 Nov 2025 12:54:48 -0700 Subject: [PATCH 23/27] potential parquet fix for spark hive --- src/chronify/sqlalchemy/functions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 7203d60..f88899f 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -155,10 +155,16 @@ def _write_to_hive( elif isinstance(df2[config.time_column].dtype, DateTime64DType): new_dtype = "datetime64[us]" df2[config.time_column] = df2[config.time_column].astype(new_dtype) # type: ignore + else: + new_dtype = "datetime64[us]" + df2[config.time_column] = pd.to_datetime( + df2[config.time_column], errors="raise" + ).astype(new_dtype) # type: ignore with NamedTemporaryFile(suffix=".parquet", dir=scratch_dir) as f: f.close() output = Path(f.name) + df2.to_parquet(output) atexit.register(lambda: delete_if_exists(output)) select_stmt = f"SELECT * FROM parquet.`{output}`" From b51571e3b8dc21eba15cd8090bf93a0fe864a699 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Tue, 18 Nov 2025 18:16:04 -0700 Subject: [PATCH 24/27] update sql functions --- src/chronify/sqlalchemy/functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index f88899f..bb64165 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -16,7 +16,7 @@ from sqlalchemy import Connection, Engine, Selectable, text from chronify.exceptions import InvalidOperation, InvalidParameter -from chronify.time_configs import DatetimeRange, TimeBaseModel +from chronify.time_configs import DatetimeRangeBase, DatetimeRange, TimeBaseModel from chronify.utils.path_utils import check_overwrite, delete_if_exists, to_path # Copied from Pandas/Polars @@ -72,7 +72,7 @@ def write_database( def _check_one_config_per_datetime_column(configs: Sequence[TimeBaseModel]) -> None: time_col_count = Counter( - [config.time_column for config in configs if isinstance(config, DatetimeRange)] + [config.time_column for config in configs if isinstance(config, DatetimeRangeBase)] ) time_col_dup = {k: v for k, v in time_col_count.items() if v > 1} if len(time_col_dup) > 0: @@ -144,7 +144,7 @@ def _write_to_hive( ) -> None: df2 = df.copy() for config in configs: - if isinstance(config, DatetimeRange): + if isinstance(config, DatetimeRangeBase): if isinstance(df2[config.time_column].dtype, DatetimeTZDtype): # Spark doesn't like ns. That might change in the future. # Pandas might offer a better way to change from ns to us in the future. @@ -158,7 +158,7 @@ def _write_to_hive( else: new_dtype = "datetime64[us]" df2[config.time_column] = pd.to_datetime( - df2[config.time_column], errors="raise" + df2[config.time_column], utc=False, errors="raise" ).astype(new_dtype) # type: ignore with NamedTemporaryFile(suffix=".parquet", dir=scratch_dir) as f: From 177cbb38b715d19579af9f09a1dd9a2951fde8a0 Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:32:49 -0700 Subject: [PATCH 25/27] Show error --- src/chronify/sqlalchemy/functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index bb64165..853c4d9 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -189,6 +189,7 @@ def _read_from_hive( query: Selectable | str, conn: Connection, config: TimeBaseModel, params: Any = None ) -> pd.DataFrame: df = pd.read_sql_query(query, conn, params=params) + assert config.time_column in df.columns, f"No {config.time_column} found, instead {df.columns}" if isinstance(config, DatetimeRange) and not config.start_time_is_tz_naive(): # This is tied to the fact that we set the Spark session to UTC. # Otherwise, there is confusion with the computer's local time zone. From 48f7ef251a0d6f78e30f24e0daea24cfda45492a Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:26:27 -0700 Subject: [PATCH 26/27] temp --- src/chronify/sqlalchemy/functions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index 853c4d9..ba96219 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -189,7 +189,11 @@ def _read_from_hive( query: Selectable | str, conn: Connection, config: TimeBaseModel, params: Any = None ) -> pd.DataFrame: df = pd.read_sql_query(query, conn, params=params) - assert config.time_column in df.columns, f"No {config.time_column} found, instead {df.columns}" + breakpoint() + if isinstance(config, DatetimeRange): + assert ( + config.time_column in df.columns + ), f"No {config.time_column} found, instead: {df.columns}\n{df}" if isinstance(config, DatetimeRange) and not config.start_time_is_tz_naive(): # This is tied to the fact that we set the Spark session to UTC. # Otherwise, there is confusion with the computer's local time zone. From 15bb2a475fdc1deb9b2f5c5539405d698b92b73a Mon Sep 17 00:00:00 2001 From: lixiliu <36629962+lixiliu@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:47:11 -0700 Subject: [PATCH 27/27] final --- src/chronify/sqlalchemy/functions.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/chronify/sqlalchemy/functions.py b/src/chronify/sqlalchemy/functions.py index ba96219..6539ec8 100644 --- a/src/chronify/sqlalchemy/functions.py +++ b/src/chronify/sqlalchemy/functions.py @@ -189,12 +189,11 @@ def _read_from_hive( query: Selectable | str, conn: Connection, config: TimeBaseModel, params: Any = None ) -> pd.DataFrame: df = pd.read_sql_query(query, conn, params=params) - breakpoint() - if isinstance(config, DatetimeRange): - assert ( - config.time_column in df.columns - ), f"No {config.time_column} found, instead: {df.columns}\n{df}" - if isinstance(config, DatetimeRange) and not config.start_time_is_tz_naive(): + if ( + isinstance(config, DatetimeRange) + and config.time_column in df.columns + and not config.start_time_is_tz_naive() + ): # This is tied to the fact that we set the Spark session to UTC. # Otherwise, there is confusion with the computer's local time zone. df[config.time_column] = df[config.time_column].dt.tz_localize("UTC")