diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b81d655..27abda1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,10 +29,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install ".[dev,spark]" - wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz - tar -xzf spark-3.5.4-bin-hadoop3.tgz - export SPARK_HOME=$(pwd)/spark-3.5.4-bin-hadoop3 + python -m pip install ".[dev]" --group=pyhive + wget https://dlcdn.apache.org/spark/spark-4.0.1/spark-4.0.1-bin-hadoop3.tgz + tar -xzf spark-4.0.1-bin-hadoop3.tgz + export SPARK_HOME=$(pwd)/spark-4.0.1-bin-hadoop3 export PATH=$SPARK_HOME/sbin:$PATH start-thriftserver.sh - name: Run pytest with coverage diff --git a/README.md b/README.md index 87b54e2..83c9303 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,19 @@ modeling packages. It supports validation and mapping across different time conf 🚧 ## Installation +To use DuckDB or SQLite as the backend: ``` $ pip install chronify ``` +To use Apache Spark via Apache Thrift Server as the backend: +``` +$ pip install chronify --group=pyhive +``` + ## Developer installation ``` -$ pip install -e ".[dev]" +$ pip install -e ".[dev]" --group=pyhive ``` Please install `pre-commit` so that your code is checked before making commits. diff --git a/docs/how_tos/getting_started/installation.md b/docs/how_tos/getting_started/installation.md index 0c3d17f..ab922a7 100644 --- a/docs/how_tos/getting_started/installation.md +++ b/docs/how_tos/getting_started/installation.md @@ -30,8 +30,17 @@ Whenever you are done using chronify, you can deactivate the environment by runn 3. Install the Python package `chronify`. +To use DuckDB or SQLite as the backend: ```{eval-rst} .. code-block:: console $ pip install chronify ``` + +To use Apache Spark via Apache Thrift Server as the backend, you must install pyhive. + +```{eval-rst} +.. code-block:: console + + $ pip install chronify --group=pyhive +``` diff --git a/docs/how_tos/spark_backend.md b/docs/how_tos/spark_backend.md index 8e57367..1df45c6 100644 --- a/docs/how_tos/spark_backend.md +++ b/docs/how_tos/spark_backend.md @@ -4,14 +4,14 @@ scripts for UNIX operating systems (not Windows). ## Install chronify with Spark support ``` -$ pip install chronify[spark] +$ pip install chronify --group=pyhive ``` ## Installation on a development computer Installation can be as simple as ``` -$ tar -xzf spark-3.5.4-bin-hadoop3.tgz -$ export SPARK_HOME=$(pwd)/spark-3.5.4-bin-hadoop3 +$ tar -xzf spark-4.0.1-bin-hadoop3.tgz +$ export SPARK_HOME=$(pwd)/spark-4.0.1-bin-hadoop3 ``` Start a Thrift server. This allows JDBC clients to send SQL queries to an in-process Spark cluster diff --git a/pyproject.toml b/pyproject.toml index fdd3db1..9f23968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,11 +38,6 @@ dependencies = [ "tzdata", ] [project.optional-dependencies] -spark = [ - "pyhive", # @ git+https://github.com/apache/kyuubi.git#egg=pyhive&subdirectory=python", - "thrift", - "thrift_sasl", -] dev = [ "mypy >= 1.15, < 2", @@ -60,6 +55,13 @@ dev = [ "sphinx-tabs~=3.4", ] +[dependency-groups] +pyhive = [ + "pyhive @ git+https://github.com/apache/kyuubi.git@3b205a3924e0e3a75c425de1396089729cf22ee5#subdirectory=python", + "thrift", + "thrift_sasl", +] + [project.urls] Documentation = "https://github.com/NREL/chronify#readme" Issues = "https://github.com/NREL/chronify/issues" diff --git a/src/chronify/time_series_mapper_column_representative_to_datetime.py b/src/chronify/time_series_mapper_column_representative_to_datetime.py index ab7fd6b..166c3a2 100644 --- a/src/chronify/time_series_mapper_column_representative_to_datetime.py +++ b/src/chronify/time_series_mapper_column_representative_to_datetime.py @@ -232,7 +232,7 @@ def mdh_from_datetime(timestamp: datetime) -> tuple[int, int, int]: return timestamp.month, timestamp.day, timestamp.hour + 1 -def generate_period_mapping(periods: pd.Series) -> pd.DataFrame: # type: ignore +def generate_period_mapping(periods: pd.Series) -> pd.DataFrame: unique_periods = periods.unique() mappings = [] for period_str in unique_periods: diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py index e45840f..4f664ef 100644 --- a/src/chronify/time_series_mapper_index_time.py +++ b/src/chronify/time_series_mapper_index_time.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import Optional import numpy as np -from datetime import timedelta +from datetime import datetime, timedelta import pandas as pd from sqlalchemy import Engine, MetaData, Table, select @@ -165,7 +165,12 @@ def _create_local_time_config(self, time_zone: str) -> DatetimeRange: filter(lambda k_v: k_v[0] in DatetimeRange.model_fields, time_kwargs.items()) ) time_kwargs["time_type"] = TimeType.DATETIME - time_kwargs["start"] = self._from_time_config.start_timestamp + if isinstance(self._from_time_config.start_timestamp, datetime): + # TODO: this is a hack. datetime is correct but only is present when Hive is used. + # The code requires pandas Timestamps. + time_kwargs["start"] = pd.Timestamp(self._from_time_config.start_timestamp) + else: + time_kwargs["start"] = self._from_time_config.start_timestamp time_kwargs["time_column"] = "represented_time" time_config = DatetimeRange(**time_kwargs) assert (