NREL · daniel-thom · Oct 22, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 22, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,10 +29,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install ".[dev,spark]"
-        wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
-        tar -xzf spark-3.5.4-bin-hadoop3.tgz
-        export SPARK_HOME=$(pwd)/spark-3.5.4-bin-hadoop3
+        python -m pip install ".[dev]" --group=pyhive
+        wget https://dlcdn.apache.org/spark/spark-4.0.1/spark-4.0.1-bin-hadoop3.tgz
+        tar -xzf spark-4.0.1-bin-hadoop3.tgz
+        export SPARK_HOME=$(pwd)/spark-4.0.1-bin-hadoop3
         export PATH=$SPARK_HOME/sbin:$PATH
         start-thriftserver.sh
     - name: Run pytest with coverage

diff --git a/README.md b/README.md
@@ -11,13 +11,19 @@ modeling packages. It supports validation and mapping across different time conf
 🚧
 
 ## Installation
+To use DuckDB or SQLite as the backend:
 ```
 $ pip install chronify
 ```
 
+To use Apache Spark via Apache Thrift Server as the backend:
+```
+$ pip install chronify --group=pyhive
+```
+
 ## Developer installation
 ```
-$ pip install -e ".[dev]"
+$ pip install -e ".[dev]" --group=pyhive
 ```
 
 Please install `pre-commit` so that your code is checked before making commits.

diff --git a/docs/how_tos/getting_started/installation.md b/docs/how_tos/getting_started/installation.md
@@ -30,8 +30,17 @@ Whenever you are done using chronify, you can deactivate the environment by runn
 
 3. Install the Python package `chronify`.
 
+To use DuckDB or SQLite as the backend:
 ```{eval-rst}
 .. code-block:: console
 
     $ pip install chronify
 ```
+
+To use Apache Spark via Apache Thrift Server as the backend, you must install pyhive.
+
+```{eval-rst}
+.. code-block:: console
+
+    $ pip install chronify --group=pyhive
+```
diff --git a/docs/how_tos/spark_backend.md b/docs/how_tos/spark_backend.md
@@ -4,14 +4,14 @@ scripts for UNIX operating systems (not Windows).
 
 ## Install chronify with Spark support
 ```
-$ pip install chronify[spark]
+$ pip install chronify --group=pyhive
 ```
 
 ## Installation on a development computer
 Installation can be as simple as
 ```
-$ tar -xzf spark-3.5.4-bin-hadoop3.tgz
-$ export SPARK_HOME=$(pwd)/spark-3.5.4-bin-hadoop3
+$ tar -xzf spark-4.0.1-bin-hadoop3.tgz
+$ export SPARK_HOME=$(pwd)/spark-4.0.1-bin-hadoop3
 ```
 
 Start a Thrift server. This allows JDBC clients to send SQL queries to an in-process Spark cluster

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,11 +38,6 @@ dependencies = [
     "tzdata",
 ]
 [project.optional-dependencies]
-spark = [
-    "pyhive", # @ git+https://github.com/apache/kyuubi.git#egg=pyhive&subdirectory=python",
-    "thrift",
-    "thrift_sasl",
-]
 
 dev = [
     "mypy >= 1.15, < 2",
@@ -60,6 +55,13 @@ dev = [
     "sphinx-tabs~=3.4",
 ]
 
+[dependency-groups]
+pyhive = [
+    "pyhive @ git+https://github.com/apache/kyuubi.git@3b205a3924e0e3a75c425de1396089729cf22ee5#subdirectory=python",
+    "thrift",
+    "thrift_sasl",
+]
+
 [project.urls]
 Documentation = "https://github.com/NREL/chronify#readme"
 Issues = "https://github.com/NREL/chronify/issues"

diff --git a/src/chronify/time_series_mapper_column_representative_to_datetime.py b/src/chronify/time_series_mapper_column_representative_to_datetime.py
@@ -232,7 +232,7 @@ def mdh_from_datetime(timestamp: datetime) -> tuple[int, int, int]:
     return timestamp.month, timestamp.day, timestamp.hour + 1
 
 
-def generate_period_mapping(periods: pd.Series) -> pd.DataFrame:  # type: ignore
+def generate_period_mapping(periods: pd.Series) -> pd.DataFrame:
     unique_periods = periods.unique()
     mappings = []
     for period_str in unique_periods:

diff --git a/src/chronify/time_series_mapper_index_time.py b/src/chronify/time_series_mapper_index_time.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from typing import Optional
 import numpy as np
-from datetime import timedelta
+from datetime import datetime, timedelta
 
 import pandas as pd
 from sqlalchemy import Engine, MetaData, Table, select
@@ -165,7 +165,12 @@ def _create_local_time_config(self, time_zone: str) -> DatetimeRange:
             filter(lambda k_v: k_v[0] in DatetimeRange.model_fields, time_kwargs.items())
         )
         time_kwargs["time_type"] = TimeType.DATETIME
-        time_kwargs["start"] = self._from_time_config.start_timestamp
+        if isinstance(self._from_time_config.start_timestamp, datetime):
+            # TODO: this is a hack. datetime is correct but only is present when Hive is used.
+            # The code requires pandas Timestamps.
+            time_kwargs["start"] = pd.Timestamp(self._from_time_config.start_timestamp)
+        else:
+            time_kwargs["start"] = self._from_time_config.start_timestamp
         time_kwargs["time_column"] = "represented_time"
         time_config = DatetimeRange(**time_kwargs)
         assert (