From 7bf171943fc25cd2524f3b804ec32277f6d0a077 Mon Sep 17 00:00:00 2001
From: pesap <pesap@users.noreply.github.com>
Date: Sat, 18 Oct 2025 14:06:18 -0600
Subject: [PATCH 1/3] feat: Adding auto_detect for CSV files

---
 src/chronify/store.py | 86 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/src/chronify/store.py b/src/chronify/store.py
index c5ecc31..2b20b85 100644
--- a/src/chronify/store.py
+++ b/src/chronify/store.py
@@ -1,6 +1,7 @@
 from collections.abc import Iterable
 from pathlib import Path
 import shutil
+import os
 from typing import Any, Optional
 from chronify.utils.sql import make_temp_view_name
 
@@ -39,6 +40,7 @@
     get_duckdb_types_from_pandas,
     get_sqlalchemy_type_from_duckdb,
 )
+from chronify.csv_utils import _should_use_auto_detect, _is_large_file, _get_file_size
 from chronify.sqlalchemy.functions import (
     create_view_from_parquet,
     read_database,
@@ -349,6 +351,7 @@ def ingest_from_csv(
         src_schema: CsvTableSchema,
         dst_schema: TableSchema,
         connection: Optional[Connection] = None,
+        auto_detect: Optional[bool] = None,
     ) -> bool:
         """Ingest data from a CSV file.
 
@@ -362,6 +365,8 @@ def ingest_from_csv(
             Defines the destination table in the database.
         connection
             Optional connection to reuse. Refer to :meth:`ingest_table` for notes.
+        auto_detect
+            Enable auto-detection. If None, checks CHRONIFY_AUTO_DETECT_CSV environment variable.
 
         Returns
         -------
@@ -402,7 +407,12 @@ def ingest_from_csv(
         --------
         ingest_from_csvs
         """
-        return self.ingest_from_csvs((path,), src_schema, dst_schema, connection=connection)
+        if _should_use_auto_detect(auto_detect) and _is_large_file(path):
+            logger.info(f"Processing large file {path} with auto-detection optimizations")
+
+        return self.ingest_from_csvs(
+            (path,), src_schema, dst_schema, connection=connection, auto_detect=auto_detect
+        )
 
     def ingest_from_csvs(
         self,
@@ -410,12 +420,13 @@ def ingest_from_csvs(
         src_schema: CsvTableSchema,
         dst_schema: TableSchema,
         connection: Optional[Connection] = None,
+        auto_detect: Optional[bool] = None,
     ) -> bool:
-        """Ingest data into the table specifed by schema. If the table does not exist,
-        create it. This is faster than calling :meth:`ingest_from_csv` many times.
-        Each file is loaded into memory one at a time.
-        If any error occurs, all added data will be removed and the state of the database will
-        be the same as the original state.
+        """Ingest data from multiple CSV files.
+
+        This is faster than calling :meth:`ingest_from_csv` multiple times.
+        Each file is loaded one at a time. If any error occurs, all added data
+        will be removed and the database state will be unchanged.
 
         Parameters
         ----------
@@ -427,6 +438,8 @@ def ingest_from_csvs(
             Defines the destination table in the database.
         conn
             Optional connection to reuse. Refer to :meth:`ingest_table` for notes.
+        auto_detect
+            Enable auto-detection. If None, checks CHRONIFY_AUTO_DETECT_CSV environment variable.
 
         Returns
         -------
@@ -1234,6 +1247,67 @@ def _handle_sqlite_error_case(self, name: str, connection: Optional[Connection])
             with self._engine.begin() as conn:
                 conn.execute(text(f"DROP TABLE IF EXISTS {name}"))
 
+    # Simple CSV Inspection (Always Available)
+    def inspect_csv(
+        self, path: Path | str, peek_rows: int = 5, auto_detect: Optional[bool] = None
+    ) -> dict[str, Any]:
+        """Inspect CSV file structure and provide recommendations.
+
+        Parameters
+        ----------
+        path
+            Path to CSV file
+        peek_rows
+            Number of rows to sample
+        auto_detect
+            Enable auto-detection. If None, checks CHRONIFY_AUTO_DETECT_CSV environment variable.
+
+        Returns
+        -------
+        dict
+            File information including columns, detected format, size, and recommendations
+        """
+        import duckdb
+
+        file_size_mb = _get_file_size(str(path)) / (1024 * 1024) if os.path.exists(path) else 0
+        is_large = _is_large_file(path)
+
+        try:
+            rel = duckdb.sql(f"SELECT * FROM read_csv('{path}') LIMIT {peek_rows}")
+            columns = rel.columns
+            sample_data = rel.to_df().to_dict("records")
+        except Exception as e:
+            return {"error": f"Could not read CSV: {e}"}
+
+        auto_detect_enabled = _should_use_auto_detect(auto_detect)
+        recommendations = []
+        if is_large:
+            if auto_detect_enabled:
+                recommendations.append("Large file detected - auto-detection optimizations active")
+            else:
+                recommendations.append(
+                    "Consider enabling auto-detection for better large file handling"
+                )
+
+        detected_format = None
+        col_set = {col.lower().strip() for col in columns}
+        if "name" in col_set and "value" in col_set and len(col_set) == 2:
+            detected_format = "name_value"
+        elif "datetime" in col_set or "timestamp" in col_set:
+            detected_format = "datetime_series"
+        elif any(f"m{i:02d}" in col_set for i in range(1, 13)):
+            detected_format = "monthly_data"
+
+        return {
+            "columns": columns,
+            "detected_format": detected_format,
+            "file_size_mb": round(file_size_mb, 2),
+            "is_large_file": is_large,
+            "auto_detect_enabled": auto_detect_enabled,
+            "sample_data": sample_data,
+            "recommendations": recommendations,
+        }
+
 
 def check_columns(
     table_columns: Iterable[str],

From ef842a5b57c641eef4d097244cb41da745638ac8 Mon Sep 17 00:00:00 2001
From: pesap <pesap@users.noreply.github.com>
Date: Sat, 18 Oct 2025 14:08:02 -0600
Subject: [PATCH 2/3] test: Adding test for new feature

---
 src/chronify/csv_utils.py     | 26 ++++++++++++
 tests/test_auto_detect_csv.py | 75 +++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 src/chronify/csv_utils.py
 create mode 100644 tests/test_auto_detect_csv.py

diff --git a/src/chronify/csv_utils.py b/src/chronify/csv_utils.py
new file mode 100644
index 0000000..88f11f3
--- /dev/null
+++ b/src/chronify/csv_utils.py
@@ -0,0 +1,26 @@
+"""CSV utilities for auto-detection features."""
+
+import os
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+
+@lru_cache(maxsize=32)
+def _get_file_size(path: str | Path) -> int:
+    try:
+        return os.path.getsize(path)
+    except (OSError, FileNotFoundError):
+        return 0
+
+
+def _is_large_file(path: str | Path, threshold_mb: int = 50) -> bool:
+    size_mb = _get_file_size(path) / (1024 * 1024)
+    return size_mb > threshold_mb
+
+
+def _should_use_auto_detect(auto_detect: Optional[bool] = None) -> bool:
+    """Check if auto-detection should be used: parameter > environment > False."""
+    if auto_detect is not None:
+        return auto_detect
+    return os.environ.get("CHRONIFY_AUTO_DETECT_CSV", "").lower() in ("true", "1", "yes")
diff --git a/tests/test_auto_detect_csv.py b/tests/test_auto_detect_csv.py
new file mode 100644
index 0000000..af83670
--- /dev/null
+++ b/tests/test_auto_detect_csv.py
@@ -0,0 +1,75 @@
+"""
+Test the stateless auto-detect CSV approach.
+No global state - just method parameters and environment variables.
+"""
+
+import os
+import tempfile
+from pathlib import Path
+
+from chronify import Store
+from chronify.csv_utils import _should_use_auto_detect
+
+
+def test_stateless_auto_detect_csv():
+    """Test that auto-detection features work without global state."""
+    
+    # Create a test CSV
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+        f.write("timestamp,device,value\n")
+        f.write("2020-01-01 00:00,A,100\n")
+        f.write("2020-01-01 01:00,A,200\n")
+        csv_path = f.name
+    
+    try:
+        store = Store()
+        
+        # Test 1: Default behavior (no auto-detection)
+        result = store.inspect_csv(csv_path)
+        assert 'error' not in result
+        print("✓ Default inspect_csv works")
+        
+        # Test 2: Explicitly enable auto-detection via parameter
+        result = store.inspect_csv(csv_path, auto_detect=True)
+        assert 'error' not in result
+        print("✓ Auto-detect via parameter works")
+        
+        # Test 3: Environment variable (no persistence needed)
+        os.environ["CHRONIFY_AUTO_DETECT_CSV"] = "true"
+        result = store.inspect_csv(csv_path)  # Should detect from env
+        assert 'error' not in result
+        print("✓ Auto-detect via environment variable works")
+        
+        # Test 4: Parameter overrides environment
+        result = store.inspect_csv(csv_path, auto_detect=False)  # Explicitly disabled
+        assert 'error' not in result
+        print("✓ Parameter override works")
+        
+        # Test 5: Utility function works correctly
+        assert _should_use_auto_detect(None) == True  # From env
+        assert _should_use_auto_detect(True) == True  # Explicit
+        assert _should_use_auto_detect(False) == False  # Explicit override
+        print("✓ Auto-detect detection works correctly")
+        
+        # Clean up environment
+        del os.environ["CHRONIFY_AUTO_DETECT_CSV"]
+        
+        # Test 6: No environment, default to False
+        assert _should_use_auto_detect(None) == False
+        print("✓ Default fallback works")
+        
+        print("\n🎉 All stateless auto-detect CSV tests passed!")
+        print("✅ No global state required")
+        print("✅ Clean method parameters")
+        print("✅ Environment variable support")
+        print("✅ Parameter priority over environment")
+        
+    finally:
+        # Cleanup
+        Path(csv_path).unlink(missing_ok=True)
+        if "CHRONIFY_AUTO_DETECT_CSV" in os.environ:
+            del os.environ["CHRONIFY_AUTO_DETECT_CSV"]
+
+
+if __name__ == "__main__":
+    test_stateless_auto_detect_csv()
\ No newline at end of file

From 189d184d1f6838c022a9047c4b946b12acc706d0 Mon Sep 17 00:00:00 2001
From: pesap <pesap@users.noreply.github.com>
Date: Sat, 18 Oct 2025 14:20:48 -0600
Subject: [PATCH 3/3] fixup! test: Adding test for new feature

---
 tests/test_auto_detect_csv.py | 121 +++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 61 deletions(-)

diff --git a/tests/test_auto_detect_csv.py b/tests/test_auto_detect_csv.py
index af83670..f008dad 100644
--- a/tests/test_auto_detect_csv.py
+++ b/tests/test_auto_detect_csv.py
@@ -1,75 +1,74 @@
-"""
-Test the stateless auto-detect CSV approach.
-No global state - just method parameters and environment variables.
-"""
+"""Test auto-detect CSV functionality."""
 
 import os
 import tempfile
+from collections.abc import Generator
 from pathlib import Path
 
+import pytest
+
 from chronify import Store
 from chronify.csv_utils import _should_use_auto_detect
 
 
-def test_stateless_auto_detect_csv():
-    """Test that auto-detection features work without global state."""
-    
-    # Create a test CSV
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+@pytest.fixture
+def csv_file() -> Generator[str, None, None]:
+    """Create temporary CSV file for testing."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
         f.write("timestamp,device,value\n")
         f.write("2020-01-01 00:00,A,100\n")
         f.write("2020-01-01 01:00,A,200\n")
         csv_path = f.name
-    
-    try:
-        store = Store()
-        
-        # Test 1: Default behavior (no auto-detection)
-        result = store.inspect_csv(csv_path)
-        assert 'error' not in result
-        print("✓ Default inspect_csv works")
-        
-        # Test 2: Explicitly enable auto-detection via parameter
-        result = store.inspect_csv(csv_path, auto_detect=True)
-        assert 'error' not in result
-        print("✓ Auto-detect via parameter works")
-        
-        # Test 3: Environment variable (no persistence needed)
-        os.environ["CHRONIFY_AUTO_DETECT_CSV"] = "true"
-        result = store.inspect_csv(csv_path)  # Should detect from env
-        assert 'error' not in result
-        print("✓ Auto-detect via environment variable works")
-        
-        # Test 4: Parameter overrides environment
-        result = store.inspect_csv(csv_path, auto_detect=False)  # Explicitly disabled
-        assert 'error' not in result
-        print("✓ Parameter override works")
-        
-        # Test 5: Utility function works correctly
-        assert _should_use_auto_detect(None) == True  # From env
-        assert _should_use_auto_detect(True) == True  # Explicit
-        assert _should_use_auto_detect(False) == False  # Explicit override
-        print("✓ Auto-detect detection works correctly")
-        
-        # Clean up environment
+
+    yield csv_path
+
+    Path(csv_path).unlink(missing_ok=True)
+
+
+@pytest.fixture
+def clean_env() -> Generator[None, None, None]:
+    """Ensure clean environment variable state."""
+    yield
+    if "CHRONIFY_AUTO_DETECT_CSV" in os.environ:
         del os.environ["CHRONIFY_AUTO_DETECT_CSV"]
-        
-        # Test 6: No environment, default to False
-        assert _should_use_auto_detect(None) == False
-        print("✓ Default fallback works")
-        
-        print("\n🎉 All stateless auto-detect CSV tests passed!")
-        print("✅ No global state required")
-        print("✅ Clean method parameters")
-        print("✅ Environment variable support")
-        print("✅ Parameter priority over environment")
-        
-    finally:
-        # Cleanup
-        Path(csv_path).unlink(missing_ok=True)
-        if "CHRONIFY_AUTO_DETECT_CSV" in os.environ:
-            del os.environ["CHRONIFY_AUTO_DETECT_CSV"]
-
-
-if __name__ == "__main__":
-    test_stateless_auto_detect_csv()
\ No newline at end of file
+
+
+def test_inspect_csv_default(csv_file: str) -> None:
+    """Test CSV inspection without auto-detect."""
+    store = Store()
+    result = store.inspect_csv(csv_file)
+    assert "error" not in result
+
+
+def test_inspect_csv_with_parameter(csv_file: str) -> None:
+    """Test CSV inspection with auto-detect parameter."""
+    store = Store()
+    result = store.inspect_csv(csv_file, auto_detect=True)
+    assert "error" not in result
+
+
+def test_inspect_csv_with_env_variable(csv_file: str, clean_env: None) -> None:
+    """Test CSV inspection with environment variable."""
+    os.environ["CHRONIFY_AUTO_DETECT_CSV"] = "true"
+    store = Store()
+    result = store.inspect_csv(csv_file)
+    assert "error" not in result
+
+
+def test_parameter_overrides_env(csv_file: str, clean_env: None) -> None:
+    """Test parameter override of environment variable."""
+    os.environ["CHRONIFY_AUTO_DETECT_CSV"] = "true"
+    store = Store()
+    result = store.inspect_csv(csv_file, auto_detect=False)
+    assert "error" not in result
+
+
+def test_should_use_auto_detect(clean_env: None) -> None:
+    """Test auto-detect priority logic."""
+    assert _should_use_auto_detect(None) is False
+    assert _should_use_auto_detect(True) is True
+    assert _should_use_auto_detect(False) is False
+
+    os.environ["CHRONIFY_AUTO_DETECT_CSV"] = "true"
+    assert _should_use_auto_detect(None) is True
+    assert _should_use_auto_detect(False) is False