llnl · lfquintaz · Jul 16, 2025 · Jul 16, 2025 · Jul 19, 2025 · Aug 11, 2025
diff --git a/dataset-generation/spack_db/Create_spack_DB.py b/dataset-generation/spack_db/Create_spack_DB.py
@@ -0,0 +1,143 @@
+import os
-import os
+# /// script
+# dependencies = [
+#   "dapper-python",
+# ]
+# ///
+
+import os
-import os
+# /// script
+# dependencies = [
+#   "dapper-python",
+# ]
+# ///
+
+import os
+import json
+import sqlite3
+import time
+from dapper_python.normalize import normalize_file_name
+
+# configuration
+INDEX_PATH = "cache/spack.index.db.json"
+SQLITE_DB_PATH = "cache/spack-v1.db"
+
+def build_package_filelist_db():
+    # load index
+    if not os.path.exists(INDEX_PATH):
+        print("❌ Index file not found.")
+        return
+
+    with open(INDEX_PATH, "r") as f:
+        index = json.load(f)
+
+    # Create SQLite DB
+    conn = sqlite3.connect(SQLITE_DB_PATH)
+    cursor = conn.cursor()
+
+    # Create table columns
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS package_files (
+            id INTEGER PRIMARY KEY,
+            file_name TEXT,
+            normalized_file_name TEXT,
+            file_path TEXT,
+            package_name TEXT,
+            UNIQUE(file_path, package_name)
+        )
+    ''')
+
+    # Create indices for efficient lookups
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)')
+
+
+    # Create dataset_version table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS dataset_version(
+                   version INTEGER,
+                   format TEXT,
+                   timestamp INTEGER
+        )
+    ''')
+
+    # Clear the dataset_version table
+    cursor.execute("DELETE FROM dataset_version")
+
+    # Create table columns
+    cursor.execute(
+        "INSERT INTO dataset_version (version, format, timestamp)" \
+        "VALUES (?, ?, ?)",
+        (1, "Spack", int(time.time()))
+    )
+
+    inserted_packages = 0
+    inserted_files = 0
+    for package_hash, entry in index.items():
+        try:
+            package_name = entry["name"]
+            version = entry["version"]
+            sha256 = entry["sha256"]
+
+
+            tarinfo_path = entry.get("tarinfo_path")
+            if not tarinfo_path or not os.path.exists(tarinfo_path):
+                print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}")
+                continue
+
+            with open(tarinfo_path, "r") as f:
+                file_list = json.load(f)
+
+            package_inserted_or_updated = False
+
+            for file_path in file_list:
+                # skipping .spack/ files
+                if file_path.startswith(".spack/"):
+                    continue
+
+                # Extract file name
+                file_name = os.path.basename(file_path)
+
+                # Normalize the file name
+                try:
+                    normalized = normalize_file_name(file_name)
+                    normalized_file_name = str(normalized).lower()
+                except Exception as e:
+                    print(f"⚠️ Failed to normalize '{file_name}': {e}")
+                    normalized_file_name = file_name.lower()
+
+                # Insert into DB
+                cursor.execute(
+                    '''INSERT OR IGNORE INTO package_files 
+                       (file_name, normalized_file_name, file_path, package_name)
+                       VALUES (?, ?, ?, ?)''',
+                    (file_name, normalized_file_name, file_path, package_name)
+                )
+
+                if cursor.rowcount > 0:
+                    inserted_files += 1
+                    package_inserted_or_updated = True # New row added
+                    continue # No need to update - freshly inserted
+                #breakpoint()
+                # Row already exists - check if any values changed
+                cursor.execute(
+                    ''' SELECT file_name, normalized_file_name FROM package_files
+                        WHERE file_path = ? AND package_name = ?''',
+                    (file_path, package_name)
+                )
+                result = cursor.fetchone()
+                if result:
+                    existing_file_name, existing_normalized_name = result
+                    if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name):
+                        # Something changed - update
+
+
+                        # Update the row
+                        cursor.execute(
+                            ''' UPDATE package_files
+                                SET file_name = ?, normalized_file_name = ?
+                                WHERE file_path = ? AND package_name = ?''',
+                            (file_name, normalized_file_name, file_path, package_name)
+                        )
+                        package_inserted_or_updated = True # A row was updated
+            if package_inserted_or_updated:
+                inserted_packages += 1
+
+
+        except Exception as e:
+            print(f"❌ Failed to insert {package_hash}: {e}")
+            continue
+
+    conn.commit()
+    conn.close()
+
+    print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}")
+
+if __name__ == "__main__":
+    build_package_filelist_db()
diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
@@ -0,0 +1,61 @@
+# Spack Build Cache Data Scraper & SQLite Database
+
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
-This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a SQLite database that maps file names back to the Spack package that contains that file.
-This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a SQLite database that maps file names back to the Spack package that contains that file.
+
+The program builds a master index called `spack.index.db.json`. 
+* Index layout:
+    * spec manifest hash as the unique key
+    * package name and version
+    * package tarball unique SHA256 hash
+    * package manifest path to the local cache directory
+    * package tarinfo path to the local cache directory
+
+The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
+
+## Directory Structure
+* `cache/spack.index.db.json` - master index
+* `cache/manifest/` - cleaned spec manifests
+* `cache/tarinfo/` - JSON file lists extracted from tarballs
+* `cache/binary_packages/` - temporary cache of downloaded tarballs
+* `cache/timeouts.txt` - packages that timed out while downloading
+* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
+* `cache/malformed_manifests.txt` - manifests that failed parsing
+* `cache/missing_tarballs.txt` - manifests without a tarball hash
+* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
+* `cache/failed_tarball_downloads.txt` - tarballs that failed to download
+
+## Features
+* Retrieves package `.spec.manifest.json` from Spack's binary mirror
+* Extracts valid JSON payload, and removes extra characters
+* Retrieves binary tarballs and extracts file lists
+* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
-* Retrieves binary tarballs and extracts file lists
-* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
+* Retrieves package binary tarballs and extracts file lists
+* Creates and maintains a canonical JSON index that maps package to its manifest and tarball information
-* Retrieves binary tarballs and extracts file lists
-* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
+* Retrieves package binary tarballs and extracts file lists
+* Creates and maintains a canonical JSON index that maps package to its manifest and tarball information
+* Contains multiple checkpoints for safe restart/resume of the program
+* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
-* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
+* Records skipped/malformed manifests, missing hashes, failed tarball downloads
-* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
+* Records skipped/malformed manifests, missing hashes, failed tarball downloads
+* Stores forward-slash paths in JSON index for cross-platform use
+
+## Usage
+1. Install dependencies
+    ```bash
+    pip install requests
+    ```
+    The rest of the necessary modules are part of Python's standard library.
+
+2. Provide a database file
+    Update the file_name in `main()` if needed
+
+3. Run the script
+    ```bash
+    python spack_db.py
+    ```
+
+4. Resume after interruption
+    If an interruption occurs, it is safe to re-run the script without losing data already processed. 
+
+5. Retry manifests or tarballs
+    Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads.
+
+6. Run Create_spack_DB.py to create SQLite database
+    ```bash
+    python Create_spack_DB.py
+    ```
+    Database will include all files extracted from the packages from the Spack build cache.
diff --git a/dataset-generation/spack_db/conftest.py b/dataset-generation/spack_db/conftest.py
@@ -0,0 +1,110 @@
+# conftest.py
+import io
+import json
+import tarfile
+import types
+import pytest
+
+
+import spack_db as pl
+
+@pytest.fixture(autouse=True)
+def isolate_fs(tmp_path, monkeypatch):
+    """Redirect all cache/config paths to a temp dir per test."""
+    cache = tmp_path / "cache"
+    (cache / "manifest").mkdir(parents=True, exist_ok=True)
+    (cache / "tarinfo").mkdir(parents=True, exist_ok=True)
+    (cache / "spec_manifests").mkdir(parents=True, exist_ok=True)
+    (cache / "binary_packages").mkdir(parents=True, exist_ok=True)
+
+    monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False)
+    monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False)
+    monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False)
+    monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False)
+    monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False)
+
+    monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False)
+    monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False)
+    monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False)
+
+    # Ensure directories exist for atomic writes
+    (tmp_path / "cache").mkdir(exist_ok=True)
+    yield
+
+
+@pytest.fixture
+def sample_manifest_json():
+    """
+    Create the *actual bytes* expected by remove_lines_spec_manifest:
+    take a valid JSON, then pad 49 bytes in front and 834 bytes at the end.
+    """
+    body = {
+        "data": [
+            {"mediaType": "irrelevant/type", "checksum": "abc"},
+            {"mediaType": "application/vnd.spack.install.v2.tar+gzip",
+             "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"}
+        ]
+    }
+    raw = json.dumps(body).encode("utf-8")
+    return b"x" * 49 + raw + b"y" * 834
+
+
+@pytest.fixture
+def tar_with_placeholder_bytes():
+    """
+    Build a tar in-memory whose members include the __spack_path_placeh segments
+    and the package-tail folder (e.g., 'compiler-wrapper-1.0-<hash>').
+    """
+    pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    member_name = (
+        "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/"
+        "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/"
+        f"{pkg_tail}/.spack/install_environment.json"
+    )
+
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tf:
+        data = b"{}"
+        tarinfo = tarfile.TarInfo(name=member_name)
+        tarinfo.size = len(data)
+        tf.addfile(tarinfo, io.BytesIO(data))
+    return buf.getvalue()
+
+
+class DummyResp:
+    def __init__(self, status_code=200, content=b""):
+        self.status_code = status_code
+        self.content = content
+
+
+@pytest.fixture
+def fake_requests(monkeypatch):
+    """
+    Monkeypatch requests.get with programmable behavior per-URL.
+    Usage:
+        table = {}
+        def _route(url, *a, **kw): return table[url]()
+        fake = fake_requests
+        fake.route = _route
+        monkeypatch.setattr(pl.requests, "get", _route)
+        table["...json"] = lambda: DummyResp(200, b"...")
+    """
+    table = {}
+
+    def _get(url, *args, **kwargs):
+        if url not in table:
+            raise AssertionError(f"Unexpected URL requested: {url}")
+        result = table[url]()
+        # Allow raising exceptions (e.g., Timeout) from factories
+        if isinstance(result, Exception):
+            raise result
+        return result
+
+    # Expose for tests to fill
+    _get.table = table
+    monkeypatch.setattr(pl.requests, "get", _get)
+    return _get
diff --git a/dataset-generation/spack_db/rename_tarinfo_file.py b/dataset-generation/spack_db/rename_tarinfo_file.py
@@ -0,0 +1,35 @@
+import os
+import re
+
+# Path to your tarinfo directory
+TARINFO_DIR = "cache/tarinfo"
+
+# Updated regex: match <prefix>-<64-char-sha256>.json
+pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$")
+
+# Counter
+renamed = 0
+skipped = 0
+
+for filename in os.listdir(TARINFO_DIR):
+    match = pattern.match(filename)
+    if match:
+        sha256_hash = match.group(2)
+        new_filename = f"{sha256_hash}.json"
+
+        old_path = os.path.join(TARINFO_DIR, filename)
+        new_path = os.path.join(TARINFO_DIR, new_filename)
+
+        # Skip if target file already exists
+        if os.path.exists(new_path):
+            print(f"⚠️ Skipping {filename} (target {new_filename} already exists)")
+            skipped += 1
+            continue
+
+        os.rename(old_path, new_path)
+        renamed += 1
+    else:
+        print(f"❓ Skipping non-matching file: {filename}")
+        skipped += 1
+
+print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.")