Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
526dd7a
initial spack dataset scraper
lfquintaz Jul 16, 2025
8421d96
added practice message of "meow"
lfquintaz Jul 16, 2025
ac16bd8
added methods to create local cache for easy interrupt/proceed from l…
lfquintaz Jul 19, 2025
c362a96
added files to capture failed manifest/tarball download
lfquintaz Aug 11, 2025
059f205
added atomic save functions for manifest and tarballs
lfquintaz Aug 11, 2025
b5e46e0
Add files via upload
lfquintaz Aug 11, 2025
601f74a
removed print statement at restart that shows all the fully processed…
lfquintaz Aug 15, 2025
9ed218f
merged diverged branch file updates in spack_db.py
lfquintaz Aug 15, 2025
670be16
added Create_spack_DB.py
lfquintaz Aug 15, 2025
cefc0fc
added test files
lfquintaz Aug 16, 2025
81afc13
updated remove_lines_spec_manifest function
lfquintaz Aug 18, 2025
f189767
added README.md
lfquintaz Aug 18, 2025
b434859
fixed bullet points in Directory Structure and Features
lfquintaz Aug 18, 2025
bfa7dbe
fixed bullet points for the layout of the index
lfquintaz Aug 18, 2025
c3ba8d5
updated Usage
lfquintaz Aug 18, 2025
fe63504
updated README.md with retry manifest/tarball download instruction
lfquintaz Aug 19, 2025
19e76fb
updated _to_posix for printing
lfquintaz Aug 19, 2025
880f545
removed spec_manifest info from README.md
lfquintaz Aug 21, 2025
b2de168
removed 'updated' comments
lfquintaz Aug 21, 2025
dc0ad5c
removed 'contributing' from README.md
lfquintaz Aug 21, 2025
9ebd7fc
added supporting scripts for testing
lfquintaz Aug 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions dataset-generation/spack_db/Create_spack_DB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import os
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
import os
# /// script
# dependencies = [
# "dapper-python",
# ]
# ///
import os

Adding inline script metadata for capturing a list of the dependencies needed is useful as both documentation, as well as making it possible to use uv or pipx to run the script without having to worry about manually installing dependencies.

(https://peps.python.org/pep-0723 has more info on this inline script metadata and its format)

import json
import sqlite3
import time
from dapper_python.normalize import normalize_file_name

# configuration
INDEX_PATH = "cache/spack.index.db.json"
SQLITE_DB_PATH = "cache/spack-v1.db"

def build_package_filelist_db():
# load index
if not os.path.exists(INDEX_PATH):
print("❌ Index file not found.")
return

with open(INDEX_PATH, "r") as f:
index = json.load(f)

# Create SQLite DB
conn = sqlite3.connect(SQLITE_DB_PATH)
cursor = conn.cursor()

# Create table columns
cursor.execute('''
CREATE TABLE IF NOT EXISTS package_files (
id INTEGER PRIMARY KEY,
file_name TEXT,
normalized_file_name TEXT,
file_path TEXT,
package_name TEXT,
UNIQUE(file_path, package_name)
)
''')

# Create indices for efficient lookups
cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)')


# Create dataset_version table
cursor.execute('''
CREATE TABLE IF NOT EXISTS dataset_version(
version INTEGER,
format TEXT,
timestamp INTEGER
)
''')

# Clear the dataset_version table
cursor.execute("DELETE FROM dataset_version")

# Create table columns
cursor.execute(
"INSERT INTO dataset_version (version, format, timestamp)" \
"VALUES (?, ?, ?)",
(1, "Spack", int(time.time()))
)

inserted_packages = 0
inserted_files = 0
for package_hash, entry in index.items():
try:
package_name = entry["name"]
version = entry["version"]
sha256 = entry["sha256"]


tarinfo_path = entry.get("tarinfo_path")
if not tarinfo_path or not os.path.exists(tarinfo_path):
print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}")
continue

with open(tarinfo_path, "r") as f:
file_list = json.load(f)

package_inserted_or_updated = False

for file_path in file_list:
# skipping .spack/ files
if file_path.startswith(".spack/"):
continue

# Extract file name
file_name = os.path.basename(file_path)

# Normalize the file name
try:
normalized = normalize_file_name(file_name)
normalized_file_name = str(normalized).lower()
except Exception as e:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the normalize_file_name function throws any exceptions?

As a general note with exceptions in Python (and if we had a linter such as pylint checking the code it would complain about this), it is best to capture the most specific exception type(s) possible, rather than the generic base Exception -- such as "IndexError", "ValueError", etc depending on what the code in the try block could actually throw.

print(f"⚠️ Failed to normalize '{file_name}': {e}")
normalized_file_name = file_name.lower()

# Insert into DB
cursor.execute(
'''INSERT OR IGNORE INTO package_files
(file_name, normalized_file_name, file_path, package_name)
VALUES (?, ?, ?, ?)''',
(file_name, normalized_file_name, file_path, package_name)
)

if cursor.rowcount > 0:
inserted_files += 1
package_inserted_or_updated = True # New row added
continue # No need to update - freshly inserted
#breakpoint()
# Row already exists - check if any values changed
cursor.execute(
''' SELECT file_name, normalized_file_name FROM package_files
WHERE file_path = ? AND package_name = ?''',
(file_path, package_name)
)
result = cursor.fetchone()
if result:
existing_file_name, existing_normalized_name = result
if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name):
# Something changed - update


# Update the row
cursor.execute(
''' UPDATE package_files
SET file_name = ?, normalized_file_name = ?
WHERE file_path = ? AND package_name = ?''',
(file_name, normalized_file_name, file_path, package_name)
)
package_inserted_or_updated = True # A row was updated
if package_inserted_or_updated:
inserted_packages += 1


except Exception as e:
print(f"❌ Failed to insert {package_hash}: {e}")
continue

conn.commit()
conn.close()

print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}")

if __name__ == "__main__":
build_package_filelist_db()
61 changes: 61 additions & 0 deletions dataset-generation/spack_db/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Spack Build Cache Data Scraper & SQLite Database

This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database.
This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a SQLite database that maps file names back to the Spack package that contains that file.


The program builds a master index called `spack.index.db.json`.
* Index layout:
* spec manifest hash as the unique key
* package name and version
* package tarball unique SHA256 hash
* package manifest path to the local cache directory
* package tarinfo path to the local cache directory

The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided.

## Directory Structure
* `cache/spack.index.db.json` - master index
* `cache/manifest/` - cleaned spec manifests
* `cache/tarinfo/` - JSON file lists extracted from tarballs
* `cache/binary_packages/` - temporary cache of downloaded tarballs
* `cache/timeouts.txt` - packages that timed out while downloading
* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
* `cache/malformed_manifests.txt` - manifests that failed parsing
* `cache/missing_tarballs.txt` - manifests without a tarball hash
* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
* `cache/failed_tarball_downloads.txt` - tarballs that failed to download

## Features
* Retrieves package `.spec.manifest.json` from Spack's binary mirror
* Extracts valid JSON payload, and removes extra characters
* Retrieves binary tarballs and extracts file lists
* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
Comment on lines +30 to +31
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Retrieves binary tarballs and extracts file lists
* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
* Retrieves package binary tarballs and extracts file lists
* Creates and maintains a canonical JSON index that maps package to its manifest and tarball information

* Contains multiple checkpoints for safe restart/resume of the program
* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
* Records skipped/malformed manifests, missing hashes, failed tarball downloads

* Stores forward-slash paths in JSON index for cross-platform use

## Usage
1. Install dependencies
```bash
pip install requests
```
The rest of the necessary modules are part of Python's standard library.

2. Provide a database file
Update the file_name in `main()` if needed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From a UX perspective, adding support for a command line argument that lets a user provide the name of the database file would be nice.


3. Run the script
```bash
python spack_db.py
```

4. Resume after interruption
If an interruption occurs, it is safe to re-run the script without losing data already processed.

5. Retry manifests or tarballs
Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads.

6. Run Create_spack_DB.py to create SQLite database
```bash
python Create_spack_DB.py
```
Database will include all files extracted from the packages from the Spack build cache.
110 changes: 110 additions & 0 deletions dataset-generation/spack_db/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# conftest.py
import io
import json
import tarfile
import types
import pytest


import spack_db as pl

@pytest.fixture(autouse=True)
def isolate_fs(tmp_path, monkeypatch):
"""Redirect all cache/config paths to a temp dir per test."""
cache = tmp_path / "cache"
(cache / "manifest").mkdir(parents=True, exist_ok=True)
(cache / "tarinfo").mkdir(parents=True, exist_ok=True)
(cache / "spec_manifests").mkdir(parents=True, exist_ok=True)
(cache / "binary_packages").mkdir(parents=True, exist_ok=True)

monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False)
monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False)
monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False)
monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False)
monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False)

monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False)
monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False)
monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False)
monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False)
monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False)
monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False)
monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False)

# Ensure directories exist for atomic writes
(tmp_path / "cache").mkdir(exist_ok=True)
yield


@pytest.fixture
def sample_manifest_json():
"""
Create the *actual bytes* expected by remove_lines_spec_manifest:
take a valid JSON, then pad 49 bytes in front and 834 bytes at the end.
"""
body = {
"data": [
{"mediaType": "irrelevant/type", "checksum": "abc"},
{"mediaType": "application/vnd.spack.install.v2.tar+gzip",
"checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"}
]
}
raw = json.dumps(body).encode("utf-8")
return b"x" * 49 + raw + b"y" * 834


@pytest.fixture
def tar_with_placeholder_bytes():
"""
Build a tar in-memory whose members include the __spack_path_placeh segments
and the package-tail folder (e.g., 'compiler-wrapper-1.0-<hash>').
"""
pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
member_name = (
"home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/"
"__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/"
f"{pkg_tail}/.spack/install_environment.json"
)

buf = io.BytesIO()
with tarfile.open(fileobj=buf, mode="w:gz") as tf:
data = b"{}"
tarinfo = tarfile.TarInfo(name=member_name)
tarinfo.size = len(data)
tf.addfile(tarinfo, io.BytesIO(data))
return buf.getvalue()


class DummyResp:
def __init__(self, status_code=200, content=b""):
self.status_code = status_code
self.content = content


@pytest.fixture
def fake_requests(monkeypatch):
"""
Monkeypatch requests.get with programmable behavior per-URL.
Usage:
table = {}
def _route(url, *a, **kw): return table[url]()
fake = fake_requests
fake.route = _route
monkeypatch.setattr(pl.requests, "get", _route)
table["...json"] = lambda: DummyResp(200, b"...")
"""
table = {}

def _get(url, *args, **kwargs):
if url not in table:
raise AssertionError(f"Unexpected URL requested: {url}")
result = table[url]()
# Allow raising exceptions (e.g., Timeout) from factories
if isinstance(result, Exception):
raise result
return result

# Expose for tests to fill
_get.table = table
monkeypatch.setattr(pl.requests, "get", _get)
return _get
35 changes: 35 additions & 0 deletions dataset-generation/spack_db/rename_tarinfo_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import re

# Path to your tarinfo directory
TARINFO_DIR = "cache/tarinfo"

# Updated regex: match <prefix>-<64-char-sha256>.json
pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$")

# Counter
renamed = 0
skipped = 0

for filename in os.listdir(TARINFO_DIR):
match = pattern.match(filename)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using a regular expression with matching groups, you could skip the regex entirely and do the same thing by just splitting the filename using rsplit() on the first dash ("-") encountered.

if match:
sha256_hash = match.group(2)
new_filename = f"{sha256_hash}.json"

old_path = os.path.join(TARINFO_DIR, filename)
new_path = os.path.join(TARINFO_DIR, new_filename)

# Skip if target file already exists
if os.path.exists(new_path):
print(f"⚠️ Skipping {filename} (target {new_filename} already exists)")
skipped += 1
continue

os.rename(old_path, new_path)
renamed += 1
else:
print(f"❓ Skipping non-matching file: {filename}")
skipped += 1

print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.")
Loading
Loading