-
Notifications
You must be signed in to change notification settings - Fork 4
Spack scraping lq #154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Spack scraping lq #154
Changes from all commits
526dd7a
8421d96
ac16bd8
c362a96
059f205
b5e46e0
601f74a
9ed218f
670be16
cefc0fc
81afc13
f189767
b434859
bfa7dbe
c3ba8d5
fe63504
19e76fb
880f545
b2de168
dc0ad5c
9ebd7fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,143 @@ | ||
| import os | ||
| import json | ||
| import sqlite3 | ||
| import time | ||
| from dapper_python.normalize import normalize_file_name | ||
|
|
||
| # configuration | ||
| INDEX_PATH = "cache/spack.index.db.json" | ||
| SQLITE_DB_PATH = "cache/spack-v1.db" | ||
|
|
||
| def build_package_filelist_db(): | ||
| # load index | ||
| if not os.path.exists(INDEX_PATH): | ||
| print("❌ Index file not found.") | ||
| return | ||
|
|
||
| with open(INDEX_PATH, "r") as f: | ||
| index = json.load(f) | ||
|
|
||
| # Create SQLite DB | ||
| conn = sqlite3.connect(SQLITE_DB_PATH) | ||
| cursor = conn.cursor() | ||
|
|
||
| # Create table columns | ||
| cursor.execute(''' | ||
| CREATE TABLE IF NOT EXISTS package_files ( | ||
| id INTEGER PRIMARY KEY, | ||
| file_name TEXT, | ||
| normalized_file_name TEXT, | ||
| file_path TEXT, | ||
| package_name TEXT, | ||
| UNIQUE(file_path, package_name) | ||
| ) | ||
| ''') | ||
|
|
||
| # Create indices for efficient lookups | ||
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)') | ||
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)') | ||
|
|
||
|
|
||
| # Create dataset_version table | ||
| cursor.execute(''' | ||
| CREATE TABLE IF NOT EXISTS dataset_version( | ||
| version INTEGER, | ||
| format TEXT, | ||
| timestamp INTEGER | ||
| ) | ||
| ''') | ||
|
|
||
| # Clear the dataset_version table | ||
| cursor.execute("DELETE FROM dataset_version") | ||
|
|
||
| # Create table columns | ||
| cursor.execute( | ||
| "INSERT INTO dataset_version (version, format, timestamp)" \ | ||
| "VALUES (?, ?, ?)", | ||
| (1, "Spack", int(time.time())) | ||
| ) | ||
|
|
||
| inserted_packages = 0 | ||
| inserted_files = 0 | ||
| for package_hash, entry in index.items(): | ||
| try: | ||
| package_name = entry["name"] | ||
| version = entry["version"] | ||
| sha256 = entry["sha256"] | ||
|
|
||
|
|
||
| tarinfo_path = entry.get("tarinfo_path") | ||
| if not tarinfo_path or not os.path.exists(tarinfo_path): | ||
| print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}") | ||
| continue | ||
|
|
||
| with open(tarinfo_path, "r") as f: | ||
| file_list = json.load(f) | ||
|
|
||
| package_inserted_or_updated = False | ||
|
|
||
| for file_path in file_list: | ||
| # skipping .spack/ files | ||
| if file_path.startswith(".spack/"): | ||
| continue | ||
|
|
||
| # Extract file name | ||
| file_name = os.path.basename(file_path) | ||
|
|
||
| # Normalize the file name | ||
| try: | ||
| normalized = normalize_file_name(file_name) | ||
| normalized_file_name = str(normalized).lower() | ||
| except Exception as e: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the normalize_file_name function throws any exceptions? As a general note with exceptions in Python (and if we had a linter such as pylint checking the code it would complain about this), it is best to capture the most specific exception type(s) possible, rather than the generic base Exception -- such as "IndexError", "ValueError", etc depending on what the code in the try block could actually throw. |
||
| print(f"⚠️ Failed to normalize '{file_name}': {e}") | ||
| normalized_file_name = file_name.lower() | ||
|
|
||
| # Insert into DB | ||
| cursor.execute( | ||
| '''INSERT OR IGNORE INTO package_files | ||
| (file_name, normalized_file_name, file_path, package_name) | ||
| VALUES (?, ?, ?, ?)''', | ||
| (file_name, normalized_file_name, file_path, package_name) | ||
| ) | ||
|
|
||
| if cursor.rowcount > 0: | ||
| inserted_files += 1 | ||
| package_inserted_or_updated = True # New row added | ||
| continue # No need to update - freshly inserted | ||
| #breakpoint() | ||
| # Row already exists - check if any values changed | ||
| cursor.execute( | ||
| ''' SELECT file_name, normalized_file_name FROM package_files | ||
| WHERE file_path = ? AND package_name = ?''', | ||
| (file_path, package_name) | ||
| ) | ||
| result = cursor.fetchone() | ||
| if result: | ||
| existing_file_name, existing_normalized_name = result | ||
| if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name): | ||
| # Something changed - update | ||
|
|
||
|
|
||
| # Update the row | ||
| cursor.execute( | ||
| ''' UPDATE package_files | ||
| SET file_name = ?, normalized_file_name = ? | ||
| WHERE file_path = ? AND package_name = ?''', | ||
| (file_name, normalized_file_name, file_path, package_name) | ||
| ) | ||
| package_inserted_or_updated = True # A row was updated | ||
| if package_inserted_or_updated: | ||
| inserted_packages += 1 | ||
|
|
||
|
|
||
| except Exception as e: | ||
| print(f"❌ Failed to insert {package_hash}: {e}") | ||
| continue | ||
|
|
||
| conn.commit() | ||
| conn.close() | ||
|
|
||
| print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}") | ||
|
|
||
| if __name__ == "__main__": | ||
| build_package_filelist_db() | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,61 @@ | ||||||||||
| # Spack Build Cache Data Scraper & SQLite Database | ||||||||||
|
|
||||||||||
| This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. | ||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
|
|
||||||||||
| The program builds a master index called `spack.index.db.json`. | ||||||||||
| * Index layout: | ||||||||||
| * spec manifest hash as the unique key | ||||||||||
| * package name and version | ||||||||||
| * package tarball unique SHA256 hash | ||||||||||
| * package manifest path to the local cache directory | ||||||||||
| * package tarinfo path to the local cache directory | ||||||||||
|
|
||||||||||
| The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. | ||||||||||
|
|
||||||||||
| ## Directory Structure | ||||||||||
| * `cache/spack.index.db.json` - master index | ||||||||||
| * `cache/manifest/` - cleaned spec manifests | ||||||||||
| * `cache/tarinfo/` - JSON file lists extracted from tarballs | ||||||||||
| * `cache/binary_packages/` - temporary cache of downloaded tarballs | ||||||||||
| * `cache/timeouts.txt` - packages that timed out while downloading | ||||||||||
| * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded | ||||||||||
| * `cache/malformed_manifests.txt` - manifests that failed parsing | ||||||||||
| * `cache/missing_tarballs.txt` - manifests without a tarball hash | ||||||||||
| * `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball | ||||||||||
| * `cache/failed_tarball_downloads.txt` - tarballs that failed to download | ||||||||||
|
|
||||||||||
| ## Features | ||||||||||
| * Retrieves package `.spec.manifest.json` from Spack's binary mirror | ||||||||||
| * Extracts valid JSON payload, and removes extra characters | ||||||||||
| * Retrieves binary tarballs and extracts file lists | ||||||||||
| * Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information | ||||||||||
|
Comment on lines
+30
to
+31
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| * Contains multiple checkpoints for safe restart/resume of the program | ||||||||||
| * Records skipped/malformed manifests, missing hashes, failed tarbll downloads | ||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| * Stores forward-slash paths in JSON index for cross-platform use | ||||||||||
|
|
||||||||||
| ## Usage | ||||||||||
| 1. Install dependencies | ||||||||||
| ```bash | ||||||||||
| pip install requests | ||||||||||
| ``` | ||||||||||
| The rest of the necessary modules are part of Python's standard library. | ||||||||||
|
|
||||||||||
| 2. Provide a database file | ||||||||||
| Update the file_name in `main()` if needed | ||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From a UX perspective, adding support for a command line argument that lets a user provide the name of the database file would be nice. |
||||||||||
|
|
||||||||||
| 3. Run the script | ||||||||||
| ```bash | ||||||||||
| python spack_db.py | ||||||||||
| ``` | ||||||||||
|
|
||||||||||
| 4. Resume after interruption | ||||||||||
| If an interruption occurs, it is safe to re-run the script without losing data already processed. | ||||||||||
|
|
||||||||||
| 5. Retry manifests or tarballs | ||||||||||
| Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads. | ||||||||||
|
|
||||||||||
| 6. Run Create_spack_DB.py to create SQLite database | ||||||||||
| ```bash | ||||||||||
| python Create_spack_DB.py | ||||||||||
| ``` | ||||||||||
| Database will include all files extracted from the packages from the Spack build cache. | ||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| # conftest.py | ||
| import io | ||
| import json | ||
| import tarfile | ||
| import types | ||
| import pytest | ||
|
|
||
|
|
||
| import spack_db as pl | ||
|
|
||
| @pytest.fixture(autouse=True) | ||
| def isolate_fs(tmp_path, monkeypatch): | ||
| """Redirect all cache/config paths to a temp dir per test.""" | ||
| cache = tmp_path / "cache" | ||
| (cache / "manifest").mkdir(parents=True, exist_ok=True) | ||
| (cache / "tarinfo").mkdir(parents=True, exist_ok=True) | ||
| (cache / "spec_manifests").mkdir(parents=True, exist_ok=True) | ||
| (cache / "binary_packages").mkdir(parents=True, exist_ok=True) | ||
|
|
||
| monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False) | ||
| monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False) | ||
| monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False) | ||
| monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False) | ||
| monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False) | ||
|
|
||
| monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False) | ||
| monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False) | ||
|
|
||
| # Ensure directories exist for atomic writes | ||
| (tmp_path / "cache").mkdir(exist_ok=True) | ||
| yield | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def sample_manifest_json(): | ||
| """ | ||
| Create the *actual bytes* expected by remove_lines_spec_manifest: | ||
| take a valid JSON, then pad 49 bytes in front and 834 bytes at the end. | ||
| """ | ||
| body = { | ||
| "data": [ | ||
| {"mediaType": "irrelevant/type", "checksum": "abc"}, | ||
| {"mediaType": "application/vnd.spack.install.v2.tar+gzip", | ||
| "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"} | ||
| ] | ||
| } | ||
| raw = json.dumps(body).encode("utf-8") | ||
| return b"x" * 49 + raw + b"y" * 834 | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def tar_with_placeholder_bytes(): | ||
| """ | ||
| Build a tar in-memory whose members include the __spack_path_placeh segments | ||
| and the package-tail folder (e.g., 'compiler-wrapper-1.0-<hash>'). | ||
| """ | ||
| pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" | ||
| member_name = ( | ||
| "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/" | ||
| "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/" | ||
| f"{pkg_tail}/.spack/install_environment.json" | ||
| ) | ||
|
|
||
| buf = io.BytesIO() | ||
| with tarfile.open(fileobj=buf, mode="w:gz") as tf: | ||
| data = b"{}" | ||
| tarinfo = tarfile.TarInfo(name=member_name) | ||
| tarinfo.size = len(data) | ||
| tf.addfile(tarinfo, io.BytesIO(data)) | ||
| return buf.getvalue() | ||
|
|
||
|
|
||
| class DummyResp: | ||
| def __init__(self, status_code=200, content=b""): | ||
| self.status_code = status_code | ||
| self.content = content | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def fake_requests(monkeypatch): | ||
| """ | ||
| Monkeypatch requests.get with programmable behavior per-URL. | ||
| Usage: | ||
| table = {} | ||
| def _route(url, *a, **kw): return table[url]() | ||
| fake = fake_requests | ||
| fake.route = _route | ||
| monkeypatch.setattr(pl.requests, "get", _route) | ||
| table["...json"] = lambda: DummyResp(200, b"...") | ||
| """ | ||
| table = {} | ||
|
|
||
| def _get(url, *args, **kwargs): | ||
| if url not in table: | ||
| raise AssertionError(f"Unexpected URL requested: {url}") | ||
| result = table[url]() | ||
| # Allow raising exceptions (e.g., Timeout) from factories | ||
| if isinstance(result, Exception): | ||
| raise result | ||
| return result | ||
|
|
||
| # Expose for tests to fill | ||
| _get.table = table | ||
| monkeypatch.setattr(pl.requests, "get", _get) | ||
| return _get |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| import os | ||
| import re | ||
|
|
||
| # Path to your tarinfo directory | ||
| TARINFO_DIR = "cache/tarinfo" | ||
|
|
||
| # Updated regex: match <prefix>-<64-char-sha256>.json | ||
| pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$") | ||
|
|
||
| # Counter | ||
| renamed = 0 | ||
| skipped = 0 | ||
|
|
||
| for filename in os.listdir(TARINFO_DIR): | ||
| match = pattern.match(filename) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of using a regular expression with matching groups, you could skip the regex entirely and do the same thing by just splitting the filename using rsplit() on the first dash ("-") encountered. |
||
| if match: | ||
| sha256_hash = match.group(2) | ||
| new_filename = f"{sha256_hash}.json" | ||
|
|
||
| old_path = os.path.join(TARINFO_DIR, filename) | ||
| new_path = os.path.join(TARINFO_DIR, new_filename) | ||
|
|
||
| # Skip if target file already exists | ||
| if os.path.exists(new_path): | ||
| print(f"⚠️ Skipping {filename} (target {new_filename} already exists)") | ||
| skipped += 1 | ||
| continue | ||
|
|
||
| os.rename(old_path, new_path) | ||
| renamed += 1 | ||
| else: | ||
| print(f"❓ Skipping non-matching file: {filename}") | ||
| skipped += 1 | ||
|
|
||
| print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding inline script metadata for capturing a list of the dependencies needed is useful as both documentation, as well as making it possible to use uv or pipx to run the script without having to worry about manually installing dependencies.
(https://peps.python.org/pep-0723 has more info on this inline script metadata and its format)