diff --git a/dataset-generation/spack_db/Create_spack_DB.py b/dataset-generation/spack_db/Create_spack_DB.py new file mode 100644 index 0000000..e2ccc8d --- /dev/null +++ b/dataset-generation/spack_db/Create_spack_DB.py @@ -0,0 +1,143 @@ +import os +import json +import sqlite3 +import time +from dapper_python.normalize import normalize_file_name + +# configuration +INDEX_PATH = "cache/spack.index.db.json" +SQLITE_DB_PATH = "cache/spack-v1.db" + +def build_package_filelist_db(): + # load index + if not os.path.exists(INDEX_PATH): + print("❌ Index file not found.") + return + + with open(INDEX_PATH, "r") as f: + index = json.load(f) + + # Create SQLite DB + conn = sqlite3.connect(SQLITE_DB_PATH) + cursor = conn.cursor() + + # Create table columns + cursor.execute(''' + CREATE TABLE IF NOT EXISTS package_files ( + id INTEGER PRIMARY KEY, + file_name TEXT, + normalized_file_name TEXT, + file_path TEXT, + package_name TEXT, + UNIQUE(file_path, package_name) + ) + ''') + + # Create indices for efficient lookups + cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)') + + + # Create dataset_version table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS dataset_version( + version INTEGER, + format TEXT, + timestamp INTEGER + ) + ''') + + # Clear the dataset_version table + cursor.execute("DELETE FROM dataset_version") + + # Create table columns + cursor.execute( + "INSERT INTO dataset_version (version, format, timestamp)" \ + "VALUES (?, ?, ?)", + (1, "Spack", int(time.time())) + ) + + inserted_packages = 0 + inserted_files = 0 + for package_hash, entry in index.items(): + try: + package_name = entry["name"] + version = entry["version"] + sha256 = entry["sha256"] + + + tarinfo_path = entry.get("tarinfo_path") + if not tarinfo_path or not os.path.exists(tarinfo_path): + print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}") + continue + + with open(tarinfo_path, "r") as f: + file_list = json.load(f) + + package_inserted_or_updated = False + + for file_path in file_list: + # skipping .spack/ files + if file_path.startswith(".spack/"): + continue + + # Extract file name + file_name = os.path.basename(file_path) + + # Normalize the file name + try: + normalized = normalize_file_name(file_name) + normalized_file_name = str(normalized).lower() + except Exception as e: + print(f"⚠️ Failed to normalize '{file_name}': {e}") + normalized_file_name = file_name.lower() + + # Insert into DB + cursor.execute( + '''INSERT OR IGNORE INTO package_files + (file_name, normalized_file_name, file_path, package_name) + VALUES (?, ?, ?, ?)''', + (file_name, normalized_file_name, file_path, package_name) + ) + + if cursor.rowcount > 0: + inserted_files += 1 + package_inserted_or_updated = True # New row added + continue # No need to update - freshly inserted + #breakpoint() + # Row already exists - check if any values changed + cursor.execute( + ''' SELECT file_name, normalized_file_name FROM package_files + WHERE file_path = ? AND package_name = ?''', + (file_path, package_name) + ) + result = cursor.fetchone() + if result: + existing_file_name, existing_normalized_name = result + if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name): + # Something changed - update + + + # Update the row + cursor.execute( + ''' UPDATE package_files + SET file_name = ?, normalized_file_name = ? + WHERE file_path = ? AND package_name = ?''', + (file_name, normalized_file_name, file_path, package_name) + ) + package_inserted_or_updated = True # A row was updated + if package_inserted_or_updated: + inserted_packages += 1 + + + except Exception as e: + print(f"❌ Failed to insert {package_hash}: {e}") + continue + + conn.commit() + conn.close() + + print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}") + +if __name__ == "__main__": + build_package_filelist_db() diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md new file mode 100644 index 0000000..3158899 --- /dev/null +++ b/dataset-generation/spack_db/README.md @@ -0,0 +1,61 @@ +# Spack Build Cache Data Scraper & SQLite Database + +This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. + +The program builds a master index called `spack.index.db.json`. +* Index layout: + * spec manifest hash as the unique key + * package name and version + * package tarball unique SHA256 hash + * package manifest path to the local cache directory + * package tarinfo path to the local cache directory + +The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. + +## Directory Structure +* `cache/spack.index.db.json` - master index +* `cache/manifest/` - cleaned spec manifests +* `cache/tarinfo/` - JSON file lists extracted from tarballs +* `cache/binary_packages/` - temporary cache of downloaded tarballs +* `cache/timeouts.txt` - packages that timed out while downloading +* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded +* `cache/malformed_manifests.txt` - manifests that failed parsing +* `cache/missing_tarballs.txt` - manifests without a tarball hash +* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball +* `cache/failed_tarball_downloads.txt` - tarballs that failed to download + +## Features +* Retrieves package `.spec.manifest.json` from Spack's binary mirror +* Extracts valid JSON payload, and removes extra characters +* Retrieves binary tarballs and extracts file lists +* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information +* Contains multiple checkpoints for safe restart/resume of the program +* Records skipped/malformed manifests, missing hashes, failed tarbll downloads +* Stores forward-slash paths in JSON index for cross-platform use + +## Usage +1. Install dependencies + ```bash + pip install requests + ``` + The rest of the necessary modules are part of Python's standard library. + +2. Provide a database file + Update the file_name in `main()` if needed + +3. Run the script + ```bash + python spack_db.py + ``` + +4. Resume after interruption + If an interruption occurs, it is safe to re-run the script without losing data already processed. + +5. Retry manifests or tarballs + Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads. + +6. Run Create_spack_DB.py to create SQLite database + ```bash + python Create_spack_DB.py + ``` + Database will include all files extracted from the packages from the Spack build cache. diff --git a/dataset-generation/spack_db/conftest.py b/dataset-generation/spack_db/conftest.py new file mode 100644 index 0000000..3acaeb6 --- /dev/null +++ b/dataset-generation/spack_db/conftest.py @@ -0,0 +1,110 @@ +# conftest.py +import io +import json +import tarfile +import types +import pytest + + +import spack_db as pl + +@pytest.fixture(autouse=True) +def isolate_fs(tmp_path, monkeypatch): + """Redirect all cache/config paths to a temp dir per test.""" + cache = tmp_path / "cache" + (cache / "manifest").mkdir(parents=True, exist_ok=True) + (cache / "tarinfo").mkdir(parents=True, exist_ok=True) + (cache / "spec_manifests").mkdir(parents=True, exist_ok=True) + (cache / "binary_packages").mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False) + monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False) + monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False) + monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False) + monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False) + + monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False) + monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False) + monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False) + monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False) + monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False) + monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False) + monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False) + + # Ensure directories exist for atomic writes + (tmp_path / "cache").mkdir(exist_ok=True) + yield + + +@pytest.fixture +def sample_manifest_json(): + """ + Create the *actual bytes* expected by remove_lines_spec_manifest: + take a valid JSON, then pad 49 bytes in front and 834 bytes at the end. + """ + body = { + "data": [ + {"mediaType": "irrelevant/type", "checksum": "abc"}, + {"mediaType": "application/vnd.spack.install.v2.tar+gzip", + "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"} + ] + } + raw = json.dumps(body).encode("utf-8") + return b"x" * 49 + raw + b"y" * 834 + + +@pytest.fixture +def tar_with_placeholder_bytes(): + """ + Build a tar in-memory whose members include the __spack_path_placeh segments + and the package-tail folder (e.g., 'compiler-wrapper-1.0-'). + """ + pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + member_name = ( + "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/" + "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/" + f"{pkg_tail}/.spack/install_environment.json" + ) + + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tf: + data = b"{}" + tarinfo = tarfile.TarInfo(name=member_name) + tarinfo.size = len(data) + tf.addfile(tarinfo, io.BytesIO(data)) + return buf.getvalue() + + +class DummyResp: + def __init__(self, status_code=200, content=b""): + self.status_code = status_code + self.content = content + + +@pytest.fixture +def fake_requests(monkeypatch): + """ + Monkeypatch requests.get with programmable behavior per-URL. + Usage: + table = {} + def _route(url, *a, **kw): return table[url]() + fake = fake_requests + fake.route = _route + monkeypatch.setattr(pl.requests, "get", _route) + table["...json"] = lambda: DummyResp(200, b"...") + """ + table = {} + + def _get(url, *args, **kwargs): + if url not in table: + raise AssertionError(f"Unexpected URL requested: {url}") + result = table[url]() + # Allow raising exceptions (e.g., Timeout) from factories + if isinstance(result, Exception): + raise result + return result + + # Expose for tests to fill + _get.table = table + monkeypatch.setattr(pl.requests, "get", _get) + return _get \ No newline at end of file diff --git a/dataset-generation/spack_db/rename_tarinfo_file.py b/dataset-generation/spack_db/rename_tarinfo_file.py new file mode 100644 index 0000000..3d16af6 --- /dev/null +++ b/dataset-generation/spack_db/rename_tarinfo_file.py @@ -0,0 +1,35 @@ +import os +import re + +# Path to your tarinfo directory +TARINFO_DIR = "cache/tarinfo" + +# Updated regex: match -<64-char-sha256>.json +pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$") + +# Counter +renamed = 0 +skipped = 0 + +for filename in os.listdir(TARINFO_DIR): + match = pattern.match(filename) + if match: + sha256_hash = match.group(2) + new_filename = f"{sha256_hash}.json" + + old_path = os.path.join(TARINFO_DIR, filename) + new_path = os.path.join(TARINFO_DIR, new_filename) + + # Skip if target file already exists + if os.path.exists(new_path): + print(f"⚠️ Skipping {filename} (target {new_filename} already exists)") + skipped += 1 + continue + + os.rename(old_path, new_path) + renamed += 1 + else: + print(f"❓ Skipping non-matching file: {filename}") + skipped += 1 + +print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.") diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py new file mode 100644 index 0000000..cea6d15 --- /dev/null +++ b/dataset-generation/spack_db/spack_db.py @@ -0,0 +1,633 @@ +# json library used to load json file +import json + +# requests used to download URL automatically +import requests + +# io for reading and writing streams such as text, binary, and raw data +import io + +# for reading and writing tar archives +import tarfile + +# to quiet warnings +import warnings +warnings.filterwarnings("ignore") + +# os to create a cache folder - disk-based caching +import os + +# tempfile and shutil for index temp file +import tempfile +import shutil + +from pathlib import Path + +# Configuration + +# spack.index.db.json maps each package back to the packagename, version, SHA256hash, + # path to manifest json file, path to tarinfo file list +INDEX_FILE = "cache/spack.index.db.json" + +# MANIFEST_DIR is the source of metadata per package used in master index +MANIFEST_DIR = "cache/manifest" + +# TARINFO_DIR contains the extracted list of files from each binary tarball +# this will be used in making the SQLite database without having to reprocess the tarballs again +TARINFO_DIR = "cache/tarinfo" + +# BINARY_CACHE_DIR contains the downloaded tarballs temporarily +# the file is deleted after processing. +BINARY_CACHE_DIR = "cache/binary_packages" +TIMEOUT_LOG_FILE = "cache/timeouts.txt" + +# checkpoint to safely stop the script at any time +# progress.txt saves the spec_manifest hash +CHECKPOINT_FILE = "progress.txt" + +# file to track all the manifest files that were unable to download +SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt" +MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt" +MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt" +SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt" +FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt" + +# create cache directories for faster download +os.makedirs(MANIFEST_DIR, exist_ok=True) +os.makedirs(TARINFO_DIR, exist_ok = True) +os.makedirs(BINARY_CACHE_DIR, exist_ok = True) + +# Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage. +def _to_posix(p:str) -> str: + return Path(p).as_posix() + +def _to_posix(p: str) -> str: + return Path(p).as_posix() + +# look for index if it exists to add info to +def load_index(): + # if the index_file exists, read it + if os.path.exists(INDEX_FILE): + with open(INDEX_FILE, "r") as f: + # return as a dictionary for easy manipulation and JSON formatting + return json.load(f) + # if the index does not exist, an empty dictionary is returned + return {} + +# save index +def save_index(index): + + # create a backup of the previous index + if os.path.exists(INDEX_FILE): + shutil.copy(INDEX_FILE, INDEX_FILE + ".bak") + + # Save to a temp file, then move to replace + temp_dir = os.path.dirname(INDEX_FILE) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(index, tmp, indent=2) + temp_name = tmp.name + + shutil.move(temp_name, INDEX_FILE) + +# format for entried added to index +def update_index_entry(index, package_hash, package_value, package_zip_hash): + name = package_value['spec']['name'] + version = package_value['spec']['version'] + manifest_filename = f"{name}-{version}-{package_hash}.json" + tarinfo_filename = f"{package_zip_hash}.json" + + manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename) + tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename) + + index[package_hash] = { + "name": name, + "version": version, + "sha256": package_zip_hash, + + # store forward-slash form in JSON for portability + "manifest_path": _to_posix(manifest_path_fs), + "tarinfo_path": _to_posix(tarinfo_path_fs), + } + +# load that last saved package hash +def load_checkpoint(): # + # checks if progress.txt exists + if os.path.exists(CHECKPOINT_FILE): + with open(CHECKPOINT_FILE, "r") as f: + + # read and return last processed package_hash + # strip removes trailing newline or spaces + return f.read().strip() + + # if the file does not exist, return None + # if None, start from the beginning + return None + + +# saves the last processed manifest package_hash to progress.txt +# if the program is interrupted, the saved package_hash will be + # the starting point when rerun +def save_checkpoint(package_hash): # + with open(CHECKPOINT_FILE, "w") as f: + f.write(package_hash) + + +# reading file +def readmyfile(myfile): + try: + with open(myfile, 'r') as file: + # database is the spack database json, within the spack build cache + db = json.load(file) # 8.6 seconds to read in large json file + + # returns database + return db + + except FileNotFoundError: + print(f"Error: The file '{myfile}' not found.") + except Exception as e: + print(f"Error occured in readmyfile: {e}") + + +# make the spec manifest downloadable URL +def make_spec_manifest_URL(package_hash, package_hash_value): +# goal is to make a URL that looks like this -> +# https://binaries.spack.io/develop/v3/manifests/spec//--.spec.manifest.json +# example URL for compiler-wrapper: +# https://binaries.spack.io/develop/v3/manifests/spec/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json + + myURL = 'https://binaries.spack.io/develop/v3/manifests/spec/' + + # this accesses the package name (not the package hash that we have been using as the package_hash) + package_name = package_hash_value['spec']['name'] + + # this accesses the package name version number + package_version = package_hash_value['spec']['version'] + + # package_filename for use later in removal of placeholder directories in the filepath of the tarball + package_filename = (package_name + '/' + package_name + '-' + package_version + + '-' + package_hash) + + # this updates the URL + myURL += (package_filename + '.spec.manifest.json') + + # returns the URL for the spec manifest and package_filename + return myURL, package_filename + +# automatically download contents from the URL +def download_from_URL(theURL, package, is_spec=True): + + + # makes filename + # Example: + # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" + # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" + package_name = package.replace('/','__') + + # if is_spec is true, meaning the file ends with ".spec.manifest.json", + # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation + # if the file ends with .tar.gz + # then the file is saved in BINARY_CACHE_DIR + cache_dir = BINARY_CACHE_DIR if not is_spec else None + + # full file path then is: + # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json + cached_path = os.path.join(cache_dir, package_name) if cache_dir else None + + if is_spec: + print(f"downloading manifest for {package_name}") + else: + print(f"temporary save location: {_to_posix(cached_path)} for {package_name}") + + #if cache exists, it does not need to be redownloaded + if cached_path and os.path.exists(cached_path): + + print(f"Using cached file: {_to_posix(cached_path)}") + + # rb is read binary + with open(cached_path, "rb") as f: + return f.read() + + try: + label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}" + print(f"trying download for {label}") + + # timeout for 60 seconds + response = requests.get(theURL, timeout=60, verify=False) + + # a response status code is 200 then the request was successful + # response.status_code of 404 means does not exist + if response.status_code == 200: + if cached_path: + print(f"download successful for {_to_posix(cached_path)}") + + # saves to cache if request is successful + # wb is write binary + with open(cached_path, "wb") as f: + f.write(response.content) + return response.content + + else: + # if URL does not exist, skip and move to next package + print(f"download failed for package: {package_name}\n") + + # return None to stop process due to download failing - goes back to run_program function + return None + + except requests.exceptions.Timeout: + print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}") + # Append to file immediately + with open(TIMEOUT_LOG_FILE, "a") as f: + f.write(f"{package_name}\t{theURL}\n") + return None + + except Exception as e: + print(f"download_from_URL package {package_name}, error: {e}") + + # return None to stop process due to download failing + return None + +# remove unnecessary lines in file +def remove_lines_spec_manifest(myfile): + + # Accept bytes or str; extract between first '{' and last '}' + data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8") + + start = data.find(b"{") + end = data.rfind(b"}") + + if start == -1 or end == -1 or end < start: + raise ValueError("Malformed manifest: JSON braces not found") + + return json.loads(data[start:end+1].decode("utf-8")) + +# returns checksum, sha256 hash used to download the binary tarball +def access_spec_manifest_media_type(db): + + try: + # get the value for the key 'data' from the db + # if the key 'data' does not exist, return empty list + # temp is db['data'] + for temp in db.get('data', []): + if temp.get('mediaType') == 'application/vnd.spack.install.v2.tar+gzip': + + # the checksum is the sha256 hash + return temp.get('checksum') + + except Exception as e: + print(f"Error occured in access_spec_manifest_media_type: {e}") + return None + +# uses checksum returned to generate URL for binary tarball download +def make_binary_package_URL(package_zip_hash): +# example URL for compiler-wrapper binary package: +# https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351 + + myURL = 'https://binaries.spack.io/develop/blobs/sha256/' + + first_byte = package_zip_hash[:2] + + myURL = myURL + first_byte + '/' + package_zip_hash + + return myURL + +# ensure tarinfo completeness +def write_tarinfo_safely(tarinfo_path, file_list): + temp_dir = os.path.dirname(tarinfo_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(file_list, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, tarinfo_path) + + +# ensure manifest completeness +def write_manifest_safely(manifest_path, manifest_data): + temp_dir = os.path.dirname(manifest_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(manifest_data, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, manifest_path) + +# using python tarfile module io module to list all the files in the downloaded tarball +# myfile is the tar_file response.content and the package is the hash we will split by +# package is the package name and the version +def read_binary_package(myfile, package, package_zip_hash): + file_list = [] + try: + with io.BytesIO(myfile) as tar_buffer: + with tarfile.open(fileobj = tar_buffer, mode="r:*") as tar: + + print(f"Files in the tar archive for {package.split('/')[0]}:") + i = 1 + for member in tar.getmembers(): + if member.isfile(): + # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/ + # .spack/install_environment.json" + # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + clean_path = remove_placeholder_directories(i, member.name, package) + + # this will add the files that are in the package to a clean_list + if clean_path: + file_list.append(clean_path) + i += 1 + + except tarfile.ReadError as e: + print(f"Error reading tar file: {e}") + return + + name = package.split('/')[0] + version = package.split('/')[1].split('-')[1] + + # saves file names to the tarinfo file + tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") + write_tarinfo_safely(tarinfo_path, file_list) + + # removes tarball once processed + tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__')) + if os.path.exists(tarball_path): + os.remove(tarball_path) + del myfile + +# removing the placeholder directories in the file path +def remove_placeholder_directories(i, name, package): + # i is the counter for file enumeration + # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/ + # compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/.spack/install_environment.json" + # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + + # updatedpackage_list for compiler-wrapper is "['compiler-wrapper', 'compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3']" + updatedpackage_list = package.split('/') + + # updatedpackage_name is "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + updatedpackage_name = updatedpackage_list[1] + + placeholder = "__spack_path_placeh" + + # split by updatedpackage_name + split_list = name.split(updatedpackage_name) + + try: + if placeholder not in split_list[0]: + print("split_list", split_list) + + # returns file name without the placeholder path + elif len(split_list) > 1: + updatedname = split_list[1][1:] + + print(f"file {i}: ", updatedname) + return updatedname # return to add to list of files for respective package + + except Exception as e: + print(f"Error in remove_placeholder_directories: {e}") + return None + + +def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): + name = package_value['spec']['name'] + version = package_value['spec']['version'] + + manifest_filename = f"{name}-{version}-{package_hash}.json" + manifest_path = os.path.join(MANIFEST_DIR, manifest_filename) + + # use existing cleaned manifest if available + if os.path.exists(manifest_path): + print(f"Using existing cleaned manifest: {_to_posix(manifest_path)}") + with open(manifest_path, "r") as f: + clean_spec_manifest = json.load(f) + + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + + else: + # download if manifest does not exist + + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + + # download the spec manifest json for the package of interest + temp = download_from_URL(theURL, package_filename, is_spec = True) + + + # return if URL does not exist + if temp is None: + print(f"Could not download manifest: {package_filename} - recording and skipping.\n") + + # recording the failed manifest filename and hash + with open(SKIPPED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\n") + + # exit print_files function and goes back to run_program function + return + + print("✅ Loaded cached spec manifest") + + try: + # remove unneccessary lines from downloaded spec manifest + clean_spec_manifest = remove_lines_spec_manifest(temp) + except Exception as e: + print(f"Failed to parse manifest {package_filename}: {e}") + with open(MALFORMED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\t{theURL}\t{type(e).__name__}: {e}\n") + # also adding it to file for skipping processing at restart + with open(SKIPPED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\n") + return + + # writes cleaned manifest information to manifest file + write_manifest_safely(manifest_path, clean_spec_manifest) + print(f"✅ Manifest safely written: {_to_posix(manifest_path)}") + + # find the mediaType that contains the hash for the package tarball install + package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest) + print(f"✅ Extracted zip hash: {package_zip_hash}") + + # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function + if package_zip_hash is None: + print(f"No Tarball hash found in manifest: {package_filename}") + + # Track taht this manifest has no downloadable binary tarball + with open(MISSING_TARBALL_HASH_FILE, "a") as f: + f.write(f"{package_hash}\n") + # go back to run_program function + return + + # track if the tarball hash has already been processed + if package_zip_hash in seen_tarball_hashes: + with open(SHARED_TARBALL_HASH_FILE, "a") as f: + # if this manifest points to a tarball that has already been seen, + # it will not create a new tarinfo entry + # it will have a new manifest entry + f.write(f"{package_hash}\t{package_zip_hash}\n") + else: + seen_tarball_hashes.add(package_zip_hash) + + + expected_tarinfo_hash = package_zip_hash + + if expected_tarinfo_hash in existing_tarinfo_files: + print(f"✅ Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"✅ Saved Index") + return + + else: + + # make the binary package URL for installing the package + binary_package_URL = make_binary_package_URL(package_zip_hash) + print(f"🔗 Downloading binary: {binary_package_URL}") + + # download the binary package file from the generated URL + tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) + print("✅ Binary package downloaded") + + if tempbinary is None: + + # Track failed tarball download + with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f: + f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n") + + return + + # read the binary package + read_binary_package(tempbinary, package_filename, package_zip_hash) + print("✅ Finished reading binary package") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"Updated Index with {package_filename}-{package_zip_hash}") + save_checkpoint(package_hash) + print(f"✅ Saved Index") + +# program dispatcher +def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): + installs = database['database']['installs'] + + # gets installs key value, aka name of package, version, etc. + package_value = installs[package_hash] + print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes) + + +def main(): + #file_name = "myMedjson.json" + # file_name = "myjson.json" + # file_name = 'Med_w_compilerwrapper_packages_at_end.json' + file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" + + database = readmyfile(file_name) + + # load list of previously skipped manifest hashes + if os.path.exists(SKIPPED_MANIFESTS_FILE): + with open(SKIPPED_MANIFESTS_FILE, "r") as f: + skipped_hashes = set(line.strip() for line in f) + + else: + skipped_hashes = set() + + # load manifests that are missing binary tarball hashes (e.g. mediaType not found) + if os.path.exists(MISSING_TARBALL_HASH_FILE): + with open(MISSING_TARBALL_HASH_FILE, "r") as f: + missing_tarball_hashes = set(line.strip() for line in f) + + else: + missing_tarball_hashes = set() + + # load tarballs that were not downloadable + if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE): + with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f: + failed_tarball_hashes = set( + line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line + ) + else: + failed_tarball_hashes = set() + + + # lists install keys + install_keys = list(database['database']['installs']) + num_keys = len(install_keys) + + # load last processed package hash from checkpoint from cache if it exists + last_processed = load_checkpoint() + index = load_index() + skip = True if last_processed else False + + existing_tarinfo_files = { + Path(fname).stem + for fname in os.listdir(TARINFO_DIR) + if fname.endswith(".json") + } + + # track already-processed tarball hashes to find shared ones + seen_tarball_hashes = set() + + SAVE_INTERVAL = 50 + + print("Starting...Will skip packages already fully processed.") + + try: + for i, package_hash in enumerate(install_keys): + + # skip if package_hash is in the skipped manifests file + if package_hash in skipped_hashes: + continue + + # skip if manifest had no usable tarball + if package_hash in missing_tarball_hashes: + continue + + if package_hash in index: + entry = index[package_hash] + manifest_path = entry.get("manifest_path", "") + tarinfo_path = entry.get("tarinfo_path", "") + tarball_hash = entry.get("sha256", "") + + manifest_exists = bool(manifest_path) and Path(manifest_path).exists() + tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists() + + if manifest_exists and tarinfo_exists: + + continue + + # if tarball previously failed, skip retrying it + if tarball_hash in failed_tarball_hashes: + print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}") + continue + + + print(f"📦 package {i + 1} out of {num_keys} packages\n") + + run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes) + + # Save checkpoint and index every N packages + if (i + 1) % SAVE_INTERVAL == 0: + save_checkpoint(package_hash) + save_index(index) + print(f"Saved checkpoint and index at package {i + 1}") + + except KeyboardInterrupt: + save_checkpoint(package_hash) + save_index(index) + print("\n🛑 Interrupted. Progress saved.") + + if last_processed: + val = database['database']['installs'].get(last_processed) + if val: + name = val['spec']['name'] + version = val['spec']['version'] + + print(f"Last checkpoint was: {name}-{version}, {last_processed}") + else: + print(f"Checkpoint not found in current file: {last_processed}") + print(f"file may have changed since the last run") + finally: + save_checkpoint(package_hash) + save_index(index) + print("\n🎊 Complete (or safely stopped). Script will resume where it left off.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dataset-generation/spack_db/spack_index_length.py b/dataset-generation/spack_db/spack_index_length.py new file mode 100644 index 0000000..0111f73 --- /dev/null +++ b/dataset-generation/spack_db/spack_index_length.py @@ -0,0 +1,19 @@ +import json + +# Path to your index file +INDEX_FILE = "cache/spack.index.db.json" + +def main(): + try: + with open(INDEX_FILE, "r") as f: + index = json.load(f) + print(f"✅ Number of entries in index: {len(index)}") + except FileNotFoundError: + print(f"❌ File not found: {INDEX_FILE}") + except json.JSONDecodeError: + print(f"❌ Failed to parse JSON. The file may be corrupted.") + except Exception as e: + print(f"❌ Unexpected error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dataset-generation/spack_db/test_spack_db.py b/dataset-generation/spack_db/test_spack_db.py new file mode 100644 index 0000000..dcf3181 --- /dev/null +++ b/dataset-generation/spack_db/test_spack_db.py @@ -0,0 +1,356 @@ +# test_spack_db.py +import json +import os +import io +import tarfile +import pytest +import spack_db as pl +from requests import exceptions as req_exc + + +def test_update_index_entry_sets_paths(tmp_path): + idx = {} + pkg_hash = "deadbeef" + value = {"spec": {"name": "foo", "version": "1.2.3"}} + tar_hash = "abc123" + pl.update_index_entry(idx, pkg_hash, value, tar_hash) + assert pkg_hash in idx + entry = idx[pkg_hash] + assert entry["name"] == "foo" + assert entry["version"] == "1.2.3" + assert entry["sha256"] == "abc123" + assert entry["manifest_path"].endswith(f"manifest/foo-1.2.3-deadbeef.json") + assert entry["tarinfo_path"].endswith(f"tarinfo/abc123.json") + + +def test_index_save_and_backup(tmp_path): + pl.save_index({"a": 1}) + # First save: no .bak yet + assert os.path.exists(pl.INDEX_FILE) + assert not os.path.exists(pl.INDEX_FILE + ".bak") + + # Second save should write a .bak of previous + pl.save_index({"a": 2}) + assert os.path.exists(pl.INDEX_FILE + ".bak") + with open(pl.INDEX_FILE) as f: + assert json.load(f)["a"] == 2 + with open(pl.INDEX_FILE + ".bak") as f: + assert json.load(f)["a"] == 1 + + +def test_checkpoint_roundtrip(tmp_path): + pl.save_checkpoint("pkg-hash-123") + assert pl.load_checkpoint() == "pkg-hash-123" + + +def test_remove_lines_spec_manifest_and_extract_hash(sample_manifest_json): + cleaned = pl.remove_lines_spec_manifest(sample_manifest_json) + # ensure we got dict and our access function finds the right tarball hash + assert isinstance(cleaned, dict) + h = pl.access_spec_manifest_media_type(cleaned) + assert h == "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + +def test_malformed_manifest_is_logged(fake_requests, tmp_path): + # Set up a fake package hash and minimal database entry + pkg_hash = "badbadbadbadbadbadbadbadbadbadba" + name = "broken-pkg" + ver = "1.0" + db_entry = {"spec": {"name": name, "version": ver}} + + # Manifest URL that print_files will request + manifest_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json" + + # Make fake_requests return invalid bytes (invalid JSON) + fake_requests.table[manifest_url] = lambda: type( + "R", (), {"status_code": 200, "content": b"xxxx"} # bad data + )() + + # Run print_files, which should try to parse the manifest and fail + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + pl.print_files(pkg_hash, db_entry, index, existing_tarinfo_files, seen_tarball_hashes) + + # Check: malformed_manifests.txt exists and has the hash + URL + error + assert os.path.exists(pl.MALFORMED_MANIFESTS_FILE) + with open(pl.MALFORMED_MANIFESTS_FILE, "r") as f: + log_content = f.read() + assert pkg_hash in log_content + assert manifest_url in log_content + assert "ValueError" in log_content or "JSON" in log_content + + # Check: skipped_manifests.txt + assert os.path.exists(pl.SKIPPED_MANIFESTS_FILE) + with open(pl.SKIPPED_MANIFESTS_FILE, "r") as f: + skipped_content = f.read() + assert pkg_hash in skipped_content + +def test_make_urls(): + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + url = pl.make_binary_package_URL(tar_hash) + + # Basic correctness + assert url.endswith("/" + tar_hash) + assert f"/{tar_hash[:2]}/" in url # path uses first two hex chars as subdir + + # manifest URL + pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + pkg_val = {"spec": {"name": "compiler-wrapper", "version": "1.0"}} + murl, package_filename = pl.make_spec_manifest_URL(pkg_hash, pkg_val) + assert "manifests/spec/compiler-wrapper/" in murl + assert murl.endswith(f"compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json") + assert package_filename == f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}" + + + +def test_download_from_URL_spec_does_not_persist(fake_requests, tmp_path): + # For is_spec=True, function returns bytes but should NOT save a file to SPEC_CACHE_DIR + url = "https://example.com/x.spec.manifest.json" + content = b"hello" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=True) + # Returned content + assert out == content + # Not persisted + cached_path = os.path.join(pl.SPEC_CACHE_DIR, "compiler__x-1.0-abc") + assert not os.path.exists(cached_path) + + +def test_download_from_URL_binary_persists(fake_requests, tmp_path): + url = "https://example.com/blob.tar.gz" + content = b"tarbytes" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False) + assert out == content + cached_path = os.path.join(pl.BINARY_CACHE_DIR, "compiler__x-1.0-abc") + assert os.path.exists(cached_path) + with open(cached_path, "rb") as f: + assert f.read() == content + + +def test_download_timeout_logs(fake_requests, tmp_path): + url = "https://timeout.test/blob.tar.gz" + fake_requests.table[url] = lambda: req_exc.Timeout() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False) + assert out is None + with open(pl.TIMEOUT_LOG_FILE, "r") as f: + txt = f.read() + assert "compiler__x-1.0-abc" in txt + assert url in txt + + +def test_read_binary_package_extracts_and_cleans_paths(tar_with_placeholder_bytes, tmp_path): + # Prepare the "downloaded" tar path that read_binary_package will delete afterward + pkg = "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + dl_path = os.path.join(pl.BINARY_CACHE_DIR, pkg.replace("/", "__")) + with open(dl_path, "wb") as f: + f.write(b"placeholder") + + # Run + pl.read_binary_package(tar_with_placeholder_bytes, pkg, tar_hash) + + # Verify cleaned tarinfo file written atomically + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + with open(tarinfo_path) as f: + items = json.load(f) + # Path should start *after* the 'pkg-tail/' segment due to remove_placeholder_directories + assert any(p.endswith(".spack/install_environment.json") for p in items) + + # tarball removed and buffer freed + assert not os.path.exists(dl_path) + + +def test_print_files_happy_path(fake_requests, sample_manifest_json, tar_with_placeholder_bytes, tmp_path): + # minimal database with one install + pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + db = { + "database": { + "installs": { + pkg_hash: {"spec": {"name": "compiler-wrapper", "version": "1.0"}} + } + } + } + index = {} + existing_tarinfo_files = set() # no prior tarinfo + seen_tarball_hashes = set() + + # Wire URLs expected by print_files flow + manifest_url = ( + "https://binaries.spack.io/develop/v3/manifests/spec/" + f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json" + ) + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + fake_requests.table[manifest_url] = lambda: type("R", (), {"status_code": 200, "content": sample_manifest_json})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_with_placeholder_bytes})() + + # Act + pl.print_files(pkg_hash, db["database"]["installs"][pkg_hash], index, existing_tarinfo_files, seen_tarball_hashes) + + # Assert manifest saved safely + manifest_path = os.path.join(pl.MANIFEST_DIR, f"compiler-wrapper-1.0-{pkg_hash}.json") + assert os.path.exists(manifest_path) + # Assert tarinfo created + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + # Index updated & saved + assert pkg_hash in index + assert index[pkg_hash]["sha256"] == tar_hash + assert os.path.exists(pl.INDEX_FILE) + + +def test_print_files_skips_when_tarinfo_exists(sample_manifest_json, tmp_path): + # Prepare: existing tarinfo means no binary download + pkg_hash = "zzz111" + spec = {"spec": {"name": "foo", "version": "9.9"}} + index = {} + + # Pre-create tarinfo + prehash = "abcdead00face" + existing_tarinfo_files = {prehash} + seen_tarball_hashes = set() + + # Monkeypatch access_spec_manifest_media_type to return prehash so it matches existing_tarinfo_files + orig = pl.access_spec_manifest_media_type + pl.access_spec_manifest_media_type = lambda _db: prehash + + # Also ensure manifest path is considered existing so we don't try to download it + manifest_path = os.path.join(pl.MANIFEST_DIR, f"foo-9.9-{pkg_hash}.json") + with open(manifest_path, "w") as f: + json.dump({"dummy": True}, f) + + try: + pl.print_files(pkg_hash, spec, index, existing_tarinfo_files, seen_tarball_hashes) + finally: + pl.access_spec_manifest_media_type = orig + + # Should just update index and save, no download attempted + assert pkg_hash in index + assert index[pkg_hash]["sha256"] == prehash + with open(pl.INDEX_FILE) as f: + data = json.load(f) + assert pkg_hash in data + + +def test_download_404_returns_none(fake_requests): + url = "https://example.com/notfound" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 404, "content": b""})() + assert pl.download_from_URL(url, "some/pkg-1.0-deadbeef", is_spec=True) is None + +def _mk_manifest_bytes_with_hash(tar_hash: str): + """Helper: build bytes that match remove_lines_spec_manifest's slicing contract.""" + body = {"data": [ + {"mediaType": "x/ignored", "checksum": "zzz"}, + {"mediaType": "application/vnd.spack.install.v2.tar+gzip", "checksum": tar_hash}, + ]} + raw = json.dumps(body).encode("utf-8") + return b"x" * 49 + raw + b"y" * 834 + + +def test_shared_tarball_logs_and_skips_second_download(fake_requests, tmp_path): + """ + Two different package hashes point to the same tarball hash. + We expect: + - First run downloads tarball and writes tarinfo. + - Second run logs to SHARED_TARBALL_HASH_FILE and (since we update + existing_tarinfo_files to include the first tarinfo) skips re-download. + """ + # Common tarball hash & blob URL + tar_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcd" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + # Package A + pkg_hash_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + name = "compiler-wrapper" + ver = "1.0" + man_url_a = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_a}.spec.manifest.json" + + # Package B (different spec hash, same tarball) + pkg_hash_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + man_url_b = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_b}.spec.manifest.json" + + # Route manifests to same tarball hash, and the blob to some tar bytes + tar_bytes = b"\x1f\x8b" + b"tar" * 100 # not actually parsed; we won't open it here + # For the first call we want a real tar.gz + # We'll just reuse download + skip path by creating a minimal valid tar.gz: + import io, tarfile + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tf: + ti = tarfile.TarInfo(name=f"{name}-{ver}-{pkg_hash_a}/.spack/install_environment.json") + data = b"{}" + ti.size = len(data) + tf.addfile(ti, io.BytesIO(data)) + tar_bytes = buf.getvalue() + + fake_requests.table[man_url_a] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[man_url_b] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_bytes})() + + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + + # Run A (creates tarinfo and index) + pl.print_files(pkg_hash_a, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + + # Emulate the main loop behavior: keep using *the same* existing_tarinfo_files set, + # but update it to reflect that we've now created tarinfo. + existing_tarinfo_files.add(tar_hash) + + # Guard: if the second call tries to re-download, we'd need another blob mapping. + # We purposely *don't* add one here—so if it tries, the test will fail. + + # Run B (should log as shared and skip binary download due to existing_tarinfo_files) + pl.print_files(pkg_hash_b, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + + # Check shared log file captured the second spec hash + the shared tar hash + assert os.path.exists(pl.SHARED_TARBALL_HASH_FILE) + with open(pl.SHARED_TARBALL_HASH_FILE, "r") as f: + shared_log = f.read() + assert f"{pkg_hash_b}\t{tar_hash}" in shared_log + + # Index should contain both manifests, same sha256 + assert index[pkg_hash_a]["sha256"] == tar_hash + assert index[pkg_hash_b]["sha256"] == tar_hash + + +def test_failed_tarball_download_is_logged(fake_requests, tmp_path): + """ + If the blob download fails (404 or None), we should append to FAILED_TARBALL_DOWNLOAD_FILE + and not produce a tarinfo file. + """ + name = "foo" + ver = "9.9" + pkg_hash = "cccccccccccccccccccccccccccccccc" + tar_hash = "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" + man_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + # Manifest OK, blob 404 + fake_requests.table[man_url] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 404, "content": b""})() + + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + + pl.print_files(pkg_hash, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + + # No tarinfo created + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert not os.path.exists(tarinfo_path) + + # Log entry created with package filename, manifest hash, and tarball hash + assert os.path.exists(pl.FAILED_TARBALL_DOWNLOAD_FILE) + with open(pl.FAILED_TARBALL_DOWNLOAD_FILE, "r") as f: + log = f.read() + # Contains the manifest hash + tarball hash; also includes the package_filename prefix + assert f"manifest hash: {pkg_hash}, tarball hash: {tar_hash}" in log