diff --git a/dataset-generation/spack_db/Create_spack_DB.py b/dataset-generation/spack_db/Create_spack_DB.py
new file mode 100644
index 0000000..e2ccc8d
--- /dev/null
+++ b/dataset-generation/spack_db/Create_spack_DB.py
@@ -0,0 +1,143 @@
+import os
+import json
+import sqlite3
+import time
+from dapper_python.normalize import normalize_file_name
+
+# configuration
+INDEX_PATH = "cache/spack.index.db.json"
+SQLITE_DB_PATH = "cache/spack-v1.db"
+
+def build_package_filelist_db():
+    # load index
+    if not os.path.exists(INDEX_PATH):
+        print("❌ Index file not found.")
+        return
+
+    with open(INDEX_PATH, "r") as f:
+        index = json.load(f)
+
+    # Create SQLite DB
+    conn = sqlite3.connect(SQLITE_DB_PATH)
+    cursor = conn.cursor()
+
+    # Create table columns
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS package_files (
+            id INTEGER PRIMARY KEY,
+            file_name TEXT,
+            normalized_file_name TEXT,
+            file_path TEXT,
+            package_name TEXT,
+            UNIQUE(file_path, package_name)
+        )
+    ''')
+
+    # Create indices for efficient lookups
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)')
+
+
+    # Create dataset_version table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS dataset_version(
+                   version INTEGER,
+                   format TEXT,
+                   timestamp INTEGER
+        )
+    ''')
+
+    # Clear the dataset_version table
+    cursor.execute("DELETE FROM dataset_version")
+
+    # Create table columns
+    cursor.execute(
+        "INSERT INTO dataset_version (version, format, timestamp)" \
+        "VALUES (?, ?, ?)",
+        (1, "Spack", int(time.time()))
+    )
+
+    inserted_packages = 0
+    inserted_files = 0
+    for package_hash, entry in index.items():
+        try:
+            package_name = entry["name"]
+            version = entry["version"]
+            sha256 = entry["sha256"]
+        
+
+            tarinfo_path = entry.get("tarinfo_path")
+            if not tarinfo_path or not os.path.exists(tarinfo_path):
+                print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}")
+                continue
+
+            with open(tarinfo_path, "r") as f:
+                file_list = json.load(f)
+
+            package_inserted_or_updated = False
+
+            for file_path in file_list:
+                # skipping .spack/ files
+                if file_path.startswith(".spack/"):
+                    continue
+
+                # Extract file name
+                file_name = os.path.basename(file_path)
+
+                # Normalize the file name
+                try:
+                    normalized = normalize_file_name(file_name)
+                    normalized_file_name = str(normalized).lower()
+                except Exception as e:
+                    print(f"⚠️ Failed to normalize '{file_name}': {e}")
+                    normalized_file_name = file_name.lower()
+
+                # Insert into DB
+                cursor.execute(
+                    '''INSERT OR IGNORE INTO package_files 
+                       (file_name, normalized_file_name, file_path, package_name)
+                       VALUES (?, ?, ?, ?)''',
+                    (file_name, normalized_file_name, file_path, package_name)
+                )
+
+                if cursor.rowcount > 0:
+                    inserted_files += 1
+                    package_inserted_or_updated = True # New row added
+                    continue # No need to update - freshly inserted
+                #breakpoint()
+                # Row already exists - check if any values changed
+                cursor.execute(
+                    ''' SELECT file_name, normalized_file_name FROM package_files
+                        WHERE file_path = ? AND package_name = ?''',
+                    (file_path, package_name)
+                )
+                result = cursor.fetchone()
+                if result:
+                    existing_file_name, existing_normalized_name = result
+                    if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name):
+                        # Something changed - update
+                        
+
+                        # Update the row
+                        cursor.execute(
+                            ''' UPDATE package_files
+                                SET file_name = ?, normalized_file_name = ?
+                                WHERE file_path = ? AND package_name = ?''',
+                            (file_name, normalized_file_name, file_path, package_name)
+                        )
+                        package_inserted_or_updated = True # A row was updated
+            if package_inserted_or_updated:
+                inserted_packages += 1
+        
+
+        except Exception as e:
+            print(f"❌ Failed to insert {package_hash}: {e}")
+            continue
+
+    conn.commit()
+    conn.close()
+
+    print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}")
+
+if __name__ == "__main__":
+    build_package_filelist_db()
diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
new file mode 100644
index 0000000..3158899
--- /dev/null
+++ b/dataset-generation/spack_db/README.md
@@ -0,0 +1,61 @@
+# Spack Build Cache Data Scraper & SQLite Database
+
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
+
+The program builds a master index called `spack.index.db.json`. 
+* Index layout:
+    * spec manifest hash as the unique key
+    * package name and version
+    * package tarball unique SHA256 hash
+    * package manifest path to the local cache directory
+    * package tarinfo path to the local cache directory
+
+The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
+
+## Directory Structure
+* `cache/spack.index.db.json` - master index
+* `cache/manifest/` - cleaned spec manifests
+* `cache/tarinfo/` - JSON file lists extracted from tarballs
+* `cache/binary_packages/` - temporary cache of downloaded tarballs
+* `cache/timeouts.txt` - packages that timed out while downloading
+* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
+* `cache/malformed_manifests.txt` - manifests that failed parsing
+* `cache/missing_tarballs.txt` - manifests without a tarball hash
+* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
+* `cache/failed_tarball_downloads.txt` - tarballs that failed to download
+
+## Features
+* Retrieves package `.spec.manifest.json` from Spack's binary mirror
+* Extracts valid JSON payload, and removes extra characters
+* Retrieves binary tarballs and extracts file lists
+* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
+* Contains multiple checkpoints for safe restart/resume of the program
+* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
+* Stores forward-slash paths in JSON index for cross-platform use
+
+## Usage
+1. Install dependencies
+    ```bash
+    pip install requests
+    ```
+    The rest of the necessary modules are part of Python's standard library.
+
+2. Provide a database file
+    Update the file_name in `main()` if needed
+
+3. Run the script
+    ```bash
+    python spack_db.py
+    ```
+
+4. Resume after interruption
+    If an interruption occurs, it is safe to re-run the script without losing data already processed. 
+
+5. Retry manifests or tarballs
+    Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads.
+
+6. Run Create_spack_DB.py to create SQLite database
+    ```bash
+    python Create_spack_DB.py
+    ```
+    Database will include all files extracted from the packages from the Spack build cache.
diff --git a/dataset-generation/spack_db/conftest.py b/dataset-generation/spack_db/conftest.py
new file mode 100644
index 0000000..3acaeb6
--- /dev/null
+++ b/dataset-generation/spack_db/conftest.py
@@ -0,0 +1,110 @@
+# conftest.py
+import io
+import json
+import tarfile
+import types
+import pytest
+
+
+import spack_db as pl
+
+@pytest.fixture(autouse=True)
+def isolate_fs(tmp_path, monkeypatch):
+    """Redirect all cache/config paths to a temp dir per test."""
+    cache = tmp_path / "cache"
+    (cache / "manifest").mkdir(parents=True, exist_ok=True)
+    (cache / "tarinfo").mkdir(parents=True, exist_ok=True)
+    (cache / "spec_manifests").mkdir(parents=True, exist_ok=True)
+    (cache / "binary_packages").mkdir(parents=True, exist_ok=True)
+
+    monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False)
+    monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False)
+    monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False)
+    monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False)
+    monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False)
+
+    monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False)
+    monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False)
+    monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False)
+
+    # Ensure directories exist for atomic writes
+    (tmp_path / "cache").mkdir(exist_ok=True)
+    yield
+
+
+@pytest.fixture
+def sample_manifest_json():
+    """
+    Create the *actual bytes* expected by remove_lines_spec_manifest:
+    take a valid JSON, then pad 49 bytes in front and 834 bytes at the end.
+    """
+    body = {
+        "data": [
+            {"mediaType": "irrelevant/type", "checksum": "abc"},
+            {"mediaType": "application/vnd.spack.install.v2.tar+gzip",
+             "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"}
+        ]
+    }
+    raw = json.dumps(body).encode("utf-8")
+    return b"x" * 49 + raw + b"y" * 834
+
+
+@pytest.fixture
+def tar_with_placeholder_bytes():
+    """
+    Build a tar in-memory whose members include the __spack_path_placeh segments
+    and the package-tail folder (e.g., 'compiler-wrapper-1.0-<hash>').
+    """
+    pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    member_name = (
+        "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/"
+        "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/"
+        f"{pkg_tail}/.spack/install_environment.json"
+    )
+
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tf:
+        data = b"{}"
+        tarinfo = tarfile.TarInfo(name=member_name)
+        tarinfo.size = len(data)
+        tf.addfile(tarinfo, io.BytesIO(data))
+    return buf.getvalue()
+
+
+class DummyResp:
+    def __init__(self, status_code=200, content=b""):
+        self.status_code = status_code
+        self.content = content
+
+
+@pytest.fixture
+def fake_requests(monkeypatch):
+    """
+    Monkeypatch requests.get with programmable behavior per-URL.
+    Usage:
+        table = {}
+        def _route(url, *a, **kw): return table[url]()
+        fake = fake_requests
+        fake.route = _route
+        monkeypatch.setattr(pl.requests, "get", _route)
+        table["...json"] = lambda: DummyResp(200, b"...")
+    """
+    table = {}
+
+    def _get(url, *args, **kwargs):
+        if url not in table:
+            raise AssertionError(f"Unexpected URL requested: {url}")
+        result = table[url]()
+        # Allow raising exceptions (e.g., Timeout) from factories
+        if isinstance(result, Exception):
+            raise result
+        return result
+
+    # Expose for tests to fill
+    _get.table = table
+    monkeypatch.setattr(pl.requests, "get", _get)
+    return _get
\ No newline at end of file
diff --git a/dataset-generation/spack_db/rename_tarinfo_file.py b/dataset-generation/spack_db/rename_tarinfo_file.py
new file mode 100644
index 0000000..3d16af6
--- /dev/null
+++ b/dataset-generation/spack_db/rename_tarinfo_file.py
@@ -0,0 +1,35 @@
+import os
+import re
+
+# Path to your tarinfo directory
+TARINFO_DIR = "cache/tarinfo"
+
+# Updated regex: match <prefix>-<64-char-sha256>.json
+pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$")
+
+# Counter
+renamed = 0
+skipped = 0
+
+for filename in os.listdir(TARINFO_DIR):
+    match = pattern.match(filename)
+    if match:
+        sha256_hash = match.group(2)
+        new_filename = f"{sha256_hash}.json"
+
+        old_path = os.path.join(TARINFO_DIR, filename)
+        new_path = os.path.join(TARINFO_DIR, new_filename)
+
+        # Skip if target file already exists
+        if os.path.exists(new_path):
+            print(f"⚠️ Skipping {filename} (target {new_filename} already exists)")
+            skipped += 1
+            continue
+
+        os.rename(old_path, new_path)
+        renamed += 1
+    else:
+        print(f"❓ Skipping non-matching file: {filename}")
+        skipped += 1
+
+print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.")
diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
new file mode 100644
index 0000000..cea6d15
--- /dev/null
+++ b/dataset-generation/spack_db/spack_db.py
@@ -0,0 +1,633 @@
+# json library used to load json file
+import json 
+
+# requests used to download URL automatically
+import requests
+
+# io for reading and writing streams such as text, binary, and raw data
+import io
+
+# for reading and writing tar archives
+import tarfile
+
+# to quiet warnings
+import warnings
+warnings.filterwarnings("ignore")
+
+# os to create a cache folder - disk-based caching
+import os
+
+# tempfile and shutil for index temp file
+import tempfile
+import shutil
+
+from pathlib import Path
+
+# Configuration
+
+# spack.index.db.json maps each package back to the packagename, version, SHA256hash, 
+    # path to manifest json file, path to tarinfo file list
+INDEX_FILE = "cache/spack.index.db.json"  
+
+# MANIFEST_DIR is the source of metadata per package used in master index
+MANIFEST_DIR = "cache/manifest" 
+
+# TARINFO_DIR contains the extracted list of files from each binary tarball
+# this will be used in making the SQLite database without having to reprocess the tarballs again
+TARINFO_DIR = "cache/tarinfo" 
+
+# BINARY_CACHE_DIR contains the downloaded tarballs temporarily
+# the file is deleted after processing. 
+BINARY_CACHE_DIR = "cache/binary_packages" 
+TIMEOUT_LOG_FILE = "cache/timeouts.txt"
+
+# checkpoint to safely stop the script at any time
+# progress.txt saves the spec_manifest hash
+CHECKPOINT_FILE = "progress.txt" 
+
+# file to track all the manifest files that were unable to download
+SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt"
+MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt"
+MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt"
+SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt"
+FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt"
+
+# create cache directories for faster download
+os.makedirs(MANIFEST_DIR, exist_ok=True)
+os.makedirs(TARINFO_DIR, exist_ok = True)
+os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
+
+# Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage.
+def _to_posix(p:str) -> str:
+    return Path(p).as_posix()
+
+def _to_posix(p: str) -> str:
+    return Path(p).as_posix()
+
+# look for index if it exists to add info to
+def load_index(): 
+    # if the index_file exists, read it
+    if os.path.exists(INDEX_FILE):
+        with open(INDEX_FILE, "r") as f:
+            # return as a dictionary for easy manipulation and JSON formatting
+            return json.load(f)
+    # if the index does not exist, an empty dictionary is returned
+    return {}
+
+# save index
+def save_index(index):                                  
+
+    # create a backup of the previous index
+    if os.path.exists(INDEX_FILE):
+        shutil.copy(INDEX_FILE, INDEX_FILE + ".bak")
+
+    # Save to a temp file, then move to replace
+    temp_dir = os.path.dirname(INDEX_FILE)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(index, tmp, indent=2)
+        temp_name = tmp.name
+
+    shutil.move(temp_name, INDEX_FILE)
+
+# format for entried added to index
+def update_index_entry(index, package_hash, package_value, package_zip_hash): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    tarinfo_filename = f"{package_zip_hash}.json"
+
+    manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename)
+    tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename)
+
+    index[package_hash] = {
+        "name": name,
+        "version": version,
+        "sha256": package_zip_hash,
+
+        # store forward-slash form in JSON for portability
+        "manifest_path": _to_posix(manifest_path_fs),
+        "tarinfo_path": _to_posix(tarinfo_path_fs),
+    }
+
+# load that last saved package hash
+def load_checkpoint(): #
+    # checks if progress.txt exists
+    if os.path.exists(CHECKPOINT_FILE):
+        with open(CHECKPOINT_FILE, "r") as f:
+
+            # read and return last processed package_hash
+            # strip removes trailing newline or spaces
+            return f.read().strip()
+    
+    # if the file does not exist, return None
+    # if None, start from the beginning
+    return None
+
+
+# saves the last processed manifest package_hash to progress.txt
+# if the program is interrupted, the saved package_hash will be 
+    # the starting point when rerun
+def save_checkpoint(package_hash): #
+    with open(CHECKPOINT_FILE, "w") as f:
+        f.write(package_hash)
+
+
+# reading file
+def readmyfile(myfile):
+    try: 
+        with open(myfile, 'r') as file:
+            # database is the spack database json, within the spack build cache
+            db = json.load(file) # 8.6 seconds to read in large json file
+
+            # returns database
+            return db
+        
+    except FileNotFoundError:
+        print(f"Error: The file '{myfile}' not found.")
+    except Exception as e:
+        print(f"Error occured in readmyfile: {e}")
+
+
+# make the spec manifest downloadable URL
+def make_spec_manifest_URL(package_hash, package_hash_value): 
+# goal is to make a URL that looks like this -> 
+# https://binaries.spack.io/develop/v3/manifests/spec/<name_of_package (not the hash)>/<name_of_package>-<version>-<hash>.spec.manifest.json
+# example URL for compiler-wrapper:
+# https://binaries.spack.io/develop/v3/manifests/spec/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json
+    
+    myURL = 'https://binaries.spack.io/develop/v3/manifests/spec/'
+    
+    # this accesses the package name (not the package hash that we have been using as the package_hash)
+    package_name = package_hash_value['spec']['name']
+
+    # this accesses the package name version number
+    package_version = package_hash_value['spec']['version']
+    
+    # package_filename for use later in removal of placeholder directories in the filepath of the tarball
+    package_filename = (package_name + '/' + package_name + '-' + package_version 
+            + '-' + package_hash)
+
+    # this updates the URL
+    myURL += (package_filename + '.spec.manifest.json')
+    
+    # returns the URL for the spec manifest and package_filename
+    return myURL, package_filename  
+
+# automatically download contents from the URL
+def download_from_URL(theURL, package, is_spec=True): 
+
+    
+    # makes filename
+    # Example: 
+        # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
+        # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
+    package_name = package.replace('/','__')
+
+    # if is_spec is true, meaning the file ends with ".spec.manifest.json",
+        # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation
+    # if the file ends with .tar.gz
+        # then the file is saved in BINARY_CACHE_DIR
+    cache_dir = BINARY_CACHE_DIR if not is_spec else None
+
+    # full file path then is:
+        # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+        #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
+    cached_path = os.path.join(cache_dir, package_name) if cache_dir else None
+
+    if is_spec:
+        print(f"downloading manifest for {package_name}")
+    else:
+        print(f"temporary save location: {_to_posix(cached_path)} for {package_name}")
+
+    #if cache exists, it does not need to be redownloaded
+    if cached_path and os.path.exists(cached_path):
+        
+        print(f"Using cached file: {_to_posix(cached_path)}")
+        
+        # rb is read binary
+        with open(cached_path, "rb") as f:
+            return f.read()
+
+    try:
+        label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}"
+        print(f"trying download for {label}")
+        
+        # timeout for 60 seconds
+        response = requests.get(theURL, timeout=60, verify=False)
+
+        # a response status code is 200 then the request was successful
+        # response.status_code of 404 means does not exist
+        if response.status_code == 200:
+            if cached_path:
+                print(f"download successful for {_to_posix(cached_path)}")
+                
+                # saves to cache if request is successful
+                # wb is write binary
+                with open(cached_path, "wb") as f:
+                    f.write(response.content)
+            return response.content
+
+        else:
+            # if URL does not exist, skip and move to next package
+            print(f"download failed for package: {package_name}\n")
+            
+            # return None to stop process due to download failing - goes back to run_program function
+            return None
+    
+    except requests.exceptions.Timeout:
+        print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}")
+            # Append to file immediately
+        with open(TIMEOUT_LOG_FILE, "a") as f:
+            f.write(f"{package_name}\t{theURL}\n")
+        return None
+    
+    except Exception as e:
+        print(f"download_from_URL package {package_name}, error: {e}")
+
+        # return None to stop process due to download failing
+        return None
+
+# remove unnecessary lines in file
+def remove_lines_spec_manifest(myfile): 
+
+    # Accept bytes or str; extract between first '{' and last '}'
+    data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8")
+
+    start = data.find(b"{")
+    end = data.rfind(b"}")
+
+    if start == -1 or end == -1 or end < start:
+        raise ValueError("Malformed manifest: JSON braces not found")
+    
+    return json.loads(data[start:end+1].decode("utf-8"))
+
+# returns checksum, sha256 hash used to download the binary tarball
+def access_spec_manifest_media_type(db): 
+
+    try:
+        # get the value for the key 'data' from the db
+        # if the key 'data' does not exist, return empty list
+        # temp is db['data']
+        for temp in db.get('data', []):
+            if temp.get('mediaType') == 'application/vnd.spack.install.v2.tar+gzip':
+
+                # the checksum is the sha256 hash
+                return temp.get('checksum')
+            
+    except Exception as e:
+        print(f"Error occured in access_spec_manifest_media_type: {e}")
+        return None
+
+# uses checksum returned to generate URL for binary tarball download
+def make_binary_package_URL(package_zip_hash): 
+# example URL for compiler-wrapper binary package:
+# https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351
+
+    myURL = 'https://binaries.spack.io/develop/blobs/sha256/'
+
+    first_byte = package_zip_hash[:2]
+    
+    myURL = myURL + first_byte + '/' + package_zip_hash
+
+    return myURL
+
+# ensure tarinfo completeness
+def write_tarinfo_safely(tarinfo_path, file_list):
+    temp_dir = os.path.dirname(tarinfo_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(file_list, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, tarinfo_path)
+
+
+# ensure manifest completeness
+def write_manifest_safely(manifest_path, manifest_data):
+    temp_dir = os.path.dirname(manifest_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(manifest_data, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, manifest_path)
+
+# using python tarfile module io module to list all the files in the downloaded tarball 
+# myfile is the tar_file response.content and the package is the hash we will split by
+# package is the package name and the version
+def read_binary_package(myfile, package, package_zip_hash): 
+    file_list = []
+    try:
+        with io.BytesIO(myfile) as tar_buffer:
+            with tarfile.open(fileobj = tar_buffer, mode="r:*") as tar:
+       
+                print(f"Files in the tar archive for {package.split('/')[0]}:")
+                i = 1
+                for member in tar.getmembers():
+                    if member.isfile():
+                        # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/
+                            # .spack/install_environment.json"
+                        # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+                        clean_path = remove_placeholder_directories(i, member.name, package) 
+
+                        # this will add the files that are in the package to a clean_list
+                        if clean_path:
+                            file_list.append(clean_path)
+                        i += 1
+                
+    except tarfile.ReadError as e:
+        print(f"Error reading tar file: {e}")
+        return
+    
+    name = package.split('/')[0]
+    version = package.split('/')[1].split('-')[1]
+    
+    # saves file names to the tarinfo file
+    tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
+    write_tarinfo_safely(tarinfo_path, file_list)
+
+    # removes tarball once processed
+    tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__'))
+    if os.path.exists(tarball_path):
+        os.remove(tarball_path)
+    del myfile
+
+# removing the placeholder directories in the file path
+def remove_placeholder_directories(i, name, package): 
+    # i is the counter for file enumeration
+    # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/
+        # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
+        # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/
+        # compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/.spack/install_environment.json"
+    # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+
+    # updatedpackage_list for compiler-wrapper is "['compiler-wrapper', 'compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3']"
+    updatedpackage_list = package.split('/')
+
+    # updatedpackage_name is "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    updatedpackage_name = updatedpackage_list[1]
+
+    placeholder = "__spack_path_placeh"
+    
+    # split by updatedpackage_name
+    split_list = name.split(updatedpackage_name)
+    
+    try:
+        if placeholder not in split_list[0]:
+            print("split_list", split_list)
+
+        # returns file name without the placeholder path
+        elif len(split_list) > 1:
+            updatedname = split_list[1][1:]
+
+            print(f"file {i}: ", updatedname)
+            return updatedname # return to add to list of files for respective package
+            
+    except Exception as e:
+        print(f"Error in remove_placeholder_directories: {e}")
+        return None
+    
+
+def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
+    
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    manifest_path = os.path.join(MANIFEST_DIR, manifest_filename)
+
+    # use existing cleaned manifest if available
+    if os.path.exists(manifest_path):
+        print(f"Using existing cleaned manifest: {_to_posix(manifest_path)}")
+        with open(manifest_path, "r") as f:
+            clean_spec_manifest = json.load(f)
+
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
+    
+    else:
+        # download if manifest does not exist
+
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
+
+        # download the spec manifest json for the package of interest
+        temp = download_from_URL(theURL, package_filename, is_spec = True)
+        
+
+        # return if URL does not exist
+        if temp is None:
+            print(f"Could not download manifest: {package_filename} - recording and skipping.\n")
+            
+            # recording the failed manifest filename and hash
+            with open(SKIPPED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\n")
+            
+            # exit print_files function and goes back to run_program function
+            return
+
+        print("✅ Loaded cached spec manifest")
+    
+        try:
+            # remove unneccessary lines from downloaded spec manifest
+            clean_spec_manifest = remove_lines_spec_manifest(temp)
+        except Exception as e:
+            print(f"Failed to parse manifest {package_filename}: {e}")
+            with open(MALFORMED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\t{theURL}\t{type(e).__name__}: {e}\n")
+            # also adding it to file for skipping processing at restart
+            with open(SKIPPED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\n")
+            return
+
+        # writes cleaned manifest information to manifest file
+        write_manifest_safely(manifest_path, clean_spec_manifest)
+        print(f"✅ Manifest safely written: {_to_posix(manifest_path)}")
+
+    # find the mediaType that contains the hash for the package tarball install
+    package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest)
+    print(f"✅ Extracted zip hash: {package_zip_hash}")
+
+    # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function
+    if package_zip_hash is None:
+        print(f"No Tarball hash found in manifest: {package_filename}")
+
+        # Track taht this manifest has no downloadable binary tarball
+        with open(MISSING_TARBALL_HASH_FILE, "a") as f:
+            f.write(f"{package_hash}\n")
+        # go back to run_program function
+        return
+    
+    # track if the tarball hash has already been processed
+    if package_zip_hash in seen_tarball_hashes:
+        with open(SHARED_TARBALL_HASH_FILE, "a") as f:
+            # if this manifest points to a tarball that has already been seen,
+                # it will not create a new tarinfo entry
+                # it will have a new manifest entry
+            f.write(f"{package_hash}\t{package_zip_hash}\n")
+    else:
+        seen_tarball_hashes.add(package_zip_hash)
+        
+
+    expected_tarinfo_hash = package_zip_hash
+
+    if expected_tarinfo_hash in existing_tarinfo_files:
+        print(f"✅ Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"✅ Saved Index")
+        return
+    
+    else:
+
+        # make the binary package URL for installing the package
+        binary_package_URL = make_binary_package_URL(package_zip_hash)
+        print(f"🔗 Downloading binary: {binary_package_URL}")
+
+        # download the binary package file from the generated URL
+        tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
+        print("✅ Binary package downloaded")
+
+        if tempbinary is None:
+
+            # Track failed tarball download
+            with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f:
+                f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n")
+            
+            return
+
+        # read the binary package
+        read_binary_package(tempbinary, package_filename, package_zip_hash)
+        print("✅ Finished reading binary package")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"Updated Index with {package_filename}-{package_zip_hash}")
+        save_checkpoint(package_hash)
+        print(f"✅ Saved Index")
+
+# program dispatcher
+def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): 
+    installs = database['database']['installs']
+
+    # gets installs key value, aka name of package, version, etc.
+    package_value = installs[package_hash]
+    print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes)
+
+
+def main():
+    #file_name = "myMedjson.json"
+    # file_name = "myjson.json"
+    # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
+    file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
+
+    database = readmyfile(file_name)
+
+    # load list of previously skipped manifest hashes
+    if os.path.exists(SKIPPED_MANIFESTS_FILE):
+        with open(SKIPPED_MANIFESTS_FILE, "r") as f:
+            skipped_hashes = set(line.strip() for line in f)
+    
+    else:
+        skipped_hashes = set()
+
+    # load manifests that are missing binary tarball hashes (e.g. mediaType not found)
+    if os.path.exists(MISSING_TARBALL_HASH_FILE):
+        with open(MISSING_TARBALL_HASH_FILE, "r") as f:
+            missing_tarball_hashes = set(line.strip() for line in f)
+
+    else:
+        missing_tarball_hashes = set()
+
+    # load tarballs that were not downloadable
+    if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE):
+        with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f:
+            failed_tarball_hashes = set(
+                line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line
+            )
+    else:
+        failed_tarball_hashes = set()
+   
+
+    # lists install keys
+    install_keys = list(database['database']['installs'])
+    num_keys = len(install_keys)
+
+    # load last processed package hash from checkpoint from cache if it exists
+    last_processed = load_checkpoint()
+    index = load_index()
+    skip = True if last_processed else False
+
+    existing_tarinfo_files = {
+        Path(fname).stem
+        for fname in os.listdir(TARINFO_DIR)
+        if fname.endswith(".json")
+    }
+
+    # track already-processed tarball hashes to find shared ones
+    seen_tarball_hashes = set()
+
+    SAVE_INTERVAL = 50
+
+    print("Starting...Will skip packages already fully processed.")
+
+    try:
+        for i, package_hash in enumerate(install_keys):
+
+            # skip if package_hash is in the skipped manifests file
+            if package_hash in skipped_hashes:
+                continue
+
+            # skip if manifest had no usable tarball
+            if package_hash in missing_tarball_hashes:
+                continue
+
+            if package_hash in index:
+                entry = index[package_hash]
+                manifest_path = entry.get("manifest_path", "")
+                tarinfo_path = entry.get("tarinfo_path", "")
+                tarball_hash = entry.get("sha256", "")
+
+                manifest_exists = bool(manifest_path) and Path(manifest_path).exists()
+                tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists()
+                
+                if manifest_exists and tarinfo_exists:
+                    
+                    continue
+                
+                # if tarball previously failed, skip retrying it
+                if tarball_hash in failed_tarball_hashes:
+                    print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}")
+                    continue
+
+
+            print(f"📦 package {i + 1} out of {num_keys} packages\n")
+
+            run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes)
+
+            # Save checkpoint and index every N packages
+            if (i + 1) % SAVE_INTERVAL == 0:
+                save_checkpoint(package_hash)
+                save_index(index)
+                print(f"Saved checkpoint and index at package {i + 1}")
+
+    except KeyboardInterrupt:
+        save_checkpoint(package_hash)
+        save_index(index)
+        print("\n🛑 Interrupted. Progress saved.")
+
+        if last_processed:
+            val = database['database']['installs'].get(last_processed)
+            if val:
+                name = val['spec']['name']
+                version = val['spec']['version']
+
+                print(f"Last checkpoint was: {name}-{version}, {last_processed}")
+            else:
+                print(f"Checkpoint not found in current file: {last_processed}")
+                print(f"file may have changed since the last run")
+    finally:
+        save_checkpoint(package_hash)
+        save_index(index)
+        print("\n🎊 Complete (or safely stopped). Script will resume where it left off.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/dataset-generation/spack_db/spack_index_length.py b/dataset-generation/spack_db/spack_index_length.py
new file mode 100644
index 0000000..0111f73
--- /dev/null
+++ b/dataset-generation/spack_db/spack_index_length.py
@@ -0,0 +1,19 @@
+import json
+
+# Path to your index file
+INDEX_FILE = "cache/spack.index.db.json"
+
+def main():
+    try:
+        with open(INDEX_FILE, "r") as f:
+            index = json.load(f)
+        print(f"✅ Number of entries in index: {len(index)}")
+    except FileNotFoundError:
+        print(f"❌ File not found: {INDEX_FILE}")
+    except json.JSONDecodeError:
+        print(f"❌ Failed to parse JSON. The file may be corrupted.")
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/dataset-generation/spack_db/test_spack_db.py b/dataset-generation/spack_db/test_spack_db.py
new file mode 100644
index 0000000..dcf3181
--- /dev/null
+++ b/dataset-generation/spack_db/test_spack_db.py
@@ -0,0 +1,356 @@
+# test_spack_db.py
+import json
+import os
+import io
+import tarfile
+import pytest
+import spack_db as pl
+from requests import exceptions as req_exc
+
+
+def test_update_index_entry_sets_paths(tmp_path):
+    idx = {}
+    pkg_hash = "deadbeef"
+    value = {"spec": {"name": "foo", "version": "1.2.3"}}
+    tar_hash = "abc123"
+    pl.update_index_entry(idx, pkg_hash, value, tar_hash)
+    assert pkg_hash in idx
+    entry = idx[pkg_hash]
+    assert entry["name"] == "foo"
+    assert entry["version"] == "1.2.3"
+    assert entry["sha256"] == "abc123"
+    assert entry["manifest_path"].endswith(f"manifest/foo-1.2.3-deadbeef.json")
+    assert entry["tarinfo_path"].endswith(f"tarinfo/abc123.json")
+
+
+def test_index_save_and_backup(tmp_path):
+    pl.save_index({"a": 1})
+    # First save: no .bak yet
+    assert os.path.exists(pl.INDEX_FILE)
+    assert not os.path.exists(pl.INDEX_FILE + ".bak")
+
+    # Second save should write a .bak of previous
+    pl.save_index({"a": 2})
+    assert os.path.exists(pl.INDEX_FILE + ".bak")
+    with open(pl.INDEX_FILE) as f:
+        assert json.load(f)["a"] == 2
+    with open(pl.INDEX_FILE + ".bak") as f:
+        assert json.load(f)["a"] == 1
+
+
+def test_checkpoint_roundtrip(tmp_path):
+    pl.save_checkpoint("pkg-hash-123")
+    assert pl.load_checkpoint() == "pkg-hash-123"
+
+
+def test_remove_lines_spec_manifest_and_extract_hash(sample_manifest_json):
+    cleaned = pl.remove_lines_spec_manifest(sample_manifest_json)
+    # ensure we got dict and our access function finds the right tarball hash
+    assert isinstance(cleaned, dict)
+    h = pl.access_spec_manifest_media_type(cleaned)
+    assert h == "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+
+def test_malformed_manifest_is_logged(fake_requests, tmp_path):
+    # Set up a fake package hash and minimal database entry
+    pkg_hash = "badbadbadbadbadbadbadbadbadbadba"
+    name = "broken-pkg"
+    ver = "1.0"
+    db_entry = {"spec": {"name": name, "version": ver}}
+    
+    # Manifest URL that print_files will request
+    manifest_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json"
+    
+    # Make fake_requests return invalid bytes (invalid JSON)
+    fake_requests.table[manifest_url] = lambda: type(
+        "R", (), {"status_code": 200, "content": b"xxxx"}  # bad data
+    )()
+    
+    # Run print_files, which should try to parse the manifest and fail
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+    pl.print_files(pkg_hash, db_entry, index, existing_tarinfo_files, seen_tarball_hashes)
+    
+    # Check: malformed_manifests.txt exists and has the hash + URL + error
+    assert os.path.exists(pl.MALFORMED_MANIFESTS_FILE)
+    with open(pl.MALFORMED_MANIFESTS_FILE, "r") as f:
+        log_content = f.read()
+    assert pkg_hash in log_content
+    assert manifest_url in log_content
+    assert "ValueError" in log_content or "JSON" in log_content 
+    
+    # Check: skipped_manifests.txt
+    assert os.path.exists(pl.SKIPPED_MANIFESTS_FILE)
+    with open(pl.SKIPPED_MANIFESTS_FILE, "r") as f:
+        skipped_content = f.read()
+    assert pkg_hash in skipped_content
+
+def test_make_urls():
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    url = pl.make_binary_package_URL(tar_hash)
+
+    # Basic correctness
+    assert url.endswith("/" + tar_hash)
+    assert f"/{tar_hash[:2]}/" in url  # path uses first two hex chars as subdir
+
+    # manifest URL
+    pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    pkg_val = {"spec": {"name": "compiler-wrapper", "version": "1.0"}}
+    murl, package_filename = pl.make_spec_manifest_URL(pkg_hash, pkg_val)
+    assert "manifests/spec/compiler-wrapper/" in murl
+    assert murl.endswith(f"compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json")
+    assert package_filename == f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}"
+
+
+
+def test_download_from_URL_spec_does_not_persist(fake_requests, tmp_path):
+    # For is_spec=True, function returns bytes but should NOT save a file to SPEC_CACHE_DIR
+    url = "https://example.com/x.spec.manifest.json"
+    content = b"hello"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=True)
+    # Returned content
+    assert out == content
+    # Not persisted
+    cached_path = os.path.join(pl.SPEC_CACHE_DIR, "compiler__x-1.0-abc")
+    assert not os.path.exists(cached_path)
+
+
+def test_download_from_URL_binary_persists(fake_requests, tmp_path):
+    url = "https://example.com/blob.tar.gz"
+    content = b"tarbytes"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False)
+    assert out == content
+    cached_path = os.path.join(pl.BINARY_CACHE_DIR, "compiler__x-1.0-abc")
+    assert os.path.exists(cached_path)
+    with open(cached_path, "rb") as f:
+        assert f.read() == content
+
+
+def test_download_timeout_logs(fake_requests, tmp_path):
+    url = "https://timeout.test/blob.tar.gz"
+    fake_requests.table[url] = lambda: req_exc.Timeout()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False)
+    assert out is None
+    with open(pl.TIMEOUT_LOG_FILE, "r") as f:
+        txt = f.read()
+    assert "compiler__x-1.0-abc" in txt
+    assert url in txt
+
+
+def test_read_binary_package_extracts_and_cleans_paths(tar_with_placeholder_bytes, tmp_path):
+    # Prepare the "downloaded" tar path that read_binary_package will delete afterward
+    pkg = "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    dl_path = os.path.join(pl.BINARY_CACHE_DIR, pkg.replace("/", "__"))
+    with open(dl_path, "wb") as f:
+        f.write(b"placeholder")
+
+    # Run
+    pl.read_binary_package(tar_with_placeholder_bytes, pkg, tar_hash)
+
+    # Verify cleaned tarinfo file written atomically
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+    with open(tarinfo_path) as f:
+        items = json.load(f)
+    # Path should start *after* the 'pkg-tail/' segment due to remove_placeholder_directories
+    assert any(p.endswith(".spack/install_environment.json") for p in items)
+
+    # tarball removed and buffer freed
+    assert not os.path.exists(dl_path)
+
+
+def test_print_files_happy_path(fake_requests, sample_manifest_json, tar_with_placeholder_bytes, tmp_path):
+    # minimal database with one install
+    pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    db = {
+        "database": {
+            "installs": {
+                pkg_hash: {"spec": {"name": "compiler-wrapper", "version": "1.0"}}
+            }
+        }
+    }
+    index = {}
+    existing_tarinfo_files = set()  # no prior tarinfo
+    seen_tarball_hashes = set()
+
+    # Wire URLs expected by print_files flow
+    manifest_url = (
+        "https://binaries.spack.io/develop/v3/manifests/spec/"
+        f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json"
+    )
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    fake_requests.table[manifest_url] = lambda: type("R", (), {"status_code": 200, "content": sample_manifest_json})()
+    fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_with_placeholder_bytes})()
+
+    # Act
+    pl.print_files(pkg_hash, db["database"]["installs"][pkg_hash], index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # Assert manifest saved safely
+    manifest_path = os.path.join(pl.MANIFEST_DIR, f"compiler-wrapper-1.0-{pkg_hash}.json")
+    assert os.path.exists(manifest_path)
+    # Assert tarinfo created
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+    # Index updated & saved
+    assert pkg_hash in index
+    assert index[pkg_hash]["sha256"] == tar_hash
+    assert os.path.exists(pl.INDEX_FILE)
+
+
+def test_print_files_skips_when_tarinfo_exists(sample_manifest_json, tmp_path):
+    # Prepare: existing tarinfo means no binary download
+    pkg_hash = "zzz111"
+    spec = {"spec": {"name": "foo", "version": "9.9"}}
+    index = {}
+
+    # Pre-create tarinfo
+    prehash = "abcdead00face"
+    existing_tarinfo_files = {prehash}
+    seen_tarball_hashes = set()
+
+    # Monkeypatch access_spec_manifest_media_type to return prehash so it matches existing_tarinfo_files
+    orig = pl.access_spec_manifest_media_type
+    pl.access_spec_manifest_media_type = lambda _db: prehash
+
+    # Also ensure manifest path is considered existing so we don't try to download it
+    manifest_path = os.path.join(pl.MANIFEST_DIR, f"foo-9.9-{pkg_hash}.json")
+    with open(manifest_path, "w") as f:
+        json.dump({"dummy": True}, f)
+
+    try:
+        pl.print_files(pkg_hash, spec, index, existing_tarinfo_files, seen_tarball_hashes)
+    finally:
+        pl.access_spec_manifest_media_type = orig
+
+    # Should just update index and save, no download attempted
+    assert pkg_hash in index
+    assert index[pkg_hash]["sha256"] == prehash
+    with open(pl.INDEX_FILE) as f:
+        data = json.load(f)
+    assert pkg_hash in data
+
+
+def test_download_404_returns_none(fake_requests):
+    url = "https://example.com/notfound"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 404, "content": b""})()
+    assert pl.download_from_URL(url, "some/pkg-1.0-deadbeef", is_spec=True) is None
+
+def _mk_manifest_bytes_with_hash(tar_hash: str):
+    """Helper: build bytes that match remove_lines_spec_manifest's slicing contract."""
+    body = {"data": [
+        {"mediaType": "x/ignored", "checksum": "zzz"},
+        {"mediaType": "application/vnd.spack.install.v2.tar+gzip", "checksum": tar_hash},
+    ]}
+    raw = json.dumps(body).encode("utf-8")
+    return b"x" * 49 + raw + b"y" * 834
+
+
+def test_shared_tarball_logs_and_skips_second_download(fake_requests, tmp_path):
+    """
+    Two different package hashes point to the same tarball hash.
+    We expect:
+      - First run downloads tarball and writes tarinfo.
+      - Second run logs to SHARED_TARBALL_HASH_FILE and (since we update
+        existing_tarinfo_files to include the first tarinfo) skips re-download.
+    """
+    # Common tarball hash & blob URL
+    tar_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcd"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    # Package A
+    pkg_hash_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+    name = "compiler-wrapper"
+    ver = "1.0"
+    man_url_a = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_a}.spec.manifest.json"
+
+    # Package B (different spec hash, same tarball)
+    pkg_hash_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    man_url_b = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_b}.spec.manifest.json"
+
+    # Route manifests to same tarball hash, and the blob to some tar bytes
+    tar_bytes = b"\x1f\x8b" + b"tar" * 100  # not actually parsed; we won't open it here
+    # For the first call we want a real tar.gz
+    # We'll just reuse download + skip path by creating a minimal valid tar.gz:
+    import io, tarfile
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tf:
+        ti = tarfile.TarInfo(name=f"{name}-{ver}-{pkg_hash_a}/.spack/install_environment.json")
+        data = b"{}"
+        ti.size = len(data)
+        tf.addfile(ti, io.BytesIO(data))
+    tar_bytes = buf.getvalue()
+
+    fake_requests.table[man_url_a] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[man_url_b] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[blob_url]   = lambda: type("R", (), {"status_code": 200, "content": tar_bytes})()
+
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+
+    # Run A (creates tarinfo and index)
+    pl.print_files(pkg_hash_a, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+
+    # Emulate the main loop behavior: keep using *the same* existing_tarinfo_files set,
+    # but update it to reflect that we've now created tarinfo.
+    existing_tarinfo_files.add(tar_hash)
+
+    # Guard: if the second call tries to re-download, we'd need another blob mapping.
+    # We purposely *don't* add one here—so if it tries, the test will fail.
+
+    # Run B (should log as shared and skip binary download due to existing_tarinfo_files)
+    pl.print_files(pkg_hash_b, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # Check shared log file captured the second spec hash + the shared tar hash
+    assert os.path.exists(pl.SHARED_TARBALL_HASH_FILE)
+    with open(pl.SHARED_TARBALL_HASH_FILE, "r") as f:
+        shared_log = f.read()
+    assert f"{pkg_hash_b}\t{tar_hash}" in shared_log
+
+    # Index should contain both manifests, same sha256
+    assert index[pkg_hash_a]["sha256"] == tar_hash
+    assert index[pkg_hash_b]["sha256"] == tar_hash
+
+
+def test_failed_tarball_download_is_logged(fake_requests, tmp_path):
+    """
+    If the blob download fails (404 or None), we should append to FAILED_TARBALL_DOWNLOAD_FILE
+    and not produce a tarinfo file.
+    """
+    name = "foo"
+    ver = "9.9"
+    pkg_hash = "cccccccccccccccccccccccccccccccc"
+    tar_hash = "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
+    man_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    # Manifest OK, blob 404
+    fake_requests.table[man_url] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 404, "content": b""})()
+
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+
+    pl.print_files(pkg_hash, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # No tarinfo created
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert not os.path.exists(tarinfo_path)
+
+    # Log entry created with package filename, manifest hash, and tarball hash
+    assert os.path.exists(pl.FAILED_TARBALL_DOWNLOAD_FILE)
+    with open(pl.FAILED_TARBALL_DOWNLOAD_FILE, "r") as f:
+        log = f.read()
+    # Contains the manifest hash + tarball hash; also includes the package_filename prefix
+    assert f"manifest hash: {pkg_hash}, tarball hash: {tar_hash}" in log