From 526dd7a76d67ebe11fa3bb04c96408524b2c5dc7 Mon Sep 17 00:00:00 2001
From: Quintanilla-Zarinan <quintanillaz1@llnl.gov>
Date: Wed, 16 Jul 2025 11:33:38 -0700
Subject: [PATCH 01/20] initial spack dataset scraper

---
 dataset-generation/spack_db/spack_db.py | 311 ++++++++++++++++++++++++
 1 file changed, 311 insertions(+)
 create mode 100644 dataset-generation/spack_db/spack_db.py
diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
new file mode 100644
index 0000000..889ae37
--- /dev/null
+++ b/dataset-generation/spack_db/spack_db.py
@@ -0,0 +1,311 @@
+# json library used to load json file
+import json 
+
+# requests used to download URL automatically
+import requests
+
+# COME BACK TO THIS LATER
+# certifi and urllib3 used for certificate verfication
+#import certifi
+#import urllib3
+
+# io for reading and writing streams such as text, binary, and raw data
+import io
+
+# for reading and writing tar archives
+import tarfile
+
+# to quiet warnings
+import warnings
+warnings.filterwarnings("ignore")
+
+# reading file
+def readmyfile(myfile):
+    try: 
+        with open(myfile, 'r') as file:
+            # database is the spack database json, within the spack build cache
+            db = json.load(file) # 8.6 seconds to read in large json file
+
+            # returns database
+            return db
+        
+    except FileNotFoundError:
+        print(f"Error: The file '{myfile}' not found.")
+    except Exception as e:
+        print(f"Error occured in readmyfile: {e}")
+
+# getting package hash which is the key for the package info
+# first argument is the index of the package by number
+# second argument is the database that was loaded in the readmyfile function
+def get_package(num, db):
+
+    # db and temp variables are type dict
+    temp = db['database']['installs']
+
+    #this accesses the install package of choice by index
+    package_hash = list(temp)[num] 
+    #NOTE TO SELF: turning it into a list may be slower
+
+    return temp, package_hash
+
+# get the package info from the package hash key
+def get_package_value(temp, package_hash):
+
+    # for each key in the database, return the value if the key matches the package hash of interest
+    for k in temp:
+        if k == package_hash:
+            #this will return the values of the package package_hash 
+            return temp[k]
+
+# make the spec manifest downloadable URL
+def make_spec_manifest_URL(package_hash, package_hash_value):
+# goal is to make a URL that looks like this -> 
+# https://binaries.spack.io/develop/v3/manifests/spec/<name_of_package (not the hash)>/<name_of_package>-<version>-<hash>.spec.manifest.json
+# example URL for compiler-wrapper:
+# https://binaries.spack.io/develop/v3/manifests/spec/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json
+    
+    myURL = 'https://binaries.spack.io/develop/v3/manifests/spec/'
+    
+    # this accesses the package name (not the package hash that we have been using as the package_hash)
+    package_name = package_hash_value['spec']['name']
+
+    # this accesses the package name version number
+    package_version = package_hash_value['spec']['version']
+    
+    # package_filename for use later in removal of placeholder directories in the filepath of the tarball
+    package_filename = (package_name + '/' + package_name + '-' + package_version 
+            + '-' + package_hash)
+
+    # this updates the URL
+    myURL += (package_filename + '.spec.manifest.json')
+    
+    # returns the URL for the spec manifest and package_filename
+    return myURL, package_filename  
+
+# automatically download contents from the URL
+def download_from_URL(theURL, package, num, db, num_keys):
+
+    # splits package_name by '/' character
+    package_name = package.split('/')[0]
+
+    try:
+        response = requests.get(theURL, verify=False)
+
+        # a response status code is 200 then the request was successful
+        # response.status_code of 404 means does not exist
+        if response.status_code == 200:
+            
+            return response.content  
+
+        else:
+            # if URL does not exist, skip and move to next package
+            print(f"download failed for package: {package_name}\n")
+            
+            # return None to stop process due to download failing - goes back to run_program function
+            return None
+    
+    except Exception as e:
+        print(f"download_from_URL package {package_name}, error: {e}")
+
+        # return None to stop process due to download failing
+        return None
+
+
+def remove_lines_spec_manifest(myfile):
+    
+    # removes unnecessary bytes
+    removed_ends = myfile[49:-834]
+
+    # converts bytes to dictionary
+    database = json.loads(removed_ends)
+
+    return database
+
+
+def access_spec_manifest_media_type(db):
+
+    try:
+        # get the value for the key 'data' from the db
+        # if the key 'data' does not exist, return empty list
+        # temp is db['data']
+        for temp in db.get('data', []):
+            if temp.get('mediaType') == 'application/vnd.spack.install.v2.tar+gzip':
+
+                # the checksum is the sha256 hash
+                return temp.get('checksum')
+            
+    except Exception as e:
+        print(f"Error occured in access_spec_manifest_media_type: {e}")
+
+def make_binary_package_URL(package_zip_hash):
+# example URL for compiler-wrapper binary package:
+# https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351
+
+    myURL = 'https://binaries.spack.io/develop/blobs/sha256/'
+
+    first_byte = package_zip_hash[:2]
+    
+    myURL = myURL + first_byte + '/' + package_zip_hash
+
+    return myURL
+
+# using python tarfile module io module to list all the files in the downloaded tarball 
+# myfile is the tar_file response.content and the package is the hash we will split by
+def read_binary_package(myfile, package):
+    
+    try:
+        with io.BytesIO(myfile) as tar_buffer:
+            with tarfile.open(fileobj = tar_buffer, mode="r") as tar:
+                
+                print(f"Files in the tar archive for {package.split('/')[0]}:")
+                i = 1
+                for member in tar.getmembers():
+                    if member.isfile():
+                        #breakpoint()
+                        #print(f"{i}: {member.name}")
+                        #i += 1
+                        
+                        # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
+                            # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/
+                            # .spack/install_environment.json"
+                        # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+                        remove_placeholder_directories(i, member.name, package) 
+                        # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
+                        #   # remove_placeholder function because we will already have a copy of the list of the files in that package
+                        # # to make it easier to add to sqlite database
+                        i += 1
+                
+    except tarfile.ReadError as e:
+        print(f"Error reading tar file: {e}")
+
+# removing the placeholder directories in the file path
+def remove_placeholder_directories(i, name, package):
+    # i is the counter for file enumeration
+    # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/
+        # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
+        # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/
+        # compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/.spack/install_environment.json"
+    # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+
+    # updatedpackage_list for compiler-wrapper is "['compiler-wrapper', 'compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3']"
+    updatedpackage_list = package.split('/')
+
+    # updatedpackage_name is "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    updatedpackage_name = updatedpackage_list[1]
+
+    placeholder = "__spack_path_placeh"
+    
+    # split by updatedpackage_name
+    split_list = name.split(updatedpackage_name)
+    
+    try:
+        if placeholder not in split_list[0]:
+            print("split_list", split_list)
+        elif len(split_list) > 1:
+            updatedname = split_list[1][1:]
+
+            print(f"file {i}: ", updatedname)
+            
+    except Exception as e:
+        print(f"Error in remove_placeholder_directories: {e}")
+    
+
+def print_files(package_number, database, num_keys):
+
+    # installs here are all the packages in the input database
+    # package_name is the package_hash of the package
+    installs, package_name = get_package(package_number, database)
+
+    # package_value is the value of the key, the key is the package_name
+    package_value = get_package_value(installs, package_name)
+    
+    # returns the URL for the spec manifest file and the package_filename
+    theURL, package_filename = make_spec_manifest_URL(package_name, package_value)
+
+    # download the spec manifest json for the package of interest
+    #temp = download_from_URL(theURL, package_filename)
+    temp = download_from_URL(theURL, package_filename, package_number, database, num_keys)
+    
+
+    # return if URL does not exist
+    if temp is None:
+        print(f"Skipping package {package_filename} due to download failure. \n")
+
+        # exit print_files function and goes back to run_program function
+        return
+
+    # remove unneccessary lines from downloaded spec manifest
+    spec_manifest_database = remove_lines_spec_manifest(temp)
+
+    # find the mediaType that contains the hash for the package install
+    package_zip_hash = access_spec_manifest_media_type(spec_manifest_database)
+
+    # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function
+    if package_zip_hash is None:
+        print("mediaType not found - skipping this package.")
+        # go back to run_program function
+        return
+
+    # make the binary package URL for installing the package
+    binary_package_URL = make_binary_package_URL(package_zip_hash)
+
+    # download the binary package file from the generated URL
+    tempbinary = download_from_URL(binary_package_URL, package_filename, package_number, database, num_keys)
+
+    # read the binary package
+    read_binary_package(tempbinary, package_filename)
+
+# program dispatcher
+def run_program(package_number, database, num_keys):
+
+    if package_number < num_keys:
+
+        print_files(package_number, database, num_keys)
+
+
+def main():
+    file_name = "myMedjson.json"
+    # file_name = "myjson.json"
+    # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
+    # file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
+
+    package_number = 0
+    # package_number2 = 2000
+    # package_number3 = 20
+    # package_number4 = 35
+    # package_number5 = 127000
+    database = readmyfile(file_name)
+
+    num_keys = len(database['database']['installs'])
+
+    for package_number in range(num_keys):
+        
+        print(f"package {package_number+1} out of {num_keys} packages\n")
+
+        run_program(package_number, database, num_keys)
+
+    print("\nComplete")
+
+if __name__ == "__main__":
+    main()
+
+### WHAT I NEED TO DO ###
+# interested in adding a function that gives us how much cpu is being used for the program
+## ways to make it faster - running it, then stopping it and then later proceeding from where it left off 
+    ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it
+    ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded )
+# can I also add the time it takes to run? 
+# can I have a calculation for how long it would take if I wanted to run it for 130464 files? - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow 
+## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it)
+# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow)
+# create a branch - commit files to branch on dapper
+## FROM RYAN ##
+# next step is play with the Python sqlite module and try to figure out how to 
+# 1) create a new sqlite db  
+# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite)
+
+
+

From 8421d96196f5e6f2f008d77f8bf6896cb62ee850 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Wed, 16 Jul 2025 11:41:46 -0700
Subject: [PATCH 02/20] added practice message of "meow"

---
 dataset-generation/spack_db/spack_db.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 889ae37..47ac1ce 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -307,5 +307,5 @@ def main():
 # 1) create a new sqlite db  
 # 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite)
 
-
+#meow
 

From ac16bd8857ade654b21ce5cfcc85c6204b543174 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Fri, 18 Jul 2025 18:00:35 -0700
Subject: [PATCH 03/20] added methods to create local cache for easy
 interrupt/proceed from last processed package

---
 dataset-generation/spack_db/spack_db.py | 171 ++++++++++++++++--------
 1 file changed, 117 insertions(+), 54 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 47ac1ce..a8e7ee2 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -19,6 +19,21 @@
 import warnings
 warnings.filterwarnings("ignore")
 
+# os to create a cache folder - disk-based caching
+import os
+
+# Configuration
+SPEC_CACHE_DIR = "cache/spec_manifests"
+BINARY_CACHE_DIR = "cache/binary_packages"
+
+# checkpoint to safely stop the script at any time
+CHECKPOINT_FILE = "progress.txt"
+
+# create cache directories for faster download
+os.makedirs(SPEC_CACHE_DIR, exist_ok = True)
+os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
+
+
 # reading file
 def readmyfile(myfile):
     try: 
@@ -34,28 +49,28 @@ def readmyfile(myfile):
     except Exception as e:
         print(f"Error occured in readmyfile: {e}")
 
-# getting package hash which is the key for the package info
-# first argument is the index of the package by number
-# second argument is the database that was loaded in the readmyfile function
-def get_package(num, db):
+# load that last saved package hash
+def load_checkpoint():
+    # checks if progress.txt exists
+    if os.path.exists(CHECKPOINT_FILE):
+        with open(CHECKPOINT_FILE, "r") as f:
 
-    # db and temp variables are type dict
-    temp = db['database']['installs']
-
-    #this accesses the install package of choice by index
-    package_hash = list(temp)[num] 
-    #NOTE TO SELF: turning it into a list may be slower
+            # read and return last processed package_hash
+            # strip removes trailing newline or spaces
+            return f.read().strip()
+    
+    # if the file does not exist, return None
+    # if None, start from the beginning
+    return None
 
-    return temp, package_hash
 
-# get the package info from the package hash key
-def get_package_value(temp, package_hash):
+# saves the last processed package_hash to progress.txt
+# if the program is interrupted, the saved package_hash will be 
+    # the starting point when rerun
+def save_checkpoint(package_hash):
+    with open(CHECKPOINT_FILE, "w") as f:
+        f.write(package_hash)
 
-    # for each key in the database, return the value if the key matches the package hash of interest
-    for k in temp:
-        if k == package_hash:
-            #this will return the values of the package package_hash 
-            return temp[k]
 
 # make the spec manifest downloadable URL
 def make_spec_manifest_URL(package_hash, package_hash_value):
@@ -83,10 +98,31 @@ def make_spec_manifest_URL(package_hash, package_hash_value):
     return myURL, package_filename  
 
 # automatically download contents from the URL
-def download_from_URL(theURL, package, num, db, num_keys):
+def download_from_URL(theURL, package, is_spec=True):
+
+    # makes filename
+    # Example: 
+        # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
+        # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
+    package_name = package.replace('/', '__')
+
+    # if is_spec is true, meaning the file ends with ".spec.manifest.json",
+        # then the file is saved in SPEC_CACHE_DIR
+    # if the file ends with .tar.gz
+        # then the file is saved in BINARY_CACHE_DIR
+    cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR
+
+    # full file path then is:
+        # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    cached_path = os.path.join(cache_dir, package_name)
 
-    # splits package_name by '/' character
-    package_name = package.split('/')[0]
+    #if cache exists, it does not need to be redownloaded
+    if os.path.exists(cached_path):
+        print(f"Using cached file: {cached_path}")
+
+        # rb is read binary
+        with open(cached_path, "rb") as f:
+            return f.read()
 
     try:
         response = requests.get(theURL, verify=False)
@@ -95,6 +131,11 @@ def download_from_URL(theURL, package, num, db, num_keys):
         # response.status_code of 404 means does not exist
         if response.status_code == 200:
             
+            # saves to cache if request is successful
+            # wb is write binary
+            with open(cached_path, "wb") as f:
+                f.write(response.content)
+
             return response.content  
 
         else:
@@ -110,7 +151,7 @@ def download_from_URL(theURL, package, num, db, num_keys):
         # return None to stop process due to download failing
         return None
 
-
+# remove unnecessary lines in file
 def remove_lines_spec_manifest(myfile):
     
     # removes unnecessary bytes
@@ -121,7 +162,7 @@ def remove_lines_spec_manifest(myfile):
 
     return database
 
-
+# returns checksum, sha256 hash used to download the binary tarball
 def access_spec_manifest_media_type(db):
 
     try:
@@ -137,6 +178,7 @@ def access_spec_manifest_media_type(db):
     except Exception as e:
         print(f"Error occured in access_spec_manifest_media_type: {e}")
 
+# uses checksum returned to generate URL for binary tarball download
 def make_binary_package_URL(package_zip_hash):
 # example URL for compiler-wrapper binary package:
 # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351
@@ -213,26 +255,19 @@ def remove_placeholder_directories(i, name, package):
         print(f"Error in remove_placeholder_directories: {e}")
     
 
-def print_files(package_number, database, num_keys):
-
-    # installs here are all the packages in the input database
-    # package_name is the package_hash of the package
-    installs, package_name = get_package(package_number, database)
-
-    # package_value is the value of the key, the key is the package_name
-    package_value = get_package_value(installs, package_name)
+def print_files(package_hash, package_value):
     
     # returns the URL for the spec manifest file and the package_filename
-    theURL, package_filename = make_spec_manifest_URL(package_name, package_value)
+    theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
 
     # download the spec manifest json for the package of interest
     #temp = download_from_URL(theURL, package_filename)
-    temp = download_from_URL(theURL, package_filename, package_number, database, num_keys)
+    temp = download_from_URL(theURL, package_filename, is_spec = True)
     
 
     # return if URL does not exist
     if temp is None:
-        print(f"Skipping package {package_filename} due to download failure. \n")
+        print(f"Skipping package {package_filename}\n")
 
         # exit print_files function and goes back to run_program function
         return
@@ -253,41 +288,68 @@ def print_files(package_number, database, num_keys):
     binary_package_URL = make_binary_package_URL(package_zip_hash)
 
     # download the binary package file from the generated URL
-    tempbinary = download_from_URL(binary_package_URL, package_filename, package_number, database, num_keys)
+    tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
 
     # read the binary package
     read_binary_package(tempbinary, package_filename)
 
 # program dispatcher
-def run_program(package_number, database, num_keys):
-
-    if package_number < num_keys:
+def run_program(package_hash, database):
+    installs = database['database']['installs']
 
-        print_files(package_number, database, num_keys)
+    # gets installs key value, aka name of package, version, etc.
+    package_value = installs[package_hash]
+    print_files(package_hash, package_value)
 
 
 def main():
-    file_name = "myMedjson.json"
+    # file_name = "myMedjson.json"
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
-    # file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
+    file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
 
-    package_number = 0
-    # package_number2 = 2000
-    # package_number3 = 20
-    # package_number4 = 35
-    # package_number5 = 127000
     database = readmyfile(file_name)
 
-    num_keys = len(database['database']['installs'])
+    # lists install keys
+    install_keys = list(database['database']['installs'])
+    num_keys = len(install_keys)
 
-    for package_number in range(num_keys):
-        
-        print(f"package {package_number+1} out of {num_keys} packages\n")
+    # load last processed package hash from checkpoint from cache if it exists
+    last_processed = load_checkpoint()
+    skip = True if last_processed else False
 
-        run_program(package_number, database, num_keys)
-
-    print("\nComplete")
+    try:
+        for i, package_hash in enumerate(install_keys):
+
+            # Skip previously completed packages until the last processed one
+            if skip:
+                print(f"Skipping {package_hash}")
+                if package_hash == last_processed:
+                    # start processing after the last one
+                    skip = False 
+                continue 
+
+            print(f"📦 package {i + 1} out of {num_keys} packages\n")
+
+            run_program(package_hash, database)
+
+            save_checkpoint(package_hash)
+
+    except KeyboardInterrupt:
+        print("\n🛑 Interrupted by user. Progress saved.")
+
+        if last_processed:
+            installs = database['database']['installs']
+            if last_processed in installs:
+                package_value = installs[last_processed]
+                package_name = package_value['spec']['name']
+                package_version = package_value['spec']['version']
+                print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}")
+            else:
+                print(f"Last checkpoint was: unknown, {last_processed}")
+                print(f"file may have changed since the last run")
+    finally:
+        print("\n🎊 Complete (or safely stopped). Script will resume where it left off.")
 
 if __name__ == "__main__":
     main()
@@ -298,7 +360,8 @@ def main():
     ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it
     ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded )
 # can I also add the time it takes to run? 
-# can I have a calculation for how long it would take if I wanted to run it for 130464 files? - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow 
+# can I have a calculation for how long it would take if I wanted to run it for 130464 files? 
+### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow 
 ## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it)
 # look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow)
 # create a branch - commit files to branch on dapper

From c362a96fba88e5149e414dcfa39085881675f99b Mon Sep 17 00:00:00 2001
From: lfquintaz <158081978+lfquintaz@users.noreply.github.com>
Date: Sun, 10 Aug 2025 18:10:42 -0700
Subject: [PATCH 04/20] added files to capture failed manifest/tarball download

---
 dataset-generation/spack_db/spack_db.py | 457 ++++++++++++++++++------
 1 file changed, 351 insertions(+), 106 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index a8e7ee2..961c657 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -4,11 +4,6 @@
 # requests used to download URL automatically
 import requests
 
-# COME BACK TO THIS LATER
-# certifi and urllib3 used for certificate verfication
-#import certifi
-#import urllib3
-
 # io for reading and writing streams such as text, binary, and raw data
 import io
 
@@ -22,35 +17,93 @@
 # os to create a cache folder - disk-based caching
 import os
 
+# tempfile and shutil for index temp file
+import tempfile
+import shutil
+
 # Configuration
-SPEC_CACHE_DIR = "cache/spec_manifests"
-BINARY_CACHE_DIR = "cache/binary_packages"
+
+# spack.index.db.json maps each package back to the packagename, version, SHA256hash, 
+    # path to manifest json file, path to tarinfo file list
+INDEX_FILE = "cache/DEMO_spack.index.db.json"  
+
+# MANIFEST_DIR is the source of metadata per package used in master index
+MANIFEST_DIR = "cache/DEMO_manifest" 
+
+# TARINFO_DIR contains the extracted list of files from each binary tarball
+# this will be used in making the SQLite database without having to reprocess the tarballs again
+TARINFO_DIR = "cache/DEMO_tarinfo" 
+
+# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet
+# a clean copy is meant to be placed in MANIFEST_DIR
+# SPEC_CACHE_DIR avoids redownloading if the script is restarted.
+SPEC_CACHE_DIR = "cache/DEMO_spec_manifests" 
+
+# BINARY_CACHE_DIR contains the downloaded tarballs temporarily
+# the file is deleted after processing. 
+BINARY_CACHE_DIR = "cache/DEMO_binary_packages" 
+TIMEOUT_LOG_FILE = "cache/DEMO_timeouts.txt"
 
 # checkpoint to safely stop the script at any time
-CHECKPOINT_FILE = "progress.txt"
+# progress.txt saves the spec_manifest hash
+CHECKPOINT_FILE = "DEMO_progress.txt" 
+
+# file to track all the manifest files that were unable to download
+SKIPPED_MANIFESTS_FILE = "cache/DEMO_skipped_manifests.txt"
+MISSING_TARBALL_HASH_FILE = "cache/DEMO_missing_tarballs.txt"
+SHARED_TARBALL_HASH_FILE = "cache/DEMO_shared_tarballs.txt"
+FAILED_TARBALL_DOWNLOAD_FILE = "cache/DEMO_failed_tarball_downloads.txt"
 
 # create cache directories for faster download
+# FIX ME: Remove SPEC_CACHE_DIR
+os.makedirs(MANIFEST_DIR, exist_ok=True)
+os.makedirs(TARINFO_DIR, exist_ok = True)
 os.makedirs(SPEC_CACHE_DIR, exist_ok = True)
 os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
 
+# look for index if it exists to add info to
+def load_index(): 
+    # if the index_file exists, read it
+    if os.path.exists(INDEX_FILE):
+        with open(INDEX_FILE, "r") as f:
+            # return as a dictionary for easy manipulation and JSON formatting
+            return json.load(f)
+    # if the index does not exist, an empty dictionary is returned
+    return {}
+
+# save index
+def save_index(index):                                  
+
+    # create a backup of the previous index
+    if os.path.exists(INDEX_FILE):
+        shutil.copy(INDEX_FILE, INDEX_FILE + ".bak")
+
+    # Save to a temp file, then move to replace
+    temp_dir = os.path.dirname(INDEX_FILE)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(index, tmp, indent=2)
+        temp_name = tmp.name
+
+    shutil.move(temp_name, INDEX_FILE)
+
+# format for entried added to index
+def update_index_entry(index, package_hash, package_value, package_zip_hash): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    tarinfo_filename = f"{package_zip_hash}.json"
+
+    index[package_hash] = {
+        "name": name,
+        "version": version,
+        "sha256": package_zip_hash,
+        "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename),
+        "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename)
+    }
 
-# reading file
-def readmyfile(myfile):
-    try: 
-        with open(myfile, 'r') as file:
-            # database is the spack database json, within the spack build cache
-            db = json.load(file) # 8.6 seconds to read in large json file
-
-            # returns database
-            return db
-        
-    except FileNotFoundError:
-        print(f"Error: The file '{myfile}' not found.")
-    except Exception as e:
-        print(f"Error occured in readmyfile: {e}")
 
 # load that last saved package hash
-def load_checkpoint():
+def load_checkpoint(): #
     # checks if progress.txt exists
     if os.path.exists(CHECKPOINT_FILE):
         with open(CHECKPOINT_FILE, "r") as f:
@@ -64,16 +117,32 @@ def load_checkpoint():
     return None
 
 
-# saves the last processed package_hash to progress.txt
+# saves the last processed manifest package_hash to progress.txt
 # if the program is interrupted, the saved package_hash will be 
     # the starting point when rerun
-def save_checkpoint(package_hash):
+def save_checkpoint(package_hash): #
     with open(CHECKPOINT_FILE, "w") as f:
         f.write(package_hash)
 
 
+# reading file
+def readmyfile(myfile):
+    try: 
+        with open(myfile, 'r') as file:
+            # database is the spack database json, within the spack build cache
+            db = json.load(file) # 8.6 seconds to read in large json file
+
+            # returns database
+            return db
+        
+    except FileNotFoundError:
+        print(f"Error: The file '{myfile}' not found.")
+    except Exception as e:
+        print(f"Error occured in readmyfile: {e}")
+
+
 # make the spec manifest downloadable URL
-def make_spec_manifest_URL(package_hash, package_hash_value):
+def make_spec_manifest_URL(package_hash, package_hash_value): 
 # goal is to make a URL that looks like this -> 
 # https://binaries.spack.io/develop/v3/manifests/spec/<name_of_package (not the hash)>/<name_of_package>-<version>-<hash>.spec.manifest.json
 # example URL for compiler-wrapper:
@@ -98,8 +167,9 @@ def make_spec_manifest_URL(package_hash, package_hash_value):
     return myURL, package_filename  
 
 # automatically download contents from the URL
-def download_from_URL(theURL, package, is_spec=True):
+def download_from_URL(theURL, package, is_spec=True): 
 
+    
     # makes filename
     # Example: 
         # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
@@ -107,14 +177,19 @@ def download_from_URL(theURL, package, is_spec=True):
     package_name = package.replace('/', '__')
 
     # if is_spec is true, meaning the file ends with ".spec.manifest.json",
-        # then the file is saved in SPEC_CACHE_DIR
+        # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation
     # if the file ends with .tar.gz
         # then the file is saved in BINARY_CACHE_DIR
     cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR
 
+    print(f"location to be saved in {cache_dir} for {package_name}")
+    
+
     # full file path then is:
         # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+        #cache/DEMO_manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
     cached_path = os.path.join(cache_dir, package_name)
+    print(f"this is the cached_path {cached_path}")
 
     #if cache exists, it does not need to be redownloaded
     if os.path.exists(cached_path):
@@ -125,18 +200,23 @@ def download_from_URL(theURL, package, is_spec=True):
             return f.read()
 
     try:
-        response = requests.get(theURL, verify=False)
-
+        print("in try block for download_from_URL")
+        # adding timeout for 60 seconds
+        response = requests.get(theURL, timeout=60, verify=False)
+        print(f"trying download for {cached_path} ")
         # a response status code is 200 then the request was successful
         # response.status_code of 404 means does not exist
         if response.status_code == 200:
-            
+            print(f"download successful for {cached_path}")
             # saves to cache if request is successful
             # wb is write binary
-            with open(cached_path, "wb") as f:
-                f.write(response.content)
+            if is_spec == False:
+                with open(cached_path, "wb") as f:
+                    f.write(response.content)
 
-            return response.content  
+                return response.content 
+            else:
+                return response.content
 
         else:
             # if URL does not exist, skip and move to next package
@@ -145,6 +225,13 @@ def download_from_URL(theURL, package, is_spec=True):
             # return None to stop process due to download failing - goes back to run_program function
             return None
     
+    except requests.exceptions.Timeout:
+        print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}")
+            # Append to file immediately
+        with open(TIMEOUT_LOG_FILE, "a") as f:
+            f.write(f"{package_name}\t{theURL}\n")
+        return None
+    
     except Exception as e:
         print(f"download_from_URL package {package_name}, error: {e}")
 
@@ -152,18 +239,18 @@ def download_from_URL(theURL, package, is_spec=True):
         return None
 
 # remove unnecessary lines in file
-def remove_lines_spec_manifest(myfile):
+def remove_lines_spec_manifest(myfile): 
     
     # removes unnecessary bytes
     removed_ends = myfile[49:-834]
 
     # converts bytes to dictionary
     database = json.loads(removed_ends)
-
+   
     return database
 
 # returns checksum, sha256 hash used to download the binary tarball
-def access_spec_manifest_media_type(db):
+def access_spec_manifest_media_type(db): 
 
     try:
         # get the value for the key 'data' from the db
@@ -177,9 +264,10 @@ def access_spec_manifest_media_type(db):
             
     except Exception as e:
         print(f"Error occured in access_spec_manifest_media_type: {e}")
+        return None
 
 # uses checksum returned to generate URL for binary tarball download
-def make_binary_package_URL(package_zip_hash):
+def make_binary_package_URL(package_zip_hash): 
 # example URL for compiler-wrapper binary package:
 # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351
 
@@ -193,9 +281,11 @@ def make_binary_package_URL(package_zip_hash):
 
 # using python tarfile module io module to list all the files in the downloaded tarball 
 # myfile is the tar_file response.content and the package is the hash we will split by
-def read_binary_package(myfile, package):
-    
+# package is the package name and the version
+def read_binary_package(myfile, package, package_zip_hash): 
+    file_list = []
     try:
+        # 
         with io.BytesIO(myfile) as tar_buffer:
             with tarfile.open(fileobj = tar_buffer, mode="r") as tar:
                 
@@ -203,7 +293,6 @@ def read_binary_package(myfile, package):
                 i = 1
                 for member in tar.getmembers():
                     if member.isfile():
-                        #breakpoint()
                         #print(f"{i}: {member.name}")
                         #i += 1
                         
@@ -214,17 +303,36 @@ def read_binary_package(myfile, package):
                             # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/
                             # .spack/install_environment.json"
                         # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
-                        remove_placeholder_directories(i, member.name, package) 
-                        # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
+                        clean_path = remove_placeholder_directories(i, member.name, package) 
+                        # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
                         #   # remove_placeholder function because we will already have a copy of the list of the files in that package
                         # # to make it easier to add to sqlite database
+
+                        # this will add the files that are in the package to a clean_list
+                        if clean_path:
+                            file_list.append(clean_path)
                         i += 1
                 
     except tarfile.ReadError as e:
         print(f"Error reading tar file: {e}")
+        return
+    
+    name = package.split('/')[0]
+    version = package.split('/')[1].split('-')[1]
+    
+    # saves file names to the tarinfo file
+    tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
+    with open(tarinfo_path, "w") as f:
+        json.dump(file_list, f, indent=2)
+
+    # removes tarball once processed
+    tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__'))
+    if os.path.exists(tarball_path):
+        os.remove(tarball_path)
+    del myfile
 
 # removing the placeholder directories in the file path
-def remove_placeholder_directories(i, name, package):
+def remove_placeholder_directories(i, name, package): 
     # i is the counter for file enumeration
     # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/
         # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
@@ -246,129 +354,266 @@ def remove_placeholder_directories(i, name, package):
     try:
         if placeholder not in split_list[0]:
             print("split_list", split_list)
+
+        # returns file name without the placeholder path
         elif len(split_list) > 1:
             updatedname = split_list[1][1:]
 
             print(f"file {i}: ", updatedname)
+            return updatedname # return to add to list of files for respective package
             
     except Exception as e:
         print(f"Error in remove_placeholder_directories: {e}")
+        return None
     
 
-def print_files(package_hash, package_value):
+def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
     
-    # returns the URL for the spec manifest file and the package_filename
-    theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
+    ############ updated 8/10 ############
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    manifest_path = os.path.join(MANIFEST_DIR, manifest_filename)
+
+    # use existing cleaned manifest if available
+    if os.path.exists(manifest_path):
+        print(f"Using existing cleaned manifest: {manifest_path}")
+        with open(manifest_path, "r") as f:
+            clean_spec_manifest = json.load(f)
+
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
+    
+    ## and indented theURL to print("✅ Cleaned and parsed spec manifest")
+    else:
+        # download if manifest does not exist
+
 
-    # download the spec manifest json for the package of interest
-    #temp = download_from_URL(theURL, package_filename)
-    temp = download_from_URL(theURL, package_filename, is_spec = True)
     
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
+
+        # download the spec manifest json for the package of interest
+        temp = download_from_URL(theURL, package_filename, is_spec = True)
+        
+
+        # return if URL does not exist
+        if temp is None:
+            print(f"Could not download manifest: {package_filename} - recording and skipping.\n")
+            
+            # recording the failed manifest filename and hash
+            with open(SKIPPED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\n")
+            
+            # exit print_files function and goes back to run_program function
+            return
 
-    # return if URL does not exist
-    if temp is None:
-        print(f"Skipping package {package_filename}\n")
+        print("✅ Loaded cached spec manifest")
 
-        # exit print_files function and goes back to run_program function
-        return
+        # remove unneccessary lines from downloaded spec manifest
+        clean_spec_manifest = remove_lines_spec_manifest(temp)
+
+        # writes cleaned manifest information to manifest file
+        with open(manifest_path, "w") as f:
+            json.dump(clean_spec_manifest, f, indent=2)
+        print("✅ Cleaned and parsed spec manifest")
 
-    # remove unneccessary lines from downloaded spec manifest
-    spec_manifest_database = remove_lines_spec_manifest(temp)
+    
+
+    # writes cleaned manifest information to manifest file
+    ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json")
+    ##with open(manifest_file, "w") as f:
+        ##json.dump(clean_spec_manifest, f, indent=2)
+
+    ######################################
 
-    # find the mediaType that contains the hash for the package install
-    package_zip_hash = access_spec_manifest_media_type(spec_manifest_database)
+    # find the mediaType that contains the hash for the package tarball install
+    package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest)
+    print(f"✅ Extracted zip hash: {package_zip_hash}")
 
     # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function
     if package_zip_hash is None:
-        print("mediaType not found - skipping this package.")
+        print(f"No Tarball hash found in manifest: {package_filename}")
+
+        # Track taht this manifest has no downloadable binary tarball
+        with open(MISSING_TARBALL_HASH_FILE, "a") as f:
+            f.write(f"{package_hash}\n")
         # go back to run_program function
         return
+    
+    # track if the tarball hash has already been processed
+    if package_zip_hash in seen_tarball_hashes:
+        with open(SHARED_TARBALL_HASH_FILE, "a") as f:
+            # if this manifest points to a tarball that has already been seen,
+                # it will not create a new tarinfo entry
+                # it will have a new manifest entry
+            f.write(f"{package_hash}\t{package_zip_hash}\n")
+    else:
+        seen_tarball_hashes.add(package_zip_hash)
+        
 
-    # make the binary package URL for installing the package
-    binary_package_URL = make_binary_package_URL(package_zip_hash)
+    expected_tarinfo_hash = package_zip_hash
 
-    # download the binary package file from the generated URL
-    tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
+    if expected_tarinfo_hash in existing_tarinfo_files:
+        print(f"✅ Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"✅ Saved Index")
+        return
+    
+    else:
+
+        # make the binary package URL for installing the package
+        binary_package_URL = make_binary_package_URL(package_zip_hash)
+        print(f"🔗 Downloading binary: {binary_package_URL}")
+
+        # download the binary package file from the generated URL
+        tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
+        print("✅ Binary package downloaded")
+
+        if tempbinary is None:
+
+            # Track failed tarball download
+            with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f:
+                f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n")
+            
+            return
 
-    # read the binary package
-    read_binary_package(tempbinary, package_filename)
+        # read the binary package
+        read_binary_package(tempbinary, package_filename, package_zip_hash)
+        print("✅ Finished reading binary package")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"Updated Index with {package_filename}-{package_zip_hash}")
+        save_checkpoint(package_hash)
+        print(f"✅ Saved Index")
 
 # program dispatcher
-def run_program(package_hash, database):
+def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): 
     installs = database['database']['installs']
 
     # gets installs key value, aka name of package, version, etc.
     package_value = installs[package_hash]
-    print_files(package_hash, package_value)
+    print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes)
 
 
 def main():
-    # file_name = "myMedjson.json"
+    #file_name = "myMedjson_DEMO.json"
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
     file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
 
     database = readmyfile(file_name)
 
+    # load list of previously skipped manifest hashes
+    if os.path.exists(SKIPPED_MANIFESTS_FILE):
+        with open(SKIPPED_MANIFESTS_FILE, "r") as f:
+            skipped_hashes = set(line.strip() for line in f)
+    
+    else:
+        skipped_hashes = set()
+
+    # load manifests that are missing binary tarball hashes (e.g. mediaType not found)
+    if os.path.exists(MISSING_TARBALL_HASH_FILE):
+        with open(MISSING_TARBALL_HASH_FILE, "r") as f:
+            missing_tarball_hashes = set(line.strip() for line in f)
+
+    else:
+        missing_tarball_hashes = set()
+
+    # load tarballs that were not downloadable
+    if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE):
+        with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f:
+            failed_tarball_hashes = set(
+                line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line
+            )
+    else:
+        failed_tarball_hashes = set()
+   
+
     # lists install keys
     install_keys = list(database['database']['installs'])
     num_keys = len(install_keys)
 
     # load last processed package hash from checkpoint from cache if it exists
     last_processed = load_checkpoint()
+    index = load_index()
     skip = True if last_processed else False
 
+    existing_tarinfo_files = set(os.listdir(TARINFO_DIR))
+    existing_tarinfo_hashes = {
+        fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files
+    }
+    existing_tarinfo_files = existing_tarinfo_hashes
+
+
+
+    # track already-processed tarball hashes to find shared ones
+    seen_tarball_hashes = set()
+
+    SAVE_INTERVAL = 50
+
+    print("Starting...Will skip packages already fully processed.")
+
     try:
         for i, package_hash in enumerate(install_keys):
 
-            # Skip previously completed packages until the last processed one
-            if skip:
-                print(f"Skipping {package_hash}")
-                if package_hash == last_processed:
-                    # start processing after the last one
-                    skip = False 
-                continue 
+            # skip if package_hash is in the skipped manifests file
+            if package_hash in skipped_hashes:
+                continue
+
+            # skip if manifest had no usable tarball
+            if package_hash in missing_tarball_hashes:
+                continue
+
+            if package_hash in index:
+                entry = index[package_hash]
+                manifest_path = entry.get("manifest_path", "")
+                tarinfo_path = entry.get("tarinfo_path", "")
+                tarball_hash = entry.get("sha256", "")
+
+                manifest_exists = manifest_path and os.path.exists(manifest_path)
+                tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
+
+                if manifest_exists and tarinfo_exists:
+                    print(f"Skipping fully processed package: {package_hash}")
+                    continue
+                
+                # if tarball previously failed, skip retrying it
+                if tarball_hash in failed_tarball_hashes:
+                    print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}")
+                    continue
+
 
             print(f"📦 package {i + 1} out of {num_keys} packages\n")
 
-            run_program(package_hash, database)
+            run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes)
 
-            save_checkpoint(package_hash)
+            # Save checkpoint and index every N packages
+            if (i + 1) % SAVE_INTERVAL == 0:
+                save_checkpoint(package_hash)
+                save_index(index)
+                print(f"Saved checkpoint and index at package {i + 1}")
 
     except KeyboardInterrupt:
-        print("\n🛑 Interrupted by user. Progress saved.")
+        save_checkpoint(package_hash)
+        save_index(index)
+        print("\n🛑 Interrupted. Progress saved.")
 
         if last_processed:
-            installs = database['database']['installs']
-            if last_processed in installs:
-                package_value = installs[last_processed]
-                package_name = package_value['spec']['name']
-                package_version = package_value['spec']['version']
-                print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}")
+            val = database['database']['installs'].get(last_processed)
+            if val:
+                name = val['spec']['name']
+                version = val['spec']['version']
+
+                print(f"Last checkpoint was: {name}-{version}, {last_processed}")
             else:
-                print(f"Last checkpoint was: unknown, {last_processed}")
+                print(f"Checkpoint not found in current file: {last_processed}")
                 print(f"file may have changed since the last run")
     finally:
+        save_checkpoint(package_hash)
+        save_index(index)
         print("\n🎊 Complete (or safely stopped). Script will resume where it left off.")
 
 if __name__ == "__main__":
-    main()
-
-### WHAT I NEED TO DO ###
-# interested in adding a function that gives us how much cpu is being used for the program
-## ways to make it faster - running it, then stopping it and then later proceeding from where it left off 
-    ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it
-    ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded )
-# can I also add the time it takes to run? 
-# can I have a calculation for how long it would take if I wanted to run it for 130464 files? 
-### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow 
-## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it)
-# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow)
-# create a branch - commit files to branch on dapper
-## FROM RYAN ##
-# next step is play with the Python sqlite module and try to figure out how to 
-# 1) create a new sqlite db  
-# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite)
-
-#meow
-
+    main()
\ No newline at end of file

From 059f205d5c4d916725c2202025a4b1379f78b258 Mon Sep 17 00:00:00 2001
From: lfquintaz <158081978+lfquintaz@users.noreply.github.com>
Date: Sun, 10 Aug 2025 18:31:50 -0700
Subject: [PATCH 05/20] added atomic save functions for manifest and tarballs

---
 dataset-generation/spack_db/spack_db.py | 36 ++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 961c657..dbe1212 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -279,6 +279,25 @@ def make_binary_package_URL(package_zip_hash):
 
     return myURL
 
+############ updated 8/10 ############
+# ensure tarinfo completeness
+def write_tarinfo_safely(tarinfo_path, file_list):
+    temp_dir = os.path.dirname(tarinfo_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(file_list, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, tarinfo_path)
+
+
+# ensure manifest completeness
+def write_manifest_safely(manifest_path, manifest_data):
+    temp_dir = os.path.dirname(manifest_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(manifest_data, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, manifest_path)
+#####################################
+
 # using python tarfile module io module to list all the files in the downloaded tarball 
 # myfile is the tar_file response.content and the package is the hash we will split by
 # package is the package name and the version
@@ -320,10 +339,14 @@ def read_binary_package(myfile, package, package_zip_hash):
     name = package.split('/')[0]
     version = package.split('/')[1].split('-')[1]
     
+    ############ updated 8/10 ############
     # saves file names to the tarinfo file
+    ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
+    ##with open(tarinfo_path, "w") as f:
+        ##json.dump(file_list, f, indent=2)
     tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
-    with open(tarinfo_path, "w") as f:
-        json.dump(file_list, f, indent=2)
+    write_tarinfo_safely(tarinfo_path, file_list)
+    #####################################
 
     # removes tarball once processed
     tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__'))
@@ -388,8 +411,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
     else:
         # download if manifest does not exist
 
-
-    
         # returns the URL for the spec manifest file and the package_filename
         theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
 
@@ -414,9 +435,10 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
         clean_spec_manifest = remove_lines_spec_manifest(temp)
 
         # writes cleaned manifest information to manifest file
-        with open(manifest_path, "w") as f:
-            json.dump(clean_spec_manifest, f, indent=2)
-        print("✅ Cleaned and parsed spec manifest")
+        ##with open(manifest_path, "w") as f:
+            ##json.dump(clean_spec_manifest, f, indent=2)
+        write_manifest_safely(manifest_path, clean_spec_manifest)
+        print("✅ Manifest safely written: {manifest_path}")
 
     
 

From b5e46e0487c6e2baf3fe535a2aa82ad10733dfd4 Mon Sep 17 00:00:00 2001
From: lfquintaz <158081978+lfquintaz@users.noreply.github.com>
Date: Mon, 11 Aug 2025 10:02:05 -0700
Subject: [PATCH 06/20] Add files via upload


From 601f74ac0f691fac68374a5722fe4a588a7ccd13 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Fri, 15 Aug 2025 11:03:29 -0700
Subject: [PATCH 07/20] removed print statement at restart that shows all the
 fully processed tarball hashes

---
 dataset-generation/spack_db/spack_db.py | 481 ++++++++++++++++++------
 1 file changed, 374 insertions(+), 107 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index a8e7ee2..ba69d3e 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -4,11 +4,6 @@
 # requests used to download URL automatically
 import requests
 
-# COME BACK TO THIS LATER
-# certifi and urllib3 used for certificate verfication
-#import certifi
-#import urllib3
-
 # io for reading and writing streams such as text, binary, and raw data
 import io
 
@@ -22,35 +17,93 @@
 # os to create a cache folder - disk-based caching
 import os
 
+# tempfile and shutil for index temp file
+import tempfile
+import shutil
+
 # Configuration
-SPEC_CACHE_DIR = "cache/spec_manifests"
-BINARY_CACHE_DIR = "cache/binary_packages"
+
+# spack.index.db.json maps each package back to the packagename, version, SHA256hash, 
+    # path to manifest json file, path to tarinfo file list
+INDEX_FILE = "cache/spack.index.db.json"  
+
+# MANIFEST_DIR is the source of metadata per package used in master index
+MANIFEST_DIR = "cache/manifest" 
+
+# TARINFO_DIR contains the extracted list of files from each binary tarball
+# this will be used in making the SQLite database without having to reprocess the tarballs again
+TARINFO_DIR = "cache/tarinfo" 
+
+# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet
+# a clean copy is meant to be placed in MANIFEST_DIR
+# SPEC_CACHE_DIR avoids redownloading if the script is restarted.
+SPEC_CACHE_DIR = "cache/spec_manifests" 
+
+# BINARY_CACHE_DIR contains the downloaded tarballs temporarily
+# the file is deleted after processing. 
+BINARY_CACHE_DIR = "cache/binary_packages" 
+TIMEOUT_LOG_FILE = "cache/timeouts.txt"
 
 # checkpoint to safely stop the script at any time
-CHECKPOINT_FILE = "progress.txt"
+# progress.txt saves the spec_manifest hash
+CHECKPOINT_FILE = "progress.txt" 
+
+# file to track all the manifest files that were unable to download
+SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt"
+MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt"
+SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt"
+FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt"
 
 # create cache directories for faster download
+# FIX ME: Remove SPEC_CACHE_DIR
+os.makedirs(MANIFEST_DIR, exist_ok=True)
+os.makedirs(TARINFO_DIR, exist_ok = True)
 os.makedirs(SPEC_CACHE_DIR, exist_ok = True)
 os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
 
+# look for index if it exists to add info to
+def load_index(): 
+    # if the index_file exists, read it
+    if os.path.exists(INDEX_FILE):
+        with open(INDEX_FILE, "r") as f:
+            # return as a dictionary for easy manipulation and JSON formatting
+            return json.load(f)
+    # if the index does not exist, an empty dictionary is returned
+    return {}
+
+# save index
+def save_index(index):                                  
+
+    # create a backup of the previous index
+    if os.path.exists(INDEX_FILE):
+        shutil.copy(INDEX_FILE, INDEX_FILE + ".bak")
+
+    # Save to a temp file, then move to replace
+    temp_dir = os.path.dirname(INDEX_FILE)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(index, tmp, indent=2)
+        temp_name = tmp.name
+
+    shutil.move(temp_name, INDEX_FILE)
+
+# format for entried added to index
+def update_index_entry(index, package_hash, package_value, package_zip_hash): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    tarinfo_filename = f"{package_zip_hash}.json"
+
+    index[package_hash] = {
+        "name": name,
+        "version": version,
+        "sha256": package_zip_hash,
+        "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename),
+        "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename)
+    }
 
-# reading file
-def readmyfile(myfile):
-    try: 
-        with open(myfile, 'r') as file:
-            # database is the spack database json, within the spack build cache
-            db = json.load(file) # 8.6 seconds to read in large json file
-
-            # returns database
-            return db
-        
-    except FileNotFoundError:
-        print(f"Error: The file '{myfile}' not found.")
-    except Exception as e:
-        print(f"Error occured in readmyfile: {e}")
 
 # load that last saved package hash
-def load_checkpoint():
+def load_checkpoint(): #
     # checks if progress.txt exists
     if os.path.exists(CHECKPOINT_FILE):
         with open(CHECKPOINT_FILE, "r") as f:
@@ -64,16 +117,32 @@ def load_checkpoint():
     return None
 
 
-# saves the last processed package_hash to progress.txt
+# saves the last processed manifest package_hash to progress.txt
 # if the program is interrupted, the saved package_hash will be 
     # the starting point when rerun
-def save_checkpoint(package_hash):
+def save_checkpoint(package_hash): #
     with open(CHECKPOINT_FILE, "w") as f:
         f.write(package_hash)
 
 
+# reading file
+def readmyfile(myfile):
+    try: 
+        with open(myfile, 'r') as file:
+            # database is the spack database json, within the spack build cache
+            db = json.load(file) # 8.6 seconds to read in large json file
+
+            # returns database
+            return db
+        
+    except FileNotFoundError:
+        print(f"Error: The file '{myfile}' not found.")
+    except Exception as e:
+        print(f"Error occured in readmyfile: {e}")
+
+
 # make the spec manifest downloadable URL
-def make_spec_manifest_URL(package_hash, package_hash_value):
+def make_spec_manifest_URL(package_hash, package_hash_value): 
 # goal is to make a URL that looks like this -> 
 # https://binaries.spack.io/develop/v3/manifests/spec/<name_of_package (not the hash)>/<name_of_package>-<version>-<hash>.spec.manifest.json
 # example URL for compiler-wrapper:
@@ -98,8 +167,9 @@ def make_spec_manifest_URL(package_hash, package_hash_value):
     return myURL, package_filename  
 
 # automatically download contents from the URL
-def download_from_URL(theURL, package, is_spec=True):
+def download_from_URL(theURL, package, is_spec=True): 
 
+    
     # makes filename
     # Example: 
         # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
@@ -107,14 +177,19 @@ def download_from_URL(theURL, package, is_spec=True):
     package_name = package.replace('/', '__')
 
     # if is_spec is true, meaning the file ends with ".spec.manifest.json",
-        # then the file is saved in SPEC_CACHE_DIR
+        # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation
     # if the file ends with .tar.gz
         # then the file is saved in BINARY_CACHE_DIR
     cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR
 
+    print(f"location to be saved in {cache_dir} for {package_name}")
+    
+
     # full file path then is:
         # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+        #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
     cached_path = os.path.join(cache_dir, package_name)
+    print(f"this is the cached_path {cached_path}")
 
     #if cache exists, it does not need to be redownloaded
     if os.path.exists(cached_path):
@@ -125,18 +200,23 @@ def download_from_URL(theURL, package, is_spec=True):
             return f.read()
 
     try:
-        response = requests.get(theURL, verify=False)
-
+        print("in try block for download_from_URL")
+        # adding timeout for 60 seconds
+        response = requests.get(theURL, timeout=60, verify=False)
+        print(f"trying download for {cached_path} ")
         # a response status code is 200 then the request was successful
         # response.status_code of 404 means does not exist
         if response.status_code == 200:
-            
+            print(f"download successful for {cached_path}")
             # saves to cache if request is successful
             # wb is write binary
-            with open(cached_path, "wb") as f:
-                f.write(response.content)
+            if is_spec == False:
+                with open(cached_path, "wb") as f:
+                    f.write(response.content)
 
-            return response.content  
+                return response.content 
+            else:
+                return response.content
 
         else:
             # if URL does not exist, skip and move to next package
@@ -145,6 +225,13 @@ def download_from_URL(theURL, package, is_spec=True):
             # return None to stop process due to download failing - goes back to run_program function
             return None
     
+    except requests.exceptions.Timeout:
+        print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}")
+            # Append to file immediately
+        with open(TIMEOUT_LOG_FILE, "a") as f:
+            f.write(f"{package_name}\t{theURL}\n")
+        return None
+    
     except Exception as e:
         print(f"download_from_URL package {package_name}, error: {e}")
 
@@ -152,18 +239,18 @@ def download_from_URL(theURL, package, is_spec=True):
         return None
 
 # remove unnecessary lines in file
-def remove_lines_spec_manifest(myfile):
+def remove_lines_spec_manifest(myfile): 
     
     # removes unnecessary bytes
     removed_ends = myfile[49:-834]
 
     # converts bytes to dictionary
     database = json.loads(removed_ends)
-
+   
     return database
 
 # returns checksum, sha256 hash used to download the binary tarball
-def access_spec_manifest_media_type(db):
+def access_spec_manifest_media_type(db): 
 
     try:
         # get the value for the key 'data' from the db
@@ -177,9 +264,10 @@ def access_spec_manifest_media_type(db):
             
     except Exception as e:
         print(f"Error occured in access_spec_manifest_media_type: {e}")
+        return None
 
 # uses checksum returned to generate URL for binary tarball download
-def make_binary_package_URL(package_zip_hash):
+def make_binary_package_URL(package_zip_hash): 
 # example URL for compiler-wrapper binary package:
 # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351
 
@@ -191,11 +279,32 @@ def make_binary_package_URL(package_zip_hash):
 
     return myURL
 
+############ updated 8/10 ############
+# ensure tarinfo completeness
+def write_tarinfo_safely(tarinfo_path, file_list):
+    temp_dir = os.path.dirname(tarinfo_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(file_list, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, tarinfo_path)
+
+
+# ensure manifest completeness
+def write_manifest_safely(manifest_path, manifest_data):
+    temp_dir = os.path.dirname(manifest_path)
+    with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp:
+        json.dump(manifest_data, tmp, indent=2)
+        temp_name = tmp.name
+    shutil.move(temp_name, manifest_path)
+#####################################
+
 # using python tarfile module io module to list all the files in the downloaded tarball 
 # myfile is the tar_file response.content and the package is the hash we will split by
-def read_binary_package(myfile, package):
-    
+# package is the package name and the version
+def read_binary_package(myfile, package, package_zip_hash): 
+    file_list = []
     try:
+        # 
         with io.BytesIO(myfile) as tar_buffer:
             with tarfile.open(fileobj = tar_buffer, mode="r") as tar:
                 
@@ -203,7 +312,6 @@ def read_binary_package(myfile, package):
                 i = 1
                 for member in tar.getmembers():
                     if member.isfile():
-                        #breakpoint()
                         #print(f"{i}: {member.name}")
                         #i += 1
                         
@@ -214,17 +322,40 @@ def read_binary_package(myfile, package):
                             # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/
                             # .spack/install_environment.json"
                         # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
-                        remove_placeholder_directories(i, member.name, package) 
-                        # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
+                        clean_path = remove_placeholder_directories(i, member.name, package) 
+                        # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
                         #   # remove_placeholder function because we will already have a copy of the list of the files in that package
                         # # to make it easier to add to sqlite database
+
+                        # this will add the files that are in the package to a clean_list
+                        if clean_path:
+                            file_list.append(clean_path)
                         i += 1
                 
     except tarfile.ReadError as e:
         print(f"Error reading tar file: {e}")
+        return
+    
+    name = package.split('/')[0]
+    version = package.split('/')[1].split('-')[1]
+    
+    ############ updated 8/10 ############
+    # saves file names to the tarinfo file
+    ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
+    ##with open(tarinfo_path, "w") as f:
+        ##json.dump(file_list, f, indent=2)
+    tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
+    write_tarinfo_safely(tarinfo_path, file_list)
+    #####################################
+
+    # removes tarball once processed
+    tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__'))
+    if os.path.exists(tarball_path):
+        os.remove(tarball_path)
+    del myfile
 
 # removing the placeholder directories in the file path
-def remove_placeholder_directories(i, name, package):
+def remove_placeholder_directories(i, name, package): 
     # i is the counter for file enumeration
     # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/
         # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/
@@ -246,129 +377,265 @@ def remove_placeholder_directories(i, name, package):
     try:
         if placeholder not in split_list[0]:
             print("split_list", split_list)
+
+        # returns file name without the placeholder path
         elif len(split_list) > 1:
             updatedname = split_list[1][1:]
 
             print(f"file {i}: ", updatedname)
+            return updatedname # return to add to list of files for respective package
             
     except Exception as e:
         print(f"Error in remove_placeholder_directories: {e}")
+        return None
     
 
-def print_files(package_hash, package_value):
+def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): 
+    name = package_value['spec']['name']
+    version = package_value['spec']['version']
     
-    # returns the URL for the spec manifest file and the package_filename
-    theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
-
-    # download the spec manifest json for the package of interest
-    #temp = download_from_URL(theURL, package_filename)
-    temp = download_from_URL(theURL, package_filename, is_spec = True)
+    ############ updated 8/10 ############
+    manifest_filename = f"{name}-{version}-{package_hash}.json"
+    manifest_path = os.path.join(MANIFEST_DIR, manifest_filename)
+
+    # use existing cleaned manifest if available
+    if os.path.exists(manifest_path):
+        print(f"Using existing cleaned manifest: {manifest_path}")
+        with open(manifest_path, "r") as f:
+            clean_spec_manifest = json.load(f)
+
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
     
+    ## and indented theURL to print("✅ Cleaned and parsed spec manifest")
+    else:
+        # download if manifest does not exist
 
-    # return if URL does not exist
-    if temp is None:
-        print(f"Skipping package {package_filename}\n")
+        # returns the URL for the spec manifest file and the package_filename
+        theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
 
-        # exit print_files function and goes back to run_program function
-        return
+        # download the spec manifest json for the package of interest
+        temp = download_from_URL(theURL, package_filename, is_spec = True)
+        
 
-    # remove unneccessary lines from downloaded spec manifest
-    spec_manifest_database = remove_lines_spec_manifest(temp)
+        # return if URL does not exist
+        if temp is None:
+            print(f"Could not download manifest: {package_filename} - recording and skipping.\n")
+            
+            # recording the failed manifest filename and hash
+            with open(SKIPPED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\n")
+            
+            # exit print_files function and goes back to run_program function
+            return
+
+        print("✅ Loaded cached spec manifest")
+
+        # remove unneccessary lines from downloaded spec manifest
+        clean_spec_manifest = remove_lines_spec_manifest(temp)
+
+        # writes cleaned manifest information to manifest file
+        ##with open(manifest_path, "w") as f:
+            ##json.dump(clean_spec_manifest, f, indent=2)
+        write_manifest_safely(manifest_path, clean_spec_manifest)
+        print(f"✅ Manifest safely written: {manifest_path}")
+
+    
+
+    # writes cleaned manifest information to manifest file
+    ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json")
+    ##with open(manifest_file, "w") as f:
+        ##json.dump(clean_spec_manifest, f, indent=2)
+
+    ######################################
 
-    # find the mediaType that contains the hash for the package install
-    package_zip_hash = access_spec_manifest_media_type(spec_manifest_database)
+    # find the mediaType that contains the hash for the package tarball install
+    package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest)
+    print(f"✅ Extracted zip hash: {package_zip_hash}")
 
     # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function
     if package_zip_hash is None:
-        print("mediaType not found - skipping this package.")
+        print(f"No Tarball hash found in manifest: {package_filename}")
+
+        # Track taht this manifest has no downloadable binary tarball
+        with open(MISSING_TARBALL_HASH_FILE, "a") as f:
+            f.write(f"{package_hash}\n")
         # go back to run_program function
         return
+    
+    # track if the tarball hash has already been processed
+    if package_zip_hash in seen_tarball_hashes:
+        with open(SHARED_TARBALL_HASH_FILE, "a") as f:
+            # if this manifest points to a tarball that has already been seen,
+                # it will not create a new tarinfo entry
+                # it will have a new manifest entry
+            f.write(f"{package_hash}\t{package_zip_hash}\n")
+    else:
+        seen_tarball_hashes.add(package_zip_hash)
+        
 
-    # make the binary package URL for installing the package
-    binary_package_URL = make_binary_package_URL(package_zip_hash)
+    expected_tarinfo_hash = package_zip_hash
 
-    # download the binary package file from the generated URL
-    tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
+    if expected_tarinfo_hash in existing_tarinfo_files:
+        print(f"✅ Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"✅ Saved Index")
+        return
+    
+    else:
+
+        # make the binary package URL for installing the package
+        binary_package_URL = make_binary_package_URL(package_zip_hash)
+        print(f"🔗 Downloading binary: {binary_package_URL}")
+
+        # download the binary package file from the generated URL
+        tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False)
+        print("✅ Binary package downloaded")
+
+        if tempbinary is None:
+
+            # Track failed tarball download
+            with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f:
+                f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n")
+            
+            return
 
-    # read the binary package
-    read_binary_package(tempbinary, package_filename)
+        # read the binary package
+        read_binary_package(tempbinary, package_filename, package_zip_hash)
+        print("✅ Finished reading binary package")
+        update_index_entry(index, package_hash, package_value, package_zip_hash)
+        save_index(index)
+        print(f"Updated Index with {package_filename}-{package_zip_hash}")
+        save_checkpoint(package_hash)
+        print(f"✅ Saved Index")
 
 # program dispatcher
-def run_program(package_hash, database):
+def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): 
     installs = database['database']['installs']
 
     # gets installs key value, aka name of package, version, etc.
     package_value = installs[package_hash]
-    print_files(package_hash, package_value)
+    print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes)
 
 
 def main():
-    # file_name = "myMedjson.json"
+    #file_name = "myMedjson.json"
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
     file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
 
     database = readmyfile(file_name)
 
+    # load list of previously skipped manifest hashes
+    if os.path.exists(SKIPPED_MANIFESTS_FILE):
+        with open(SKIPPED_MANIFESTS_FILE, "r") as f:
+            skipped_hashes = set(line.strip() for line in f)
+    
+    else:
+        skipped_hashes = set()
+
+    # load manifests that are missing binary tarball hashes (e.g. mediaType not found)
+    if os.path.exists(MISSING_TARBALL_HASH_FILE):
+        with open(MISSING_TARBALL_HASH_FILE, "r") as f:
+            missing_tarball_hashes = set(line.strip() for line in f)
+
+    else:
+        missing_tarball_hashes = set()
+
+    # load tarballs that were not downloadable
+    if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE):
+        with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f:
+            failed_tarball_hashes = set(
+                line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line
+            )
+    else:
+        failed_tarball_hashes = set()
+   
+
     # lists install keys
     install_keys = list(database['database']['installs'])
     num_keys = len(install_keys)
 
     # load last processed package hash from checkpoint from cache if it exists
     last_processed = load_checkpoint()
+    index = load_index()
     skip = True if last_processed else False
 
+    existing_tarinfo_files = set(os.listdir(TARINFO_DIR))
+    existing_tarinfo_hashes = {
+        fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files
+    }
+    existing_tarinfo_files = existing_tarinfo_hashes
+
+
+
+    # track already-processed tarball hashes to find shared ones
+    seen_tarball_hashes = set()
+
+    SAVE_INTERVAL = 50
+
+    print("Starting...Will skip packages already fully processed.")
+
     try:
         for i, package_hash in enumerate(install_keys):
 
-            # Skip previously completed packages until the last processed one
-            if skip:
-                print(f"Skipping {package_hash}")
-                if package_hash == last_processed:
-                    # start processing after the last one
-                    skip = False 
-                continue 
+            # skip if package_hash is in the skipped manifests file
+            if package_hash in skipped_hashes:
+                continue
+
+            # skip if manifest had no usable tarball
+            if package_hash in missing_tarball_hashes:
+                continue
+
+            if package_hash in index:
+                entry = index[package_hash]
+                manifest_path = entry.get("manifest_path", "")
+                tarinfo_path = entry.get("tarinfo_path", "")
+                tarball_hash = entry.get("sha256", "")
+
+                manifest_exists = manifest_path and os.path.exists(manifest_path)
+                tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
+
+                if manifest_exists and tarinfo_exists:
+                    #print(f"Skipping fully processed package: {package_hash}")
+                    continue
+                
+                # if tarball previously failed, skip retrying it
+                if tarball_hash in failed_tarball_hashes:
+                    print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}")
+                    continue
+
 
             print(f"📦 package {i + 1} out of {num_keys} packages\n")
 
-            run_program(package_hash, database)
+            run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes)
 
-            save_checkpoint(package_hash)
+            # Save checkpoint and index every N packages
+            if (i + 1) % SAVE_INTERVAL == 0:
+                save_checkpoint(package_hash)
+                save_index(index)
+                print(f"Saved checkpoint and index at package {i + 1}")
 
     except KeyboardInterrupt:
-        print("\n🛑 Interrupted by user. Progress saved.")
+        save_checkpoint(package_hash)
+        save_index(index)
+        print("\n🛑 Interrupted. Progress saved.")
 
         if last_processed:
-            installs = database['database']['installs']
-            if last_processed in installs:
-                package_value = installs[last_processed]
-                package_name = package_value['spec']['name']
-                package_version = package_value['spec']['version']
-                print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}")
+            val = database['database']['installs'].get(last_processed)
+            if val:
+                name = val['spec']['name']
+                version = val['spec']['version']
+
+                print(f"Last checkpoint was: {name}-{version}, {last_processed}")
             else:
-                print(f"Last checkpoint was: unknown, {last_processed}")
+                print(f"Checkpoint not found in current file: {last_processed}")
                 print(f"file may have changed since the last run")
     finally:
+        save_checkpoint(package_hash)
+        save_index(index)
         print("\n🎊 Complete (or safely stopped). Script will resume where it left off.")
 
 if __name__ == "__main__":
-    main()
-
-### WHAT I NEED TO DO ###
-# interested in adding a function that gives us how much cpu is being used for the program
-## ways to make it faster - running it, then stopping it and then later proceeding from where it left off 
-    ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it
-    ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded )
-# can I also add the time it takes to run? 
-# can I have a calculation for how long it would take if I wanted to run it for 130464 files? 
-### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow 
-## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it)
-# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow)
-# create a branch - commit files to branch on dapper
-## FROM RYAN ##
-# next step is play with the Python sqlite module and try to figure out how to 
-# 1) create a new sqlite db  
-# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite)
-
-#meow
-
+    main()
\ No newline at end of file

From 670be1605116e4ca5673ebbac06443cc1eabbe54 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Fri, 15 Aug 2025 14:33:42 -0700
Subject: [PATCH 08/20] added Create_spack_DB.py

---
 .../spack_db/Create_spack_DB.py               | 143 ++++++++++++++++++
 dataset-generation/spack_db/spack_db.py       |  46 ------
 2 files changed, 143 insertions(+), 46 deletions(-)
 create mode 100644 dataset-generation/spack_db/Create_spack_DB.py

diff --git a/dataset-generation/spack_db/Create_spack_DB.py b/dataset-generation/spack_db/Create_spack_DB.py
new file mode 100644
index 0000000..e2ccc8d
--- /dev/null
+++ b/dataset-generation/spack_db/Create_spack_DB.py
@@ -0,0 +1,143 @@
+import os
+import json
+import sqlite3
+import time
+from dapper_python.normalize import normalize_file_name
+
+# configuration
+INDEX_PATH = "cache/spack.index.db.json"
+SQLITE_DB_PATH = "cache/spack-v1.db"
+
+def build_package_filelist_db():
+    # load index
+    if not os.path.exists(INDEX_PATH):
+        print("❌ Index file not found.")
+        return
+
+    with open(INDEX_PATH, "r") as f:
+        index = json.load(f)
+
+    # Create SQLite DB
+    conn = sqlite3.connect(SQLITE_DB_PATH)
+    cursor = conn.cursor()
+
+    # Create table columns
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS package_files (
+            id INTEGER PRIMARY KEY,
+            file_name TEXT,
+            normalized_file_name TEXT,
+            file_path TEXT,
+            package_name TEXT,
+            UNIQUE(file_path, package_name)
+        )
+    ''')
+
+    # Create indices for efficient lookups
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)')
+
+
+    # Create dataset_version table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS dataset_version(
+                   version INTEGER,
+                   format TEXT,
+                   timestamp INTEGER
+        )
+    ''')
+
+    # Clear the dataset_version table
+    cursor.execute("DELETE FROM dataset_version")
+
+    # Create table columns
+    cursor.execute(
+        "INSERT INTO dataset_version (version, format, timestamp)" \
+        "VALUES (?, ?, ?)",
+        (1, "Spack", int(time.time()))
+    )
+
+    inserted_packages = 0
+    inserted_files = 0
+    for package_hash, entry in index.items():
+        try:
+            package_name = entry["name"]
+            version = entry["version"]
+            sha256 = entry["sha256"]
+        
+
+            tarinfo_path = entry.get("tarinfo_path")
+            if not tarinfo_path or not os.path.exists(tarinfo_path):
+                print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}")
+                continue
+
+            with open(tarinfo_path, "r") as f:
+                file_list = json.load(f)
+
+            package_inserted_or_updated = False
+
+            for file_path in file_list:
+                # skipping .spack/ files
+                if file_path.startswith(".spack/"):
+                    continue
+
+                # Extract file name
+                file_name = os.path.basename(file_path)
+
+                # Normalize the file name
+                try:
+                    normalized = normalize_file_name(file_name)
+                    normalized_file_name = str(normalized).lower()
+                except Exception as e:
+                    print(f"⚠️ Failed to normalize '{file_name}': {e}")
+                    normalized_file_name = file_name.lower()
+
+                # Insert into DB
+                cursor.execute(
+                    '''INSERT OR IGNORE INTO package_files 
+                       (file_name, normalized_file_name, file_path, package_name)
+                       VALUES (?, ?, ?, ?)''',
+                    (file_name, normalized_file_name, file_path, package_name)
+                )
+
+                if cursor.rowcount > 0:
+                    inserted_files += 1
+                    package_inserted_or_updated = True # New row added
+                    continue # No need to update - freshly inserted
+                #breakpoint()
+                # Row already exists - check if any values changed
+                cursor.execute(
+                    ''' SELECT file_name, normalized_file_name FROM package_files
+                        WHERE file_path = ? AND package_name = ?''',
+                    (file_path, package_name)
+                )
+                result = cursor.fetchone()
+                if result:
+                    existing_file_name, existing_normalized_name = result
+                    if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name):
+                        # Something changed - update
+                        
+
+                        # Update the row
+                        cursor.execute(
+                            ''' UPDATE package_files
+                                SET file_name = ?, normalized_file_name = ?
+                                WHERE file_path = ? AND package_name = ?''',
+                            (file_name, normalized_file_name, file_path, package_name)
+                        )
+                        package_inserted_or_updated = True # A row was updated
+            if package_inserted_or_updated:
+                inserted_packages += 1
+        
+
+        except Exception as e:
+            print(f"❌ Failed to insert {package_hash}: {e}")
+            continue
+
+    conn.commit()
+    conn.close()
+
+    print(f"🎉 Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}")
+
+if __name__ == "__main__":
+    build_package_filelist_db()
diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 0cccc00..ba69d3e 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -25,7 +25,6 @@
 
 # spack.index.db.json maps each package back to the packagename, version, SHA256hash, 
     # path to manifest json file, path to tarinfo file list
-<<<<<<< HEAD
 INDEX_FILE = "cache/spack.index.db.json"  
 
 # MANIFEST_DIR is the source of metadata per package used in master index
@@ -34,21 +33,10 @@
 # TARINFO_DIR contains the extracted list of files from each binary tarball
 # this will be used in making the SQLite database without having to reprocess the tarballs again
 TARINFO_DIR = "cache/tarinfo" 
-=======
-INDEX_FILE = "cache/DEMO_spack.index.db.json"  
-
-# MANIFEST_DIR is the source of metadata per package used in master index
-MANIFEST_DIR = "cache/DEMO_manifest" 
-
-# TARINFO_DIR contains the extracted list of files from each binary tarball
-# this will be used in making the SQLite database without having to reprocess the tarballs again
-TARINFO_DIR = "cache/DEMO_tarinfo" 
->>>>>>> origin/spack_scraping_LQ
 
 # SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet
 # a clean copy is meant to be placed in MANIFEST_DIR
 # SPEC_CACHE_DIR avoids redownloading if the script is restarted.
-<<<<<<< HEAD
 SPEC_CACHE_DIR = "cache/spec_manifests" 
 
 # BINARY_CACHE_DIR contains the downloaded tarballs temporarily
@@ -65,24 +53,6 @@
 MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt"
 SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt"
 FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt"
-=======
-SPEC_CACHE_DIR = "cache/DEMO_spec_manifests" 
-
-# BINARY_CACHE_DIR contains the downloaded tarballs temporarily
-# the file is deleted after processing. 
-BINARY_CACHE_DIR = "cache/DEMO_binary_packages" 
-TIMEOUT_LOG_FILE = "cache/DEMO_timeouts.txt"
-
-# checkpoint to safely stop the script at any time
-# progress.txt saves the spec_manifest hash
-CHECKPOINT_FILE = "DEMO_progress.txt" 
-
-# file to track all the manifest files that were unable to download
-SKIPPED_MANIFESTS_FILE = "cache/DEMO_skipped_manifests.txt"
-MISSING_TARBALL_HASH_FILE = "cache/DEMO_missing_tarballs.txt"
-SHARED_TARBALL_HASH_FILE = "cache/DEMO_shared_tarballs.txt"
-FAILED_TARBALL_DOWNLOAD_FILE = "cache/DEMO_failed_tarball_downloads.txt"
->>>>>>> origin/spack_scraping_LQ
 
 # create cache directories for faster download
 # FIX ME: Remove SPEC_CACHE_DIR
@@ -217,11 +187,7 @@ def download_from_URL(theURL, package, is_spec=True):
 
     # full file path then is:
         # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
-<<<<<<< HEAD
         #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
-=======
-        #cache/DEMO_manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
->>>>>>> origin/spack_scraping_LQ
     cached_path = os.path.join(cache_dir, package_name)
     print(f"this is the cached_path {cached_path}")
 
@@ -472,11 +438,7 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
         ##with open(manifest_path, "w") as f:
             ##json.dump(clean_spec_manifest, f, indent=2)
         write_manifest_safely(manifest_path, clean_spec_manifest)
-<<<<<<< HEAD
         print(f"✅ Manifest safely written: {manifest_path}")
-=======
-        print("✅ Manifest safely written: {manifest_path}")
->>>>>>> origin/spack_scraping_LQ
 
     
 
@@ -558,11 +520,7 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb
 
 
 def main():
-<<<<<<< HEAD
     #file_name = "myMedjson.json"
-=======
-    #file_name = "myMedjson_DEMO.json"
->>>>>>> origin/spack_scraping_LQ
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
     file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9"
@@ -640,11 +598,7 @@ def main():
                 tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
 
                 if manifest_exists and tarinfo_exists:
-<<<<<<< HEAD
                     #print(f"Skipping fully processed package: {package_hash}")
-=======
-                    print(f"Skipping fully processed package: {package_hash}")
->>>>>>> origin/spack_scraping_LQ
                     continue
                 
                 # if tarball previously failed, skip retrying it

From cefc0fccf399b52c4ef246ea49aa53117a52a29a Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Fri, 15 Aug 2025 17:16:43 -0700
Subject: [PATCH 09/20] added test files

---
 dataset-generation/spack_db/conftest.py      | 110 ++++++
 dataset-generation/spack_db/test_spack_db.py | 356 +++++++++++++++++++
 2 files changed, 466 insertions(+)
 create mode 100644 dataset-generation/spack_db/conftest.py
 create mode 100644 dataset-generation/spack_db/test_spack_db.py

diff --git a/dataset-generation/spack_db/conftest.py b/dataset-generation/spack_db/conftest.py
new file mode 100644
index 0000000..3acaeb6
--- /dev/null
+++ b/dataset-generation/spack_db/conftest.py
@@ -0,0 +1,110 @@
+# conftest.py
+import io
+import json
+import tarfile
+import types
+import pytest
+
+
+import spack_db as pl
+
+@pytest.fixture(autouse=True)
+def isolate_fs(tmp_path, monkeypatch):
+    """Redirect all cache/config paths to a temp dir per test."""
+    cache = tmp_path / "cache"
+    (cache / "manifest").mkdir(parents=True, exist_ok=True)
+    (cache / "tarinfo").mkdir(parents=True, exist_ok=True)
+    (cache / "spec_manifests").mkdir(parents=True, exist_ok=True)
+    (cache / "binary_packages").mkdir(parents=True, exist_ok=True)
+
+    monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False)
+    monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False)
+    monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False)
+    monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False)
+    monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False)
+
+    monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False)
+    monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False)
+    monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False)
+    monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False)
+    monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False)
+
+    # Ensure directories exist for atomic writes
+    (tmp_path / "cache").mkdir(exist_ok=True)
+    yield
+
+
+@pytest.fixture
+def sample_manifest_json():
+    """
+    Create the *actual bytes* expected by remove_lines_spec_manifest:
+    take a valid JSON, then pad 49 bytes in front and 834 bytes at the end.
+    """
+    body = {
+        "data": [
+            {"mediaType": "irrelevant/type", "checksum": "abc"},
+            {"mediaType": "application/vnd.spack.install.v2.tar+gzip",
+             "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"}
+        ]
+    }
+    raw = json.dumps(body).encode("utf-8")
+    return b"x" * 49 + raw + b"y" * 834
+
+
+@pytest.fixture
+def tar_with_placeholder_bytes():
+    """
+    Build a tar in-memory whose members include the __spack_path_placeh segments
+    and the package-tail folder (e.g., 'compiler-wrapper-1.0-<hash>').
+    """
+    pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    member_name = (
+        "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/"
+        "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/"
+        f"{pkg_tail}/.spack/install_environment.json"
+    )
+
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tf:
+        data = b"{}"
+        tarinfo = tarfile.TarInfo(name=member_name)
+        tarinfo.size = len(data)
+        tf.addfile(tarinfo, io.BytesIO(data))
+    return buf.getvalue()
+
+
+class DummyResp:
+    def __init__(self, status_code=200, content=b""):
+        self.status_code = status_code
+        self.content = content
+
+
+@pytest.fixture
+def fake_requests(monkeypatch):
+    """
+    Monkeypatch requests.get with programmable behavior per-URL.
+    Usage:
+        table = {}
+        def _route(url, *a, **kw): return table[url]()
+        fake = fake_requests
+        fake.route = _route
+        monkeypatch.setattr(pl.requests, "get", _route)
+        table["...json"] = lambda: DummyResp(200, b"...")
+    """
+    table = {}
+
+    def _get(url, *args, **kwargs):
+        if url not in table:
+            raise AssertionError(f"Unexpected URL requested: {url}")
+        result = table[url]()
+        # Allow raising exceptions (e.g., Timeout) from factories
+        if isinstance(result, Exception):
+            raise result
+        return result
+
+    # Expose for tests to fill
+    _get.table = table
+    monkeypatch.setattr(pl.requests, "get", _get)
+    return _get
\ No newline at end of file
diff --git a/dataset-generation/spack_db/test_spack_db.py b/dataset-generation/spack_db/test_spack_db.py
new file mode 100644
index 0000000..dcf3181
--- /dev/null
+++ b/dataset-generation/spack_db/test_spack_db.py
@@ -0,0 +1,356 @@
+# test_spack_db.py
+import json
+import os
+import io
+import tarfile
+import pytest
+import spack_db as pl
+from requests import exceptions as req_exc
+
+
+def test_update_index_entry_sets_paths(tmp_path):
+    idx = {}
+    pkg_hash = "deadbeef"
+    value = {"spec": {"name": "foo", "version": "1.2.3"}}
+    tar_hash = "abc123"
+    pl.update_index_entry(idx, pkg_hash, value, tar_hash)
+    assert pkg_hash in idx
+    entry = idx[pkg_hash]
+    assert entry["name"] == "foo"
+    assert entry["version"] == "1.2.3"
+    assert entry["sha256"] == "abc123"
+    assert entry["manifest_path"].endswith(f"manifest/foo-1.2.3-deadbeef.json")
+    assert entry["tarinfo_path"].endswith(f"tarinfo/abc123.json")
+
+
+def test_index_save_and_backup(tmp_path):
+    pl.save_index({"a": 1})
+    # First save: no .bak yet
+    assert os.path.exists(pl.INDEX_FILE)
+    assert not os.path.exists(pl.INDEX_FILE + ".bak")
+
+    # Second save should write a .bak of previous
+    pl.save_index({"a": 2})
+    assert os.path.exists(pl.INDEX_FILE + ".bak")
+    with open(pl.INDEX_FILE) as f:
+        assert json.load(f)["a"] == 2
+    with open(pl.INDEX_FILE + ".bak") as f:
+        assert json.load(f)["a"] == 1
+
+
+def test_checkpoint_roundtrip(tmp_path):
+    pl.save_checkpoint("pkg-hash-123")
+    assert pl.load_checkpoint() == "pkg-hash-123"
+
+
+def test_remove_lines_spec_manifest_and_extract_hash(sample_manifest_json):
+    cleaned = pl.remove_lines_spec_manifest(sample_manifest_json)
+    # ensure we got dict and our access function finds the right tarball hash
+    assert isinstance(cleaned, dict)
+    h = pl.access_spec_manifest_media_type(cleaned)
+    assert h == "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+
+def test_malformed_manifest_is_logged(fake_requests, tmp_path):
+    # Set up a fake package hash and minimal database entry
+    pkg_hash = "badbadbadbadbadbadbadbadbadbadba"
+    name = "broken-pkg"
+    ver = "1.0"
+    db_entry = {"spec": {"name": name, "version": ver}}
+    
+    # Manifest URL that print_files will request
+    manifest_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json"
+    
+    # Make fake_requests return invalid bytes (invalid JSON)
+    fake_requests.table[manifest_url] = lambda: type(
+        "R", (), {"status_code": 200, "content": b"xxxx"}  # bad data
+    )()
+    
+    # Run print_files, which should try to parse the manifest and fail
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+    pl.print_files(pkg_hash, db_entry, index, existing_tarinfo_files, seen_tarball_hashes)
+    
+    # Check: malformed_manifests.txt exists and has the hash + URL + error
+    assert os.path.exists(pl.MALFORMED_MANIFESTS_FILE)
+    with open(pl.MALFORMED_MANIFESTS_FILE, "r") as f:
+        log_content = f.read()
+    assert pkg_hash in log_content
+    assert manifest_url in log_content
+    assert "ValueError" in log_content or "JSON" in log_content 
+    
+    # Check: skipped_manifests.txt
+    assert os.path.exists(pl.SKIPPED_MANIFESTS_FILE)
+    with open(pl.SKIPPED_MANIFESTS_FILE, "r") as f:
+        skipped_content = f.read()
+    assert pkg_hash in skipped_content
+
+def test_make_urls():
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    url = pl.make_binary_package_URL(tar_hash)
+
+    # Basic correctness
+    assert url.endswith("/" + tar_hash)
+    assert f"/{tar_hash[:2]}/" in url  # path uses first two hex chars as subdir
+
+    # manifest URL
+    pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    pkg_val = {"spec": {"name": "compiler-wrapper", "version": "1.0"}}
+    murl, package_filename = pl.make_spec_manifest_URL(pkg_hash, pkg_val)
+    assert "manifests/spec/compiler-wrapper/" in murl
+    assert murl.endswith(f"compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json")
+    assert package_filename == f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}"
+
+
+
+def test_download_from_URL_spec_does_not_persist(fake_requests, tmp_path):
+    # For is_spec=True, function returns bytes but should NOT save a file to SPEC_CACHE_DIR
+    url = "https://example.com/x.spec.manifest.json"
+    content = b"hello"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=True)
+    # Returned content
+    assert out == content
+    # Not persisted
+    cached_path = os.path.join(pl.SPEC_CACHE_DIR, "compiler__x-1.0-abc")
+    assert not os.path.exists(cached_path)
+
+
+def test_download_from_URL_binary_persists(fake_requests, tmp_path):
+    url = "https://example.com/blob.tar.gz"
+    content = b"tarbytes"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False)
+    assert out == content
+    cached_path = os.path.join(pl.BINARY_CACHE_DIR, "compiler__x-1.0-abc")
+    assert os.path.exists(cached_path)
+    with open(cached_path, "rb") as f:
+        assert f.read() == content
+
+
+def test_download_timeout_logs(fake_requests, tmp_path):
+    url = "https://timeout.test/blob.tar.gz"
+    fake_requests.table[url] = lambda: req_exc.Timeout()
+
+    out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False)
+    assert out is None
+    with open(pl.TIMEOUT_LOG_FILE, "r") as f:
+        txt = f.read()
+    assert "compiler__x-1.0-abc" in txt
+    assert url in txt
+
+
+def test_read_binary_package_extracts_and_cleans_paths(tar_with_placeholder_bytes, tmp_path):
+    # Prepare the "downloaded" tar path that read_binary_package will delete afterward
+    pkg = "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    dl_path = os.path.join(pl.BINARY_CACHE_DIR, pkg.replace("/", "__"))
+    with open(dl_path, "wb") as f:
+        f.write(b"placeholder")
+
+    # Run
+    pl.read_binary_package(tar_with_placeholder_bytes, pkg, tar_hash)
+
+    # Verify cleaned tarinfo file written atomically
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+    with open(tarinfo_path) as f:
+        items = json.load(f)
+    # Path should start *after* the 'pkg-tail/' segment due to remove_placeholder_directories
+    assert any(p.endswith(".spack/install_environment.json") for p in items)
+
+    # tarball removed and buffer freed
+    assert not os.path.exists(dl_path)
+
+
+def test_print_files_happy_path(fake_requests, sample_manifest_json, tar_with_placeholder_bytes, tmp_path):
+    # minimal database with one install
+    pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
+    db = {
+        "database": {
+            "installs": {
+                pkg_hash: {"spec": {"name": "compiler-wrapper", "version": "1.0"}}
+            }
+        }
+    }
+    index = {}
+    existing_tarinfo_files = set()  # no prior tarinfo
+    seen_tarball_hashes = set()
+
+    # Wire URLs expected by print_files flow
+    manifest_url = (
+        "https://binaries.spack.io/develop/v3/manifests/spec/"
+        f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json"
+    )
+    tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    fake_requests.table[manifest_url] = lambda: type("R", (), {"status_code": 200, "content": sample_manifest_json})()
+    fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_with_placeholder_bytes})()
+
+    # Act
+    pl.print_files(pkg_hash, db["database"]["installs"][pkg_hash], index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # Assert manifest saved safely
+    manifest_path = os.path.join(pl.MANIFEST_DIR, f"compiler-wrapper-1.0-{pkg_hash}.json")
+    assert os.path.exists(manifest_path)
+    # Assert tarinfo created
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+    # Index updated & saved
+    assert pkg_hash in index
+    assert index[pkg_hash]["sha256"] == tar_hash
+    assert os.path.exists(pl.INDEX_FILE)
+
+
+def test_print_files_skips_when_tarinfo_exists(sample_manifest_json, tmp_path):
+    # Prepare: existing tarinfo means no binary download
+    pkg_hash = "zzz111"
+    spec = {"spec": {"name": "foo", "version": "9.9"}}
+    index = {}
+
+    # Pre-create tarinfo
+    prehash = "abcdead00face"
+    existing_tarinfo_files = {prehash}
+    seen_tarball_hashes = set()
+
+    # Monkeypatch access_spec_manifest_media_type to return prehash so it matches existing_tarinfo_files
+    orig = pl.access_spec_manifest_media_type
+    pl.access_spec_manifest_media_type = lambda _db: prehash
+
+    # Also ensure manifest path is considered existing so we don't try to download it
+    manifest_path = os.path.join(pl.MANIFEST_DIR, f"foo-9.9-{pkg_hash}.json")
+    with open(manifest_path, "w") as f:
+        json.dump({"dummy": True}, f)
+
+    try:
+        pl.print_files(pkg_hash, spec, index, existing_tarinfo_files, seen_tarball_hashes)
+    finally:
+        pl.access_spec_manifest_media_type = orig
+
+    # Should just update index and save, no download attempted
+    assert pkg_hash in index
+    assert index[pkg_hash]["sha256"] == prehash
+    with open(pl.INDEX_FILE) as f:
+        data = json.load(f)
+    assert pkg_hash in data
+
+
+def test_download_404_returns_none(fake_requests):
+    url = "https://example.com/notfound"
+    fake_requests.table[url] = lambda: type("R", (), {"status_code": 404, "content": b""})()
+    assert pl.download_from_URL(url, "some/pkg-1.0-deadbeef", is_spec=True) is None
+
+def _mk_manifest_bytes_with_hash(tar_hash: str):
+    """Helper: build bytes that match remove_lines_spec_manifest's slicing contract."""
+    body = {"data": [
+        {"mediaType": "x/ignored", "checksum": "zzz"},
+        {"mediaType": "application/vnd.spack.install.v2.tar+gzip", "checksum": tar_hash},
+    ]}
+    raw = json.dumps(body).encode("utf-8")
+    return b"x" * 49 + raw + b"y" * 834
+
+
+def test_shared_tarball_logs_and_skips_second_download(fake_requests, tmp_path):
+    """
+    Two different package hashes point to the same tarball hash.
+    We expect:
+      - First run downloads tarball and writes tarinfo.
+      - Second run logs to SHARED_TARBALL_HASH_FILE and (since we update
+        existing_tarinfo_files to include the first tarinfo) skips re-download.
+    """
+    # Common tarball hash & blob URL
+    tar_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcd"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    # Package A
+    pkg_hash_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+    name = "compiler-wrapper"
+    ver = "1.0"
+    man_url_a = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_a}.spec.manifest.json"
+
+    # Package B (different spec hash, same tarball)
+    pkg_hash_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    man_url_b = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_b}.spec.manifest.json"
+
+    # Route manifests to same tarball hash, and the blob to some tar bytes
+    tar_bytes = b"\x1f\x8b" + b"tar" * 100  # not actually parsed; we won't open it here
+    # For the first call we want a real tar.gz
+    # We'll just reuse download + skip path by creating a minimal valid tar.gz:
+    import io, tarfile
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tf:
+        ti = tarfile.TarInfo(name=f"{name}-{ver}-{pkg_hash_a}/.spack/install_environment.json")
+        data = b"{}"
+        ti.size = len(data)
+        tf.addfile(ti, io.BytesIO(data))
+    tar_bytes = buf.getvalue()
+
+    fake_requests.table[man_url_a] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[man_url_b] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[blob_url]   = lambda: type("R", (), {"status_code": 200, "content": tar_bytes})()
+
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+
+    # Run A (creates tarinfo and index)
+    pl.print_files(pkg_hash_a, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert os.path.exists(tarinfo_path)
+
+    # Emulate the main loop behavior: keep using *the same* existing_tarinfo_files set,
+    # but update it to reflect that we've now created tarinfo.
+    existing_tarinfo_files.add(tar_hash)
+
+    # Guard: if the second call tries to re-download, we'd need another blob mapping.
+    # We purposely *don't* add one here—so if it tries, the test will fail.
+
+    # Run B (should log as shared and skip binary download due to existing_tarinfo_files)
+    pl.print_files(pkg_hash_b, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # Check shared log file captured the second spec hash + the shared tar hash
+    assert os.path.exists(pl.SHARED_TARBALL_HASH_FILE)
+    with open(pl.SHARED_TARBALL_HASH_FILE, "r") as f:
+        shared_log = f.read()
+    assert f"{pkg_hash_b}\t{tar_hash}" in shared_log
+
+    # Index should contain both manifests, same sha256
+    assert index[pkg_hash_a]["sha256"] == tar_hash
+    assert index[pkg_hash_b]["sha256"] == tar_hash
+
+
+def test_failed_tarball_download_is_logged(fake_requests, tmp_path):
+    """
+    If the blob download fails (404 or None), we should append to FAILED_TARBALL_DOWNLOAD_FILE
+    and not produce a tarinfo file.
+    """
+    name = "foo"
+    ver = "9.9"
+    pkg_hash = "cccccccccccccccccccccccccccccccc"
+    tar_hash = "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
+    man_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json"
+    blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}"
+
+    # Manifest OK, blob 404
+    fake_requests.table[man_url] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})()
+    fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 404, "content": b""})()
+
+    index = {}
+    existing_tarinfo_files = set()
+    seen_tarball_hashes = set()
+
+    pl.print_files(pkg_hash, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes)
+
+    # No tarinfo created
+    tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json")
+    assert not os.path.exists(tarinfo_path)
+
+    # Log entry created with package filename, manifest hash, and tarball hash
+    assert os.path.exists(pl.FAILED_TARBALL_DOWNLOAD_FILE)
+    with open(pl.FAILED_TARBALL_DOWNLOAD_FILE, "r") as f:
+        log = f.read()
+    # Contains the manifest hash + tarball hash; also includes the package_filename prefix
+    assert f"manifest hash: {pkg_hash}, tarball hash: {tar_hash}" in log

From 81afc13ece50311e069dcfe9bc5dd573c66f0b93 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Mon, 18 Aug 2025 11:30:51 -0700
Subject: [PATCH 10/20] updated remove_lines_spec_manifest function

---
 dataset-generation/spack_db/spack_db.py | 127 ++++++++++++++++++------
 1 file changed, 95 insertions(+), 32 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index ba69d3e..1f89395 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -21,6 +21,8 @@
 import tempfile
 import shutil
 
+from pathlib import Path
+
 # Configuration
 
 # spack.index.db.json maps each package back to the packagename, version, SHA256hash, 
@@ -50,6 +52,9 @@
 
 # file to track all the manifest files that were unable to download
 SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt"
+############ updated 8/15 ############
+MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt"
+######################################
 MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt"
 SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt"
 FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt"
@@ -61,6 +66,37 @@
 os.makedirs(SPEC_CACHE_DIR, exist_ok = True)
 os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
 
+# purge truncated binaries in the binary_package directory on restart/resume
+# def purge_truncated_binaries():
+   
+#     if not os.path.isdir(BINARY_CACHE_DIR):
+#         return
+#     removed = 0
+#     for fname in os.listdir(BINARY_CACHE_DIR):
+#         fpath = os.path.join(BINARY_CACHE_DIR, fname)
+#         if not os.path.isfile(fpath):
+#             continue
+#         try:
+#             # Autodetect compression; iterate to EOF to ensure integrity
+#             with tarfile.open(fpath, mode="r:*") as tar:
+#                 for _ in tar:
+#                     pass
+#         except (tarfile.ReadError, EOFError, OSError) as e:
+#             print(f"🧹 Removing corrupt binary cache: {fpath} ({e})")
+#             try:
+#                 os.remove(fpath)
+#                 removed += 1
+#             except OSError as oe:
+#                 print(f"⚠️ Could not remove {fpath}: {oe}")
+#     if removed:
+#         print(f"✅ Purged {removed} corrupt/partial tarball(s).")
+
+############ updated 8/15 ############
+# Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage.
+def _to_posix(p:str) -> str:
+    return Path(p).as_posix()
+######################################
+
 # look for index if it exists to add info to
 def load_index(): 
     # if the index_file exists, read it
@@ -93,12 +129,25 @@ def update_index_entry(index, package_hash, package_value, package_zip_hash):
     manifest_filename = f"{name}-{version}-{package_hash}.json"
     tarinfo_filename = f"{package_zip_hash}.json"
 
+    ############ updated 8/15 ############
+    manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename)
+    tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename)
+    ######################################
+
     index[package_hash] = {
         "name": name,
         "version": version,
         "sha256": package_zip_hash,
-        "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename),
-        "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename)
+        ############ updated 8/15 ############
+        ##"manifest_path": os.path.join(MANIFEST_DIR, manifest_filename),
+        ##"tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename)
+
+        # store forward-slash form in JSON for portability
+        "manifest_path": _to_posix(manifest_path_fs),
+        "tarinfo_path": _to_posix(tarinfo_path_fs),
+        
+        ######################################
+
     }
 
 
@@ -240,14 +289,27 @@ def download_from_URL(theURL, package, is_spec=True):
 
 # remove unnecessary lines in file
 def remove_lines_spec_manifest(myfile): 
-    
+    ############ updated 8/15 ############
     # removes unnecessary bytes
-    removed_ends = myfile[49:-834]
+    # removed_ends = myfile[49:-834]
 
-    # converts bytes to dictionary
-    database = json.loads(removed_ends)
+    # # converts bytes to dictionary
+    # database = json.loads(removed_ends)
    
-    return database
+    # return database
+
+    # Accept bytes or str; extract between first '{' and last '}'
+    data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8")
+
+    start = data.find(b"{")
+    end = data.rfind(b"}")
+
+    if start == -1 or end == -1 or end < start:
+        raise ValueError("Malformed manifest: JSON braces not found")
+    
+    return json.loads(data[start:end+1].decode("utf-8"))
+    ######################################
+
 
 # returns checksum, sha256 hash used to download the binary tarball
 def access_spec_manifest_media_type(db): 
@@ -279,7 +341,6 @@ def make_binary_package_URL(package_zip_hash):
 
     return myURL
 
-############ updated 8/10 ############
 # ensure tarinfo completeness
 def write_tarinfo_safely(tarinfo_path, file_list):
     temp_dir = os.path.dirname(tarinfo_path)
@@ -296,7 +357,6 @@ def write_manifest_safely(manifest_path, manifest_data):
         json.dump(manifest_data, tmp, indent=2)
         temp_name = tmp.name
     shutil.move(temp_name, manifest_path)
-#####################################
 
 # using python tarfile module io module to list all the files in the downloaded tarball 
 # myfile is the tar_file response.content and the package is the hash we will split by
@@ -339,14 +399,9 @@ def read_binary_package(myfile, package, package_zip_hash):
     name = package.split('/')[0]
     version = package.split('/')[1].split('-')[1]
     
-    ############ updated 8/10 ############
     # saves file names to the tarinfo file
-    ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
-    ##with open(tarinfo_path, "w") as f:
-        ##json.dump(file_list, f, indent=2)
     tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json")
     write_tarinfo_safely(tarinfo_path, file_list)
-    #####################################
 
     # removes tarball once processed
     tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__'))
@@ -394,7 +449,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
     name = package_value['spec']['name']
     version = package_value['spec']['version']
     
-    ############ updated 8/10 ############
     manifest_filename = f"{name}-{version}-{package_hash}.json"
     manifest_path = os.path.join(MANIFEST_DIR, manifest_filename)
 
@@ -431,24 +485,27 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
 
         print("✅ Loaded cached spec manifest")
 
-        # remove unneccessary lines from downloaded spec manifest
-        clean_spec_manifest = remove_lines_spec_manifest(temp)
+        
+        ############ updated 8/15 ############
+    
+        try:
+            # remove unneccessary lines from downloaded spec manifest
+            clean_spec_manifest = remove_lines_spec_manifest(temp)
+        except Exception as e:
+            print(f"Failed to parse manifest {package_filename}: {e}")
+            with open(MALFORMED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\t{theURL}\t{type(e).__name__}: {e}\n")
+            # also adding it to file for skipping processing at restart
+            with open(SKIPPED_MANIFESTS_FILE, "a") as f:
+                f.write(f"{package_hash}\n")
+            return
+        
+        ######################################
 
         # writes cleaned manifest information to manifest file
-        ##with open(manifest_path, "w") as f:
-            ##json.dump(clean_spec_manifest, f, indent=2)
         write_manifest_safely(manifest_path, clean_spec_manifest)
         print(f"✅ Manifest safely written: {manifest_path}")
 
-    
-
-    # writes cleaned manifest information to manifest file
-    ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json")
-    ##with open(manifest_file, "w") as f:
-        ##json.dump(clean_spec_manifest, f, indent=2)
-
-    ######################################
-
     # find the mediaType that contains the hash for the package tarball install
     package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest)
     print(f"✅ Extracted zip hash: {package_zip_hash}")
@@ -520,6 +577,9 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb
 
 
 def main():
+
+    # remove any corrupt tarballs if they exist before resuming program
+    ##purge_truncated_binaries()
     #file_name = "myMedjson.json"
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
@@ -593,10 +653,13 @@ def main():
                 manifest_path = entry.get("manifest_path", "")
                 tarinfo_path = entry.get("tarinfo_path", "")
                 tarball_hash = entry.get("sha256", "")
-
-                manifest_exists = manifest_path and os.path.exists(manifest_path)
-                tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
-
+                
+                ############ updated 8/15 ############
+                ##manifest_exists = manifest_path and os.path.exists(manifest_path)
+                ##tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
+                manifest_exists = bool(manifest_path) and Path(manifest_path).exists()
+                tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists()
+                ######################################
                 if manifest_exists and tarinfo_exists:
                     #print(f"Skipping fully processed package: {package_hash}")
                     continue

From f18976708f2f28b8408668b871a7066877f7deff Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Mon, 18 Aug 2025 15:39:07 -0700
Subject: [PATCH 11/20] added README.md

---
 dataset-generation/spack_db/README.md | 70 +++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 dataset-generation/spack_db/README.md

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
new file mode 100644
index 0000000..4c40155
--- /dev/null
+++ b/dataset-generation/spack_db/README.md
@@ -0,0 +1,70 @@
+# Spack Build Cache Data Scraper
+
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. 
+
+The program builds a master index called spack.index.db.json. The layout of the index:
+    * spec manifest hash as the unique key
+    * package name and version
+    * package tarball unique SHA256 hash
+    * package manifest path to the local cache directory
+    * package tarinfo path to the local cache directory
+
+The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
+
+## Directory Structure
+    * `cache/spack.index.db.json` - master index
+    * `cache/manifest/` - cleaned spec manifests
+    * `cache/tarinfo/` - JSON file lists extracted from tarballs
+    * `cache/spec_manifests/` - temporary cache of raw manifests before clean up
+    * 'cache/binary_packages/` - temporary cache of downloaded tarballs
+    * `cache/timeouts.txt` - packages that timed out while downloading
+    * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
+    * `cache/malformed_manifests.txt` - manifests that failed parsing
+    * `cache/missing_tarballs.txt` - manifests without a tarball hash
+    * `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
+    * `cache/failed_tarball_downloads.txt` - tarballs that failed to download
+
+## Features
+    * Retrieves package `.spec.manifest.json` from Spack's binary mirror
+    * Extracts valid JSON payload, and removes extra characters
+    * Retrieves binary tarballs and extracts file lists
+    * Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
+    * Contains multiple checkpoints for safe restart/resume of the program
+    * Records skipped/malformed manifests, missing hashes, failed tarbll downloads
+    * Stores forward-slash paths in JSON index for cross-platform use
+
+## Usage
+1. Install dependencies
+```bash
+pip install requests
+```
+The rest of the necessary modules are part of Python's standard library.
+
+2. Provide a database file
+Update the file_name in `main()` if needed
+
+3. Run the script
+```bash
+python spack_db.py
+```
+
+4. Resume after interruption
+If an interruption occurs, it is safe to re-run the script without losing data already processed. 
+
+5. Run Create_spack_DB.py to create SQLite database
+```bash
+python Create_spack_DB.py
+```
+Database will include all files extracted from the packages from the Spack build cache.
+
+## Contributing
+
+Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
+
+Please make sure to update tests as appropriate.
+
+For more information on contributing see the [CONTRIBUTING](./CONTRIBUTING.md) file.
+
+## License
+
+MIT license
\ No newline at end of file

From b434859d0b42d60fff1c205c8f15dac1dbe927d1 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Mon, 18 Aug 2025 15:42:51 -0700
Subject: [PATCH 12/20] fixed bullet points in Directory Structure and Features

---
 dataset-generation/spack_db/README.md | 36 +++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 4c40155..86e937a 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -12,26 +12,26 @@ The program builds a master index called spack.index.db.json. The layout of the
 The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
 
 ## Directory Structure
-    * `cache/spack.index.db.json` - master index
-    * `cache/manifest/` - cleaned spec manifests
-    * `cache/tarinfo/` - JSON file lists extracted from tarballs
-    * `cache/spec_manifests/` - temporary cache of raw manifests before clean up
-    * 'cache/binary_packages/` - temporary cache of downloaded tarballs
-    * `cache/timeouts.txt` - packages that timed out while downloading
-    * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
-    * `cache/malformed_manifests.txt` - manifests that failed parsing
-    * `cache/missing_tarballs.txt` - manifests without a tarball hash
-    * `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
-    * `cache/failed_tarball_downloads.txt` - tarballs that failed to download
+* `cache/spack.index.db.json` - master index
+* `cache/manifest/` - cleaned spec manifests
+* `cache/tarinfo/` - JSON file lists extracted from tarballs
+* `cache/spec_manifests/` - temporary cache of raw manifests before clean up
+* `cache/binary_packages/` - temporary cache of downloaded tarballs
+* `cache/timeouts.txt` - packages that timed out while downloading
+* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded
+* `cache/malformed_manifests.txt` - manifests that failed parsing
+* `cache/missing_tarballs.txt` - manifests without a tarball hash
+* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball
+* `cache/failed_tarball_downloads.txt` - tarballs that failed to download
 
 ## Features
-    * Retrieves package `.spec.manifest.json` from Spack's binary mirror
-    * Extracts valid JSON payload, and removes extra characters
-    * Retrieves binary tarballs and extracts file lists
-    * Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
-    * Contains multiple checkpoints for safe restart/resume of the program
-    * Records skipped/malformed manifests, missing hashes, failed tarbll downloads
-    * Stores forward-slash paths in JSON index for cross-platform use
+* Retrieves package `.spec.manifest.json` from Spack's binary mirror
+* Extracts valid JSON payload, and removes extra characters
+* Retrieves binary tarballs and extracts file lists
+* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information
+* Contains multiple checkpoints for safe restart/resume of the program
+* Records skipped/malformed manifests, missing hashes, failed tarbll downloads
+* Stores forward-slash paths in JSON index for cross-platform use
 
 ## Usage
 1. Install dependencies

From bfa7dbe8acdae4e09ca459d725a28b57fefdc5e1 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Mon, 18 Aug 2025 15:45:52 -0700
Subject: [PATCH 13/20] fixed bullet points for the layout of the index

---
 dataset-generation/spack_db/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 86e937a..42c4e5d 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -3,11 +3,11 @@
 This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. 
 
 The program builds a master index called spack.index.db.json. The layout of the index:
-    * spec manifest hash as the unique key
-    * package name and version
-    * package tarball unique SHA256 hash
-    * package manifest path to the local cache directory
-    * package tarinfo path to the local cache directory
+* spec manifest hash as the unique key
+* package name and version
+* package tarball unique SHA256 hash
+* package manifest path to the local cache directory
+* package tarinfo path to the local cache directory
 
 The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
 

From c3ba8d55654ffb7ff294c9ad4c5f971f30499afa Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Mon, 18 Aug 2025 15:51:46 -0700
Subject: [PATCH 14/20] updated Usage

---
 dataset-generation/spack_db/README.md | 41 ++++++++++++++-------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 42c4e5d..39b027e 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -1,13 +1,14 @@
 # Spack Build Cache Data Scraper
 
-This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. 
+This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
 
-The program builds a master index called spack.index.db.json. The layout of the index:
-* spec manifest hash as the unique key
-* package name and version
-* package tarball unique SHA256 hash
-* package manifest path to the local cache directory
-* package tarinfo path to the local cache directory
+The program builds a master index called `spack.index.db.json`. 
+* Index layout:
+    * spec manifest hash as the unique key
+    * package name and version
+    * package tarball unique SHA256 hash
+    * package manifest path to the local cache directory
+    * package tarinfo path to the local cache directory
 
 The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. 
 
@@ -35,27 +36,27 @@ The program allows for restart/resume in case there are program run interruption
 
 ## Usage
 1. Install dependencies
-```bash
-pip install requests
-```
-The rest of the necessary modules are part of Python's standard library.
+    ```bash
+    pip install requests
+    ```
+    The rest of the necessary modules are part of Python's standard library.
 
 2. Provide a database file
-Update the file_name in `main()` if needed
+    Update the file_name in `main()` if needed
 
 3. Run the script
-```bash
-python spack_db.py
-```
+    ```bash
+    python spack_db.py
+    ```
 
 4. Resume after interruption
-If an interruption occurs, it is safe to re-run the script without losing data already processed. 
+    If an interruption occurs, it is safe to re-run the script without losing data already processed. 
 
 5. Run Create_spack_DB.py to create SQLite database
-```bash
-python Create_spack_DB.py
-```
-Database will include all files extracted from the packages from the Spack build cache.
+    ```bash
+    python Create_spack_DB.py
+    ```
+    Database will include all files extracted from the packages from the Spack build cache.
 
 ## Contributing
 

From fe63504609e84697775d2ee8db0780d52220e5e1 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Tue, 19 Aug 2025 13:53:03 -0700
Subject: [PATCH 15/20] updated README.md with retry manifest/tarball download
 instruction

---
 dataset-generation/spack_db/README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 39b027e..86d4c01 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -1,4 +1,4 @@
-# Spack Build Cache Data Scraper
+# Spack Build Cache Data Scraper & SQLite Database
 
 This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. 
 
@@ -52,7 +52,10 @@ The program allows for restart/resume in case there are program run interruption
 4. Resume after interruption
     If an interruption occurs, it is safe to re-run the script without losing data already processed. 
 
-5. Run Create_spack_DB.py to create SQLite database
+5. Retry manifests or tarballs
+    Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads.
+
+6. Run Create_spack_DB.py to create SQLite database
     ```bash
     python Create_spack_DB.py
     ```

From 19e76fb1684c606cf2a7fbc8988a4934892b8519 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Tue, 19 Aug 2025 16:10:35 -0700
Subject: [PATCH 16/20] updated _to_posix for printing

---
 dataset-generation/spack_db/spack_db.py | 150 +++++++-----------------
 1 file changed, 42 insertions(+), 108 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 1f89395..2cf7fca 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -36,11 +36,6 @@
 # this will be used in making the SQLite database without having to reprocess the tarballs again
 TARINFO_DIR = "cache/tarinfo" 
 
-# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet
-# a clean copy is meant to be placed in MANIFEST_DIR
-# SPEC_CACHE_DIR avoids redownloading if the script is restarted.
-SPEC_CACHE_DIR = "cache/spec_manifests" 
-
 # BINARY_CACHE_DIR contains the downloaded tarballs temporarily
 # the file is deleted after processing. 
 BINARY_CACHE_DIR = "cache/binary_packages" 
@@ -52,50 +47,24 @@
 
 # file to track all the manifest files that were unable to download
 SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt"
-############ updated 8/15 ############
 MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt"
-######################################
 MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt"
 SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt"
 FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt"
 
 # create cache directories for faster download
-# FIX ME: Remove SPEC_CACHE_DIR
 os.makedirs(MANIFEST_DIR, exist_ok=True)
 os.makedirs(TARINFO_DIR, exist_ok = True)
-os.makedirs(SPEC_CACHE_DIR, exist_ok = True)
 os.makedirs(BINARY_CACHE_DIR, exist_ok = True)
 
-# purge truncated binaries in the binary_package directory on restart/resume
-# def purge_truncated_binaries():
-   
-#     if not os.path.isdir(BINARY_CACHE_DIR):
-#         return
-#     removed = 0
-#     for fname in os.listdir(BINARY_CACHE_DIR):
-#         fpath = os.path.join(BINARY_CACHE_DIR, fname)
-#         if not os.path.isfile(fpath):
-#             continue
-#         try:
-#             # Autodetect compression; iterate to EOF to ensure integrity
-#             with tarfile.open(fpath, mode="r:*") as tar:
-#                 for _ in tar:
-#                     pass
-#         except (tarfile.ReadError, EOFError, OSError) as e:
-#             print(f"🧹 Removing corrupt binary cache: {fpath} ({e})")
-#             try:
-#                 os.remove(fpath)
-#                 removed += 1
-#             except OSError as oe:
-#                 print(f"⚠️ Could not remove {fpath}: {oe}")
-#     if removed:
-#         print(f"✅ Purged {removed} corrupt/partial tarball(s).")
-
-############ updated 8/15 ############
 # Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage.
 def _to_posix(p:str) -> str:
     return Path(p).as_posix()
-######################################
+
+########### UPDATED 8/19 ###########
+def _to_posix(p: str) -> str:
+    return Path(p).as_posix()
+####################################
 
 # look for index if it exists to add info to
 def load_index(): 
@@ -129,28 +98,19 @@ def update_index_entry(index, package_hash, package_value, package_zip_hash):
     manifest_filename = f"{name}-{version}-{package_hash}.json"
     tarinfo_filename = f"{package_zip_hash}.json"
 
-    ############ updated 8/15 ############
     manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename)
     tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename)
-    ######################################
 
     index[package_hash] = {
         "name": name,
         "version": version,
         "sha256": package_zip_hash,
-        ############ updated 8/15 ############
-        ##"manifest_path": os.path.join(MANIFEST_DIR, manifest_filename),
-        ##"tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename)
 
         # store forward-slash form in JSON for portability
         "manifest_path": _to_posix(manifest_path_fs),
         "tarinfo_path": _to_posix(tarinfo_path_fs),
-        
-        ######################################
-
     }
 
-
 # load that last saved package hash
 def load_checkpoint(): #
     # checks if progress.txt exists
@@ -223,49 +183,54 @@ def download_from_URL(theURL, package, is_spec=True):
     # Example: 
         # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
         # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json"
-    package_name = package.replace('/', '__')
+    package_name = package.replace('/','__')
 
     # if is_spec is true, meaning the file ends with ".spec.manifest.json",
         # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation
     # if the file ends with .tar.gz
         # then the file is saved in BINARY_CACHE_DIR
-    cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR
-
-    print(f"location to be saved in {cache_dir} for {package_name}")
-    
+    cache_dir = BINARY_CACHE_DIR if not is_spec else None
 
     # full file path then is:
         # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
         #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json
-    cached_path = os.path.join(cache_dir, package_name)
-    print(f"this is the cached_path {cached_path}")
+    cached_path = os.path.join(cache_dir, package_name) if cache_dir else None
 
-    #if cache exists, it does not need to be redownloaded
-    if os.path.exists(cached_path):
-        print(f"Using cached file: {cached_path}")
+    if is_spec:
+        print(f"downloading manifest for {package_name}")
+    else:
+        ########### UPDATED 8/19 ###########
+        print(f"temporary save location: {_to_posix(cached_path)} for {package_name}")
+        ####################################
 
+    #if cache exists, it does not need to be redownloaded
+    if cached_path and os.path.exists(cached_path):
+        ########### UPDATED 8/19 ###########
+        print(f"Using cached file: {_to_posix(cached_path)}")
+        ####################################
         # rb is read binary
         with open(cached_path, "rb") as f:
             return f.read()
 
     try:
-        print("in try block for download_from_URL")
-        # adding timeout for 60 seconds
+        ########### UPDATED 8/19 ###########
+        label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}"
+        print(f"trying download for {label}")
+        ####################################
+        # timeout for 60 seconds
         response = requests.get(theURL, timeout=60, verify=False)
-        print(f"trying download for {cached_path} ")
+
         # a response status code is 200 then the request was successful
         # response.status_code of 404 means does not exist
         if response.status_code == 200:
-            print(f"download successful for {cached_path}")
-            # saves to cache if request is successful
-            # wb is write binary
-            if is_spec == False:
+            if cached_path:
+                print(f"download successful for {_to_posix(cached_path)}")
+                ####################################
+                # saves to cache if request is successful
+                # wb is write binary
                 with open(cached_path, "wb") as f:
                     f.write(response.content)
-
-                return response.content 
-            else:
-                return response.content
+            return response.content
 
         else:
             # if URL does not exist, skip and move to next package
@@ -289,14 +254,6 @@ def download_from_URL(theURL, package, is_spec=True):
 
 # remove unnecessary lines in file
 def remove_lines_spec_manifest(myfile): 
-    ############ updated 8/15 ############
-    # removes unnecessary bytes
-    # removed_ends = myfile[49:-834]
-
-    # # converts bytes to dictionary
-    # database = json.loads(removed_ends)
-   
-    # return database
 
     # Accept bytes or str; extract between first '{' and last '}'
     data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8")
@@ -308,8 +265,6 @@ def remove_lines_spec_manifest(myfile):
         raise ValueError("Malformed manifest: JSON braces not found")
     
     return json.loads(data[start:end+1].decode("utf-8"))
-    ######################################
-
 
 # returns checksum, sha256 hash used to download the binary tarball
 def access_spec_manifest_media_type(db): 
@@ -364,17 +319,13 @@ def write_manifest_safely(manifest_path, manifest_data):
 def read_binary_package(myfile, package, package_zip_hash): 
     file_list = []
     try:
-        # 
         with io.BytesIO(myfile) as tar_buffer:
-            with tarfile.open(fileobj = tar_buffer, mode="r") as tar:
-                
+            with tarfile.open(fileobj = tar_buffer, mode="r:*") as tar:
+       
                 print(f"Files in the tar archive for {package.split('/')[0]}:")
                 i = 1
                 for member in tar.getmembers():
                     if member.isfile():
-                        #print(f"{i}: {member.name}")
-                        #i += 1
-                        
                         # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/
                             # __spack_path_placeholder__/__spack_path_placeholder__/
                             # __spack_path_placeholder__/__spack_path_placeholder__/
@@ -383,9 +334,6 @@ def read_binary_package(myfile, package, package_zip_hash):
                             # .spack/install_environment.json"
                         # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3"
                         clean_path = remove_placeholder_directories(i, member.name, package) 
-                        # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the 
-                        #   # remove_placeholder function because we will already have a copy of the list of the files in that package
-                        # # to make it easier to add to sqlite database
 
                         # this will add the files that are in the package to a clean_list
                         if clean_path:
@@ -454,14 +402,13 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
 
     # use existing cleaned manifest if available
     if os.path.exists(manifest_path):
-        print(f"Using existing cleaned manifest: {manifest_path}")
+        print(f"Using existing cleaned manifest: {_to_posix(manifest_path)}")
         with open(manifest_path, "r") as f:
             clean_spec_manifest = json.load(f)
 
         # returns the URL for the spec manifest file and the package_filename
         theURL, package_filename = make_spec_manifest_URL(package_hash, package_value)
     
-    ## and indented theURL to print("✅ Cleaned and parsed spec manifest")
     else:
         # download if manifest does not exist
 
@@ -484,9 +431,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
             return
 
         print("✅ Loaded cached spec manifest")
-
-        
-        ############ updated 8/15 ############
     
         try:
             # remove unneccessary lines from downloaded spec manifest
@@ -499,12 +443,10 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen
             with open(SKIPPED_MANIFESTS_FILE, "a") as f:
                 f.write(f"{package_hash}\n")
             return
-        
-        ######################################
 
         # writes cleaned manifest information to manifest file
         write_manifest_safely(manifest_path, clean_spec_manifest)
-        print(f"✅ Manifest safely written: {manifest_path}")
+        print(f"✅ Manifest safely written: {_to_posix(manifest_path)}")
 
     # find the mediaType that contains the hash for the package tarball install
     package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest)
@@ -577,9 +519,6 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb
 
 
 def main():
-
-    # remove any corrupt tarballs if they exist before resuming program
-    ##purge_truncated_binaries()
     #file_name = "myMedjson.json"
     # file_name = "myjson.json"
     # file_name = 'Med_w_compilerwrapper_packages_at_end.json'
@@ -622,13 +561,11 @@ def main():
     index = load_index()
     skip = True if last_processed else False
 
-    existing_tarinfo_files = set(os.listdir(TARINFO_DIR))
-    existing_tarinfo_hashes = {
-        fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files
+    existing_tarinfo_files = {
+        Path(fname).stem
+        for fname in os.listdir(TARINFO_DIR)
+        if fname.endswith(".json")
     }
-    existing_tarinfo_files = existing_tarinfo_hashes
-
-
 
     # track already-processed tarball hashes to find shared ones
     seen_tarball_hashes = set()
@@ -653,15 +590,12 @@ def main():
                 manifest_path = entry.get("manifest_path", "")
                 tarinfo_path = entry.get("tarinfo_path", "")
                 tarball_hash = entry.get("sha256", "")
-                
-                ############ updated 8/15 ############
-                ##manifest_exists = manifest_path and os.path.exists(manifest_path)
-                ##tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path)
+
                 manifest_exists = bool(manifest_path) and Path(manifest_path).exists()
                 tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists()
-                ######################################
+                
                 if manifest_exists and tarinfo_exists:
-                    #print(f"Skipping fully processed package: {package_hash}")
+                    
                     continue
                 
                 # if tarball previously failed, skip retrying it

From 880f545e69fa8cdb285385c229b92ba194dd8fad Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Thu, 21 Aug 2025 13:54:13 -0700
Subject: [PATCH 17/20] removed spec_manifest info from README.md

---
 dataset-generation/spack_db/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 86d4c01..01bbd9e 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -16,7 +16,6 @@ The program allows for restart/resume in case there are program run interruption
 * `cache/spack.index.db.json` - master index
 * `cache/manifest/` - cleaned spec manifests
 * `cache/tarinfo/` - JSON file lists extracted from tarballs
-* `cache/spec_manifests/` - temporary cache of raw manifests before clean up
 * `cache/binary_packages/` - temporary cache of downloaded tarballs
 * `cache/timeouts.txt` - packages that timed out while downloading
 * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded

From b2de168d3f875a529af9bd8d4f7247be4d0ce2d9 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Thu, 21 Aug 2025 13:59:52 -0700
Subject: [PATCH 18/20] removed 'updated' comments

---
 dataset-generation/spack_db/spack_db.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py
index 2cf7fca..cea6d15 100644
--- a/dataset-generation/spack_db/spack_db.py
+++ b/dataset-generation/spack_db/spack_db.py
@@ -61,10 +61,8 @@
 def _to_posix(p:str) -> str:
     return Path(p).as_posix()
 
-########### UPDATED 8/19 ###########
 def _to_posix(p: str) -> str:
     return Path(p).as_posix()
-####################################
 
 # look for index if it exists to add info to
 def load_index(): 
@@ -199,24 +197,21 @@ def download_from_URL(theURL, package, is_spec=True):
     if is_spec:
         print(f"downloading manifest for {package_name}")
     else:
-        ########### UPDATED 8/19 ###########
         print(f"temporary save location: {_to_posix(cached_path)} for {package_name}")
-        ####################################
 
     #if cache exists, it does not need to be redownloaded
     if cached_path and os.path.exists(cached_path):
-        ########### UPDATED 8/19 ###########
+        
         print(f"Using cached file: {_to_posix(cached_path)}")
-        ####################################
+        
         # rb is read binary
         with open(cached_path, "rb") as f:
             return f.read()
 
     try:
-        ########### UPDATED 8/19 ###########
         label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}"
         print(f"trying download for {label}")
-        ####################################
+        
         # timeout for 60 seconds
         response = requests.get(theURL, timeout=60, verify=False)
 
@@ -225,7 +220,7 @@ def download_from_URL(theURL, package, is_spec=True):
         if response.status_code == 200:
             if cached_path:
                 print(f"download successful for {_to_posix(cached_path)}")
-                ####################################
+                
                 # saves to cache if request is successful
                 # wb is write binary
                 with open(cached_path, "wb") as f:

From dc0ad5cedf923a2775355028c90c62d6b1762dea Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Thu, 21 Aug 2025 14:43:41 -0700
Subject: [PATCH 19/20] removed 'contributing' from README.md

---
 dataset-generation/spack_db/README.md | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md
index 01bbd9e..3158899 100644
--- a/dataset-generation/spack_db/README.md
+++ b/dataset-generation/spack_db/README.md
@@ -59,15 +59,3 @@ The program allows for restart/resume in case there are program run interruption
     python Create_spack_DB.py
     ```
     Database will include all files extracted from the packages from the Spack build cache.
-
-## Contributing
-
-Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
-
-Please make sure to update tests as appropriate.
-
-For more information on contributing see the [CONTRIBUTING](./CONTRIBUTING.md) file.
-
-## License
-
-MIT license
\ No newline at end of file

From 9ebd7fc2ca80e8dd2309dd6c1113e3eef0abe0a5 Mon Sep 17 00:00:00 2001
From: leslie q <quintanillaz1@llnl.gov>
Date: Thu, 21 Aug 2025 15:09:33 -0700
Subject: [PATCH 20/20] added supporting scripts for testing

---
 .../spack_db/rename_tarinfo_file.py           | 35 +++++++++++++++++++
 .../spack_db/spack_index_length.py            | 19 ++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 dataset-generation/spack_db/rename_tarinfo_file.py
 create mode 100644 dataset-generation/spack_db/spack_index_length.py

diff --git a/dataset-generation/spack_db/rename_tarinfo_file.py b/dataset-generation/spack_db/rename_tarinfo_file.py
new file mode 100644
index 0000000..3d16af6
--- /dev/null
+++ b/dataset-generation/spack_db/rename_tarinfo_file.py
@@ -0,0 +1,35 @@
+import os
+import re
+
+# Path to your tarinfo directory
+TARINFO_DIR = "cache/tarinfo"
+
+# Updated regex: match <prefix>-<64-char-sha256>.json
+pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$")
+
+# Counter
+renamed = 0
+skipped = 0
+
+for filename in os.listdir(TARINFO_DIR):
+    match = pattern.match(filename)
+    if match:
+        sha256_hash = match.group(2)
+        new_filename = f"{sha256_hash}.json"
+
+        old_path = os.path.join(TARINFO_DIR, filename)
+        new_path = os.path.join(TARINFO_DIR, new_filename)
+
+        # Skip if target file already exists
+        if os.path.exists(new_path):
+            print(f"⚠️ Skipping {filename} (target {new_filename} already exists)")
+            skipped += 1
+            continue
+
+        os.rename(old_path, new_path)
+        renamed += 1
+    else:
+        print(f"❓ Skipping non-matching file: {filename}")
+        skipped += 1
+
+print(f"\n✅ Done! Renamed {renamed} files. Skipped {skipped} files.")
diff --git a/dataset-generation/spack_db/spack_index_length.py b/dataset-generation/spack_db/spack_index_length.py
new file mode 100644
index 0000000..0111f73
--- /dev/null
+++ b/dataset-generation/spack_db/spack_index_length.py
@@ -0,0 +1,19 @@
+import json
+
+# Path to your index file
+INDEX_FILE = "cache/spack.index.db.json"
+
+def main():
+    try:
+        with open(INDEX_FILE, "r") as f:
+            index = json.load(f)
+        print(f"✅ Number of entries in index: {len(index)}")
+    except FileNotFoundError:
+        print(f"❌ File not found: {INDEX_FILE}")
+    except json.JSONDecodeError:
+        print(f"❌ Failed to parse JSON. The file may be corrupted.")
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file