From 526dd7a76d67ebe11fa3bb04c96408524b2c5dc7 Mon Sep 17 00:00:00 2001 From: Quintanilla-Zarinan Date: Wed, 16 Jul 2025 11:33:38 -0700 Subject: [PATCH 01/20] initial spack dataset scraper --- dataset-generation/spack_db/spack_db.py | 311 ++++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 dataset-generation/spack_db/spack_db.py diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py new file mode 100644 index 0000000..889ae37 --- /dev/null +++ b/dataset-generation/spack_db/spack_db.py @@ -0,0 +1,311 @@ +# json library used to load json file +import json + +# requests used to download URL automatically +import requests + +# COME BACK TO THIS LATER +# certifi and urllib3 used for certificate verfication +#import certifi +#import urllib3 + +# io for reading and writing streams such as text, binary, and raw data +import io + +# for reading and writing tar archives +import tarfile + +# to quiet warnings +import warnings +warnings.filterwarnings("ignore") + +# reading file +def readmyfile(myfile): + try: + with open(myfile, 'r') as file: + # database is the spack database json, within the spack build cache + db = json.load(file) # 8.6 seconds to read in large json file + + # returns database + return db + + except FileNotFoundError: + print(f"Error: The file '{myfile}' not found.") + except Exception as e: + print(f"Error occured in readmyfile: {e}") + +# getting package hash which is the key for the package info +# first argument is the index of the package by number +# second argument is the database that was loaded in the readmyfile function +def get_package(num, db): + + # db and temp variables are type dict + temp = db['database']['installs'] + + #this accesses the install package of choice by index + package_hash = list(temp)[num] + #NOTE TO SELF: turning it into a list may be slower + + return temp, package_hash + +# get the package info from the package hash key +def get_package_value(temp, package_hash): + + # for each key in the database, return the value if the key matches the package hash of interest + for k in temp: + if k == package_hash: + #this will return the values of the package package_hash + return temp[k] + +# make the spec manifest downloadable URL +def make_spec_manifest_URL(package_hash, package_hash_value): +# goal is to make a URL that looks like this -> +# https://binaries.spack.io/develop/v3/manifests/spec//--.spec.manifest.json +# example URL for compiler-wrapper: +# https://binaries.spack.io/develop/v3/manifests/spec/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json + + myURL = 'https://binaries.spack.io/develop/v3/manifests/spec/' + + # this accesses the package name (not the package hash that we have been using as the package_hash) + package_name = package_hash_value['spec']['name'] + + # this accesses the package name version number + package_version = package_hash_value['spec']['version'] + + # package_filename for use later in removal of placeholder directories in the filepath of the tarball + package_filename = (package_name + '/' + package_name + '-' + package_version + + '-' + package_hash) + + # this updates the URL + myURL += (package_filename + '.spec.manifest.json') + + # returns the URL for the spec manifest and package_filename + return myURL, package_filename + +# automatically download contents from the URL +def download_from_URL(theURL, package, num, db, num_keys): + + # splits package_name by '/' character + package_name = package.split('/')[0] + + try: + response = requests.get(theURL, verify=False) + + # a response status code is 200 then the request was successful + # response.status_code of 404 means does not exist + if response.status_code == 200: + + return response.content + + else: + # if URL does not exist, skip and move to next package + print(f"download failed for package: {package_name}\n") + + # return None to stop process due to download failing - goes back to run_program function + return None + + except Exception as e: + print(f"download_from_URL package {package_name}, error: {e}") + + # return None to stop process due to download failing + return None + + +def remove_lines_spec_manifest(myfile): + + # removes unnecessary bytes + removed_ends = myfile[49:-834] + + # converts bytes to dictionary + database = json.loads(removed_ends) + + return database + + +def access_spec_manifest_media_type(db): + + try: + # get the value for the key 'data' from the db + # if the key 'data' does not exist, return empty list + # temp is db['data'] + for temp in db.get('data', []): + if temp.get('mediaType') == 'application/vnd.spack.install.v2.tar+gzip': + + # the checksum is the sha256 hash + return temp.get('checksum') + + except Exception as e: + print(f"Error occured in access_spec_manifest_media_type: {e}") + +def make_binary_package_URL(package_zip_hash): +# example URL for compiler-wrapper binary package: +# https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351 + + myURL = 'https://binaries.spack.io/develop/blobs/sha256/' + + first_byte = package_zip_hash[:2] + + myURL = myURL + first_byte + '/' + package_zip_hash + + return myURL + +# using python tarfile module io module to list all the files in the downloaded tarball +# myfile is the tar_file response.content and the package is the hash we will split by +def read_binary_package(myfile, package): + + try: + with io.BytesIO(myfile) as tar_buffer: + with tarfile.open(fileobj = tar_buffer, mode="r") as tar: + + print(f"Files in the tar archive for {package.split('/')[0]}:") + i = 1 + for member in tar.getmembers(): + if member.isfile(): + #breakpoint() + #print(f"{i}: {member.name}") + #i += 1 + + # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/ + # .spack/install_environment.json" + # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + remove_placeholder_directories(i, member.name, package) + # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the + # # remove_placeholder function because we will already have a copy of the list of the files in that package + # # to make it easier to add to sqlite database + i += 1 + + except tarfile.ReadError as e: + print(f"Error reading tar file: {e}") + +# removing the placeholder directories in the file path +def remove_placeholder_directories(i, name, package): + # i is the counter for file enumeration + # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ + # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/ + # compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/.spack/install_environment.json" + # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + + # updatedpackage_list for compiler-wrapper is "['compiler-wrapper', 'compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3']" + updatedpackage_list = package.split('/') + + # updatedpackage_name is "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + updatedpackage_name = updatedpackage_list[1] + + placeholder = "__spack_path_placeh" + + # split by updatedpackage_name + split_list = name.split(updatedpackage_name) + + try: + if placeholder not in split_list[0]: + print("split_list", split_list) + elif len(split_list) > 1: + updatedname = split_list[1][1:] + + print(f"file {i}: ", updatedname) + + except Exception as e: + print(f"Error in remove_placeholder_directories: {e}") + + +def print_files(package_number, database, num_keys): + + # installs here are all the packages in the input database + # package_name is the package_hash of the package + installs, package_name = get_package(package_number, database) + + # package_value is the value of the key, the key is the package_name + package_value = get_package_value(installs, package_name) + + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_name, package_value) + + # download the spec manifest json for the package of interest + #temp = download_from_URL(theURL, package_filename) + temp = download_from_URL(theURL, package_filename, package_number, database, num_keys) + + + # return if URL does not exist + if temp is None: + print(f"Skipping package {package_filename} due to download failure. \n") + + # exit print_files function and goes back to run_program function + return + + # remove unneccessary lines from downloaded spec manifest + spec_manifest_database = remove_lines_spec_manifest(temp) + + # find the mediaType that contains the hash for the package install + package_zip_hash = access_spec_manifest_media_type(spec_manifest_database) + + # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function + if package_zip_hash is None: + print("mediaType not found - skipping this package.") + # go back to run_program function + return + + # make the binary package URL for installing the package + binary_package_URL = make_binary_package_URL(package_zip_hash) + + # download the binary package file from the generated URL + tempbinary = download_from_URL(binary_package_URL, package_filename, package_number, database, num_keys) + + # read the binary package + read_binary_package(tempbinary, package_filename) + +# program dispatcher +def run_program(package_number, database, num_keys): + + if package_number < num_keys: + + print_files(package_number, database, num_keys) + + +def main(): + file_name = "myMedjson.json" + # file_name = "myjson.json" + # file_name = 'Med_w_compilerwrapper_packages_at_end.json' + # file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" + + package_number = 0 + # package_number2 = 2000 + # package_number3 = 20 + # package_number4 = 35 + # package_number5 = 127000 + database = readmyfile(file_name) + + num_keys = len(database['database']['installs']) + + for package_number in range(num_keys): + + print(f"package {package_number+1} out of {num_keys} packages\n") + + run_program(package_number, database, num_keys) + + print("\nComplete") + +if __name__ == "__main__": + main() + +### WHAT I NEED TO DO ### +# interested in adding a function that gives us how much cpu is being used for the program +## ways to make it faster - running it, then stopping it and then later proceeding from where it left off + ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it + ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded ) +# can I also add the time it takes to run? +# can I have a calculation for how long it would take if I wanted to run it for 130464 files? - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow +## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it) +# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow) +# create a branch - commit files to branch on dapper +## FROM RYAN ## +# next step is play with the Python sqlite module and try to figure out how to +# 1) create a new sqlite db +# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite) + + + From 8421d96196f5e6f2f008d77f8bf6896cb62ee850 Mon Sep 17 00:00:00 2001 From: leslie q Date: Wed, 16 Jul 2025 11:41:46 -0700 Subject: [PATCH 02/20] added practice message of "meow" --- dataset-generation/spack_db/spack_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 889ae37..47ac1ce 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -307,5 +307,5 @@ def main(): # 1) create a new sqlite db # 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite) - +#meow From ac16bd8857ade654b21ce5cfcc85c6204b543174 Mon Sep 17 00:00:00 2001 From: leslie q Date: Fri, 18 Jul 2025 18:00:35 -0700 Subject: [PATCH 03/20] added methods to create local cache for easy interrupt/proceed from last processed package --- dataset-generation/spack_db/spack_db.py | 171 ++++++++++++++++-------- 1 file changed, 117 insertions(+), 54 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 47ac1ce..a8e7ee2 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -19,6 +19,21 @@ import warnings warnings.filterwarnings("ignore") +# os to create a cache folder - disk-based caching +import os + +# Configuration +SPEC_CACHE_DIR = "cache/spec_manifests" +BINARY_CACHE_DIR = "cache/binary_packages" + +# checkpoint to safely stop the script at any time +CHECKPOINT_FILE = "progress.txt" + +# create cache directories for faster download +os.makedirs(SPEC_CACHE_DIR, exist_ok = True) +os.makedirs(BINARY_CACHE_DIR, exist_ok = True) + + # reading file def readmyfile(myfile): try: @@ -34,28 +49,28 @@ def readmyfile(myfile): except Exception as e: print(f"Error occured in readmyfile: {e}") -# getting package hash which is the key for the package info -# first argument is the index of the package by number -# second argument is the database that was loaded in the readmyfile function -def get_package(num, db): +# load that last saved package hash +def load_checkpoint(): + # checks if progress.txt exists + if os.path.exists(CHECKPOINT_FILE): + with open(CHECKPOINT_FILE, "r") as f: - # db and temp variables are type dict - temp = db['database']['installs'] - - #this accesses the install package of choice by index - package_hash = list(temp)[num] - #NOTE TO SELF: turning it into a list may be slower + # read and return last processed package_hash + # strip removes trailing newline or spaces + return f.read().strip() + + # if the file does not exist, return None + # if None, start from the beginning + return None - return temp, package_hash -# get the package info from the package hash key -def get_package_value(temp, package_hash): +# saves the last processed package_hash to progress.txt +# if the program is interrupted, the saved package_hash will be + # the starting point when rerun +def save_checkpoint(package_hash): + with open(CHECKPOINT_FILE, "w") as f: + f.write(package_hash) - # for each key in the database, return the value if the key matches the package hash of interest - for k in temp: - if k == package_hash: - #this will return the values of the package package_hash - return temp[k] # make the spec manifest downloadable URL def make_spec_manifest_URL(package_hash, package_hash_value): @@ -83,10 +98,31 @@ def make_spec_manifest_URL(package_hash, package_hash_value): return myURL, package_filename # automatically download contents from the URL -def download_from_URL(theURL, package, num, db, num_keys): +def download_from_URL(theURL, package, is_spec=True): + + # makes filename + # Example: + # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" + # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" + package_name = package.replace('/', '__') + + # if is_spec is true, meaning the file ends with ".spec.manifest.json", + # then the file is saved in SPEC_CACHE_DIR + # if the file ends with .tar.gz + # then the file is saved in BINARY_CACHE_DIR + cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR + + # full file path then is: + # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + cached_path = os.path.join(cache_dir, package_name) - # splits package_name by '/' character - package_name = package.split('/')[0] + #if cache exists, it does not need to be redownloaded + if os.path.exists(cached_path): + print(f"Using cached file: {cached_path}") + + # rb is read binary + with open(cached_path, "rb") as f: + return f.read() try: response = requests.get(theURL, verify=False) @@ -95,6 +131,11 @@ def download_from_URL(theURL, package, num, db, num_keys): # response.status_code of 404 means does not exist if response.status_code == 200: + # saves to cache if request is successful + # wb is write binary + with open(cached_path, "wb") as f: + f.write(response.content) + return response.content else: @@ -110,7 +151,7 @@ def download_from_URL(theURL, package, num, db, num_keys): # return None to stop process due to download failing return None - +# remove unnecessary lines in file def remove_lines_spec_manifest(myfile): # removes unnecessary bytes @@ -121,7 +162,7 @@ def remove_lines_spec_manifest(myfile): return database - +# returns checksum, sha256 hash used to download the binary tarball def access_spec_manifest_media_type(db): try: @@ -137,6 +178,7 @@ def access_spec_manifest_media_type(db): except Exception as e: print(f"Error occured in access_spec_manifest_media_type: {e}") +# uses checksum returned to generate URL for binary tarball download def make_binary_package_URL(package_zip_hash): # example URL for compiler-wrapper binary package: # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351 @@ -213,26 +255,19 @@ def remove_placeholder_directories(i, name, package): print(f"Error in remove_placeholder_directories: {e}") -def print_files(package_number, database, num_keys): - - # installs here are all the packages in the input database - # package_name is the package_hash of the package - installs, package_name = get_package(package_number, database) - - # package_value is the value of the key, the key is the package_name - package_value = get_package_value(installs, package_name) +def print_files(package_hash, package_value): # returns the URL for the spec manifest file and the package_filename - theURL, package_filename = make_spec_manifest_URL(package_name, package_value) + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) # download the spec manifest json for the package of interest #temp = download_from_URL(theURL, package_filename) - temp = download_from_URL(theURL, package_filename, package_number, database, num_keys) + temp = download_from_URL(theURL, package_filename, is_spec = True) # return if URL does not exist if temp is None: - print(f"Skipping package {package_filename} due to download failure. \n") + print(f"Skipping package {package_filename}\n") # exit print_files function and goes back to run_program function return @@ -253,41 +288,68 @@ def print_files(package_number, database, num_keys): binary_package_URL = make_binary_package_URL(package_zip_hash) # download the binary package file from the generated URL - tempbinary = download_from_URL(binary_package_URL, package_filename, package_number, database, num_keys) + tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) # read the binary package read_binary_package(tempbinary, package_filename) # program dispatcher -def run_program(package_number, database, num_keys): - - if package_number < num_keys: +def run_program(package_hash, database): + installs = database['database']['installs'] - print_files(package_number, database, num_keys) + # gets installs key value, aka name of package, version, etc. + package_value = installs[package_hash] + print_files(package_hash, package_value) def main(): - file_name = "myMedjson.json" + # file_name = "myMedjson.json" # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' - # file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" + file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" - package_number = 0 - # package_number2 = 2000 - # package_number3 = 20 - # package_number4 = 35 - # package_number5 = 127000 database = readmyfile(file_name) - num_keys = len(database['database']['installs']) + # lists install keys + install_keys = list(database['database']['installs']) + num_keys = len(install_keys) - for package_number in range(num_keys): - - print(f"package {package_number+1} out of {num_keys} packages\n") + # load last processed package hash from checkpoint from cache if it exists + last_processed = load_checkpoint() + skip = True if last_processed else False - run_program(package_number, database, num_keys) - - print("\nComplete") + try: + for i, package_hash in enumerate(install_keys): + + # Skip previously completed packages until the last processed one + if skip: + print(f"Skipping {package_hash}") + if package_hash == last_processed: + # start processing after the last one + skip = False + continue + + print(f"πŸ“¦ package {i + 1} out of {num_keys} packages\n") + + run_program(package_hash, database) + + save_checkpoint(package_hash) + + except KeyboardInterrupt: + print("\nπŸ›‘ Interrupted by user. Progress saved.") + + if last_processed: + installs = database['database']['installs'] + if last_processed in installs: + package_value = installs[last_processed] + package_name = package_value['spec']['name'] + package_version = package_value['spec']['version'] + print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}") + else: + print(f"Last checkpoint was: unknown, {last_processed}") + print(f"file may have changed since the last run") + finally: + print("\n🎊 Complete (or safely stopped). Script will resume where it left off.") if __name__ == "__main__": main() @@ -298,7 +360,8 @@ def main(): ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded ) # can I also add the time it takes to run? -# can I have a calculation for how long it would take if I wanted to run it for 130464 files? - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow +# can I have a calculation for how long it would take if I wanted to run it for 130464 files? +### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow ## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it) # look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow) # create a branch - commit files to branch on dapper From c362a96fba88e5149e414dcfa39085881675f99b Mon Sep 17 00:00:00 2001 From: lfquintaz <158081978+lfquintaz@users.noreply.github.com> Date: Sun, 10 Aug 2025 18:10:42 -0700 Subject: [PATCH 04/20] added files to capture failed manifest/tarball download --- dataset-generation/spack_db/spack_db.py | 457 ++++++++++++++++++------ 1 file changed, 351 insertions(+), 106 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index a8e7ee2..961c657 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -4,11 +4,6 @@ # requests used to download URL automatically import requests -# COME BACK TO THIS LATER -# certifi and urllib3 used for certificate verfication -#import certifi -#import urllib3 - # io for reading and writing streams such as text, binary, and raw data import io @@ -22,35 +17,93 @@ # os to create a cache folder - disk-based caching import os +# tempfile and shutil for index temp file +import tempfile +import shutil + # Configuration -SPEC_CACHE_DIR = "cache/spec_manifests" -BINARY_CACHE_DIR = "cache/binary_packages" + +# spack.index.db.json maps each package back to the packagename, version, SHA256hash, + # path to manifest json file, path to tarinfo file list +INDEX_FILE = "cache/DEMO_spack.index.db.json" + +# MANIFEST_DIR is the source of metadata per package used in master index +MANIFEST_DIR = "cache/DEMO_manifest" + +# TARINFO_DIR contains the extracted list of files from each binary tarball +# this will be used in making the SQLite database without having to reprocess the tarballs again +TARINFO_DIR = "cache/DEMO_tarinfo" + +# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet +# a clean copy is meant to be placed in MANIFEST_DIR +# SPEC_CACHE_DIR avoids redownloading if the script is restarted. +SPEC_CACHE_DIR = "cache/DEMO_spec_manifests" + +# BINARY_CACHE_DIR contains the downloaded tarballs temporarily +# the file is deleted after processing. +BINARY_CACHE_DIR = "cache/DEMO_binary_packages" +TIMEOUT_LOG_FILE = "cache/DEMO_timeouts.txt" # checkpoint to safely stop the script at any time -CHECKPOINT_FILE = "progress.txt" +# progress.txt saves the spec_manifest hash +CHECKPOINT_FILE = "DEMO_progress.txt" + +# file to track all the manifest files that were unable to download +SKIPPED_MANIFESTS_FILE = "cache/DEMO_skipped_manifests.txt" +MISSING_TARBALL_HASH_FILE = "cache/DEMO_missing_tarballs.txt" +SHARED_TARBALL_HASH_FILE = "cache/DEMO_shared_tarballs.txt" +FAILED_TARBALL_DOWNLOAD_FILE = "cache/DEMO_failed_tarball_downloads.txt" # create cache directories for faster download +# FIX ME: Remove SPEC_CACHE_DIR +os.makedirs(MANIFEST_DIR, exist_ok=True) +os.makedirs(TARINFO_DIR, exist_ok = True) os.makedirs(SPEC_CACHE_DIR, exist_ok = True) os.makedirs(BINARY_CACHE_DIR, exist_ok = True) +# look for index if it exists to add info to +def load_index(): + # if the index_file exists, read it + if os.path.exists(INDEX_FILE): + with open(INDEX_FILE, "r") as f: + # return as a dictionary for easy manipulation and JSON formatting + return json.load(f) + # if the index does not exist, an empty dictionary is returned + return {} + +# save index +def save_index(index): + + # create a backup of the previous index + if os.path.exists(INDEX_FILE): + shutil.copy(INDEX_FILE, INDEX_FILE + ".bak") + + # Save to a temp file, then move to replace + temp_dir = os.path.dirname(INDEX_FILE) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(index, tmp, indent=2) + temp_name = tmp.name + + shutil.move(temp_name, INDEX_FILE) + +# format for entried added to index +def update_index_entry(index, package_hash, package_value, package_zip_hash): + name = package_value['spec']['name'] + version = package_value['spec']['version'] + manifest_filename = f"{name}-{version}-{package_hash}.json" + tarinfo_filename = f"{package_zip_hash}.json" + + index[package_hash] = { + "name": name, + "version": version, + "sha256": package_zip_hash, + "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename), + "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename) + } -# reading file -def readmyfile(myfile): - try: - with open(myfile, 'r') as file: - # database is the spack database json, within the spack build cache - db = json.load(file) # 8.6 seconds to read in large json file - - # returns database - return db - - except FileNotFoundError: - print(f"Error: The file '{myfile}' not found.") - except Exception as e: - print(f"Error occured in readmyfile: {e}") # load that last saved package hash -def load_checkpoint(): +def load_checkpoint(): # # checks if progress.txt exists if os.path.exists(CHECKPOINT_FILE): with open(CHECKPOINT_FILE, "r") as f: @@ -64,16 +117,32 @@ def load_checkpoint(): return None -# saves the last processed package_hash to progress.txt +# saves the last processed manifest package_hash to progress.txt # if the program is interrupted, the saved package_hash will be # the starting point when rerun -def save_checkpoint(package_hash): +def save_checkpoint(package_hash): # with open(CHECKPOINT_FILE, "w") as f: f.write(package_hash) +# reading file +def readmyfile(myfile): + try: + with open(myfile, 'r') as file: + # database is the spack database json, within the spack build cache + db = json.load(file) # 8.6 seconds to read in large json file + + # returns database + return db + + except FileNotFoundError: + print(f"Error: The file '{myfile}' not found.") + except Exception as e: + print(f"Error occured in readmyfile: {e}") + + # make the spec manifest downloadable URL -def make_spec_manifest_URL(package_hash, package_hash_value): +def make_spec_manifest_URL(package_hash, package_hash_value): # goal is to make a URL that looks like this -> # https://binaries.spack.io/develop/v3/manifests/spec//--.spec.manifest.json # example URL for compiler-wrapper: @@ -98,8 +167,9 @@ def make_spec_manifest_URL(package_hash, package_hash_value): return myURL, package_filename # automatically download contents from the URL -def download_from_URL(theURL, package, is_spec=True): +def download_from_URL(theURL, package, is_spec=True): + # makes filename # Example: # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" @@ -107,14 +177,19 @@ def download_from_URL(theURL, package, is_spec=True): package_name = package.replace('/', '__') # if is_spec is true, meaning the file ends with ".spec.manifest.json", - # then the file is saved in SPEC_CACHE_DIR + # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation # if the file ends with .tar.gz # then the file is saved in BINARY_CACHE_DIR cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR + print(f"location to be saved in {cache_dir} for {package_name}") + + # full file path then is: # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + #cache/DEMO_manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json cached_path = os.path.join(cache_dir, package_name) + print(f"this is the cached_path {cached_path}") #if cache exists, it does not need to be redownloaded if os.path.exists(cached_path): @@ -125,18 +200,23 @@ def download_from_URL(theURL, package, is_spec=True): return f.read() try: - response = requests.get(theURL, verify=False) - + print("in try block for download_from_URL") + # adding timeout for 60 seconds + response = requests.get(theURL, timeout=60, verify=False) + print(f"trying download for {cached_path} ") # a response status code is 200 then the request was successful # response.status_code of 404 means does not exist if response.status_code == 200: - + print(f"download successful for {cached_path}") # saves to cache if request is successful # wb is write binary - with open(cached_path, "wb") as f: - f.write(response.content) + if is_spec == False: + with open(cached_path, "wb") as f: + f.write(response.content) - return response.content + return response.content + else: + return response.content else: # if URL does not exist, skip and move to next package @@ -145,6 +225,13 @@ def download_from_URL(theURL, package, is_spec=True): # return None to stop process due to download failing - goes back to run_program function return None + except requests.exceptions.Timeout: + print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}") + # Append to file immediately + with open(TIMEOUT_LOG_FILE, "a") as f: + f.write(f"{package_name}\t{theURL}\n") + return None + except Exception as e: print(f"download_from_URL package {package_name}, error: {e}") @@ -152,18 +239,18 @@ def download_from_URL(theURL, package, is_spec=True): return None # remove unnecessary lines in file -def remove_lines_spec_manifest(myfile): +def remove_lines_spec_manifest(myfile): # removes unnecessary bytes removed_ends = myfile[49:-834] # converts bytes to dictionary database = json.loads(removed_ends) - + return database # returns checksum, sha256 hash used to download the binary tarball -def access_spec_manifest_media_type(db): +def access_spec_manifest_media_type(db): try: # get the value for the key 'data' from the db @@ -177,9 +264,10 @@ def access_spec_manifest_media_type(db): except Exception as e: print(f"Error occured in access_spec_manifest_media_type: {e}") + return None # uses checksum returned to generate URL for binary tarball download -def make_binary_package_URL(package_zip_hash): +def make_binary_package_URL(package_zip_hash): # example URL for compiler-wrapper binary package: # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351 @@ -193,9 +281,11 @@ def make_binary_package_URL(package_zip_hash): # using python tarfile module io module to list all the files in the downloaded tarball # myfile is the tar_file response.content and the package is the hash we will split by -def read_binary_package(myfile, package): - +# package is the package name and the version +def read_binary_package(myfile, package, package_zip_hash): + file_list = [] try: + # with io.BytesIO(myfile) as tar_buffer: with tarfile.open(fileobj = tar_buffer, mode="r") as tar: @@ -203,7 +293,6 @@ def read_binary_package(myfile, package): i = 1 for member in tar.getmembers(): if member.isfile(): - #breakpoint() #print(f"{i}: {member.name}") #i += 1 @@ -214,17 +303,36 @@ def read_binary_package(myfile, package): # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/ # .spack/install_environment.json" # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" - remove_placeholder_directories(i, member.name, package) - # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the + clean_path = remove_placeholder_directories(i, member.name, package) + # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the # # remove_placeholder function because we will already have a copy of the list of the files in that package # # to make it easier to add to sqlite database + + # this will add the files that are in the package to a clean_list + if clean_path: + file_list.append(clean_path) i += 1 except tarfile.ReadError as e: print(f"Error reading tar file: {e}") + return + + name = package.split('/')[0] + version = package.split('/')[1].split('-')[1] + + # saves file names to the tarinfo file + tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") + with open(tarinfo_path, "w") as f: + json.dump(file_list, f, indent=2) + + # removes tarball once processed + tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__')) + if os.path.exists(tarball_path): + os.remove(tarball_path) + del myfile # removing the placeholder directories in the file path -def remove_placeholder_directories(i, name, package): +def remove_placeholder_directories(i, name, package): # i is the counter for file enumeration # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/ # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ @@ -246,129 +354,266 @@ def remove_placeholder_directories(i, name, package): try: if placeholder not in split_list[0]: print("split_list", split_list) + + # returns file name without the placeholder path elif len(split_list) > 1: updatedname = split_list[1][1:] print(f"file {i}: ", updatedname) + return updatedname # return to add to list of files for respective package except Exception as e: print(f"Error in remove_placeholder_directories: {e}") + return None -def print_files(package_hash, package_value): +def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): + name = package_value['spec']['name'] + version = package_value['spec']['version'] - # returns the URL for the spec manifest file and the package_filename - theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + ############ updated 8/10 ############ + manifest_filename = f"{name}-{version}-{package_hash}.json" + manifest_path = os.path.join(MANIFEST_DIR, manifest_filename) + + # use existing cleaned manifest if available + if os.path.exists(manifest_path): + print(f"Using existing cleaned manifest: {manifest_path}") + with open(manifest_path, "r") as f: + clean_spec_manifest = json.load(f) + + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + + ## and indented theURL to print("βœ… Cleaned and parsed spec manifest") + else: + # download if manifest does not exist + - # download the spec manifest json for the package of interest - #temp = download_from_URL(theURL, package_filename) - temp = download_from_URL(theURL, package_filename, is_spec = True) + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + + # download the spec manifest json for the package of interest + temp = download_from_URL(theURL, package_filename, is_spec = True) + + + # return if URL does not exist + if temp is None: + print(f"Could not download manifest: {package_filename} - recording and skipping.\n") + + # recording the failed manifest filename and hash + with open(SKIPPED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\n") + + # exit print_files function and goes back to run_program function + return - # return if URL does not exist - if temp is None: - print(f"Skipping package {package_filename}\n") + print("βœ… Loaded cached spec manifest") - # exit print_files function and goes back to run_program function - return + # remove unneccessary lines from downloaded spec manifest + clean_spec_manifest = remove_lines_spec_manifest(temp) + + # writes cleaned manifest information to manifest file + with open(manifest_path, "w") as f: + json.dump(clean_spec_manifest, f, indent=2) + print("βœ… Cleaned and parsed spec manifest") - # remove unneccessary lines from downloaded spec manifest - spec_manifest_database = remove_lines_spec_manifest(temp) + + + # writes cleaned manifest information to manifest file + ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json") + ##with open(manifest_file, "w") as f: + ##json.dump(clean_spec_manifest, f, indent=2) + + ###################################### - # find the mediaType that contains the hash for the package install - package_zip_hash = access_spec_manifest_media_type(spec_manifest_database) + # find the mediaType that contains the hash for the package tarball install + package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest) + print(f"βœ… Extracted zip hash: {package_zip_hash}") # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function if package_zip_hash is None: - print("mediaType not found - skipping this package.") + print(f"No Tarball hash found in manifest: {package_filename}") + + # Track taht this manifest has no downloadable binary tarball + with open(MISSING_TARBALL_HASH_FILE, "a") as f: + f.write(f"{package_hash}\n") # go back to run_program function return + + # track if the tarball hash has already been processed + if package_zip_hash in seen_tarball_hashes: + with open(SHARED_TARBALL_HASH_FILE, "a") as f: + # if this manifest points to a tarball that has already been seen, + # it will not create a new tarinfo entry + # it will have a new manifest entry + f.write(f"{package_hash}\t{package_zip_hash}\n") + else: + seen_tarball_hashes.add(package_zip_hash) + - # make the binary package URL for installing the package - binary_package_URL = make_binary_package_URL(package_zip_hash) + expected_tarinfo_hash = package_zip_hash - # download the binary package file from the generated URL - tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) + if expected_tarinfo_hash in existing_tarinfo_files: + print(f"βœ… Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"βœ… Saved Index") + return + + else: + + # make the binary package URL for installing the package + binary_package_URL = make_binary_package_URL(package_zip_hash) + print(f"πŸ”— Downloading binary: {binary_package_URL}") + + # download the binary package file from the generated URL + tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) + print("βœ… Binary package downloaded") + + if tempbinary is None: + + # Track failed tarball download + with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f: + f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n") + + return - # read the binary package - read_binary_package(tempbinary, package_filename) + # read the binary package + read_binary_package(tempbinary, package_filename, package_zip_hash) + print("βœ… Finished reading binary package") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"Updated Index with {package_filename}-{package_zip_hash}") + save_checkpoint(package_hash) + print(f"βœ… Saved Index") # program dispatcher -def run_program(package_hash, database): +def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): installs = database['database']['installs'] # gets installs key value, aka name of package, version, etc. package_value = installs[package_hash] - print_files(package_hash, package_value) + print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes) def main(): - # file_name = "myMedjson.json" + #file_name = "myMedjson_DEMO.json" # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" database = readmyfile(file_name) + # load list of previously skipped manifest hashes + if os.path.exists(SKIPPED_MANIFESTS_FILE): + with open(SKIPPED_MANIFESTS_FILE, "r") as f: + skipped_hashes = set(line.strip() for line in f) + + else: + skipped_hashes = set() + + # load manifests that are missing binary tarball hashes (e.g. mediaType not found) + if os.path.exists(MISSING_TARBALL_HASH_FILE): + with open(MISSING_TARBALL_HASH_FILE, "r") as f: + missing_tarball_hashes = set(line.strip() for line in f) + + else: + missing_tarball_hashes = set() + + # load tarballs that were not downloadable + if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE): + with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f: + failed_tarball_hashes = set( + line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line + ) + else: + failed_tarball_hashes = set() + + # lists install keys install_keys = list(database['database']['installs']) num_keys = len(install_keys) # load last processed package hash from checkpoint from cache if it exists last_processed = load_checkpoint() + index = load_index() skip = True if last_processed else False + existing_tarinfo_files = set(os.listdir(TARINFO_DIR)) + existing_tarinfo_hashes = { + fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files + } + existing_tarinfo_files = existing_tarinfo_hashes + + + + # track already-processed tarball hashes to find shared ones + seen_tarball_hashes = set() + + SAVE_INTERVAL = 50 + + print("Starting...Will skip packages already fully processed.") + try: for i, package_hash in enumerate(install_keys): - # Skip previously completed packages until the last processed one - if skip: - print(f"Skipping {package_hash}") - if package_hash == last_processed: - # start processing after the last one - skip = False - continue + # skip if package_hash is in the skipped manifests file + if package_hash in skipped_hashes: + continue + + # skip if manifest had no usable tarball + if package_hash in missing_tarball_hashes: + continue + + if package_hash in index: + entry = index[package_hash] + manifest_path = entry.get("manifest_path", "") + tarinfo_path = entry.get("tarinfo_path", "") + tarball_hash = entry.get("sha256", "") + + manifest_exists = manifest_path and os.path.exists(manifest_path) + tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) + + if manifest_exists and tarinfo_exists: + print(f"Skipping fully processed package: {package_hash}") + continue + + # if tarball previously failed, skip retrying it + if tarball_hash in failed_tarball_hashes: + print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}") + continue + print(f"πŸ“¦ package {i + 1} out of {num_keys} packages\n") - run_program(package_hash, database) + run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes) - save_checkpoint(package_hash) + # Save checkpoint and index every N packages + if (i + 1) % SAVE_INTERVAL == 0: + save_checkpoint(package_hash) + save_index(index) + print(f"Saved checkpoint and index at package {i + 1}") except KeyboardInterrupt: - print("\nπŸ›‘ Interrupted by user. Progress saved.") + save_checkpoint(package_hash) + save_index(index) + print("\nπŸ›‘ Interrupted. Progress saved.") if last_processed: - installs = database['database']['installs'] - if last_processed in installs: - package_value = installs[last_processed] - package_name = package_value['spec']['name'] - package_version = package_value['spec']['version'] - print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}") + val = database['database']['installs'].get(last_processed) + if val: + name = val['spec']['name'] + version = val['spec']['version'] + + print(f"Last checkpoint was: {name}-{version}, {last_processed}") else: - print(f"Last checkpoint was: unknown, {last_processed}") + print(f"Checkpoint not found in current file: {last_processed}") print(f"file may have changed since the last run") finally: + save_checkpoint(package_hash) + save_index(index) print("\n🎊 Complete (or safely stopped). Script will resume where it left off.") if __name__ == "__main__": - main() - -### WHAT I NEED TO DO ### -# interested in adding a function that gives us how much cpu is being used for the program -## ways to make it faster - running it, then stopping it and then later proceeding from where it left off - ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it - ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded ) -# can I also add the time it takes to run? -# can I have a calculation for how long it would take if I wanted to run it for 130464 files? -### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow -## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it) -# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow) -# create a branch - commit files to branch on dapper -## FROM RYAN ## -# next step is play with the Python sqlite module and try to figure out how to -# 1) create a new sqlite db -# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite) - -#meow - + main() \ No newline at end of file From 059f205d5c4d916725c2202025a4b1379f78b258 Mon Sep 17 00:00:00 2001 From: lfquintaz <158081978+lfquintaz@users.noreply.github.com> Date: Sun, 10 Aug 2025 18:31:50 -0700 Subject: [PATCH 05/20] added atomic save functions for manifest and tarballs --- dataset-generation/spack_db/spack_db.py | 36 ++++++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 961c657..dbe1212 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -279,6 +279,25 @@ def make_binary_package_URL(package_zip_hash): return myURL +############ updated 8/10 ############ +# ensure tarinfo completeness +def write_tarinfo_safely(tarinfo_path, file_list): + temp_dir = os.path.dirname(tarinfo_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(file_list, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, tarinfo_path) + + +# ensure manifest completeness +def write_manifest_safely(manifest_path, manifest_data): + temp_dir = os.path.dirname(manifest_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(manifest_data, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, manifest_path) +##################################### + # using python tarfile module io module to list all the files in the downloaded tarball # myfile is the tar_file response.content and the package is the hash we will split by # package is the package name and the version @@ -320,10 +339,14 @@ def read_binary_package(myfile, package, package_zip_hash): name = package.split('/')[0] version = package.split('/')[1].split('-')[1] + ############ updated 8/10 ############ # saves file names to the tarinfo file + ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") + ##with open(tarinfo_path, "w") as f: + ##json.dump(file_list, f, indent=2) tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") - with open(tarinfo_path, "w") as f: - json.dump(file_list, f, indent=2) + write_tarinfo_safely(tarinfo_path, file_list) + ##################################### # removes tarball once processed tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__')) @@ -388,8 +411,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen else: # download if manifest does not exist - - # returns the URL for the spec manifest file and the package_filename theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) @@ -414,9 +435,10 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen clean_spec_manifest = remove_lines_spec_manifest(temp) # writes cleaned manifest information to manifest file - with open(manifest_path, "w") as f: - json.dump(clean_spec_manifest, f, indent=2) - print("βœ… Cleaned and parsed spec manifest") + ##with open(manifest_path, "w") as f: + ##json.dump(clean_spec_manifest, f, indent=2) + write_manifest_safely(manifest_path, clean_spec_manifest) + print("βœ… Manifest safely written: {manifest_path}") From b5e46e0487c6e2baf3fe535a2aa82ad10733dfd4 Mon Sep 17 00:00:00 2001 From: lfquintaz <158081978+lfquintaz@users.noreply.github.com> Date: Mon, 11 Aug 2025 10:02:05 -0700 Subject: [PATCH 06/20] Add files via upload From 601f74ac0f691fac68374a5722fe4a588a7ccd13 Mon Sep 17 00:00:00 2001 From: leslie q Date: Fri, 15 Aug 2025 11:03:29 -0700 Subject: [PATCH 07/20] removed print statement at restart that shows all the fully processed tarball hashes --- dataset-generation/spack_db/spack_db.py | 481 ++++++++++++++++++------ 1 file changed, 374 insertions(+), 107 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index a8e7ee2..ba69d3e 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -4,11 +4,6 @@ # requests used to download URL automatically import requests -# COME BACK TO THIS LATER -# certifi and urllib3 used for certificate verfication -#import certifi -#import urllib3 - # io for reading and writing streams such as text, binary, and raw data import io @@ -22,35 +17,93 @@ # os to create a cache folder - disk-based caching import os +# tempfile and shutil for index temp file +import tempfile +import shutil + # Configuration -SPEC_CACHE_DIR = "cache/spec_manifests" -BINARY_CACHE_DIR = "cache/binary_packages" + +# spack.index.db.json maps each package back to the packagename, version, SHA256hash, + # path to manifest json file, path to tarinfo file list +INDEX_FILE = "cache/spack.index.db.json" + +# MANIFEST_DIR is the source of metadata per package used in master index +MANIFEST_DIR = "cache/manifest" + +# TARINFO_DIR contains the extracted list of files from each binary tarball +# this will be used in making the SQLite database without having to reprocess the tarballs again +TARINFO_DIR = "cache/tarinfo" + +# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet +# a clean copy is meant to be placed in MANIFEST_DIR +# SPEC_CACHE_DIR avoids redownloading if the script is restarted. +SPEC_CACHE_DIR = "cache/spec_manifests" + +# BINARY_CACHE_DIR contains the downloaded tarballs temporarily +# the file is deleted after processing. +BINARY_CACHE_DIR = "cache/binary_packages" +TIMEOUT_LOG_FILE = "cache/timeouts.txt" # checkpoint to safely stop the script at any time -CHECKPOINT_FILE = "progress.txt" +# progress.txt saves the spec_manifest hash +CHECKPOINT_FILE = "progress.txt" + +# file to track all the manifest files that were unable to download +SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt" +MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt" +SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt" +FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt" # create cache directories for faster download +# FIX ME: Remove SPEC_CACHE_DIR +os.makedirs(MANIFEST_DIR, exist_ok=True) +os.makedirs(TARINFO_DIR, exist_ok = True) os.makedirs(SPEC_CACHE_DIR, exist_ok = True) os.makedirs(BINARY_CACHE_DIR, exist_ok = True) +# look for index if it exists to add info to +def load_index(): + # if the index_file exists, read it + if os.path.exists(INDEX_FILE): + with open(INDEX_FILE, "r") as f: + # return as a dictionary for easy manipulation and JSON formatting + return json.load(f) + # if the index does not exist, an empty dictionary is returned + return {} + +# save index +def save_index(index): + + # create a backup of the previous index + if os.path.exists(INDEX_FILE): + shutil.copy(INDEX_FILE, INDEX_FILE + ".bak") + + # Save to a temp file, then move to replace + temp_dir = os.path.dirname(INDEX_FILE) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(index, tmp, indent=2) + temp_name = tmp.name + + shutil.move(temp_name, INDEX_FILE) + +# format for entried added to index +def update_index_entry(index, package_hash, package_value, package_zip_hash): + name = package_value['spec']['name'] + version = package_value['spec']['version'] + manifest_filename = f"{name}-{version}-{package_hash}.json" + tarinfo_filename = f"{package_zip_hash}.json" + + index[package_hash] = { + "name": name, + "version": version, + "sha256": package_zip_hash, + "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename), + "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename) + } -# reading file -def readmyfile(myfile): - try: - with open(myfile, 'r') as file: - # database is the spack database json, within the spack build cache - db = json.load(file) # 8.6 seconds to read in large json file - - # returns database - return db - - except FileNotFoundError: - print(f"Error: The file '{myfile}' not found.") - except Exception as e: - print(f"Error occured in readmyfile: {e}") # load that last saved package hash -def load_checkpoint(): +def load_checkpoint(): # # checks if progress.txt exists if os.path.exists(CHECKPOINT_FILE): with open(CHECKPOINT_FILE, "r") as f: @@ -64,16 +117,32 @@ def load_checkpoint(): return None -# saves the last processed package_hash to progress.txt +# saves the last processed manifest package_hash to progress.txt # if the program is interrupted, the saved package_hash will be # the starting point when rerun -def save_checkpoint(package_hash): +def save_checkpoint(package_hash): # with open(CHECKPOINT_FILE, "w") as f: f.write(package_hash) +# reading file +def readmyfile(myfile): + try: + with open(myfile, 'r') as file: + # database is the spack database json, within the spack build cache + db = json.load(file) # 8.6 seconds to read in large json file + + # returns database + return db + + except FileNotFoundError: + print(f"Error: The file '{myfile}' not found.") + except Exception as e: + print(f"Error occured in readmyfile: {e}") + + # make the spec manifest downloadable URL -def make_spec_manifest_URL(package_hash, package_hash_value): +def make_spec_manifest_URL(package_hash, package_hash_value): # goal is to make a URL that looks like this -> # https://binaries.spack.io/develop/v3/manifests/spec//--.spec.manifest.json # example URL for compiler-wrapper: @@ -98,8 +167,9 @@ def make_spec_manifest_URL(package_hash, package_hash_value): return myURL, package_filename # automatically download contents from the URL -def download_from_URL(theURL, package, is_spec=True): +def download_from_URL(theURL, package, is_spec=True): + # makes filename # Example: # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" @@ -107,14 +177,19 @@ def download_from_URL(theURL, package, is_spec=True): package_name = package.replace('/', '__') # if is_spec is true, meaning the file ends with ".spec.manifest.json", - # then the file is saved in SPEC_CACHE_DIR + # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation # if the file ends with .tar.gz # then the file is saved in BINARY_CACHE_DIR cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR + print(f"location to be saved in {cache_dir} for {package_name}") + + # full file path then is: # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json cached_path = os.path.join(cache_dir, package_name) + print(f"this is the cached_path {cached_path}") #if cache exists, it does not need to be redownloaded if os.path.exists(cached_path): @@ -125,18 +200,23 @@ def download_from_URL(theURL, package, is_spec=True): return f.read() try: - response = requests.get(theURL, verify=False) - + print("in try block for download_from_URL") + # adding timeout for 60 seconds + response = requests.get(theURL, timeout=60, verify=False) + print(f"trying download for {cached_path} ") # a response status code is 200 then the request was successful # response.status_code of 404 means does not exist if response.status_code == 200: - + print(f"download successful for {cached_path}") # saves to cache if request is successful # wb is write binary - with open(cached_path, "wb") as f: - f.write(response.content) + if is_spec == False: + with open(cached_path, "wb") as f: + f.write(response.content) - return response.content + return response.content + else: + return response.content else: # if URL does not exist, skip and move to next package @@ -145,6 +225,13 @@ def download_from_URL(theURL, package, is_spec=True): # return None to stop process due to download failing - goes back to run_program function return None + except requests.exceptions.Timeout: + print(f"⏰ Timeout: Skipping package that took too long to download: {package_name}") + # Append to file immediately + with open(TIMEOUT_LOG_FILE, "a") as f: + f.write(f"{package_name}\t{theURL}\n") + return None + except Exception as e: print(f"download_from_URL package {package_name}, error: {e}") @@ -152,18 +239,18 @@ def download_from_URL(theURL, package, is_spec=True): return None # remove unnecessary lines in file -def remove_lines_spec_manifest(myfile): +def remove_lines_spec_manifest(myfile): # removes unnecessary bytes removed_ends = myfile[49:-834] # converts bytes to dictionary database = json.loads(removed_ends) - + return database # returns checksum, sha256 hash used to download the binary tarball -def access_spec_manifest_media_type(db): +def access_spec_manifest_media_type(db): try: # get the value for the key 'data' from the db @@ -177,9 +264,10 @@ def access_spec_manifest_media_type(db): except Exception as e: print(f"Error occured in access_spec_manifest_media_type: {e}") + return None # uses checksum returned to generate URL for binary tarball download -def make_binary_package_URL(package_zip_hash): +def make_binary_package_URL(package_zip_hash): # example URL for compiler-wrapper binary package: # https://binaries.spack.io/develop/blobs/sha256/f4/f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351 @@ -191,11 +279,32 @@ def make_binary_package_URL(package_zip_hash): return myURL +############ updated 8/10 ############ +# ensure tarinfo completeness +def write_tarinfo_safely(tarinfo_path, file_list): + temp_dir = os.path.dirname(tarinfo_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(file_list, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, tarinfo_path) + + +# ensure manifest completeness +def write_manifest_safely(manifest_path, manifest_data): + temp_dir = os.path.dirname(manifest_path) + with tempfile.NamedTemporaryFile("w", dir=temp_dir, delete=False) as tmp: + json.dump(manifest_data, tmp, indent=2) + temp_name = tmp.name + shutil.move(temp_name, manifest_path) +##################################### + # using python tarfile module io module to list all the files in the downloaded tarball # myfile is the tar_file response.content and the package is the hash we will split by -def read_binary_package(myfile, package): - +# package is the package name and the version +def read_binary_package(myfile, package, package_zip_hash): + file_list = [] try: + # with io.BytesIO(myfile) as tar_buffer: with tarfile.open(fileobj = tar_buffer, mode="r") as tar: @@ -203,7 +312,6 @@ def read_binary_package(myfile, package): i = 1 for member in tar.getmembers(): if member.isfile(): - #breakpoint() #print(f"{i}: {member.name}") #i += 1 @@ -214,17 +322,40 @@ def read_binary_package(myfile, package): # __spack_path_placeh/morepadding/linux-x86_64_v3/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3/ # .spack/install_environment.json" # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" - remove_placeholder_directories(i, member.name, package) - # instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the + clean_path = remove_placeholder_directories(i, member.name, package) + # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the # # remove_placeholder function because we will already have a copy of the list of the files in that package # # to make it easier to add to sqlite database + + # this will add the files that are in the package to a clean_list + if clean_path: + file_list.append(clean_path) i += 1 except tarfile.ReadError as e: print(f"Error reading tar file: {e}") + return + + name = package.split('/')[0] + version = package.split('/')[1].split('-')[1] + + ############ updated 8/10 ############ + # saves file names to the tarinfo file + ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") + ##with open(tarinfo_path, "w") as f: + ##json.dump(file_list, f, indent=2) + tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") + write_tarinfo_safely(tarinfo_path, file_list) + ##################################### + + # removes tarball once processed + tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__')) + if os.path.exists(tarball_path): + os.remove(tarball_path) + del myfile # removing the placeholder directories in the file path -def remove_placeholder_directories(i, name, package): +def remove_placeholder_directories(i, name, package): # i is the counter for file enumeration # name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/ # __spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/__spack_path_placeholder__/ @@ -246,129 +377,265 @@ def remove_placeholder_directories(i, name, package): try: if placeholder not in split_list[0]: print("split_list", split_list) + + # returns file name without the placeholder path elif len(split_list) > 1: updatedname = split_list[1][1:] print(f"file {i}: ", updatedname) + return updatedname # return to add to list of files for respective package except Exception as e: print(f"Error in remove_placeholder_directories: {e}") + return None -def print_files(package_hash, package_value): +def print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes): + name = package_value['spec']['name'] + version = package_value['spec']['version'] - # returns the URL for the spec manifest file and the package_filename - theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) - - # download the spec manifest json for the package of interest - #temp = download_from_URL(theURL, package_filename) - temp = download_from_URL(theURL, package_filename, is_spec = True) + ############ updated 8/10 ############ + manifest_filename = f"{name}-{version}-{package_hash}.json" + manifest_path = os.path.join(MANIFEST_DIR, manifest_filename) + + # use existing cleaned manifest if available + if os.path.exists(manifest_path): + print(f"Using existing cleaned manifest: {manifest_path}") + with open(manifest_path, "r") as f: + clean_spec_manifest = json.load(f) + + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) + ## and indented theURL to print("βœ… Cleaned and parsed spec manifest") + else: + # download if manifest does not exist - # return if URL does not exist - if temp is None: - print(f"Skipping package {package_filename}\n") + # returns the URL for the spec manifest file and the package_filename + theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) - # exit print_files function and goes back to run_program function - return + # download the spec manifest json for the package of interest + temp = download_from_URL(theURL, package_filename, is_spec = True) + - # remove unneccessary lines from downloaded spec manifest - spec_manifest_database = remove_lines_spec_manifest(temp) + # return if URL does not exist + if temp is None: + print(f"Could not download manifest: {package_filename} - recording and skipping.\n") + + # recording the failed manifest filename and hash + with open(SKIPPED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\n") + + # exit print_files function and goes back to run_program function + return + + print("βœ… Loaded cached spec manifest") + + # remove unneccessary lines from downloaded spec manifest + clean_spec_manifest = remove_lines_spec_manifest(temp) + + # writes cleaned manifest information to manifest file + ##with open(manifest_path, "w") as f: + ##json.dump(clean_spec_manifest, f, indent=2) + write_manifest_safely(manifest_path, clean_spec_manifest) + print(f"βœ… Manifest safely written: {manifest_path}") + + + + # writes cleaned manifest information to manifest file + ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json") + ##with open(manifest_file, "w") as f: + ##json.dump(clean_spec_manifest, f, indent=2) + + ###################################### - # find the mediaType that contains the hash for the package install - package_zip_hash = access_spec_manifest_media_type(spec_manifest_database) + # find the mediaType that contains the hash for the package tarball install + package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest) + print(f"βœ… Extracted zip hash: {package_zip_hash}") # if 'data' key was not found in access_spec_manifest_media_type, None is returned and we go back to run_program function if package_zip_hash is None: - print("mediaType not found - skipping this package.") + print(f"No Tarball hash found in manifest: {package_filename}") + + # Track taht this manifest has no downloadable binary tarball + with open(MISSING_TARBALL_HASH_FILE, "a") as f: + f.write(f"{package_hash}\n") # go back to run_program function return + + # track if the tarball hash has already been processed + if package_zip_hash in seen_tarball_hashes: + with open(SHARED_TARBALL_HASH_FILE, "a") as f: + # if this manifest points to a tarball that has already been seen, + # it will not create a new tarinfo entry + # it will have a new manifest entry + f.write(f"{package_hash}\t{package_zip_hash}\n") + else: + seen_tarball_hashes.add(package_zip_hash) + - # make the binary package URL for installing the package - binary_package_URL = make_binary_package_URL(package_zip_hash) + expected_tarinfo_hash = package_zip_hash - # download the binary package file from the generated URL - tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) + if expected_tarinfo_hash in existing_tarinfo_files: + print(f"βœ… Already have tarinfo for {name}-{version}-{expected_tarinfo_hash}, skipping binary download.") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"βœ… Saved Index") + return + + else: + + # make the binary package URL for installing the package + binary_package_URL = make_binary_package_URL(package_zip_hash) + print(f"πŸ”— Downloading binary: {binary_package_URL}") + + # download the binary package file from the generated URL + tempbinary = download_from_URL(binary_package_URL, package_filename, is_spec = False) + print("βœ… Binary package downloaded") + + if tempbinary is None: + + # Track failed tarball download + with open(FAILED_TARBALL_DOWNLOAD_FILE, "a") as f: + f.write(f"{package_filename}: manifest hash: {package_hash}, tarball hash: {package_zip_hash}\n") + + return - # read the binary package - read_binary_package(tempbinary, package_filename) + # read the binary package + read_binary_package(tempbinary, package_filename, package_zip_hash) + print("βœ… Finished reading binary package") + update_index_entry(index, package_hash, package_value, package_zip_hash) + save_index(index) + print(f"Updated Index with {package_filename}-{package_zip_hash}") + save_checkpoint(package_hash) + print(f"βœ… Saved Index") # program dispatcher -def run_program(package_hash, database): +def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes): installs = database['database']['installs'] # gets installs key value, aka name of package, version, etc. package_value = installs[package_hash] - print_files(package_hash, package_value) + print_files(package_hash, package_value, index, existing_tarinfo_files, seen_tarball_hashes) def main(): - # file_name = "myMedjson.json" + #file_name = "myMedjson.json" # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" database = readmyfile(file_name) + # load list of previously skipped manifest hashes + if os.path.exists(SKIPPED_MANIFESTS_FILE): + with open(SKIPPED_MANIFESTS_FILE, "r") as f: + skipped_hashes = set(line.strip() for line in f) + + else: + skipped_hashes = set() + + # load manifests that are missing binary tarball hashes (e.g. mediaType not found) + if os.path.exists(MISSING_TARBALL_HASH_FILE): + with open(MISSING_TARBALL_HASH_FILE, "r") as f: + missing_tarball_hashes = set(line.strip() for line in f) + + else: + missing_tarball_hashes = set() + + # load tarballs that were not downloadable + if os.path.exists(FAILED_TARBALL_DOWNLOAD_FILE): + with open(FAILED_TARBALL_DOWNLOAD_FILE, "r") as f: + failed_tarball_hashes = set( + line.strip().split("tarball hash: ")[-1] for line in f if "tarball hash:" in line + ) + else: + failed_tarball_hashes = set() + + # lists install keys install_keys = list(database['database']['installs']) num_keys = len(install_keys) # load last processed package hash from checkpoint from cache if it exists last_processed = load_checkpoint() + index = load_index() skip = True if last_processed else False + existing_tarinfo_files = set(os.listdir(TARINFO_DIR)) + existing_tarinfo_hashes = { + fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files + } + existing_tarinfo_files = existing_tarinfo_hashes + + + + # track already-processed tarball hashes to find shared ones + seen_tarball_hashes = set() + + SAVE_INTERVAL = 50 + + print("Starting...Will skip packages already fully processed.") + try: for i, package_hash in enumerate(install_keys): - # Skip previously completed packages until the last processed one - if skip: - print(f"Skipping {package_hash}") - if package_hash == last_processed: - # start processing after the last one - skip = False - continue + # skip if package_hash is in the skipped manifests file + if package_hash in skipped_hashes: + continue + + # skip if manifest had no usable tarball + if package_hash in missing_tarball_hashes: + continue + + if package_hash in index: + entry = index[package_hash] + manifest_path = entry.get("manifest_path", "") + tarinfo_path = entry.get("tarinfo_path", "") + tarball_hash = entry.get("sha256", "") + + manifest_exists = manifest_path and os.path.exists(manifest_path) + tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) + + if manifest_exists and tarinfo_exists: + #print(f"Skipping fully processed package: {package_hash}") + continue + + # if tarball previously failed, skip retrying it + if tarball_hash in failed_tarball_hashes: + print(f"🚫 Skipping manifest with previously failed tarball download: {package_hash}") + continue + print(f"πŸ“¦ package {i + 1} out of {num_keys} packages\n") - run_program(package_hash, database) + run_program(package_hash, database, index, existing_tarinfo_files, seen_tarball_hashes) - save_checkpoint(package_hash) + # Save checkpoint and index every N packages + if (i + 1) % SAVE_INTERVAL == 0: + save_checkpoint(package_hash) + save_index(index) + print(f"Saved checkpoint and index at package {i + 1}") except KeyboardInterrupt: - print("\nπŸ›‘ Interrupted by user. Progress saved.") + save_checkpoint(package_hash) + save_index(index) + print("\nπŸ›‘ Interrupted. Progress saved.") if last_processed: - installs = database['database']['installs'] - if last_processed in installs: - package_value = installs[last_processed] - package_name = package_value['spec']['name'] - package_version = package_value['spec']['version'] - print(f"Last checkpoint was: {package_name}-{package_version}, {last_processed}") + val = database['database']['installs'].get(last_processed) + if val: + name = val['spec']['name'] + version = val['spec']['version'] + + print(f"Last checkpoint was: {name}-{version}, {last_processed}") else: - print(f"Last checkpoint was: unknown, {last_processed}") + print(f"Checkpoint not found in current file: {last_processed}") print(f"file may have changed since the last run") finally: + save_checkpoint(package_hash) + save_index(index) print("\n🎊 Complete (or safely stopped). Script will resume where it left off.") if __name__ == "__main__": - main() - -### WHAT I NEED TO DO ### -# interested in adding a function that gives us how much cpu is being used for the program -## ways to make it faster - running it, then stopping it and then later proceeding from where it left off - ## Steven and Monwen - scrapers, whenever there is a network request, save a copy of the file locally (currently doing this from inputting the file) but only use it - ## when the program gets restarted - (if the file didn't exist - then we could have a directory where we have a copy of the stuff we downloaded ) -# can I also add the time it takes to run? -# can I have a calculation for how long it would take if I wanted to run it for 130464 files? -### - we're fetching the spec manifest files for each of these packages - 130thousean network requests - downloading the tarball is slow -## dont save the entire tarball - in the cache(local folder ) save the list of files - the tarball may take up a lot of space (may not want to save it) -# look into timescaling pyramid - memory hierarchy (the peak is faster, bottom level is slowest - downloading stuff from the internet is very slow) -# create a branch - commit files to branch on dapper -## FROM RYAN ## -# next step is play with the Python sqlite module and try to figure out how to -# 1) create a new sqlite db -# 2) add a new entry to it (the PyPI db scraping script that Steven made could be helpful for seeing how to work with sqlite) - -#meow - + main() \ No newline at end of file From 670be1605116e4ca5673ebbac06443cc1eabbe54 Mon Sep 17 00:00:00 2001 From: leslie q Date: Fri, 15 Aug 2025 14:33:42 -0700 Subject: [PATCH 08/20] added Create_spack_DB.py --- .../spack_db/Create_spack_DB.py | 143 ++++++++++++++++++ dataset-generation/spack_db/spack_db.py | 46 ------ 2 files changed, 143 insertions(+), 46 deletions(-) create mode 100644 dataset-generation/spack_db/Create_spack_DB.py diff --git a/dataset-generation/spack_db/Create_spack_DB.py b/dataset-generation/spack_db/Create_spack_DB.py new file mode 100644 index 0000000..e2ccc8d --- /dev/null +++ b/dataset-generation/spack_db/Create_spack_DB.py @@ -0,0 +1,143 @@ +import os +import json +import sqlite3 +import time +from dapper_python.normalize import normalize_file_name + +# configuration +INDEX_PATH = "cache/spack.index.db.json" +SQLITE_DB_PATH = "cache/spack-v1.db" + +def build_package_filelist_db(): + # load index + if not os.path.exists(INDEX_PATH): + print("❌ Index file not found.") + return + + with open(INDEX_PATH, "r") as f: + index = json.load(f) + + # Create SQLite DB + conn = sqlite3.connect(SQLITE_DB_PATH) + cursor = conn.cursor() + + # Create table columns + cursor.execute(''' + CREATE TABLE IF NOT EXISTS package_files ( + id INTEGER PRIMARY KEY, + file_name TEXT, + normalized_file_name TEXT, + file_path TEXT, + package_name TEXT, + UNIQUE(file_path, package_name) + ) + ''') + + # Create indices for efficient lookups + cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_name ON package_files(file_name)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_normalized_file_name ON package_files(normalized_file_name)') + + + # Create dataset_version table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS dataset_version( + version INTEGER, + format TEXT, + timestamp INTEGER + ) + ''') + + # Clear the dataset_version table + cursor.execute("DELETE FROM dataset_version") + + # Create table columns + cursor.execute( + "INSERT INTO dataset_version (version, format, timestamp)" \ + "VALUES (?, ?, ?)", + (1, "Spack", int(time.time())) + ) + + inserted_packages = 0 + inserted_files = 0 + for package_hash, entry in index.items(): + try: + package_name = entry["name"] + version = entry["version"] + sha256 = entry["sha256"] + + + tarinfo_path = entry.get("tarinfo_path") + if not tarinfo_path or not os.path.exists(tarinfo_path): + print(f"⚠️ Missing tarinfo for: {package_name}-{version}-{sha256}") + continue + + with open(tarinfo_path, "r") as f: + file_list = json.load(f) + + package_inserted_or_updated = False + + for file_path in file_list: + # skipping .spack/ files + if file_path.startswith(".spack/"): + continue + + # Extract file name + file_name = os.path.basename(file_path) + + # Normalize the file name + try: + normalized = normalize_file_name(file_name) + normalized_file_name = str(normalized).lower() + except Exception as e: + print(f"⚠️ Failed to normalize '{file_name}': {e}") + normalized_file_name = file_name.lower() + + # Insert into DB + cursor.execute( + '''INSERT OR IGNORE INTO package_files + (file_name, normalized_file_name, file_path, package_name) + VALUES (?, ?, ?, ?)''', + (file_name, normalized_file_name, file_path, package_name) + ) + + if cursor.rowcount > 0: + inserted_files += 1 + package_inserted_or_updated = True # New row added + continue # No need to update - freshly inserted + #breakpoint() + # Row already exists - check if any values changed + cursor.execute( + ''' SELECT file_name, normalized_file_name FROM package_files + WHERE file_path = ? AND package_name = ?''', + (file_path, package_name) + ) + result = cursor.fetchone() + if result: + existing_file_name, existing_normalized_name = result + if (existing_file_name != file_name) or (existing_normalized_name != normalized_file_name): + # Something changed - update + + + # Update the row + cursor.execute( + ''' UPDATE package_files + SET file_name = ?, normalized_file_name = ? + WHERE file_path = ? AND package_name = ?''', + (file_name, normalized_file_name, file_path, package_name) + ) + package_inserted_or_updated = True # A row was updated + if package_inserted_or_updated: + inserted_packages += 1 + + + except Exception as e: + print(f"❌ Failed to insert {package_hash}: {e}") + continue + + conn.commit() + conn.close() + + print(f"πŸŽ‰ Done. Inserted {inserted_files} new files from {inserted_packages} packages into {SQLITE_DB_PATH}") + +if __name__ == "__main__": + build_package_filelist_db() diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 0cccc00..ba69d3e 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -25,7 +25,6 @@ # spack.index.db.json maps each package back to the packagename, version, SHA256hash, # path to manifest json file, path to tarinfo file list -<<<<<<< HEAD INDEX_FILE = "cache/spack.index.db.json" # MANIFEST_DIR is the source of metadata per package used in master index @@ -34,21 +33,10 @@ # TARINFO_DIR contains the extracted list of files from each binary tarball # this will be used in making the SQLite database without having to reprocess the tarballs again TARINFO_DIR = "cache/tarinfo" -======= -INDEX_FILE = "cache/DEMO_spack.index.db.json" - -# MANIFEST_DIR is the source of metadata per package used in master index -MANIFEST_DIR = "cache/DEMO_manifest" - -# TARINFO_DIR contains the extracted list of files from each binary tarball -# this will be used in making the SQLite database without having to reprocess the tarballs again -TARINFO_DIR = "cache/DEMO_tarinfo" ->>>>>>> origin/spack_scraping_LQ # SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet # a clean copy is meant to be placed in MANIFEST_DIR # SPEC_CACHE_DIR avoids redownloading if the script is restarted. -<<<<<<< HEAD SPEC_CACHE_DIR = "cache/spec_manifests" # BINARY_CACHE_DIR contains the downloaded tarballs temporarily @@ -65,24 +53,6 @@ MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt" SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt" FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt" -======= -SPEC_CACHE_DIR = "cache/DEMO_spec_manifests" - -# BINARY_CACHE_DIR contains the downloaded tarballs temporarily -# the file is deleted after processing. -BINARY_CACHE_DIR = "cache/DEMO_binary_packages" -TIMEOUT_LOG_FILE = "cache/DEMO_timeouts.txt" - -# checkpoint to safely stop the script at any time -# progress.txt saves the spec_manifest hash -CHECKPOINT_FILE = "DEMO_progress.txt" - -# file to track all the manifest files that were unable to download -SKIPPED_MANIFESTS_FILE = "cache/DEMO_skipped_manifests.txt" -MISSING_TARBALL_HASH_FILE = "cache/DEMO_missing_tarballs.txt" -SHARED_TARBALL_HASH_FILE = "cache/DEMO_shared_tarballs.txt" -FAILED_TARBALL_DOWNLOAD_FILE = "cache/DEMO_failed_tarball_downloads.txt" ->>>>>>> origin/spack_scraping_LQ # create cache directories for faster download # FIX ME: Remove SPEC_CACHE_DIR @@ -217,11 +187,7 @@ def download_from_URL(theURL, package, is_spec=True): # full file path then is: # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" -<<<<<<< HEAD #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json -======= - #cache/DEMO_manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json ->>>>>>> origin/spack_scraping_LQ cached_path = os.path.join(cache_dir, package_name) print(f"this is the cached_path {cached_path}") @@ -472,11 +438,7 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen ##with open(manifest_path, "w") as f: ##json.dump(clean_spec_manifest, f, indent=2) write_manifest_safely(manifest_path, clean_spec_manifest) -<<<<<<< HEAD print(f"βœ… Manifest safely written: {manifest_path}") -======= - print("βœ… Manifest safely written: {manifest_path}") ->>>>>>> origin/spack_scraping_LQ @@ -558,11 +520,7 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb def main(): -<<<<<<< HEAD #file_name = "myMedjson.json" -======= - #file_name = "myMedjson_DEMO.json" ->>>>>>> origin/spack_scraping_LQ # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' file_name = "e2a6969c742c8ee33deba2d210ce2243cd3941c6553a3ffc53780ac6463537a9" @@ -640,11 +598,7 @@ def main(): tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) if manifest_exists and tarinfo_exists: -<<<<<<< HEAD #print(f"Skipping fully processed package: {package_hash}") -======= - print(f"Skipping fully processed package: {package_hash}") ->>>>>>> origin/spack_scraping_LQ continue # if tarball previously failed, skip retrying it From cefc0fccf399b52c4ef246ea49aa53117a52a29a Mon Sep 17 00:00:00 2001 From: leslie q Date: Fri, 15 Aug 2025 17:16:43 -0700 Subject: [PATCH 09/20] added test files --- dataset-generation/spack_db/conftest.py | 110 ++++++ dataset-generation/spack_db/test_spack_db.py | 356 +++++++++++++++++++ 2 files changed, 466 insertions(+) create mode 100644 dataset-generation/spack_db/conftest.py create mode 100644 dataset-generation/spack_db/test_spack_db.py diff --git a/dataset-generation/spack_db/conftest.py b/dataset-generation/spack_db/conftest.py new file mode 100644 index 0000000..3acaeb6 --- /dev/null +++ b/dataset-generation/spack_db/conftest.py @@ -0,0 +1,110 @@ +# conftest.py +import io +import json +import tarfile +import types +import pytest + + +import spack_db as pl + +@pytest.fixture(autouse=True) +def isolate_fs(tmp_path, monkeypatch): + """Redirect all cache/config paths to a temp dir per test.""" + cache = tmp_path / "cache" + (cache / "manifest").mkdir(parents=True, exist_ok=True) + (cache / "tarinfo").mkdir(parents=True, exist_ok=True) + (cache / "spec_manifests").mkdir(parents=True, exist_ok=True) + (cache / "binary_packages").mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(pl, "INDEX_FILE", str(cache / "spack.index.db.json"), raising=False) + monkeypatch.setattr(pl, "MANIFEST_DIR", str(cache / "manifest"), raising=False) + monkeypatch.setattr(pl, "TARINFO_DIR", str(cache / "tarinfo"), raising=False) + monkeypatch.setattr(pl, "SPEC_CACHE_DIR", str(cache / "spec_manifests"), raising=False) + monkeypatch.setattr(pl, "BINARY_CACHE_DIR", str(cache / "binary_packages"), raising=False) + + monkeypatch.setattr(pl, "CHECKPOINT_FILE", str(tmp_path / "progress.txt"), raising=False) + monkeypatch.setattr(pl, "SKIPPED_MANIFESTS_FILE", str(cache / "skipped_manifests.txt"), raising=False) + monkeypatch.setattr(pl, "MALFORMED_MANIFESTS_FILE", str(cache / "malformed_manifests.txt"), raising=False) + monkeypatch.setattr(pl, "TIMEOUT_LOG_FILE", str(cache / "timeouts.txt"), raising=False) + monkeypatch.setattr(pl, "MISSING_TARBALL_HASH_FILE", str(cache / "missing_tarballs.txt"), raising=False) + monkeypatch.setattr(pl, "SHARED_TARBALL_HASH_FILE", str(cache / "shared_tarballs.txt"), raising=False) + monkeypatch.setattr(pl, "FAILED_TARBALL_DOWNLOAD_FILE", str(cache / "failed_tarball_downloads.txt"), raising=False) + + # Ensure directories exist for atomic writes + (tmp_path / "cache").mkdir(exist_ok=True) + yield + + +@pytest.fixture +def sample_manifest_json(): + """ + Create the *actual bytes* expected by remove_lines_spec_manifest: + take a valid JSON, then pad 49 bytes in front and 834 bytes at the end. + """ + body = { + "data": [ + {"mediaType": "irrelevant/type", "checksum": "abc"}, + {"mediaType": "application/vnd.spack.install.v2.tar+gzip", + "checksum": "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351"} + ] + } + raw = json.dumps(body).encode("utf-8") + return b"x" * 49 + raw + b"y" * 834 + + +@pytest.fixture +def tar_with_placeholder_bytes(): + """ + Build a tar in-memory whose members include the __spack_path_placeh segments + and the package-tail folder (e.g., 'compiler-wrapper-1.0-'). + """ + pkg_tail = "compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + member_name = ( + "home/software/spack/__spack_path_placeholder__/__spack_path_placeholder__/" + "__spack_path_placeholder__/__spack_path_placeh/morepadding/linux-x86_64_v3/" + f"{pkg_tail}/.spack/install_environment.json" + ) + + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tf: + data = b"{}" + tarinfo = tarfile.TarInfo(name=member_name) + tarinfo.size = len(data) + tf.addfile(tarinfo, io.BytesIO(data)) + return buf.getvalue() + + +class DummyResp: + def __init__(self, status_code=200, content=b""): + self.status_code = status_code + self.content = content + + +@pytest.fixture +def fake_requests(monkeypatch): + """ + Monkeypatch requests.get with programmable behavior per-URL. + Usage: + table = {} + def _route(url, *a, **kw): return table[url]() + fake = fake_requests + fake.route = _route + monkeypatch.setattr(pl.requests, "get", _route) + table["...json"] = lambda: DummyResp(200, b"...") + """ + table = {} + + def _get(url, *args, **kwargs): + if url not in table: + raise AssertionError(f"Unexpected URL requested: {url}") + result = table[url]() + # Allow raising exceptions (e.g., Timeout) from factories + if isinstance(result, Exception): + raise result + return result + + # Expose for tests to fill + _get.table = table + monkeypatch.setattr(pl.requests, "get", _get) + return _get \ No newline at end of file diff --git a/dataset-generation/spack_db/test_spack_db.py b/dataset-generation/spack_db/test_spack_db.py new file mode 100644 index 0000000..dcf3181 --- /dev/null +++ b/dataset-generation/spack_db/test_spack_db.py @@ -0,0 +1,356 @@ +# test_spack_db.py +import json +import os +import io +import tarfile +import pytest +import spack_db as pl +from requests import exceptions as req_exc + + +def test_update_index_entry_sets_paths(tmp_path): + idx = {} + pkg_hash = "deadbeef" + value = {"spec": {"name": "foo", "version": "1.2.3"}} + tar_hash = "abc123" + pl.update_index_entry(idx, pkg_hash, value, tar_hash) + assert pkg_hash in idx + entry = idx[pkg_hash] + assert entry["name"] == "foo" + assert entry["version"] == "1.2.3" + assert entry["sha256"] == "abc123" + assert entry["manifest_path"].endswith(f"manifest/foo-1.2.3-deadbeef.json") + assert entry["tarinfo_path"].endswith(f"tarinfo/abc123.json") + + +def test_index_save_and_backup(tmp_path): + pl.save_index({"a": 1}) + # First save: no .bak yet + assert os.path.exists(pl.INDEX_FILE) + assert not os.path.exists(pl.INDEX_FILE + ".bak") + + # Second save should write a .bak of previous + pl.save_index({"a": 2}) + assert os.path.exists(pl.INDEX_FILE + ".bak") + with open(pl.INDEX_FILE) as f: + assert json.load(f)["a"] == 2 + with open(pl.INDEX_FILE + ".bak") as f: + assert json.load(f)["a"] == 1 + + +def test_checkpoint_roundtrip(tmp_path): + pl.save_checkpoint("pkg-hash-123") + assert pl.load_checkpoint() == "pkg-hash-123" + + +def test_remove_lines_spec_manifest_and_extract_hash(sample_manifest_json): + cleaned = pl.remove_lines_spec_manifest(sample_manifest_json) + # ensure we got dict and our access function finds the right tarball hash + assert isinstance(cleaned, dict) + h = pl.access_spec_manifest_media_type(cleaned) + assert h == "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + +def test_malformed_manifest_is_logged(fake_requests, tmp_path): + # Set up a fake package hash and minimal database entry + pkg_hash = "badbadbadbadbadbadbadbadbadbadba" + name = "broken-pkg" + ver = "1.0" + db_entry = {"spec": {"name": name, "version": ver}} + + # Manifest URL that print_files will request + manifest_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json" + + # Make fake_requests return invalid bytes (invalid JSON) + fake_requests.table[manifest_url] = lambda: type( + "R", (), {"status_code": 200, "content": b"xxxx"} # bad data + )() + + # Run print_files, which should try to parse the manifest and fail + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + pl.print_files(pkg_hash, db_entry, index, existing_tarinfo_files, seen_tarball_hashes) + + # Check: malformed_manifests.txt exists and has the hash + URL + error + assert os.path.exists(pl.MALFORMED_MANIFESTS_FILE) + with open(pl.MALFORMED_MANIFESTS_FILE, "r") as f: + log_content = f.read() + assert pkg_hash in log_content + assert manifest_url in log_content + assert "ValueError" in log_content or "JSON" in log_content + + # Check: skipped_manifests.txt + assert os.path.exists(pl.SKIPPED_MANIFESTS_FILE) + with open(pl.SKIPPED_MANIFESTS_FILE, "r") as f: + skipped_content = f.read() + assert pkg_hash in skipped_content + +def test_make_urls(): + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + url = pl.make_binary_package_URL(tar_hash) + + # Basic correctness + assert url.endswith("/" + tar_hash) + assert f"/{tar_hash[:2]}/" in url # path uses first two hex chars as subdir + + # manifest URL + pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + pkg_val = {"spec": {"name": "compiler-wrapper", "version": "1.0"}} + murl, package_filename = pl.make_spec_manifest_URL(pkg_hash, pkg_val) + assert "manifests/spec/compiler-wrapper/" in murl + assert murl.endswith(f"compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json") + assert package_filename == f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}" + + + +def test_download_from_URL_spec_does_not_persist(fake_requests, tmp_path): + # For is_spec=True, function returns bytes but should NOT save a file to SPEC_CACHE_DIR + url = "https://example.com/x.spec.manifest.json" + content = b"hello" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=True) + # Returned content + assert out == content + # Not persisted + cached_path = os.path.join(pl.SPEC_CACHE_DIR, "compiler__x-1.0-abc") + assert not os.path.exists(cached_path) + + +def test_download_from_URL_binary_persists(fake_requests, tmp_path): + url = "https://example.com/blob.tar.gz" + content = b"tarbytes" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 200, "content": content})() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False) + assert out == content + cached_path = os.path.join(pl.BINARY_CACHE_DIR, "compiler__x-1.0-abc") + assert os.path.exists(cached_path) + with open(cached_path, "rb") as f: + assert f.read() == content + + +def test_download_timeout_logs(fake_requests, tmp_path): + url = "https://timeout.test/blob.tar.gz" + fake_requests.table[url] = lambda: req_exc.Timeout() + + out = pl.download_from_URL(url, "compiler/x-1.0-abc", is_spec=False) + assert out is None + with open(pl.TIMEOUT_LOG_FILE, "r") as f: + txt = f.read() + assert "compiler__x-1.0-abc" in txt + assert url in txt + + +def test_read_binary_package_extracts_and_cleans_paths(tar_with_placeholder_bytes, tmp_path): + # Prepare the "downloaded" tar path that read_binary_package will delete afterward + pkg = "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + dl_path = os.path.join(pl.BINARY_CACHE_DIR, pkg.replace("/", "__")) + with open(dl_path, "wb") as f: + f.write(b"placeholder") + + # Run + pl.read_binary_package(tar_with_placeholder_bytes, pkg, tar_hash) + + # Verify cleaned tarinfo file written atomically + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + with open(tarinfo_path) as f: + items = json.load(f) + # Path should start *after* the 'pkg-tail/' segment due to remove_placeholder_directories + assert any(p.endswith(".spack/install_environment.json") for p in items) + + # tarball removed and buffer freed + assert not os.path.exists(dl_path) + + +def test_print_files_happy_path(fake_requests, sample_manifest_json, tar_with_placeholder_bytes, tmp_path): + # minimal database with one install + pkg_hash = "bsavlbvtqsc7yjtvka3ko3aem4wye2u3" + db = { + "database": { + "installs": { + pkg_hash: {"spec": {"name": "compiler-wrapper", "version": "1.0"}} + } + } + } + index = {} + existing_tarinfo_files = set() # no prior tarinfo + seen_tarball_hashes = set() + + # Wire URLs expected by print_files flow + manifest_url = ( + "https://binaries.spack.io/develop/v3/manifests/spec/" + f"compiler-wrapper/compiler-wrapper-1.0-{pkg_hash}.spec.manifest.json" + ) + tar_hash = "f4d1969c7a82c76b962ae969c91d7b54cc11e0ce9f1ec9277789990f58aab351" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + fake_requests.table[manifest_url] = lambda: type("R", (), {"status_code": 200, "content": sample_manifest_json})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_with_placeholder_bytes})() + + # Act + pl.print_files(pkg_hash, db["database"]["installs"][pkg_hash], index, existing_tarinfo_files, seen_tarball_hashes) + + # Assert manifest saved safely + manifest_path = os.path.join(pl.MANIFEST_DIR, f"compiler-wrapper-1.0-{pkg_hash}.json") + assert os.path.exists(manifest_path) + # Assert tarinfo created + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + # Index updated & saved + assert pkg_hash in index + assert index[pkg_hash]["sha256"] == tar_hash + assert os.path.exists(pl.INDEX_FILE) + + +def test_print_files_skips_when_tarinfo_exists(sample_manifest_json, tmp_path): + # Prepare: existing tarinfo means no binary download + pkg_hash = "zzz111" + spec = {"spec": {"name": "foo", "version": "9.9"}} + index = {} + + # Pre-create tarinfo + prehash = "abcdead00face" + existing_tarinfo_files = {prehash} + seen_tarball_hashes = set() + + # Monkeypatch access_spec_manifest_media_type to return prehash so it matches existing_tarinfo_files + orig = pl.access_spec_manifest_media_type + pl.access_spec_manifest_media_type = lambda _db: prehash + + # Also ensure manifest path is considered existing so we don't try to download it + manifest_path = os.path.join(pl.MANIFEST_DIR, f"foo-9.9-{pkg_hash}.json") + with open(manifest_path, "w") as f: + json.dump({"dummy": True}, f) + + try: + pl.print_files(pkg_hash, spec, index, existing_tarinfo_files, seen_tarball_hashes) + finally: + pl.access_spec_manifest_media_type = orig + + # Should just update index and save, no download attempted + assert pkg_hash in index + assert index[pkg_hash]["sha256"] == prehash + with open(pl.INDEX_FILE) as f: + data = json.load(f) + assert pkg_hash in data + + +def test_download_404_returns_none(fake_requests): + url = "https://example.com/notfound" + fake_requests.table[url] = lambda: type("R", (), {"status_code": 404, "content": b""})() + assert pl.download_from_URL(url, "some/pkg-1.0-deadbeef", is_spec=True) is None + +def _mk_manifest_bytes_with_hash(tar_hash: str): + """Helper: build bytes that match remove_lines_spec_manifest's slicing contract.""" + body = {"data": [ + {"mediaType": "x/ignored", "checksum": "zzz"}, + {"mediaType": "application/vnd.spack.install.v2.tar+gzip", "checksum": tar_hash}, + ]} + raw = json.dumps(body).encode("utf-8") + return b"x" * 49 + raw + b"y" * 834 + + +def test_shared_tarball_logs_and_skips_second_download(fake_requests, tmp_path): + """ + Two different package hashes point to the same tarball hash. + We expect: + - First run downloads tarball and writes tarinfo. + - Second run logs to SHARED_TARBALL_HASH_FILE and (since we update + existing_tarinfo_files to include the first tarinfo) skips re-download. + """ + # Common tarball hash & blob URL + tar_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcd" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + # Package A + pkg_hash_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + name = "compiler-wrapper" + ver = "1.0" + man_url_a = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_a}.spec.manifest.json" + + # Package B (different spec hash, same tarball) + pkg_hash_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + man_url_b = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash_b}.spec.manifest.json" + + # Route manifests to same tarball hash, and the blob to some tar bytes + tar_bytes = b"\x1f\x8b" + b"tar" * 100 # not actually parsed; we won't open it here + # For the first call we want a real tar.gz + # We'll just reuse download + skip path by creating a minimal valid tar.gz: + import io, tarfile + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tf: + ti = tarfile.TarInfo(name=f"{name}-{ver}-{pkg_hash_a}/.spack/install_environment.json") + data = b"{}" + ti.size = len(data) + tf.addfile(ti, io.BytesIO(data)) + tar_bytes = buf.getvalue() + + fake_requests.table[man_url_a] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[man_url_b] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 200, "content": tar_bytes})() + + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + + # Run A (creates tarinfo and index) + pl.print_files(pkg_hash_a, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert os.path.exists(tarinfo_path) + + # Emulate the main loop behavior: keep using *the same* existing_tarinfo_files set, + # but update it to reflect that we've now created tarinfo. + existing_tarinfo_files.add(tar_hash) + + # Guard: if the second call tries to re-download, we'd need another blob mapping. + # We purposely *don't* add one hereβ€”so if it tries, the test will fail. + + # Run B (should log as shared and skip binary download due to existing_tarinfo_files) + pl.print_files(pkg_hash_b, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + + # Check shared log file captured the second spec hash + the shared tar hash + assert os.path.exists(pl.SHARED_TARBALL_HASH_FILE) + with open(pl.SHARED_TARBALL_HASH_FILE, "r") as f: + shared_log = f.read() + assert f"{pkg_hash_b}\t{tar_hash}" in shared_log + + # Index should contain both manifests, same sha256 + assert index[pkg_hash_a]["sha256"] == tar_hash + assert index[pkg_hash_b]["sha256"] == tar_hash + + +def test_failed_tarball_download_is_logged(fake_requests, tmp_path): + """ + If the blob download fails (404 or None), we should append to FAILED_TARBALL_DOWNLOAD_FILE + and not produce a tarinfo file. + """ + name = "foo" + ver = "9.9" + pkg_hash = "cccccccccccccccccccccccccccccccc" + tar_hash = "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" + man_url = f"https://binaries.spack.io/develop/v3/manifests/spec/{name}/{name}-{ver}-{pkg_hash}.spec.manifest.json" + blob_url = f"https://binaries.spack.io/develop/blobs/sha256/{tar_hash[:2]}/{tar_hash}" + + # Manifest OK, blob 404 + fake_requests.table[man_url] = lambda: type("R", (), {"status_code": 200, "content": _mk_manifest_bytes_with_hash(tar_hash)})() + fake_requests.table[blob_url] = lambda: type("R", (), {"status_code": 404, "content": b""})() + + index = {} + existing_tarinfo_files = set() + seen_tarball_hashes = set() + + pl.print_files(pkg_hash, {"spec": {"name": name, "version": ver}}, index, existing_tarinfo_files, seen_tarball_hashes) + + # No tarinfo created + tarinfo_path = os.path.join(pl.TARINFO_DIR, f"{tar_hash}.json") + assert not os.path.exists(tarinfo_path) + + # Log entry created with package filename, manifest hash, and tarball hash + assert os.path.exists(pl.FAILED_TARBALL_DOWNLOAD_FILE) + with open(pl.FAILED_TARBALL_DOWNLOAD_FILE, "r") as f: + log = f.read() + # Contains the manifest hash + tarball hash; also includes the package_filename prefix + assert f"manifest hash: {pkg_hash}, tarball hash: {tar_hash}" in log From 81afc13ece50311e069dcfe9bc5dd573c66f0b93 Mon Sep 17 00:00:00 2001 From: leslie q Date: Mon, 18 Aug 2025 11:30:51 -0700 Subject: [PATCH 10/20] updated remove_lines_spec_manifest function --- dataset-generation/spack_db/spack_db.py | 127 ++++++++++++++++++------ 1 file changed, 95 insertions(+), 32 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index ba69d3e..1f89395 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -21,6 +21,8 @@ import tempfile import shutil +from pathlib import Path + # Configuration # spack.index.db.json maps each package back to the packagename, version, SHA256hash, @@ -50,6 +52,9 @@ # file to track all the manifest files that were unable to download SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt" +############ updated 8/15 ############ +MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt" +###################################### MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt" SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt" FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt" @@ -61,6 +66,37 @@ os.makedirs(SPEC_CACHE_DIR, exist_ok = True) os.makedirs(BINARY_CACHE_DIR, exist_ok = True) +# purge truncated binaries in the binary_package directory on restart/resume +# def purge_truncated_binaries(): + +# if not os.path.isdir(BINARY_CACHE_DIR): +# return +# removed = 0 +# for fname in os.listdir(BINARY_CACHE_DIR): +# fpath = os.path.join(BINARY_CACHE_DIR, fname) +# if not os.path.isfile(fpath): +# continue +# try: +# # Autodetect compression; iterate to EOF to ensure integrity +# with tarfile.open(fpath, mode="r:*") as tar: +# for _ in tar: +# pass +# except (tarfile.ReadError, EOFError, OSError) as e: +# print(f"🧹 Removing corrupt binary cache: {fpath} ({e})") +# try: +# os.remove(fpath) +# removed += 1 +# except OSError as oe: +# print(f"⚠️ Could not remove {fpath}: {oe}") +# if removed: +# print(f"βœ… Purged {removed} corrupt/partial tarball(s).") + +############ updated 8/15 ############ +# Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage. +def _to_posix(p:str) -> str: + return Path(p).as_posix() +###################################### + # look for index if it exists to add info to def load_index(): # if the index_file exists, read it @@ -93,12 +129,25 @@ def update_index_entry(index, package_hash, package_value, package_zip_hash): manifest_filename = f"{name}-{version}-{package_hash}.json" tarinfo_filename = f"{package_zip_hash}.json" + ############ updated 8/15 ############ + manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename) + tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename) + ###################################### + index[package_hash] = { "name": name, "version": version, "sha256": package_zip_hash, - "manifest_path": os.path.join(MANIFEST_DIR, manifest_filename), - "tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename) + ############ updated 8/15 ############ + ##"manifest_path": os.path.join(MANIFEST_DIR, manifest_filename), + ##"tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename) + + # store forward-slash form in JSON for portability + "manifest_path": _to_posix(manifest_path_fs), + "tarinfo_path": _to_posix(tarinfo_path_fs), + + ###################################### + } @@ -240,14 +289,27 @@ def download_from_URL(theURL, package, is_spec=True): # remove unnecessary lines in file def remove_lines_spec_manifest(myfile): - + ############ updated 8/15 ############ # removes unnecessary bytes - removed_ends = myfile[49:-834] + # removed_ends = myfile[49:-834] - # converts bytes to dictionary - database = json.loads(removed_ends) + # # converts bytes to dictionary + # database = json.loads(removed_ends) - return database + # return database + + # Accept bytes or str; extract between first '{' and last '}' + data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8") + + start = data.find(b"{") + end = data.rfind(b"}") + + if start == -1 or end == -1 or end < start: + raise ValueError("Malformed manifest: JSON braces not found") + + return json.loads(data[start:end+1].decode("utf-8")) + ###################################### + # returns checksum, sha256 hash used to download the binary tarball def access_spec_manifest_media_type(db): @@ -279,7 +341,6 @@ def make_binary_package_URL(package_zip_hash): return myURL -############ updated 8/10 ############ # ensure tarinfo completeness def write_tarinfo_safely(tarinfo_path, file_list): temp_dir = os.path.dirname(tarinfo_path) @@ -296,7 +357,6 @@ def write_manifest_safely(manifest_path, manifest_data): json.dump(manifest_data, tmp, indent=2) temp_name = tmp.name shutil.move(temp_name, manifest_path) -##################################### # using python tarfile module io module to list all the files in the downloaded tarball # myfile is the tar_file response.content and the package is the hash we will split by @@ -339,14 +399,9 @@ def read_binary_package(myfile, package, package_zip_hash): name = package.split('/')[0] version = package.split('/')[1].split('-')[1] - ############ updated 8/10 ############ # saves file names to the tarinfo file - ##tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") - ##with open(tarinfo_path, "w") as f: - ##json.dump(file_list, f, indent=2) tarinfo_path = os.path.join(TARINFO_DIR, f"{package_zip_hash}.json") write_tarinfo_safely(tarinfo_path, file_list) - ##################################### # removes tarball once processed tarball_path = os.path.join(BINARY_CACHE_DIR, package.replace('/','__')) @@ -394,7 +449,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen name = package_value['spec']['name'] version = package_value['spec']['version'] - ############ updated 8/10 ############ manifest_filename = f"{name}-{version}-{package_hash}.json" manifest_path = os.path.join(MANIFEST_DIR, manifest_filename) @@ -431,24 +485,27 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen print("βœ… Loaded cached spec manifest") - # remove unneccessary lines from downloaded spec manifest - clean_spec_manifest = remove_lines_spec_manifest(temp) + + ############ updated 8/15 ############ + + try: + # remove unneccessary lines from downloaded spec manifest + clean_spec_manifest = remove_lines_spec_manifest(temp) + except Exception as e: + print(f"Failed to parse manifest {package_filename}: {e}") + with open(MALFORMED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\t{theURL}\t{type(e).__name__}: {e}\n") + # also adding it to file for skipping processing at restart + with open(SKIPPED_MANIFESTS_FILE, "a") as f: + f.write(f"{package_hash}\n") + return + + ###################################### # writes cleaned manifest information to manifest file - ##with open(manifest_path, "w") as f: - ##json.dump(clean_spec_manifest, f, indent=2) write_manifest_safely(manifest_path, clean_spec_manifest) print(f"βœ… Manifest safely written: {manifest_path}") - - - # writes cleaned manifest information to manifest file - ##manifest_file = os.path.join(MANIFEST_DIR, f"{name}-{version}-{package_hash}.json") - ##with open(manifest_file, "w") as f: - ##json.dump(clean_spec_manifest, f, indent=2) - - ###################################### - # find the mediaType that contains the hash for the package tarball install package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest) print(f"βœ… Extracted zip hash: {package_zip_hash}") @@ -520,6 +577,9 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb def main(): + + # remove any corrupt tarballs if they exist before resuming program + ##purge_truncated_binaries() #file_name = "myMedjson.json" # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' @@ -593,10 +653,13 @@ def main(): manifest_path = entry.get("manifest_path", "") tarinfo_path = entry.get("tarinfo_path", "") tarball_hash = entry.get("sha256", "") - - manifest_exists = manifest_path and os.path.exists(manifest_path) - tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) - + + ############ updated 8/15 ############ + ##manifest_exists = manifest_path and os.path.exists(manifest_path) + ##tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) + manifest_exists = bool(manifest_path) and Path(manifest_path).exists() + tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists() + ###################################### if manifest_exists and tarinfo_exists: #print(f"Skipping fully processed package: {package_hash}") continue From f18976708f2f28b8408668b871a7066877f7deff Mon Sep 17 00:00:00 2001 From: leslie q Date: Mon, 18 Aug 2025 15:39:07 -0700 Subject: [PATCH 11/20] added README.md --- dataset-generation/spack_db/README.md | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 dataset-generation/spack_db/README.md diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md new file mode 100644 index 0000000..4c40155 --- /dev/null +++ b/dataset-generation/spack_db/README.md @@ -0,0 +1,70 @@ +# Spack Build Cache Data Scraper + +This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. + +The program builds a master index called spack.index.db.json. The layout of the index: + * spec manifest hash as the unique key + * package name and version + * package tarball unique SHA256 hash + * package manifest path to the local cache directory + * package tarinfo path to the local cache directory + +The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. + +## Directory Structure + * `cache/spack.index.db.json` - master index + * `cache/manifest/` - cleaned spec manifests + * `cache/tarinfo/` - JSON file lists extracted from tarballs + * `cache/spec_manifests/` - temporary cache of raw manifests before clean up + * 'cache/binary_packages/` - temporary cache of downloaded tarballs + * `cache/timeouts.txt` - packages that timed out while downloading + * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded + * `cache/malformed_manifests.txt` - manifests that failed parsing + * `cache/missing_tarballs.txt` - manifests without a tarball hash + * `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball + * `cache/failed_tarball_downloads.txt` - tarballs that failed to download + +## Features + * Retrieves package `.spec.manifest.json` from Spack's binary mirror + * Extracts valid JSON payload, and removes extra characters + * Retrieves binary tarballs and extracts file lists + * Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information + * Contains multiple checkpoints for safe restart/resume of the program + * Records skipped/malformed manifests, missing hashes, failed tarbll downloads + * Stores forward-slash paths in JSON index for cross-platform use + +## Usage +1. Install dependencies +```bash +pip install requests +``` +The rest of the necessary modules are part of Python's standard library. + +2. Provide a database file +Update the file_name in `main()` if needed + +3. Run the script +```bash +python spack_db.py +``` + +4. Resume after interruption +If an interruption occurs, it is safe to re-run the script without losing data already processed. + +5. Run Create_spack_DB.py to create SQLite database +```bash +python Create_spack_DB.py +``` +Database will include all files extracted from the packages from the Spack build cache. + +## Contributing + +Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. + +Please make sure to update tests as appropriate. + +For more information on contributing see the [CONTRIBUTING](./CONTRIBUTING.md) file. + +## License + +MIT license \ No newline at end of file From b434859d0b42d60fff1c205c8f15dac1dbe927d1 Mon Sep 17 00:00:00 2001 From: leslie q Date: Mon, 18 Aug 2025 15:42:51 -0700 Subject: [PATCH 12/20] fixed bullet points in Directory Structure and Features --- dataset-generation/spack_db/README.md | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 4c40155..86e937a 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -12,26 +12,26 @@ The program builds a master index called spack.index.db.json. The layout of the The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. ## Directory Structure - * `cache/spack.index.db.json` - master index - * `cache/manifest/` - cleaned spec manifests - * `cache/tarinfo/` - JSON file lists extracted from tarballs - * `cache/spec_manifests/` - temporary cache of raw manifests before clean up - * 'cache/binary_packages/` - temporary cache of downloaded tarballs - * `cache/timeouts.txt` - packages that timed out while downloading - * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded - * `cache/malformed_manifests.txt` - manifests that failed parsing - * `cache/missing_tarballs.txt` - manifests without a tarball hash - * `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball - * `cache/failed_tarball_downloads.txt` - tarballs that failed to download +* `cache/spack.index.db.json` - master index +* `cache/manifest/` - cleaned spec manifests +* `cache/tarinfo/` - JSON file lists extracted from tarballs +* `cache/spec_manifests/` - temporary cache of raw manifests before clean up +* `cache/binary_packages/` - temporary cache of downloaded tarballs +* `cache/timeouts.txt` - packages that timed out while downloading +* `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded +* `cache/malformed_manifests.txt` - manifests that failed parsing +* `cache/missing_tarballs.txt` - manifests without a tarball hash +* `cache/shared_tarballs.txt` - records multiple manifests that point to the same tarball +* `cache/failed_tarball_downloads.txt` - tarballs that failed to download ## Features - * Retrieves package `.spec.manifest.json` from Spack's binary mirror - * Extracts valid JSON payload, and removes extra characters - * Retrieves binary tarballs and extracts file lists - * Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information - * Contains multiple checkpoints for safe restart/resume of the program - * Records skipped/malformed manifests, missing hashes, failed tarbll downloads - * Stores forward-slash paths in JSON index for cross-platform use +* Retrieves package `.spec.manifest.json` from Spack's binary mirror +* Extracts valid JSON payload, and removes extra characters +* Retrieves binary tarballs and extracts file lists +* Creates and maintains a canonical JSON index that maps package to it's manifest and tarball information +* Contains multiple checkpoints for safe restart/resume of the program +* Records skipped/malformed manifests, missing hashes, failed tarbll downloads +* Stores forward-slash paths in JSON index for cross-platform use ## Usage 1. Install dependencies From bfa7dbe8acdae4e09ca459d725a28b57fefdc5e1 Mon Sep 17 00:00:00 2001 From: leslie q Date: Mon, 18 Aug 2025 15:45:52 -0700 Subject: [PATCH 13/20] fixed bullet points for the layout of the index --- dataset-generation/spack_db/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 86e937a..42c4e5d 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -3,11 +3,11 @@ This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. The program builds a master index called spack.index.db.json. The layout of the index: - * spec manifest hash as the unique key - * package name and version - * package tarball unique SHA256 hash - * package manifest path to the local cache directory - * package tarinfo path to the local cache directory +* spec manifest hash as the unique key +* package name and version +* package tarball unique SHA256 hash +* package manifest path to the local cache directory +* package tarinfo path to the local cache directory The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. From c3ba8d55654ffb7ff294c9ad4c5f971f30499afa Mon Sep 17 00:00:00 2001 From: leslie q Date: Mon, 18 Aug 2025 15:51:46 -0700 Subject: [PATCH 14/20] updated Usage --- dataset-generation/spack_db/README.md | 41 ++++++++++++++------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 42c4e5d..39b027e 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -1,13 +1,14 @@ # Spack Build Cache Data Scraper -This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then converted into a Spack SQLite database. +This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. -The program builds a master index called spack.index.db.json. The layout of the index: -* spec manifest hash as the unique key -* package name and version -* package tarball unique SHA256 hash -* package manifest path to the local cache directory -* package tarinfo path to the local cache directory +The program builds a master index called `spack.index.db.json`. +* Index layout: + * spec manifest hash as the unique key + * package name and version + * package tarball unique SHA256 hash + * package manifest path to the local cache directory + * package tarinfo path to the local cache directory The program allows for restart/resume in case there are program run interruptions. Skipped or malformed manifests are recorded and if the information exists for both manifest and tarball, re-downloading files is avoided. @@ -35,27 +36,27 @@ The program allows for restart/resume in case there are program run interruption ## Usage 1. Install dependencies -```bash -pip install requests -``` -The rest of the necessary modules are part of Python's standard library. + ```bash + pip install requests + ``` + The rest of the necessary modules are part of Python's standard library. 2. Provide a database file -Update the file_name in `main()` if needed + Update the file_name in `main()` if needed 3. Run the script -```bash -python spack_db.py -``` + ```bash + python spack_db.py + ``` 4. Resume after interruption -If an interruption occurs, it is safe to re-run the script without losing data already processed. + If an interruption occurs, it is safe to re-run the script without losing data already processed. 5. Run Create_spack_DB.py to create SQLite database -```bash -python Create_spack_DB.py -``` -Database will include all files extracted from the packages from the Spack build cache. + ```bash + python Create_spack_DB.py + ``` + Database will include all files extracted from the packages from the Spack build cache. ## Contributing From fe63504609e84697775d2ee8db0780d52220e5e1 Mon Sep 17 00:00:00 2001 From: leslie q Date: Tue, 19 Aug 2025 13:53:03 -0700 Subject: [PATCH 15/20] updated README.md with retry manifest/tarball download instruction --- dataset-generation/spack_db/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 39b027e..86d4c01 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -1,4 +1,4 @@ -# Spack Build Cache Data Scraper +# Spack Build Cache Data Scraper & SQLite Database This project aims to scrape the Spack build cache by downloading, cleaning, and indexing spec manifests and binary tarballs into a local cache, then convert the data into a Spack SQLite database. @@ -52,7 +52,10 @@ The program allows for restart/resume in case there are program run interruption 4. Resume after interruption If an interruption occurs, it is safe to re-run the script without losing data already processed. -5. Run Create_spack_DB.py to create SQLite database +5. Retry manifests or tarballs + Delete the files `skipped_manifests.txt`, `malformed_manifests.txt`, `failed_tarball_downloads.txt`, to retry failed manifest or tarball downloads. + +6. Run Create_spack_DB.py to create SQLite database ```bash python Create_spack_DB.py ``` From 19e76fb1684c606cf2a7fbc8988a4934892b8519 Mon Sep 17 00:00:00 2001 From: leslie q Date: Tue, 19 Aug 2025 16:10:35 -0700 Subject: [PATCH 16/20] updated _to_posix for printing --- dataset-generation/spack_db/spack_db.py | 150 +++++++----------------- 1 file changed, 42 insertions(+), 108 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 1f89395..2cf7fca 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -36,11 +36,6 @@ # this will be used in making the SQLite database without having to reprocess the tarballs again TARINFO_DIR = "cache/tarinfo" -# SPEC_CACHE_DIR is meant to be a temporary cache of raw spec manifest files downloaded from the internet -# a clean copy is meant to be placed in MANIFEST_DIR -# SPEC_CACHE_DIR avoids redownloading if the script is restarted. -SPEC_CACHE_DIR = "cache/spec_manifests" - # BINARY_CACHE_DIR contains the downloaded tarballs temporarily # the file is deleted after processing. BINARY_CACHE_DIR = "cache/binary_packages" @@ -52,50 +47,24 @@ # file to track all the manifest files that were unable to download SKIPPED_MANIFESTS_FILE = "cache/skipped_manifests.txt" -############ updated 8/15 ############ MALFORMED_MANIFESTS_FILE = "cache/malformed_manifests.txt" -###################################### MISSING_TARBALL_HASH_FILE = "cache/missing_tarballs.txt" SHARED_TARBALL_HASH_FILE = "cache/shared_tarballs.txt" FAILED_TARBALL_DOWNLOAD_FILE = "cache/failed_tarball_downloads.txt" # create cache directories for faster download -# FIX ME: Remove SPEC_CACHE_DIR os.makedirs(MANIFEST_DIR, exist_ok=True) os.makedirs(TARINFO_DIR, exist_ok = True) -os.makedirs(SPEC_CACHE_DIR, exist_ok = True) os.makedirs(BINARY_CACHE_DIR, exist_ok = True) -# purge truncated binaries in the binary_package directory on restart/resume -# def purge_truncated_binaries(): - -# if not os.path.isdir(BINARY_CACHE_DIR): -# return -# removed = 0 -# for fname in os.listdir(BINARY_CACHE_DIR): -# fpath = os.path.join(BINARY_CACHE_DIR, fname) -# if not os.path.isfile(fpath): -# continue -# try: -# # Autodetect compression; iterate to EOF to ensure integrity -# with tarfile.open(fpath, mode="r:*") as tar: -# for _ in tar: -# pass -# except (tarfile.ReadError, EOFError, OSError) as e: -# print(f"🧹 Removing corrupt binary cache: {fpath} ({e})") -# try: -# os.remove(fpath) -# removed += 1 -# except OSError as oe: -# print(f"⚠️ Could not remove {fpath}: {oe}") -# if removed: -# print(f"βœ… Purged {removed} corrupt/partial tarball(s).") - -############ updated 8/15 ############ # Normalize any filesystem path to a forward-slash (POSIX) string for JSON storage. def _to_posix(p:str) -> str: return Path(p).as_posix() -###################################### + +########### UPDATED 8/19 ########### +def _to_posix(p: str) -> str: + return Path(p).as_posix() +#################################### # look for index if it exists to add info to def load_index(): @@ -129,28 +98,19 @@ def update_index_entry(index, package_hash, package_value, package_zip_hash): manifest_filename = f"{name}-{version}-{package_hash}.json" tarinfo_filename = f"{package_zip_hash}.json" - ############ updated 8/15 ############ manifest_path_fs = os.path.join(MANIFEST_DIR, manifest_filename) tarinfo_path_fs = os.path.join(TARINFO_DIR, tarinfo_filename) - ###################################### index[package_hash] = { "name": name, "version": version, "sha256": package_zip_hash, - ############ updated 8/15 ############ - ##"manifest_path": os.path.join(MANIFEST_DIR, manifest_filename), - ##"tarinfo_path": os.path.join(TARINFO_DIR, tarinfo_filename) # store forward-slash form in JSON for portability "manifest_path": _to_posix(manifest_path_fs), "tarinfo_path": _to_posix(tarinfo_path_fs), - - ###################################### - } - # load that last saved package hash def load_checkpoint(): # # checks if progress.txt exists @@ -223,49 +183,54 @@ def download_from_URL(theURL, package, is_spec=True): # Example: # This -> "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" # is turned into this -> "compiler-wrapper__compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.spec.manifest.json" - package_name = package.replace('/', '__') + package_name = package.replace('/','__') # if is_spec is true, meaning the file ends with ".spec.manifest.json", # then the file is not saved, but the reponse is returned to remove_lines_spec_manifest() for further manipulation # if the file ends with .tar.gz # then the file is saved in BINARY_CACHE_DIR - cache_dir = SPEC_CACHE_DIR if is_spec else BINARY_CACHE_DIR - - print(f"location to be saved in {cache_dir} for {package_name}") - + cache_dir = BINARY_CACHE_DIR if not is_spec else None # full file path then is: # "cache/spec_manifests/compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" #cache/manifest\\compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3.json - cached_path = os.path.join(cache_dir, package_name) - print(f"this is the cached_path {cached_path}") + cached_path = os.path.join(cache_dir, package_name) if cache_dir else None - #if cache exists, it does not need to be redownloaded - if os.path.exists(cached_path): - print(f"Using cached file: {cached_path}") + if is_spec: + print(f"downloading manifest for {package_name}") + else: + ########### UPDATED 8/19 ########### + print(f"temporary save location: {_to_posix(cached_path)} for {package_name}") + #################################### + #if cache exists, it does not need to be redownloaded + if cached_path and os.path.exists(cached_path): + ########### UPDATED 8/19 ########### + print(f"Using cached file: {_to_posix(cached_path)}") + #################################### # rb is read binary with open(cached_path, "rb") as f: return f.read() try: - print("in try block for download_from_URL") - # adding timeout for 60 seconds + ########### UPDATED 8/19 ########### + label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}" + print(f"trying download for {label}") + #################################### + # timeout for 60 seconds response = requests.get(theURL, timeout=60, verify=False) - print(f"trying download for {cached_path} ") + # a response status code is 200 then the request was successful # response.status_code of 404 means does not exist if response.status_code == 200: - print(f"download successful for {cached_path}") - # saves to cache if request is successful - # wb is write binary - if is_spec == False: + if cached_path: + print(f"download successful for {_to_posix(cached_path)}") + #################################### + # saves to cache if request is successful + # wb is write binary with open(cached_path, "wb") as f: f.write(response.content) - - return response.content - else: - return response.content + return response.content else: # if URL does not exist, skip and move to next package @@ -289,14 +254,6 @@ def download_from_URL(theURL, package, is_spec=True): # remove unnecessary lines in file def remove_lines_spec_manifest(myfile): - ############ updated 8/15 ############ - # removes unnecessary bytes - # removed_ends = myfile[49:-834] - - # # converts bytes to dictionary - # database = json.loads(removed_ends) - - # return database # Accept bytes or str; extract between first '{' and last '}' data = myfile if isinstance(myfile, (bytes, bytearray)) else myfile.encode("utf-8") @@ -308,8 +265,6 @@ def remove_lines_spec_manifest(myfile): raise ValueError("Malformed manifest: JSON braces not found") return json.loads(data[start:end+1].decode("utf-8")) - ###################################### - # returns checksum, sha256 hash used to download the binary tarball def access_spec_manifest_media_type(db): @@ -364,17 +319,13 @@ def write_manifest_safely(manifest_path, manifest_data): def read_binary_package(myfile, package, package_zip_hash): file_list = [] try: - # with io.BytesIO(myfile) as tar_buffer: - with tarfile.open(fileobj = tar_buffer, mode="r") as tar: - + with tarfile.open(fileobj = tar_buffer, mode="r:*") as tar: + print(f"Files in the tar archive for {package.split('/')[0]}:") i = 1 for member in tar.getmembers(): if member.isfile(): - #print(f"{i}: {member.name}") - #i += 1 - # member.name for compiler-wrapper is "home/software/spack/__spack_path_placeholder__/ # __spack_path_placeholder__/__spack_path_placeholder__/ # __spack_path_placeholder__/__spack_path_placeholder__/ @@ -383,9 +334,6 @@ def read_binary_package(myfile, package, package_zip_hash): # .spack/install_environment.json" # package for compiler-wrapper is "compiler-wrapper/compiler-wrapper-1.0-bsavlbvtqsc7yjtvka3ko3aem4wye2u3" clean_path = remove_placeholder_directories(i, member.name, package) - # ??? instead of just printing, we'll add this to a list (also save a copy of this list to the cache directory) - if we have to restart the program, then trying to get tarball from the list, we can skip the the - # # remove_placeholder function because we will already have a copy of the list of the files in that package - # # to make it easier to add to sqlite database # this will add the files that are in the package to a clean_list if clean_path: @@ -454,14 +402,13 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen # use existing cleaned manifest if available if os.path.exists(manifest_path): - print(f"Using existing cleaned manifest: {manifest_path}") + print(f"Using existing cleaned manifest: {_to_posix(manifest_path)}") with open(manifest_path, "r") as f: clean_spec_manifest = json.load(f) # returns the URL for the spec manifest file and the package_filename theURL, package_filename = make_spec_manifest_URL(package_hash, package_value) - ## and indented theURL to print("βœ… Cleaned and parsed spec manifest") else: # download if manifest does not exist @@ -484,9 +431,6 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen return print("βœ… Loaded cached spec manifest") - - - ############ updated 8/15 ############ try: # remove unneccessary lines from downloaded spec manifest @@ -499,12 +443,10 @@ def print_files(package_hash, package_value, index, existing_tarinfo_files, seen with open(SKIPPED_MANIFESTS_FILE, "a") as f: f.write(f"{package_hash}\n") return - - ###################################### # writes cleaned manifest information to manifest file write_manifest_safely(manifest_path, clean_spec_manifest) - print(f"βœ… Manifest safely written: {manifest_path}") + print(f"βœ… Manifest safely written: {_to_posix(manifest_path)}") # find the mediaType that contains the hash for the package tarball install package_zip_hash = access_spec_manifest_media_type(clean_spec_manifest) @@ -577,9 +519,6 @@ def run_program(package_hash, database, index, existing_tarinfo_files, seen_tarb def main(): - - # remove any corrupt tarballs if they exist before resuming program - ##purge_truncated_binaries() #file_name = "myMedjson.json" # file_name = "myjson.json" # file_name = 'Med_w_compilerwrapper_packages_at_end.json' @@ -622,13 +561,11 @@ def main(): index = load_index() skip = True if last_processed else False - existing_tarinfo_files = set(os.listdir(TARINFO_DIR)) - existing_tarinfo_hashes = { - fname.rsplit("-", 1)[-1].replace(".json", "") for fname in existing_tarinfo_files + existing_tarinfo_files = { + Path(fname).stem + for fname in os.listdir(TARINFO_DIR) + if fname.endswith(".json") } - existing_tarinfo_files = existing_tarinfo_hashes - - # track already-processed tarball hashes to find shared ones seen_tarball_hashes = set() @@ -653,15 +590,12 @@ def main(): manifest_path = entry.get("manifest_path", "") tarinfo_path = entry.get("tarinfo_path", "") tarball_hash = entry.get("sha256", "") - - ############ updated 8/15 ############ - ##manifest_exists = manifest_path and os.path.exists(manifest_path) - ##tarinfo_exists = tarinfo_path and os.path.exists(tarinfo_path) + manifest_exists = bool(manifest_path) and Path(manifest_path).exists() tarinfo_exists = bool(tarinfo_path) and Path(tarinfo_path).exists() - ###################################### + if manifest_exists and tarinfo_exists: - #print(f"Skipping fully processed package: {package_hash}") + continue # if tarball previously failed, skip retrying it From 880f545e69fa8cdb285385c229b92ba194dd8fad Mon Sep 17 00:00:00 2001 From: leslie q Date: Thu, 21 Aug 2025 13:54:13 -0700 Subject: [PATCH 17/20] removed spec_manifest info from README.md --- dataset-generation/spack_db/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 86d4c01..01bbd9e 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -16,7 +16,6 @@ The program allows for restart/resume in case there are program run interruption * `cache/spack.index.db.json` - master index * `cache/manifest/` - cleaned spec manifests * `cache/tarinfo/` - JSON file lists extracted from tarballs -* `cache/spec_manifests/` - temporary cache of raw manifests before clean up * `cache/binary_packages/` - temporary cache of downloaded tarballs * `cache/timeouts.txt` - packages that timed out while downloading * `cache/skipped_manifests.txt` - a list of manifests that could not be downloaded From b2de168d3f875a529af9bd8d4f7247be4d0ce2d9 Mon Sep 17 00:00:00 2001 From: leslie q Date: Thu, 21 Aug 2025 13:59:52 -0700 Subject: [PATCH 18/20] removed 'updated' comments --- dataset-generation/spack_db/spack_db.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/dataset-generation/spack_db/spack_db.py b/dataset-generation/spack_db/spack_db.py index 2cf7fca..cea6d15 100644 --- a/dataset-generation/spack_db/spack_db.py +++ b/dataset-generation/spack_db/spack_db.py @@ -61,10 +61,8 @@ def _to_posix(p:str) -> str: return Path(p).as_posix() -########### UPDATED 8/19 ########### def _to_posix(p: str) -> str: return Path(p).as_posix() -#################################### # look for index if it exists to add info to def load_index(): @@ -199,24 +197,21 @@ def download_from_URL(theURL, package, is_spec=True): if is_spec: print(f"downloading manifest for {package_name}") else: - ########### UPDATED 8/19 ########### print(f"temporary save location: {_to_posix(cached_path)} for {package_name}") - #################################### #if cache exists, it does not need to be redownloaded if cached_path and os.path.exists(cached_path): - ########### UPDATED 8/19 ########### + print(f"Using cached file: {_to_posix(cached_path)}") - #################################### + # rb is read binary with open(cached_path, "rb") as f: return f.read() try: - ########### UPDATED 8/19 ########### label = _to_posix(cached_path) if cached_path else f"manifest: {package_name}" print(f"trying download for {label}") - #################################### + # timeout for 60 seconds response = requests.get(theURL, timeout=60, verify=False) @@ -225,7 +220,7 @@ def download_from_URL(theURL, package, is_spec=True): if response.status_code == 200: if cached_path: print(f"download successful for {_to_posix(cached_path)}") - #################################### + # saves to cache if request is successful # wb is write binary with open(cached_path, "wb") as f: From dc0ad5cedf923a2775355028c90c62d6b1762dea Mon Sep 17 00:00:00 2001 From: leslie q Date: Thu, 21 Aug 2025 14:43:41 -0700 Subject: [PATCH 19/20] removed 'contributing' from README.md --- dataset-generation/spack_db/README.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/dataset-generation/spack_db/README.md b/dataset-generation/spack_db/README.md index 01bbd9e..3158899 100644 --- a/dataset-generation/spack_db/README.md +++ b/dataset-generation/spack_db/README.md @@ -59,15 +59,3 @@ The program allows for restart/resume in case there are program run interruption python Create_spack_DB.py ``` Database will include all files extracted from the packages from the Spack build cache. - -## Contributing - -Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. - -Please make sure to update tests as appropriate. - -For more information on contributing see the [CONTRIBUTING](./CONTRIBUTING.md) file. - -## License - -MIT license \ No newline at end of file From 9ebd7fc2ca80e8dd2309dd6c1113e3eef0abe0a5 Mon Sep 17 00:00:00 2001 From: leslie q Date: Thu, 21 Aug 2025 15:09:33 -0700 Subject: [PATCH 20/20] added supporting scripts for testing --- .../spack_db/rename_tarinfo_file.py | 35 +++++++++++++++++++ .../spack_db/spack_index_length.py | 19 ++++++++++ 2 files changed, 54 insertions(+) create mode 100644 dataset-generation/spack_db/rename_tarinfo_file.py create mode 100644 dataset-generation/spack_db/spack_index_length.py diff --git a/dataset-generation/spack_db/rename_tarinfo_file.py b/dataset-generation/spack_db/rename_tarinfo_file.py new file mode 100644 index 0000000..3d16af6 --- /dev/null +++ b/dataset-generation/spack_db/rename_tarinfo_file.py @@ -0,0 +1,35 @@ +import os +import re + +# Path to your tarinfo directory +TARINFO_DIR = "cache/tarinfo" + +# Updated regex: match -<64-char-sha256>.json +pattern = re.compile(r"^(.*)-([a-f0-9]{64})\.json$") + +# Counter +renamed = 0 +skipped = 0 + +for filename in os.listdir(TARINFO_DIR): + match = pattern.match(filename) + if match: + sha256_hash = match.group(2) + new_filename = f"{sha256_hash}.json" + + old_path = os.path.join(TARINFO_DIR, filename) + new_path = os.path.join(TARINFO_DIR, new_filename) + + # Skip if target file already exists + if os.path.exists(new_path): + print(f"⚠️ Skipping {filename} (target {new_filename} already exists)") + skipped += 1 + continue + + os.rename(old_path, new_path) + renamed += 1 + else: + print(f"❓ Skipping non-matching file: {filename}") + skipped += 1 + +print(f"\nβœ… Done! Renamed {renamed} files. Skipped {skipped} files.") diff --git a/dataset-generation/spack_db/spack_index_length.py b/dataset-generation/spack_db/spack_index_length.py new file mode 100644 index 0000000..0111f73 --- /dev/null +++ b/dataset-generation/spack_db/spack_index_length.py @@ -0,0 +1,19 @@ +import json + +# Path to your index file +INDEX_FILE = "cache/spack.index.db.json" + +def main(): + try: + with open(INDEX_FILE, "r") as f: + index = json.load(f) + print(f"βœ… Number of entries in index: {len(index)}") + except FileNotFoundError: + print(f"❌ File not found: {INDEX_FILE}") + except json.JSONDecodeError: + print(f"❌ Failed to parse JSON. The file may be corrupted.") + except Exception as e: + print(f"❌ Unexpected error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file