diff --git a/install/helioviewer/hvpull/servers/__init__.py b/install/helioviewer/hvpull/servers/__init__.py index 1ac237ae..5c871dd5 100644 --- a/install/helioviewer/hvpull/servers/__init__.py +++ b/install/helioviewer/hvpull/servers/__init__.py @@ -1,22 +1,49 @@ """Classes for working with known data servers""" import os +import re import datetime + +def get_datetime_from_file(filename): + """Extract datetime from filename using regex matching for date formats '%Y_%m_%d__%H_%M_%S', '%Y%m%dT%H%M%S', or '%Y%m%d%H%M%S'""" + url_filename = os.path.basename(filename) + + # Try format: YYYY_MM_DD__HH_MM_SS + match = re.search(r'(\d{4}_\d{2}_\d{2}__\d{2}_\d{2}_\d{2})', url_filename) + if match: + url_datetime = match.group(1) + return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S') + + # Try format: YYYYMMDDTHHMMSS + match = re.search(r'(\d{8}T\d{6})', url_filename) + if match: + url_datetime = match.group(1) + return datetime.datetime.strptime(url_datetime, '%Y%m%dT%H%M%S') + + # Try format: YYYYMMDDHHMMSS + match = re.search(r'(\d{14})', url_filename) + if match: + url_datetime = match.group(1) + return datetime.datetime.strptime(url_datetime, '%Y%m%d%H%M%S') + + raise ValueError(f"No valid datetime format found in filename: {filename}") + + class DataServer: """Class for interacting with data servers.""" def __init__(self, uri, name, pause=3): self.uri = uri self.name = name self.pause = datetime.timedelta(minutes=pause) - + # Example: 2011_11_17__08_13_08_13__SDO_AIA_AIA_304.jp2 self.filename_regex = ( "^(?P\d{4})_(?P\d{2})_(?P\d{2})__" + - "(?P\d{2})_(?P\d{2})_(?P\d{2})_" + + "(?P\d{2})_(?P\d{2})_(?P\d{2})_" + "(?P\d{2,3})__" + "(?P[a-zA-Z0-9]{3})_(?P[a-zA-Z0-9]{3})_" + "(?P[a-zA-Z0-9]{3})_(?P[a-zA-Z0-9]{2,11})\.jp2$") - + def compute_directories(self, start_date, end_date): """Creates a list of possible directories containing new files""" return [] @@ -24,7 +51,7 @@ def compute_directories(self, start_date, end_date): def get_starttime(self): """Default start time to use when retrieving data""" return datetime.datetime.utcnow() - datetime.timedelta(hours=6) - + def get_dates(self, starttime, endtime): """Get a complete list of dates between the start and the end time""" fmt = "%Y/%m/%d" @@ -34,13 +61,13 @@ def get_dates(self, starttime, endtime): while date < endtime.date(): date = date + datetime.timedelta(days=1) dates.append(date.strftime(fmt)) - + # Ensure the dates are most recent first dates.sort() dates.reverse() - + return dates - + def get_file_regex(self): """Returns a regex which described the expected format of filenames on the server""" @@ -49,15 +76,13 @@ def get_file_regex(self): def get_measurements(self, nicknames, dates): """Get a list of all the URIs down to the measurement""" return None - + def get_uri(self): """Return the server URI""" return self.uri def get_datetime_from_file(self, filename): - url_filename = os.path.basename(filename) - url_datetime = url_filename[0:20] - return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S') + return get_datetime_from_file(filename) class DataServerPauseDelayDefinesDefaultStartTime: @@ -115,7 +140,5 @@ def get_uri(self): return self.uri def get_datetime_from_file(self, filename): - url_filename = os.path.basename(filename) - url_datetime = url_filename[0:20] - return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S') + return get_datetime_from_file(filename) diff --git a/install/helioviewer/hvpull/servers/hv.py b/install/helioviewer/hvpull/servers/hv.py index 04fc9ef7..d9cfe6d5 100644 --- a/install/helioviewer/hvpull/servers/hv.py +++ b/install/helioviewer/hvpull/servers/hv.py @@ -16,28 +16,35 @@ def compute_directories(self, start_date, end_date): # Start with date directories for date in self.get_dates(start_date, end_date): date_url = os.path.join(self.uri, date) + # Recursively enumerate subdirectories starting from date URL + dirs.extend(self._enumerate_subdirectories(date_url)) - # Query the URL to find subdirectories - try: - response = requests.get(date_url) - response.raise_for_status() + return dirs - # Extract subdirectory links from HTML - subdirs = self._parse_directory_links(response.content.decode('utf-8')) + def _enumerate_subdirectories(self, url): + """Recursively enumerate subdirectories by querying the URL""" + try: + response = requests.get(url) + response.raise_for_status() - if subdirs: - # Add each subdirectory with date_url as prefix - for subdir in subdirs: - dirs.append(f"{date_url}/{subdir}") - else: - # No subdirectories found, add the date URL itself - dirs.append(date_url) + # Extract subdirectory links from HTML + subdirs = self._parse_directory_links(response.content.decode('utf-8')) - except requests.RequestException: - # If we can't query the URL, add it as-is - dirs.append(date_url) + if not subdirs: + # No subdirectories found, this is a leaf directory + return [url] - return dirs + # Recursively enumerate each subdirectory + all_dirs = [] + for subdir in subdirs: + subdir_url = f"{url}/{subdir}" + all_dirs.extend(self._enumerate_subdirectories(subdir_url)) + + return all_dirs + + except requests.RequestException: + # If we can't query the URL, return it as a leaf directory + return [url] def _parse_directory_links(self, html): """Parse HTML content and extract directory links"""