diff --git a/src/nomadic/start/data/NOMADS_Library_Worksheet.xlsx b/src/nomadic/start/data/NOMADS_Library_Worksheet.xlsx index 4d547a8..10f4dd4 100644 Binary files a/src/nomadic/start/data/NOMADS_Library_Worksheet.xlsx and b/src/nomadic/start/data/NOMADS_Library_Worksheet.xlsx differ diff --git a/src/nomadic/util/metadata.py b/src/nomadic/util/metadata.py index e287d74..c70fcbf 100644 --- a/src/nomadic/util/metadata.py +++ b/src/nomadic/util/metadata.py @@ -4,10 +4,10 @@ from typing import List, Optional import pandas as pd +from openpyxl import load_workbook from .exceptions import MetadataFormatError - STANDARD_METADATA_FILENAME = "samples.csv" @@ -49,6 +49,9 @@ def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str: EXPECTED = "barcode[0-9]{2}$" EXAMPLE = "barcode01" + if isinstance(barcode, float): + barcode = int(barcode) + if not isinstance(barcode, str): barcode = str(barcode) @@ -58,11 +61,6 @@ def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str: f"Barcode '{barcode}' has bad format: must conform to '{EXAMPLE}'." ) - # Raise a warning - warnings.warn( - f"Barcode '{barcode}' has bad format: must conform to '{EXAMPLE}'. Trying to fix..." - ) - nums = re.findall("[0-9]+", barcode) if not nums: @@ -96,7 +94,7 @@ class MetadataTableParser: # If the required columns are not found, try these alternative names, case insensitive ALTERNATIVE_NAMES = { - "barcode": ["barcodes"], + "barcode": ["barcodes", "barcode#"], "sample_id": [ "sample", "sampleid", @@ -133,17 +131,45 @@ def _load_metadata(self, path: str): _, ext = os.path.splitext(path) ext = ext.lower() if ext == ".xlsx": - xlsx = pd.ExcelFile(path, engine="openpyxl") + warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl") + xlsx = load_workbook(path, data_only=True) # name in nomadic excel template, and in the (legacy) warehouse template - target_sheets = ["nomadic", "rxn_metadata"] + target_sheets = ["Library", "rxn_metadata"] # Find first matching sheetname or use first sheet sheet_names = [ - sheetname - for sheetname in target_sheets - if sheetname in xlsx.sheet_names - ] + [xlsx.sheet_names[0]] - data = pd.read_excel(path, sheet_name=sheet_names[0], engine="openpyxl") + sheetname for sheetname in target_sheets if sheetname in xlsx.sheetnames + ] + [xlsx.sheetnames[0]] + # Get the sheet and table + ws = xlsx[sheet_names[0]] + tbl_name = "tbl_SeqLib" + tbl = ws.tables[tbl_name] + cells = ws[tbl.ref] + start_col = cells[0][0].column # 1-based worksheet column index + + # Collect ALL hidden column ranges (including grouped ones) to identify only + # visible columns + hidden_ranges = [ + (dim.min, dim.max) + for dim in ws.column_dimensions.values() + if dim.hidden is True + ] + visible_cols = [] + for i in range(len(cells[0])): + col_idx = start_col + i + hidden = any(lo <= col_idx <= hi for lo, hi in hidden_ranges) + + if not hidden: + visible_cols.append(i) + + # Extract data from visible columns ONLY + cells = ws[tbl.ref] + rows = [[cell.value for cell in row] for row in cells] + rows_filt = [[row[i] for i in visible_cols] for row in rows] + data = pd.DataFrame(rows_filt[1:], columns=rows_filt[0]) + + # Ensure that empty rows or those with missing sample_id are not included data.dropna(how="all", inplace=True) + data = data.dropna(subset=["Sample ID"]) self.df = data else: self.df = pd.read_csv(path, delimiter=get_csv_delimiter(path))