Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified src/nomadic/start/data/NOMADS_Library_Worksheet.xlsx
Binary file not shown.
54 changes: 40 additions & 14 deletions src/nomadic/util/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from typing import List, Optional

import pandas as pd
from openpyxl import load_workbook

from .exceptions import MetadataFormatError


STANDARD_METADATA_FILENAME = "samples.csv"


Expand Down Expand Up @@ -49,6 +49,9 @@ def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str:
EXPECTED = "barcode[0-9]{2}$"
EXAMPLE = "barcode01"

if isinstance(barcode, float):
barcode = int(barcode)

if not isinstance(barcode, str):
barcode = str(barcode)

Expand All @@ -58,11 +61,6 @@ def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str:
f"Barcode '{barcode}' has bad format: must conform to '{EXAMPLE}'."
)

# Raise a warning
warnings.warn(
f"Barcode '{barcode}' has bad format: must conform to '{EXAMPLE}'. Trying to fix..."
)

nums = re.findall("[0-9]+", barcode)

if not nums:
Expand Down Expand Up @@ -96,7 +94,7 @@ class MetadataTableParser:

# If the required columns are not found, try these alternative names, case insensitive
ALTERNATIVE_NAMES = {
"barcode": ["barcodes"],
"barcode": ["barcodes", "barcode#"],
"sample_id": [
"sample",
"sampleid",
Expand Down Expand Up @@ -133,17 +131,45 @@ def _load_metadata(self, path: str):
_, ext = os.path.splitext(path)
ext = ext.lower()
if ext == ".xlsx":
xlsx = pd.ExcelFile(path, engine="openpyxl")
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
xlsx = load_workbook(path, data_only=True)
# name in nomadic excel template, and in the (legacy) warehouse template
target_sheets = ["nomadic", "rxn_metadata"]
target_sheets = ["Library", "rxn_metadata"]
# Find first matching sheetname or use first sheet
sheet_names = [
sheetname
for sheetname in target_sheets
if sheetname in xlsx.sheet_names
] + [xlsx.sheet_names[0]]
data = pd.read_excel(path, sheet_name=sheet_names[0], engine="openpyxl")
sheetname for sheetname in target_sheets if sheetname in xlsx.sheetnames
] + [xlsx.sheetnames[0]]
# Get the sheet and table
ws = xlsx[sheet_names[0]]
tbl_name = "tbl_SeqLib"
tbl = ws.tables[tbl_name]
cells = ws[tbl.ref]
start_col = cells[0][0].column # 1-based worksheet column index

# Collect ALL hidden column ranges (including grouped ones) to identify only
# visible columns
hidden_ranges = [
(dim.min, dim.max)
for dim in ws.column_dimensions.values()
if dim.hidden is True
]
visible_cols = []
for i in range(len(cells[0])):
col_idx = start_col + i
hidden = any(lo <= col_idx <= hi for lo, hi in hidden_ranges)

if not hidden:
visible_cols.append(i)

# Extract data from visible columns ONLY
cells = ws[tbl.ref]
rows = [[cell.value for cell in row] for row in cells]
rows_filt = [[row[i] for i in visible_cols] for row in rows]
data = pd.DataFrame(rows_filt[1:], columns=rows_filt[0])

# Ensure that empty rows or those with missing sample_id are not included
data.dropna(how="all", inplace=True)
data = data.dropna(subset=["Sample ID"])
self.df = data
else:
self.df = pd.read_csv(path, delimiter=get_csv_delimiter(path))
Expand Down
Loading