From 31f97fcccde9ecaf923d5a17ff906845cabaa2fb Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Sun, 20 Apr 2025 09:48:44 +0000 Subject: [PATCH 1/2] Fixed CLI errors. --- pairtools/cli/dedup.py | 76 +++++++++++++++-------------- pairtools/cli/sort.py | 49 ++++++++++++++----- pairtools/lib/headerops.py | 45 +++++++++++++++++ tests/test_headerops.py | 98 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 221 insertions(+), 47 deletions(-) diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index fe6f197c..1512dd99 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -386,7 +386,6 @@ def dedup_py( n_proc, **kwargs, ): - sep = ast.literal_eval('"""' + sep + '"""') send_header_to_dedup = send_header_to in ["both", "dedup"] send_header_to_dup = send_header_to in ["both", "dups"] @@ -488,6 +487,34 @@ def dedup_py( logger.warning( "Pairs file appears not to be sorted, dedup might produce wrong results." ) + + # Canonicalize column names for flexible matching + column_names = headerops.extract_column_names(header) + column_names = headerops.canonicalize_columns(column_names) + + # Get column indices with fallbacks + try: + col1 = headerops.get_column_index(column_names, c1) + col2 = headerops.get_column_index(column_names, c2) + colp1 = headerops.get_column_index(column_names, p1) + colp2 = headerops.get_column_index(column_names, p2) + cols1 = headerops.get_column_index(column_names, s1) + cols2 = headerops.get_column_index(column_names, s2) + + # Handle extra column pairs + extra_cols1 = [] + extra_cols2 = [] + if extra_col_pair is not None: + for col1_spec, col2_spec in extra_col_pair: + try: + extra_cols1.append(headerops.get_column_index(column_names, col1_spec)) + extra_cols2.append(headerops.get_column_index(column_names, col2_spec)) + except ValueError: + logger.warning(f"Extra column pair ({col1_spec}, {col2_spec}) not found in header, skipping") + continue + except ValueError as e: + raise ValueError(f"Column error: {str(e)}") from e + header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) dups_header = header.copy() if keep_parent_id and len(dups_header) > 0: @@ -505,38 +532,17 @@ def dedup_py( ): outstream_unmapped.writelines((l + "\n" for l in header)) - column_names = headerops.extract_column_names(header) - extra_cols1 = [] - extra_cols2 = [] - if extra_col_pair is not None: - for col1, col2 in extra_col_pair: - extra_cols1.append(column_names[col1] if col1.isnumeric() else col1) - extra_cols2.append(column_names[col2] if col2.isnumeric() else col2) - if backend == "cython": - # warnings.warn( - # "'cython' backend is deprecated and provided only" - # " for backwards compatibility", - # DeprecationWarning, - # ) - extra_cols1 = [column_names.index(col) for col in extra_cols1] - extra_cols2 = [column_names.index(col) for col in extra_cols2] - c1 = column_names.index(c1) - c2 = column_names.index(c2) - p1 = column_names.index(p1) - p2 = column_names.index(p2) - s1 = column_names.index(s1) - s2 = column_names.index(s2) streaming_dedup_cython( method, max_mismatch, sep, - c1, - c2, - p1, - p2, - s1, - s2, + col1, + col2, + colp1, + colp2, + cols1, + cols2, extra_cols1, extra_cols2, unmapped_chrom, @@ -557,7 +563,7 @@ def dedup_py( method=method, mark_dups=mark_dups, max_mismatch=max_mismatch, - extra_col_pairs=list(extra_col_pair), + extra_col_pairs=list(zip(extra_cols1, extra_cols2)) if extra_cols1 else [], keep_parent_id=keep_parent_id, unmapped_chrom=unmapped_chrom, outstream=outstream, @@ -566,12 +572,12 @@ def dedup_py( out_stat=out_stat, backend=backend, n_proc=n_proc, - c1=c1, - c2=c2, - p1=p1, - p2=p2, - s1=s1, - s2=s2, + c1=col1, + c2=col2, + p1=colp1, + p2=colp2, + s1=cols1, + s2=cols2, ) else: raise ValueError("Unknown backend") diff --git a/pairtools/cli/sort.py b/pairtools/cli/sort.py index 06c63d28..0681c921 100644 --- a/pairtools/cli/sort.py +++ b/pairtools/cli/sort.py @@ -6,10 +6,12 @@ import subprocess import shutil import warnings +from .._logging import get_logger from ..lib import fileio, pairsam_format, headerops from . import cli, common_io_options +logger = get_logger() UTIL_NAME = "pairtools_sort" @@ -58,6 +60,7 @@ default=pairsam_format.COLUMNS_PAIRS[7], help=f"Pair type column; default {pairsam_format.COLUMNS_PAIRS[7]}" "[input format option]", + required=False, ) @click.option( "--extra-col", @@ -157,7 +160,6 @@ def sort_py( compress_program, **kwargs, ): - instream = fileio.auto_open( pairs_path, mode="r", @@ -176,7 +178,6 @@ def sort_py( header = headerops.mark_header_as_sorted(header) outstream.writelines((l + "\n" for l in header)) - outstream.flush() if compress_program == "auto": @@ -191,16 +192,42 @@ def sort_py( compress_program = "gzip" column_names = headerops.extract_column_names(header) - columns = [c1, c2, p1, p2, pt] + list(extra_col) - # Now generating the "-k ," expressions for all columns. - # If column name is in the default pairsam format and has an integer dtype there, do numerical sorting + column_names = headerops.canonicalize_columns(column_names) + + # Get column indices with fallbacks + try: + col1 = headerops.get_column_index(column_names, c1) + col2 = headerops.get_column_index(column_names, c2) + colp1 = headerops.get_column_index(column_names, p1) + colp2 = headerops.get_column_index(column_names, p2) + + # Make pair_type optional + try: + colpt = headerops.get_column_index(column_names, pt) if pt else None + except ValueError: + colpt = None + + extra_cols = [] + for col in extra_col: + try: + extra_cols.append(headerops.get_column_index(column_names, col)) + except ValueError: + logger.warning(f"Extra column {col} not found in header, skipping") + continue + except ValueError as e: + raise ValueError(f"Column error: {str(e)}") from e + + # Generate sort command columns cols = [] - for col in columns: - colindex = int(col) if col.isnumeric() else column_names.index(col) + 1 + for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols): + if col is None: + continue # Skip optional columns that weren't found + dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str) cols.append( - f"-k {colindex},{colindex}{'n' if issubclass(pairsam_format.DTYPES_PAIRSAM.get(column_names[colindex-1], str), int) else ''}" + f"-k {col+1},{col+1}{'n' if issubclass(dtype, int) else ''}" ) cols = " ".join(cols) + command = rf""" /bin/bash -c 'export LC_COLLATE=C; export LANG=C; sort {cols} @@ -210,9 +237,8 @@ def sort_py( {f'--temporary-directory={tmpdir}' if tmpdir else ''} -S {memory} {f'--compress-program={compress_program}' if compress_program else ''}' - """.replace( - "\n", " " - ) + """.replace("\n", " ") + with subprocess.Popen( command, stdin=subprocess.PIPE, bufsize=-1, shell=True, stdout=outstream ) as process: @@ -224,7 +250,6 @@ def sort_py( if instream != sys.stdin: instream.close() - if outstream != sys.stdout: outstream.close() diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py index 14624df4..54d2f53b 100644 --- a/pairtools/lib/headerops.py +++ b/pairtools/lib/headerops.py @@ -22,6 +22,51 @@ +def canonicalize_columns(columns): + """Convert between common column name variants.""" + canonical_map = { + 'chr1': 'chrom1', + 'chr2': 'chrom2', + 'chrom1': 'chrom1', # Ensure identity mapping + 'chrom2': 'chrom2', + 'pt': 'pair_type', + 'pair_type': 'pair_type' + } + return [canonical_map.get(col.lower(), col) for col in columns] + +def get_column_index(column_names, column_spec): + """Get column index with flexible name matching.""" + if isinstance(column_spec, int): + if -len(column_names) <= column_spec < len(column_names): + return column_spec % len(column_names) # Handle negative indices + raise ValueError(f"Column index {column_spec} out of range") + + if not isinstance(column_spec, (str, int)): + raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}") + + # Try direct match first + try: + return column_names.index(column_spec) + except ValueError: + pass + + # Try canonical name + canonical = canonicalize_columns([column_spec])[0] + try: + return column_names.index(canonical) + except ValueError: + pass + + # Try case-insensitive + lower_columns = [c.lower() for c in column_names] + try: + return lower_columns.index(canonical.lower()) + except ValueError: + available = ', '.join(f"'{c}'" for c in column_names) + raise ValueError( + f"Column '{column_spec}' not found. Available columns: {available}" + ) + def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False): """Returns a header from the stream and an the reaminder of the stream with the actual data. diff --git a/tests/test_headerops.py b/tests/test_headerops.py index 8525ea64..7b401631 100644 --- a/tests/test_headerops.py +++ b/tests/test_headerops.py @@ -4,6 +4,104 @@ import pytest +import pytest +from pairtools.lib.headerops import ( + canonicalize_columns, + get_column_index, + extract_column_names, +) + +def test_canonicalize_columns(): + # Test basic canonicalization + assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2'] + assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2'] + assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other'] + + # Test mixed case + assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2'] + assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2'] + + # Test no changes needed + assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1'] + + # Test empty input + assert canonicalize_columns([]) == [] + + # Test all known aliases + assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type'] + +def test_get_column_index(): + # Setup test columns + columns = ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type'] + + # Test string lookup - direct matches + assert get_column_index(columns, 'chr1') == 1 + assert get_column_index(columns, 'pos2') == 4 + assert get_column_index(columns, 'pair_type') == 7 + + # Test string lookup - canonicalized matches + assert get_column_index(columns, 'chrom1') == 1 + assert get_column_index(columns, 'CHROM2') == 3 + assert get_column_index(columns, 'PT') == 7 + + # Test case insensitive matches + assert get_column_index(columns, 'CHR1') == 1 + assert get_column_index(columns, 'ChR2') == 3 + + # Test integer lookup + assert get_column_index(columns, 0) == 0 + assert get_column_index(columns, 3) == 3 + assert get_column_index(columns, 7) == 7 + + # Test error cases + with pytest.raises(ValueError, match="Column 'nonexistent' not found"): + get_column_index(columns, 'nonexistent') + + with pytest.raises(ValueError, match="Column index 100 out of range"): + get_column_index(columns, 100) + + with pytest.raises(AttributeError, match="Column spec must be string or integer"): + get_column_index(columns, 3.14) + +def test_integration_with_extract_column_names(): + # Test with actual header format + header = [ + "## pairs format v1.0", + "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 pair_type", + "#chromsize: chr1 1000", + "#chromsize: chr2 800" + ] + + columns = extract_column_names(header) + assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type'] + + # Test canonicalized column access + assert get_column_index(columns, 'chrom1') == 1 + assert get_column_index(columns, 'chrom2') == 3 + assert get_column_index(columns, 'pt') == 7 + + # Test with alternative header format + header2 = [ + "## pairs format v1.0", + "#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type", + ] + columns2 = extract_column_names(header2) + assert get_column_index(columns2, 'chr1') == 1 + assert get_column_index(columns2, 'chr2') == 3 + +def test_edge_cases(): + # Test empty columns + with pytest.raises(ValueError): + get_column_index([], 'chrom1') + + # Test invalid column spec type + with pytest.raises(AttributeError): + get_column_index(['a', 'b', 'c'], 3.14) # float not supported + + # Test negative indices + assert get_column_index(['a', 'b', 'c'], -1) == 2 # Python-style negative indexing + + def test_make_standard_header(): header = headerops.make_standard_pairsheader() From 89d1148dbbfbd29007b3a8eb158a6a451c01fd8d Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sun, 11 Jan 2026 17:21:56 +0530 Subject: [PATCH 2/2] fix: addressed review comments --- pairtools/cli/dedup.py | 39 ++++++++-------- pairtools/cli/sort.py | 17 +++---- pairtools/lib/headerops.py | 54 +++++++++------------ pairtools/lib/pairsam_format.py | 8 ++++ tests/test_headerops.py | 83 +++++++++++++++++++++------------ 5 files changed, 111 insertions(+), 90 deletions(-) diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index 1512dd99..2215a015 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -490,16 +490,17 @@ def dedup_py( # Canonicalize column names for flexible matching column_names = headerops.extract_column_names(header) - column_names = headerops.canonicalize_columns(column_names) + # Ensure all columns are standardized before lookup + column_names = [headerops.standardize_column(c) for c in column_names] # Get column indices with fallbacks try: - col1 = headerops.get_column_index(column_names, c1) - col2 = headerops.get_column_index(column_names, c2) - colp1 = headerops.get_column_index(column_names, p1) - colp2 = headerops.get_column_index(column_names, p2) - cols1 = headerops.get_column_index(column_names, s1) - cols2 = headerops.get_column_index(column_names, s2) + col_c1 = headerops.get_column_index(column_names, c1) + col_c2 = headerops.get_column_index(column_names, c2) + col_p1 = headerops.get_column_index(column_names, p1) + col_p2 = headerops.get_column_index(column_names, p2) + col_s1 = headerops.get_column_index(column_names, s1) + col_s2 = headerops.get_column_index(column_names, s2) # Handle extra column pairs extra_cols1 = [] @@ -537,12 +538,12 @@ def dedup_py( method, max_mismatch, sep, - col1, - col2, - colp1, - colp2, - cols1, - cols2, + col_c1, + col_c2, + col_p1, + col_p2, + col_s1, + col_s2, extra_cols1, extra_cols2, unmapped_chrom, @@ -572,12 +573,12 @@ def dedup_py( out_stat=out_stat, backend=backend, n_proc=n_proc, - c1=col1, - c2=col2, - p1=colp1, - p2=colp2, - s1=cols1, - s2=cols2, + c1=col_c1, + c2=col_c2, + p1=col_p1, + p2=col_p2, + s1=col_s1, + s2=col_s2, ) else: raise ValueError("Unknown backend") diff --git a/pairtools/cli/sort.py b/pairtools/cli/sort.py index 0681c921..b6132f6a 100644 --- a/pairtools/cli/sort.py +++ b/pairtools/cli/sort.py @@ -192,20 +192,21 @@ def sort_py( compress_program = "gzip" column_names = headerops.extract_column_names(header) - column_names = headerops.canonicalize_columns(column_names) + # Ensure all columns are standardized before lookup + column_names = [headerops.standardize_column(c) for c in column_names] # Get column indices with fallbacks try: - col1 = headerops.get_column_index(column_names, c1) - col2 = headerops.get_column_index(column_names, c2) - colp1 = headerops.get_column_index(column_names, p1) - colp2 = headerops.get_column_index(column_names, p2) + col_c1 = headerops.get_column_index(column_names, c1) + col_c2 = headerops.get_column_index(column_names, c2) + col_p1 = headerops.get_column_index(column_names, p1) + col_p2 = headerops.get_column_index(column_names, p2) # Make pair_type optional try: - colpt = headerops.get_column_index(column_names, pt) if pt else None + col_pt = headerops.get_column_index(column_names, pt) if pt else None except ValueError: - colpt = None + col_pt = None extra_cols = [] for col in extra_col: @@ -219,7 +220,7 @@ def sort_py( # Generate sort command columns cols = [] - for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols): + for i, col in enumerate([col_c1, col_p1, col_c2, col_p2, col_pt] + extra_cols): if col is None: continue # Skip optional columns that weren't found dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str) diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py index 54d2f53b..3c3646b8 100644 --- a/pairtools/lib/headerops.py +++ b/pairtools/lib/headerops.py @@ -22,50 +22,38 @@ -def canonicalize_columns(columns): - """Convert between common column name variants.""" - canonical_map = { - 'chr1': 'chrom1', - 'chr2': 'chrom2', - 'chrom1': 'chrom1', # Ensure identity mapping - 'chrom2': 'chrom2', - 'pt': 'pair_type', - 'pair_type': 'pair_type' - } - return [canonical_map.get(col.lower(), col) for col in columns] - -def get_column_index(column_names, column_spec): +def standardize_column(col): + """Standardize column name.""" + # Resolve aliases using centralized map + return pairsam_format.COLUMN_ALIASES.get(col, col) + +def get_column_index(column_names, col): """Get column index with flexible name matching.""" - if isinstance(column_spec, int): - if -len(column_names) <= column_spec < len(column_names): - return column_spec % len(column_names) # Handle negative indices - raise ValueError(f"Column index {column_spec} out of range") + if isinstance(col, int): + if -len(column_names) <= col < len(column_names): + return col % len(column_names) # Handle negative indices + raise ValueError(f"Column index {col} out of range") - if not isinstance(column_spec, (str, int)): - raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}") + if not isinstance(col, (str, int)): + raise AttributeError(f"Column spec must be string or integer, got {type(col)}") # Try direct match first try: - return column_names.index(column_spec) + return column_names.index(col) except ValueError: pass - # Try canonical name - canonical = canonicalize_columns([column_spec])[0] + # Try standardized name + standardized = standardize_column(col) try: - return column_names.index(canonical) + return column_names.index(standardized) except ValueError: pass - - # Try case-insensitive - lower_columns = [c.lower() for c in column_names] - try: - return lower_columns.index(canonical.lower()) - except ValueError: - available = ', '.join(f"'{c}'" for c in column_names) - raise ValueError( - f"Column '{column_spec}' not found. Available columns: {available}" - ) + + available = ', '.join(f"'{c}'" for c in column_names) + raise ValueError( + f"Column '{col}' not found. Available columns: {available}" + ) def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False): """Returns a header from the stream and an the reaminder of the stream diff --git a/pairtools/lib/pairsam_format.py b/pairtools/lib/pairsam_format.py index 7383bab1..e1e7a01a 100644 --- a/pairtools/lib/pairsam_format.py +++ b/pairtools/lib/pairsam_format.py @@ -123,3 +123,11 @@ "algn_idx": int, "same_side_algn_count": int, } + +# Standard column aliases for flexibility +COLUMN_ALIASES = { + 'chr1': 'chrom1', + 'chr2': 'chrom2', + 'pt': 'pair_type', +} + diff --git a/tests/test_headerops.py b/tests/test_headerops.py index 7b401631..0c325194 100644 --- a/tests/test_headerops.py +++ b/tests/test_headerops.py @@ -6,29 +6,24 @@ import pytest from pairtools.lib.headerops import ( - canonicalize_columns, + standardize_column, get_column_index, extract_column_names, ) -def test_canonicalize_columns(): - # Test basic canonicalization - assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2'] - assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2'] - assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other'] - - # Test mixed case - assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2'] - assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2'] +def test_standardize_column(): + # Test basic standardization + assert standardize_column('chr1') == 'chrom1' + assert standardize_column('chr2') == 'chrom2' + assert standardize_column('pt') == 'pair_type' # Test no changes needed - assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1'] - - # Test empty input - assert canonicalize_columns([]) == [] + assert standardize_column('chrom1') == 'chrom1' + assert standardize_column('readID') == 'readID' - # Test all known aliases - assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type'] + # Test unknown columns remain unchanged + assert standardize_column('unknown') == 'unknown' + def test_get_column_index(): # Setup test columns @@ -39,14 +34,34 @@ def test_get_column_index(): assert get_column_index(columns, 'pos2') == 4 assert get_column_index(columns, 'pair_type') == 7 - # Test string lookup - canonicalized matches - assert get_column_index(columns, 'chrom1') == 1 - assert get_column_index(columns, 'CHROM2') == 3 - assert get_column_index(columns, 'PT') == 7 + # Test string lookup - standardized matches + # Note: 'chrom1' standardizes to 'chrom1' (no change if not in alias map depending on map content, + # but here 'chr1' is in columns, so we search for that. + # Wait, 'chrom1' is NOT in columns list above. 'chr1' is. + # standardized('chrom1') -> 'chrom1'. 'chrom1' is not in columns. + # So searching for 'chrom1' should FAIL unless we also standardize the columns list before search, + # which get_column_index DOES NOT do (it expects caller to do it). + # actually, looking at the code I wrote for get_column_index: + # it tries direct match, then "standardized" match. + # if I pass 'chr1', standardized is 'chrom1'. + # if I pass 'chrom1', standardized is 'chrom1'. + # The columns list has 'chr1'. + # direct match 'chr1' -> found. + # direct match 'chrom1' -> not found. standardized 'chrom1' -> 'chrom1' -> not found. + # So 'chrom1' should fail lookup in 'columns' list ['chr1', ...]. + + # Let's adjust the test case to match HOW it is used. + # In the code, we standardize the header columns FIRST. + # So the columns list passed to get_column_index usually has STANDARDIZED names. + + std_columns = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'] - # Test case insensitive matches - assert get_column_index(columns, 'CHR1') == 1 - assert get_column_index(columns, 'ChR2') == 3 + assert get_column_index(std_columns, 'chrom1') == 1 + assert get_column_index(std_columns, 'chr1') == 1 # chr1 -> chrom1 (standardized) -> found + + assert get_column_index(std_columns, 'pt') == 7 # pt -> pair_type -> found + assert get_column_index(std_columns, 'pair_type') == 7 + # Test integer lookup assert get_column_index(columns, 0) == 0 @@ -73,21 +88,29 @@ def test_integration_with_extract_column_names(): ] columns = extract_column_names(header) + # The extraction just gets names. assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type'] - # Test canonicalized column access - assert get_column_index(columns, 'chrom1') == 1 - assert get_column_index(columns, 'chrom2') == 3 - assert get_column_index(columns, 'pt') == 7 + # Standardize them + std_columns = [standardize_column(c) for c in columns] + assert std_columns == ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'] + + # Test lookup + assert get_column_index(std_columns, 'chrom1') == 1 + assert get_column_index(std_columns, 'chr1') == 1 + assert get_column_index(std_columns, 'pt') == 7 - # Test with alternative header format + # Test with alternative header format (already standardized names) header2 = [ "## pairs format v1.0", "#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type", ] columns2 = extract_column_names(header2) - assert get_column_index(columns2, 'chr1') == 1 - assert get_column_index(columns2, 'chr2') == 3 + std_columns2 = [standardize_column(c) for c in columns2] + + assert get_column_index(std_columns2, 'chr1') == 1 + assert get_column_index(std_columns2, 'chrom1') == 1 + def test_edge_cases(): # Test empty columns