From 31f97fcccde9ecaf923d5a17ff906845cabaa2fb Mon Sep 17 00:00:00 2001
From: ShigrafS <140247389+ShigrafS@users.noreply.github.com>
Date: Sun, 20 Apr 2025 09:48:44 +0000
Subject: [PATCH 1/2] Fixed CLI errors.

---
 pairtools/cli/dedup.py     | 76 +++++++++++++++--------------
 pairtools/cli/sort.py      | 49 ++++++++++++++-----
 pairtools/lib/headerops.py | 45 +++++++++++++++++
 tests/test_headerops.py    | 98 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 221 insertions(+), 47 deletions(-)

diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
index fe6f197c..1512dd99 100644
--- a/pairtools/cli/dedup.py
+++ b/pairtools/cli/dedup.py
@@ -386,7 +386,6 @@ def dedup_py(
     n_proc,
     **kwargs,
 ):
-
     sep = ast.literal_eval('"""' + sep + '"""')
     send_header_to_dedup = send_header_to in ["both", "dedup"]
     send_header_to_dup = send_header_to in ["both", "dups"]
@@ -488,6 +487,34 @@ def dedup_py(
         logger.warning(
             "Pairs file appears not to be sorted, dedup might produce wrong results."
         )
+    
+    # Canonicalize column names for flexible matching
+    column_names = headerops.extract_column_names(header)
+    column_names = headerops.canonicalize_columns(column_names)
+    
+    # Get column indices with fallbacks
+    try:
+        col1 = headerops.get_column_index(column_names, c1)
+        col2 = headerops.get_column_index(column_names, c2)
+        colp1 = headerops.get_column_index(column_names, p1)
+        colp2 = headerops.get_column_index(column_names, p2)
+        cols1 = headerops.get_column_index(column_names, s1)
+        cols2 = headerops.get_column_index(column_names, s2)
+        
+        # Handle extra column pairs
+        extra_cols1 = []
+        extra_cols2 = []
+        if extra_col_pair is not None:
+            for col1_spec, col2_spec in extra_col_pair:
+                try:
+                    extra_cols1.append(headerops.get_column_index(column_names, col1_spec))
+                    extra_cols2.append(headerops.get_column_index(column_names, col2_spec))
+                except ValueError:
+                    logger.warning(f"Extra column pair ({col1_spec}, {col2_spec}) not found in header, skipping")
+                    continue
+    except ValueError as e:
+        raise ValueError(f"Column error: {str(e)}") from e
+
     header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
     dups_header = header.copy()
     if keep_parent_id and len(dups_header) > 0:
@@ -505,38 +532,17 @@ def dedup_py(
     ):
         outstream_unmapped.writelines((l + "\n" for l in header))
 
-    column_names = headerops.extract_column_names(header)
-    extra_cols1 = []
-    extra_cols2 = []
-    if extra_col_pair is not None:
-        for col1, col2 in extra_col_pair:
-            extra_cols1.append(column_names[col1] if col1.isnumeric() else col1)
-            extra_cols2.append(column_names[col2] if col2.isnumeric() else col2)
-
     if backend == "cython":
-        # warnings.warn(
-        #     "'cython' backend is deprecated and provided only"
-        #     " for backwards compatibility",
-        #     DeprecationWarning,
-        # )
-        extra_cols1 = [column_names.index(col) for col in extra_cols1]
-        extra_cols2 = [column_names.index(col) for col in extra_cols2]
-        c1 = column_names.index(c1)
-        c2 = column_names.index(c2)
-        p1 = column_names.index(p1)
-        p2 = column_names.index(p2)
-        s1 = column_names.index(s1)
-        s2 = column_names.index(s2)
         streaming_dedup_cython(
             method,
             max_mismatch,
             sep,
-            c1,
-            c2,
-            p1,
-            p2,
-            s1,
-            s2,
+            col1,
+            col2,
+            colp1,
+            colp2,
+            cols1,
+            cols2,
             extra_cols1,
             extra_cols2,
             unmapped_chrom,
@@ -557,7 +563,7 @@ def dedup_py(
             method=method,
             mark_dups=mark_dups,
             max_mismatch=max_mismatch,
-            extra_col_pairs=list(extra_col_pair),
+            extra_col_pairs=list(zip(extra_cols1, extra_cols2)) if extra_cols1 else [],
             keep_parent_id=keep_parent_id,
             unmapped_chrom=unmapped_chrom,
             outstream=outstream,
@@ -566,12 +572,12 @@ def dedup_py(
             out_stat=out_stat,
             backend=backend,
             n_proc=n_proc,
-            c1=c1,
-            c2=c2,
-            p1=p1,
-            p2=p2,
-            s1=s1,
-            s2=s2,
+            c1=col1,
+            c2=col2,
+            p1=colp1,
+            p2=colp2,
+            s1=cols1,
+            s2=cols2,
         )
     else:
         raise ValueError("Unknown backend")
diff --git a/pairtools/cli/sort.py b/pairtools/cli/sort.py
index 06c63d28..0681c921 100644
--- a/pairtools/cli/sort.py
+++ b/pairtools/cli/sort.py
@@ -6,10 +6,12 @@
 import subprocess
 import shutil
 import warnings
+from .._logging import get_logger
 
 from ..lib import fileio, pairsam_format, headerops
 from . import cli, common_io_options
 
+logger = get_logger()
 UTIL_NAME = "pairtools_sort"
 
 
@@ -58,6 +60,7 @@
     default=pairsam_format.COLUMNS_PAIRS[7],
     help=f"Pair type column; default {pairsam_format.COLUMNS_PAIRS[7]}"
     "[input format option]",
+    required=False,
 )
 @click.option(
     "--extra-col",
@@ -157,7 +160,6 @@ def sort_py(
     compress_program,
     **kwargs,
 ):
-
     instream = fileio.auto_open(
         pairs_path,
         mode="r",
@@ -176,7 +178,6 @@ def sort_py(
     header = headerops.mark_header_as_sorted(header)
 
     outstream.writelines((l + "\n" for l in header))
-
     outstream.flush()
 
     if compress_program == "auto":
@@ -191,16 +192,42 @@ def sort_py(
             compress_program = "gzip"
 
     column_names = headerops.extract_column_names(header)
-    columns = [c1, c2, p1, p2, pt] + list(extra_col)
-    # Now generating the "-k <i>,<i><mode>" expressions for all columns.
-    # If column name is in the default pairsam format and has an integer dtype there, do numerical sorting
+    column_names = headerops.canonicalize_columns(column_names)
+
+    # Get column indices with fallbacks
+    try:
+        col1 = headerops.get_column_index(column_names, c1)
+        col2 = headerops.get_column_index(column_names, c2)
+        colp1 = headerops.get_column_index(column_names, p1)
+        colp2 = headerops.get_column_index(column_names, p2)
+        
+        # Make pair_type optional
+        try:
+            colpt = headerops.get_column_index(column_names, pt) if pt else None
+        except ValueError:
+            colpt = None
+            
+        extra_cols = []
+        for col in extra_col:
+            try:
+                extra_cols.append(headerops.get_column_index(column_names, col))
+            except ValueError:
+                logger.warning(f"Extra column {col} not found in header, skipping")
+                continue
+    except ValueError as e:
+        raise ValueError(f"Column error: {str(e)}") from e
+
+    # Generate sort command columns
     cols = []
-    for col in columns:
-        colindex = int(col) if col.isnumeric() else column_names.index(col) + 1
+    for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols):
+        if col is None:
+            continue  # Skip optional columns that weren't found
+        dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str)
         cols.append(
-            f"-k {colindex},{colindex}{'n' if issubclass(pairsam_format.DTYPES_PAIRSAM.get(column_names[colindex-1], str), int) else ''}"
+            f"-k {col+1},{col+1}{'n' if issubclass(dtype, int) else ''}"
         )
     cols = " ".join(cols)
+    
     command = rf"""
         /bin/bash -c 'export LC_COLLATE=C; export LANG=C; sort 
         {cols}
@@ -210,9 +237,8 @@ def sort_py(
         {f'--temporary-directory={tmpdir}' if tmpdir else ''}
         -S {memory}
         {f'--compress-program={compress_program}' if compress_program else ''}'
-        """.replace(
-        "\n", " "
-    )
+        """.replace("\n", " ")
+        
     with subprocess.Popen(
         command, stdin=subprocess.PIPE, bufsize=-1, shell=True, stdout=outstream
     ) as process:
@@ -224,7 +250,6 @@ def sort_py(
 
     if instream != sys.stdin:
         instream.close()
-
     if outstream != sys.stdout:
         outstream.close()
 
diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py
index 14624df4..54d2f53b 100644
--- a/pairtools/lib/headerops.py
+++ b/pairtools/lib/headerops.py
@@ -22,6 +22,51 @@
 
 
 
+def canonicalize_columns(columns):
+    """Convert between common column name variants."""
+    canonical_map = {
+        'chr1': 'chrom1',
+        'chr2': 'chrom2',
+        'chrom1': 'chrom1',  # Ensure identity mapping
+        'chrom2': 'chrom2',
+        'pt': 'pair_type',
+        'pair_type': 'pair_type'
+    }
+    return [canonical_map.get(col.lower(), col) for col in columns]
+
+def get_column_index(column_names, column_spec):
+    """Get column index with flexible name matching."""
+    if isinstance(column_spec, int):
+        if -len(column_names) <= column_spec < len(column_names):
+            return column_spec % len(column_names)  # Handle negative indices
+        raise ValueError(f"Column index {column_spec} out of range")
+    
+    if not isinstance(column_spec, (str, int)):
+        raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}")
+
+    # Try direct match first
+    try:
+        return column_names.index(column_spec)
+    except ValueError:
+        pass
+        
+    # Try canonical name
+    canonical = canonicalize_columns([column_spec])[0]
+    try:
+        return column_names.index(canonical)
+    except ValueError:
+        pass
+        
+    # Try case-insensitive
+    lower_columns = [c.lower() for c in column_names]
+    try:
+        return lower_columns.index(canonical.lower())
+    except ValueError:
+        available = ', '.join(f"'{c}'" for c in column_names)
+        raise ValueError(
+            f"Column '{column_spec}' not found. Available columns: {available}"
+        )
+
 def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
     """Returns a header from the stream and an the reaminder of the stream
     with the actual data.
diff --git a/tests/test_headerops.py b/tests/test_headerops.py
index 8525ea64..7b401631 100644
--- a/tests/test_headerops.py
+++ b/tests/test_headerops.py
@@ -4,6 +4,104 @@
 import pytest
 
 
+import pytest
+from pairtools.lib.headerops import (
+    canonicalize_columns,
+    get_column_index,
+    extract_column_names,
+)
+
+def test_canonicalize_columns():
+    # Test basic canonicalization
+    assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other']
+    
+    # Test mixed case
+    assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2']
+    
+    # Test no changes needed
+    assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1']
+    
+    # Test empty input
+    assert canonicalize_columns([]) == []
+    
+    # Test all known aliases
+    assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type']
+
+def test_get_column_index():
+    # Setup test columns
+    columns = ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
+    
+    # Test string lookup - direct matches
+    assert get_column_index(columns, 'chr1') == 1
+    assert get_column_index(columns, 'pos2') == 4
+    assert get_column_index(columns, 'pair_type') == 7
+    
+    # Test string lookup - canonicalized matches
+    assert get_column_index(columns, 'chrom1') == 1
+    assert get_column_index(columns, 'CHROM2') == 3
+    assert get_column_index(columns, 'PT') == 7
+    
+    # Test case insensitive matches
+    assert get_column_index(columns, 'CHR1') == 1
+    assert get_column_index(columns, 'ChR2') == 3
+    
+    # Test integer lookup
+    assert get_column_index(columns, 0) == 0
+    assert get_column_index(columns, 3) == 3
+    assert get_column_index(columns, 7) == 7
+    
+    # Test error cases
+    with pytest.raises(ValueError, match="Column 'nonexistent' not found"):
+        get_column_index(columns, 'nonexistent')
+        
+    with pytest.raises(ValueError, match="Column index 100 out of range"):
+        get_column_index(columns, 100)
+        
+    with pytest.raises(AttributeError, match="Column spec must be string or integer"):
+        get_column_index(columns, 3.14)
+
+def test_integration_with_extract_column_names():
+    # Test with actual header format
+    header = [
+        "## pairs format v1.0",
+        "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 pair_type",
+        "#chromsize: chr1 1000",
+        "#chromsize: chr2 800"
+    ]
+    
+    columns = extract_column_names(header)
+    assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
+    
+    # Test canonicalized column access
+    assert get_column_index(columns, 'chrom1') == 1
+    assert get_column_index(columns, 'chrom2') == 3
+    assert get_column_index(columns, 'pt') == 7
+    
+    # Test with alternative header format
+    header2 = [
+        "## pairs format v1.0",
+        "#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type",
+    ]
+    columns2 = extract_column_names(header2)
+    assert get_column_index(columns2, 'chr1') == 1
+    assert get_column_index(columns2, 'chr2') == 3
+
+def test_edge_cases():
+    # Test empty columns
+    with pytest.raises(ValueError):
+        get_column_index([], 'chrom1')
+    
+    # Test invalid column spec type
+    with pytest.raises(AttributeError):
+        get_column_index(['a', 'b', 'c'], 3.14)  # float not supported
+        
+    # Test negative indices
+    assert get_column_index(['a', 'b', 'c'], -1) == 2  # Python-style negative indexing
+
+
 def test_make_standard_header():
     header = headerops.make_standard_pairsheader()
 

From 89d1148dbbfbd29007b3a8eb158a6a451c01fd8d Mon Sep 17 00:00:00 2001
From: ShigrafS <shigrafsalik@proton.me>
Date: Sun, 11 Jan 2026 17:21:56 +0530
Subject: [PATCH 2/2] fix: addressed review comments

---
 pairtools/cli/dedup.py          | 39 ++++++++--------
 pairtools/cli/sort.py           | 17 +++----
 pairtools/lib/headerops.py      | 54 +++++++++------------
 pairtools/lib/pairsam_format.py |  8 ++++
 tests/test_headerops.py         | 83 +++++++++++++++++++++------------
 5 files changed, 111 insertions(+), 90 deletions(-)

diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
index 1512dd99..2215a015 100644
--- a/pairtools/cli/dedup.py
+++ b/pairtools/cli/dedup.py
@@ -490,16 +490,17 @@ def dedup_py(
     
     # Canonicalize column names for flexible matching
     column_names = headerops.extract_column_names(header)
-    column_names = headerops.canonicalize_columns(column_names)
+    # Ensure all columns are standardized before lookup
+    column_names = [headerops.standardize_column(c) for c in column_names]
     
     # Get column indices with fallbacks
     try:
-        col1 = headerops.get_column_index(column_names, c1)
-        col2 = headerops.get_column_index(column_names, c2)
-        colp1 = headerops.get_column_index(column_names, p1)
-        colp2 = headerops.get_column_index(column_names, p2)
-        cols1 = headerops.get_column_index(column_names, s1)
-        cols2 = headerops.get_column_index(column_names, s2)
+        col_c1 = headerops.get_column_index(column_names, c1)
+        col_c2 = headerops.get_column_index(column_names, c2)
+        col_p1 = headerops.get_column_index(column_names, p1)
+        col_p2 = headerops.get_column_index(column_names, p2)
+        col_s1 = headerops.get_column_index(column_names, s1)
+        col_s2 = headerops.get_column_index(column_names, s2)
         
         # Handle extra column pairs
         extra_cols1 = []
@@ -537,12 +538,12 @@ def dedup_py(
             method,
             max_mismatch,
             sep,
-            col1,
-            col2,
-            colp1,
-            colp2,
-            cols1,
-            cols2,
+            col_c1,
+            col_c2,
+            col_p1,
+            col_p2,
+            col_s1,
+            col_s2,
             extra_cols1,
             extra_cols2,
             unmapped_chrom,
@@ -572,12 +573,12 @@ def dedup_py(
             out_stat=out_stat,
             backend=backend,
             n_proc=n_proc,
-            c1=col1,
-            c2=col2,
-            p1=colp1,
-            p2=colp2,
-            s1=cols1,
-            s2=cols2,
+            c1=col_c1,
+            c2=col_c2,
+            p1=col_p1,
+            p2=col_p2,
+            s1=col_s1,
+            s2=col_s2,
         )
     else:
         raise ValueError("Unknown backend")
diff --git a/pairtools/cli/sort.py b/pairtools/cli/sort.py
index 0681c921..b6132f6a 100644
--- a/pairtools/cli/sort.py
+++ b/pairtools/cli/sort.py
@@ -192,20 +192,21 @@ def sort_py(
             compress_program = "gzip"
 
     column_names = headerops.extract_column_names(header)
-    column_names = headerops.canonicalize_columns(column_names)
+    # Ensure all columns are standardized before lookup
+    column_names = [headerops.standardize_column(c) for c in column_names]
 
     # Get column indices with fallbacks
     try:
-        col1 = headerops.get_column_index(column_names, c1)
-        col2 = headerops.get_column_index(column_names, c2)
-        colp1 = headerops.get_column_index(column_names, p1)
-        colp2 = headerops.get_column_index(column_names, p2)
+        col_c1 = headerops.get_column_index(column_names, c1)
+        col_c2 = headerops.get_column_index(column_names, c2)
+        col_p1 = headerops.get_column_index(column_names, p1)
+        col_p2 = headerops.get_column_index(column_names, p2)
         
         # Make pair_type optional
         try:
-            colpt = headerops.get_column_index(column_names, pt) if pt else None
+            col_pt = headerops.get_column_index(column_names, pt) if pt else None
         except ValueError:
-            colpt = None
+            col_pt = None
             
         extra_cols = []
         for col in extra_col:
@@ -219,7 +220,7 @@ def sort_py(
 
     # Generate sort command columns
     cols = []
-    for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols):
+    for i, col in enumerate([col_c1, col_p1, col_c2, col_p2, col_pt] + extra_cols):
         if col is None:
             continue  # Skip optional columns that weren't found
         dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str)
diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py
index 54d2f53b..3c3646b8 100644
--- a/pairtools/lib/headerops.py
+++ b/pairtools/lib/headerops.py
@@ -22,50 +22,38 @@
 
 
 
-def canonicalize_columns(columns):
-    """Convert between common column name variants."""
-    canonical_map = {
-        'chr1': 'chrom1',
-        'chr2': 'chrom2',
-        'chrom1': 'chrom1',  # Ensure identity mapping
-        'chrom2': 'chrom2',
-        'pt': 'pair_type',
-        'pair_type': 'pair_type'
-    }
-    return [canonical_map.get(col.lower(), col) for col in columns]
-
-def get_column_index(column_names, column_spec):
+def standardize_column(col):
+    """Standardize column name."""
+    # Resolve aliases using centralized map
+    return pairsam_format.COLUMN_ALIASES.get(col, col)
+
+def get_column_index(column_names, col):
     """Get column index with flexible name matching."""
-    if isinstance(column_spec, int):
-        if -len(column_names) <= column_spec < len(column_names):
-            return column_spec % len(column_names)  # Handle negative indices
-        raise ValueError(f"Column index {column_spec} out of range")
+    if isinstance(col, int):
+        if -len(column_names) <= col < len(column_names):
+            return col % len(column_names)  # Handle negative indices
+        raise ValueError(f"Column index {col} out of range")
     
-    if not isinstance(column_spec, (str, int)):
-        raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}")
+    if not isinstance(col, (str, int)):
+        raise AttributeError(f"Column spec must be string or integer, got {type(col)}")
 
     # Try direct match first
     try:
-        return column_names.index(column_spec)
+        return column_names.index(col)
     except ValueError:
         pass
         
-    # Try canonical name
-    canonical = canonicalize_columns([column_spec])[0]
+    # Try standardized name
+    standardized = standardize_column(col)
     try:
-        return column_names.index(canonical)
+        return column_names.index(standardized)
     except ValueError:
         pass
-        
-    # Try case-insensitive
-    lower_columns = [c.lower() for c in column_names]
-    try:
-        return lower_columns.index(canonical.lower())
-    except ValueError:
-        available = ', '.join(f"'{c}'" for c in column_names)
-        raise ValueError(
-            f"Column '{column_spec}' not found. Available columns: {available}"
-        )
+
+    available = ', '.join(f"'{c}'" for c in column_names)
+    raise ValueError(
+        f"Column '{col}' not found. Available columns: {available}"
+    )
 
 def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
     """Returns a header from the stream and an the reaminder of the stream
diff --git a/pairtools/lib/pairsam_format.py b/pairtools/lib/pairsam_format.py
index 7383bab1..e1e7a01a 100644
--- a/pairtools/lib/pairsam_format.py
+++ b/pairtools/lib/pairsam_format.py
@@ -123,3 +123,11 @@
     "algn_idx": int,
     "same_side_algn_count": int,
 }
+
+# Standard column aliases for flexibility
+COLUMN_ALIASES = {
+    'chr1': 'chrom1',
+    'chr2': 'chrom2',
+    'pt': 'pair_type',
+}
+
diff --git a/tests/test_headerops.py b/tests/test_headerops.py
index 7b401631..0c325194 100644
--- a/tests/test_headerops.py
+++ b/tests/test_headerops.py
@@ -6,29 +6,24 @@
 
 import pytest
 from pairtools.lib.headerops import (
-    canonicalize_columns,
+    standardize_column,
     get_column_index,
     extract_column_names,
 )
 
-def test_canonicalize_columns():
-    # Test basic canonicalization
-    assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2']
-    assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2']
-    assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other']
-    
-    # Test mixed case
-    assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2']
-    assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2']
+def test_standardize_column():
+    # Test basic standardization
+    assert standardize_column('chr1') == 'chrom1'
+    assert standardize_column('chr2') == 'chrom2'
+    assert standardize_column('pt') == 'pair_type'
     
     # Test no changes needed
-    assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1']
-    
-    # Test empty input
-    assert canonicalize_columns([]) == []
+    assert standardize_column('chrom1') == 'chrom1'
+    assert standardize_column('readID') == 'readID'
     
-    # Test all known aliases
-    assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type']
+    # Test unknown columns remain unchanged
+    assert standardize_column('unknown') == 'unknown'
+
 
 def test_get_column_index():
     # Setup test columns
@@ -39,14 +34,34 @@ def test_get_column_index():
     assert get_column_index(columns, 'pos2') == 4
     assert get_column_index(columns, 'pair_type') == 7
     
-    # Test string lookup - canonicalized matches
-    assert get_column_index(columns, 'chrom1') == 1
-    assert get_column_index(columns, 'CHROM2') == 3
-    assert get_column_index(columns, 'PT') == 7
+    # Test string lookup - standardized matches
+    # Note: 'chrom1' standardizes to 'chrom1' (no change if not in alias map depending on map content, 
+    # but here 'chr1' is in columns, so we search for that. 
+    # Wait, 'chrom1' is NOT in columns list above. 'chr1' is.
+    # standardized('chrom1') -> 'chrom1'. 'chrom1' is not in columns.
+    # So searching for 'chrom1' should FAIL unless we also standardize the columns list before search,
+    # which get_column_index DOES NOT do (it expects caller to do it).
+    # actually, looking at the code I wrote for get_column_index:
+    # it tries direct match, then "standardized" match.
+    # if I pass 'chr1', standardized is 'chrom1'.
+    # if I pass 'chrom1', standardized is 'chrom1'.
+    # The columns list has 'chr1'. 
+    # direct match 'chr1' -> found.
+    # direct match 'chrom1' -> not found. standardized 'chrom1' -> 'chrom1' -> not found.
+    # So 'chrom1' should fail lookup in 'columns' list ['chr1', ...].
+    
+    # Let's adjust the test case to match HOW it is used. 
+    # In the code, we standardize the header columns FIRST.
+    # So the columns list passed to get_column_index usually has STANDARDIZED names.
+    
+    std_columns = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']
     
-    # Test case insensitive matches
-    assert get_column_index(columns, 'CHR1') == 1
-    assert get_column_index(columns, 'ChR2') == 3
+    assert get_column_index(std_columns, 'chrom1') == 1
+    assert get_column_index(std_columns, 'chr1') == 1  # chr1 -> chrom1 (standardized) -> found
+    
+    assert get_column_index(std_columns, 'pt') == 7    # pt -> pair_type -> found
+    assert get_column_index(std_columns, 'pair_type') == 7
+
     
     # Test integer lookup
     assert get_column_index(columns, 0) == 0
@@ -73,21 +88,29 @@ def test_integration_with_extract_column_names():
     ]
     
     columns = extract_column_names(header)
+    # The extraction just gets names.
     assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
     
-    # Test canonicalized column access
-    assert get_column_index(columns, 'chrom1') == 1
-    assert get_column_index(columns, 'chrom2') == 3
-    assert get_column_index(columns, 'pt') == 7
+    # Standardize them
+    std_columns = [standardize_column(c) for c in columns]
+    assert std_columns == ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']
+    
+    # Test lookup
+    assert get_column_index(std_columns, 'chrom1') == 1
+    assert get_column_index(std_columns, 'chr1') == 1
+    assert get_column_index(std_columns, 'pt') == 7
     
-    # Test with alternative header format
+    # Test with alternative header format (already standardized names)
     header2 = [
         "## pairs format v1.0",
         "#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type",
     ]
     columns2 = extract_column_names(header2)
-    assert get_column_index(columns2, 'chr1') == 1
-    assert get_column_index(columns2, 'chr2') == 3
+    std_columns2 = [standardize_column(c) for c in columns2]
+    
+    assert get_column_index(std_columns2, 'chr1') == 1
+    assert get_column_index(std_columns2, 'chrom1') == 1
+
 
 def test_edge_cases():
     # Test empty columns