From cd7b2483c0737592f559a1a70c6be74db2888f34 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 22 Jul 2025 20:28:13 -0400 Subject: [PATCH 01/11] Deterministic common names, English prioritized; get names for higher ranks as well; convert common names tooling to Polars --- src/taxonopy/resolve_common_names.py | 238 ++++++++++++++++++++------- 1 file changed, 179 insertions(+), 59 deletions(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index e02217e..2958168 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -1,6 +1,6 @@ import os import argparse -import pandas as pd +import polars as pl import glob import zipfile import requests @@ -26,7 +26,7 @@ def download_and_extract_backbone(cache_dir: Path): # Download if needed if not zip_path.exists() or zip_path.stat().st_size < 900_000_000: # Expect ~926MB - print(f"Downloading GBIF backbone (~926MB) into cache → {zip_path}") + print(f"Downloading GBIF backbone into cache → {zip_path}") try: # Remove partial/corrupt file if it exists if zip_path.exists(): @@ -112,63 +112,151 @@ def merge_taxon_id(anno_df, taxon_df): :param taxon_df: taxon dataframe :return: merged dataframe """ - new_anno_df = anno_df.copy() - new_anno_df = new_anno_df.replace('', None) - new_anno_df = new_anno_df.replace(pd.NA, None) + new_anno_df = anno_df.clone() + + # Cast join key columns to Utf8 and convert empty strings to null + taxonomic_cols = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] + existing_cols = [col for col in taxonomic_cols if col in new_anno_df.columns] + + new_anno_df = new_anno_df.with_columns([ + pl.col(col).cast(pl.Utf8).map_elements(lambda x: None if x == '' else x, return_dtype=pl.Utf8) + for col in existing_cols + ]) print('Start merging with taxon_df') for key in ['species', 'genus']: - new_anno_df = pd.merge( - new_anno_df, - taxon_df[['canonicalName', 'taxonID', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus']], - how='left', - left_on=[key, 'kingdom', 'phylum', 'class', 'order', 'family', 'genus'], - right_on=['canonicalName', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus'], - suffixes=('', f'_{key}') + if key not in new_anno_df.columns: + continue + + # Select and rename taxonID to avoid conflicts + backbone_subset = taxon_df.select([ + 'canonicalName', + pl.col('taxonID').alias(f'taxonID_{key}'), + 'kingdom', 'phylum', 'class', 'order', 'family', 'genus' + ]) + + # Get columns that exist in both dataframes for joining + join_cols = [col for col in ['kingdom', 'phylum', 'class', 'order', 'family', 'genus'] + if col in new_anno_df.columns] + + new_anno_df = new_anno_df.join( + backbone_subset, + left_on=[key] + join_cols, + right_on=['canonicalName'] + join_cols, + how='left' ) - new_anno_df = new_anno_df.drop(columns=['canonicalName']) - new_anno_df.rename(columns={'taxonID': 'taxonID_species'}, inplace=True) + + # Drop canonicalName if it exists + if 'canonicalName' in new_anno_df.columns: + new_anno_df = new_anno_df.drop('canonicalName') - # Only keep the smallest taxonID for each uuid - duplicated_uuids = new_anno_df[new_anno_df.duplicated(subset='uuid', keep=False)] - non_duplicated_df = new_anno_df[~new_anno_df['uuid'].isin(duplicated_uuids['uuid'])] - duplicated_uuids = duplicated_uuids.loc[duplicated_uuids.groupby('uuid')['taxonID_genus'].idxmin()] - new_anno_df = pd.concat([non_duplicated_df, duplicated_uuids], ignore_index=True) + # Only keep the smallest taxonID for each uuid (handle duplicates) + if 'uuid' in new_anno_df.columns and 'taxonID_genus' in new_anno_df.columns: + duplicated_uuids = new_anno_df.filter(pl.col('uuid').is_duplicated()) + if len(duplicated_uuids) > 0: + non_duplicated_df = new_anno_df.filter(~pl.col('uuid').is_in(duplicated_uuids['uuid'])) + duplicated_uuids = duplicated_uuids.group_by('uuid').agg(pl.col('taxonID_genus').min()).join( + duplicated_uuids, on=['uuid', 'taxonID_genus'], how='inner' + ) + new_anno_df = pl.concat([non_duplicated_df, duplicated_uuids]) assert len(new_anno_df) == len(anno_df), f"Length mismatch: {len(new_anno_df)} != {len(anno_df)}" return new_anno_df -def merge_common_name(anno_df, common_name_df): +def merge_common_name(anno_df, common_name_df, taxon_df): """ - This function is used to merge common name with annotation dataframe + This function merges common names with annotation dataframe using hierarchical lookup. + Common names are always derived from backbone lookup data for consistent mapping. + Prefers English names, falls back to any language if English unavailable. + Searches from most specific taxonomic rank to least specific. + :param anno_df: annotation dataframe with taxonID - :param common_name_df: common name dataframe + :param common_name_df: common name dataframe (prioritized) + :param taxon_df: taxon dataframe for rank information :return: merged dataframe """ - new_anno_df = anno_df.copy() - print('Start merging with common_name_df') - for key in ['species', 'genus']: - new_anno_df = pd.merge( - new_anno_df, - common_name_df, - how='left', - left_on=f'taxonID_{key}', - right_on='taxonID', - suffixes=('', f'_{key}') + new_anno_df = anno_df.clone() + print('Start hierarchical common name lookup using backbone data only') + + # Normalize common_name_df to one row per taxonID (handle duplicates) + common_lookup = ( + common_name_df + .group_by("taxonID") + .agg(pl.col("vernacularName").first().alias("common_name")) + ) + + # Define hierarchical order of taxonomic ranks + rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] + + # Find which taxonomic ranks we actually have taxonIDs for + available_taxonid_cols = [f"taxonID_{rank}" for rank in rank_columns + if f"taxonID_{rank}" in new_anno_df.columns] + + # Find which taxonomic classification columns we have + available_rank_cols = [rank for rank in rank_columns + if rank in new_anno_df.columns] + + # Get taxonIDs for all ranks from taxon_df (this ensures we have the authoritative mapping) + for rank in available_rank_cols: + taxonid_col = f"taxonID_{rank}" + temp_taxonid_col = f"temp_taxonID_{rank}" + + # Always look up the authoritative taxonID from taxon_df + rank_taxa = ( + taxon_df + .filter(pl.col("taxonRank") == rank) + .select([pl.col("taxonID").alias(temp_taxonid_col), pl.col("canonicalName").alias(rank)]) ) - new_anno_df = new_anno_df.drop(columns=['taxonID']) - - print('Update the common_name column') - new_anno_df.rename(columns={'vernacularName': 'vernacularName_species'}, inplace=True) - for key in ['species', 'genus']: - new_anno_df['common_name'] = new_anno_df.apply( - lambda x: x['common_name'] if x['common_name'] is not None else x[f'vernacularName_{key}'], - axis=1 + + new_anno_df = new_anno_df.join( + rank_taxa, + on=rank, + how="left" ) - new_anno_df = new_anno_df.drop(columns=[f'vernacularName_{key}']) - new_anno_df = new_anno_df.drop(columns=[f'taxonID_{key}']) + + # Use the authoritative taxonID, falling back to existing one if lookup failed + if taxonid_col in new_anno_df.columns: + new_anno_df = new_anno_df.with_columns([ + pl.coalesce([pl.col(temp_taxonid_col), pl.col(taxonid_col)]).alias(taxonid_col) + ]).drop(temp_taxonid_col) + else: + new_anno_df = new_anno_df.rename({temp_taxonid_col: taxonid_col}) + + # Initialize common_name column + new_anno_df = new_anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) + + # Apply hierarchical lookup - check each rank in priority order + for rank in rank_columns: + taxonid_col = f"taxonID_{rank}" + if taxonid_col not in new_anno_df.columns: + continue + + # Join common names for this rank + temp_df = new_anno_df.join( + common_lookup.select([ + "taxonID", + pl.col("common_name").alias(f"temp_common_{rank}") + ]), + left_on=taxonid_col, + right_on="taxonID", + how="left" + ) + + # Update common_name where it's null and this rank has a name + new_anno_df = temp_df.with_columns([ + pl.coalesce([ + pl.col("common_name"), + pl.col(f"temp_common_{rank}") + ]).alias("common_name") + ]).drop(f"temp_common_{rank}") + + # Clean up temporary taxonID columns (keep original taxonomic classification columns) + cleanup_cols = [f"taxonID_{rank}" for rank in rank_columns] + existing_cleanup_cols = [col for col in cleanup_cols if col in new_anno_df.columns] + if existing_cleanup_cols: + new_anno_df = new_anno_df.drop(existing_cleanup_cols) assert len(new_anno_df) == len(anno_df), f"Length mismatch: {len(new_anno_df)} != {len(anno_df)}" @@ -212,26 +300,56 @@ def main(annotation_dir=None, output_dir=None): # Load the two TSVs print(f"Loading taxonomy data from {taxon_file}") - common_name_df = ( - pd.read_csv(common_name_file, sep="\t", low_memory=False) - .query("language == 'en'") + + # Load all vernacular names, prioritizing English but keeping others as fallback + # Turn off schema inference to handle improperly escaped quotes in GBIF data + vernacular_df = pl.read_csv(common_name_file, separator="\t", infer_schema_length=0, quote_char=None) + + # Create prioritized vernacular names: prefer English, fallback to any language + english_names = ( + vernacular_df + .filter(pl.col("language") == "en") + .with_columns([ + pl.col("vernacularName").str.to_lowercase().str.to_titlecase().alias("vernacularName"), + pl.lit(1).alias("priority") + ]) + .group_by("taxonID") + .agg([ + pl.col("vernacularName").first().alias("vernacularName"), + pl.col("priority").first().alias("priority") + ]) ) - common_name_df["vernacularName"] = ( - common_name_df["vernacularName"] - .str.lower() - .str.capitalize() + + other_names = ( + vernacular_df + .filter(pl.col("language") != "en") + .with_columns([ + pl.col("vernacularName").str.to_lowercase().str.to_titlecase().alias("vernacularName"), + pl.lit(2).alias("priority") + ]) + .group_by("taxonID") + .agg([ + pl.col("vernacularName").first().alias("vernacularName"), + pl.col("priority").first().alias("priority") + ]) ) + + # Combine with English preference common_name_df = ( - common_name_df - .groupby("taxonID")["vernacularName"] - .agg(lambda x: x.value_counts().index[0]) - .reset_index() + pl.concat([english_names, other_names]) + .group_by("taxonID") + .agg([ + pl.col("vernacularName").sort_by("priority").first().alias("vernacularName") + ]) ) print(f"Loading taxon data from {taxon_file}") taxon_df = ( - pd.read_csv(taxon_file, sep="\t", quoting=3, low_memory=False) - .query("taxonomicStatus == 'accepted' and canonicalName.notnull()") + pl.read_csv(taxon_file, separator="\t", infer_schema_length=0, quote_char=None) + .filter( + (pl.col("taxonomicStatus") == "accepted") & + (pl.col("canonicalName").is_not_null()) + ) ) # Find all .resolved.parquet under annotation_dir @@ -243,14 +361,16 @@ def main(annotation_dir=None, output_dir=None): # Process one-by-one, preserving subdirs for idx, annotation_path in enumerate(annotation_paths, start=1): print(f"[{idx}/{len(annotation_paths)}] {annotation_path}") - anno_df = pd.read_parquet(annotation_path) + anno_df = pl.read_parquet(annotation_path) new_df = merge_taxon_id(anno_df, taxon_df) - new_df = merge_common_name(new_df, common_name_df) - new_df["scientific_name"] = new_df["scientific_name"].astype(str) + new_df = merge_common_name(new_df, common_name_df, taxon_df) + new_df = new_df.with_columns([ + pl.col("scientific_name").cast(pl.Utf8) + ]) rel = os.path.relpath(annotation_path, annotation_dir) out_path = os.path.join(output_dir, rel) os.makedirs(os.path.dirname(out_path), exist_ok=True) - new_df.to_parquet(out_path, index=False) + new_df.write_parquet(out_path) print(f" → wrote {out_path}") From 6f20c1855cf1da2e87e7ed3e732c5bd0881351cb Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 23 Jul 2025 18:22:50 -0400 Subject: [PATCH 02/11] Modularize helper functions --- src/taxonopy/resolve_common_names.py | 285 +++++++++++++++++---------- 1 file changed, 183 insertions(+), 102 deletions(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 2958168..79ce283 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -105,6 +105,71 @@ def download_and_extract_backbone(cache_dir: Path): return taxon_file, vernacular_file +def _normalize_one_column(col: str) -> pl.Expr: + """Build the Utf8‐cast + empty→null expression for a single column.""" + # Cast to string + casted = pl.col(col).cast(pl.Utf8) + # Turn "" into None + cleaned = casted.map_elements(lambda x: None if x == "" else x, return_dtype=pl.Utf8) + # Give it back its original name + return cleaned.alias(col) + +def normalize_taxonomic_columns(df: pl.DataFrame) -> pl.DataFrame: + """ + Normalize taxonomic columns by casting to Utf8 and converting empty strings to null. + + :param df: DataFrame with taxonomic columns + :return: DataFrame with normalized taxonomic columns + """ + taxonomic_cols = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] + existing = [c for c in taxonomic_cols if c in df.columns] + + # Build a list of clean‐up expressions, one per column + exprs = [_normalize_one_column(c) for c in existing] + + # Apply them all at once + return df.with_columns(exprs) + +# Module-level constant for join columns to avoid duplication +TAXONOMIC_HIERARCHY = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus'] + + +def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) -> pl.DataFrame: + """ + Join annotation dataframe with taxon dataframe for a single taxonomic rank. + + :param anno_df: Annotation dataframe (should have normalized taxonomic columns) + :param taxon_df: Taxon dataframe + :param rank: Taxonomic rank to join on ('species' or 'genus') + :return: DataFrame with taxonID_{rank} column added + """ + if rank not in anno_df.columns: + return anno_df + + # Select and rename taxonID to avoid conflicts + backbone_subset = taxon_df.select([ + 'canonicalName', + pl.col('taxonID').alias(f'taxonID_{rank}'), + *TAXONOMIC_HIERARCHY + ]) + + # Get columns that exist in both dataframes for joining + join_cols = [col for col in TAXONOMIC_HIERARCHY if col in anno_df.columns] + + result = anno_df.join( + backbone_subset, + left_on=[rank] + join_cols, + right_on=['canonicalName'] + join_cols, + how='left' + ) + + # Drop canonicalName if it exists + if 'canonicalName' in result.columns: + result = result.drop('canonicalName') + + return result + + def merge_taxon_id(anno_df, taxon_df): """ This function is used to retrieve taxon_id from taxon_df @@ -112,43 +177,11 @@ def merge_taxon_id(anno_df, taxon_df): :param taxon_df: taxon dataframe :return: merged dataframe """ - new_anno_df = anno_df.clone() - - # Cast join key columns to Utf8 and convert empty strings to null - taxonomic_cols = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] - existing_cols = [col for col in taxonomic_cols if col in new_anno_df.columns] - - new_anno_df = new_anno_df.with_columns([ - pl.col(col).cast(pl.Utf8).map_elements(lambda x: None if x == '' else x, return_dtype=pl.Utf8) - for col in existing_cols - ]) + new_anno_df = normalize_taxonomic_columns(anno_df.clone()) print('Start merging with taxon_df') - for key in ['species', 'genus']: - if key not in new_anno_df.columns: - continue - - # Select and rename taxonID to avoid conflicts - backbone_subset = taxon_df.select([ - 'canonicalName', - pl.col('taxonID').alias(f'taxonID_{key}'), - 'kingdom', 'phylum', 'class', 'order', 'family', 'genus' - ]) - - # Get columns that exist in both dataframes for joining - join_cols = [col for col in ['kingdom', 'phylum', 'class', 'order', 'family', 'genus'] - if col in new_anno_df.columns] - - new_anno_df = new_anno_df.join( - backbone_subset, - left_on=[key] + join_cols, - right_on=['canonicalName'] + join_cols, - how='left' - ) - - # Drop canonicalName if it exists - if 'canonicalName' in new_anno_df.columns: - new_anno_df = new_anno_df.drop('canonicalName') + for rank in ['species', 'genus']: + new_anno_df = join_single_rank(new_anno_df, taxon_df, rank) # Only keep the smallest taxonID for each uuid (handle duplicates) if 'uuid' in new_anno_df.columns and 'taxonID_genus' in new_anno_df.columns: @@ -165,6 +198,118 @@ def merge_taxon_id(anno_df, taxon_df): return new_anno_df +def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame: + """ + Prioritize vernacular names with English preference. + + :param vernacular_df: Raw GBIF vernacular DataFrame with taxonID, vernacularName, language columns + :return: DataFrame with columns (taxonID, vernacularName) prioritizing English, then any other language + """ + # Tag English vs. other + english = ( + vernacular_df + .filter(pl.col("language") == "en") + .with_columns([ + pl.col("vernacularName") + .str.to_lowercase() + .str.to_titlecase() + .alias("vernacularName"), + pl.lit(1).alias("priority") + ]) + .group_by("taxonID") + .agg([ + pl.col("vernacularName").first(), + pl.col("priority").first() + ]) + ) + + other = ( + vernacular_df + .filter(pl.col("language") != "en") + .with_columns([ + pl.col("vernacularName") + .str.to_lowercase() + .str.to_titlecase() + .alias("vernacularName"), + pl.lit(2).alias("priority") + ]) + .group_by("taxonID") + .agg([ + pl.col("vernacularName").first(), + pl.col("priority").first() + ]) + ) + + # Merge and pick the top‐priority name, then drop the priority column + result = ( + pl.concat([english, other]) + .group_by("taxonID") + .agg( + pl.col("vernacularName") + .sort_by("priority") + .first() + .alias("vernacularName") + ) + ) + + # Just in case, ensure we only have the two columns + return result.select(["taxonID", "vernacularName"]) + + +def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame: + """ + Apply hierarchical common name lookup from most specific to least specific rank. + + :param anno_df: Annotation dataframe with taxonID_* columns + :param common_lookup: Common name lookup table with (taxonID, common_name) columns + :return: DataFrame with common_name column populated using hierarchical fallback + """ + rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] + + # Initialize common_name column + result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) + + # Apply hierarchical lookup - check each rank in priority order + for rank in rank_columns: + taxonid_col = f"taxonID_{rank}" + if taxonid_col not in result_df.columns: + continue + + # Join common names for this rank + temp_df = result_df.join( + common_lookup.select([ + "taxonID", + pl.col("common_name").alias(f"temp_common_{rank}") + ]), + left_on=taxonid_col, + right_on="taxonID", + how="left" + ) + + # Update common_name where it's null and this rank has a name + result_df = temp_df.with_columns([ + pl.coalesce([ + pl.col("common_name"), + pl.col(f"temp_common_{rank}") + ]).alias("common_name") + ]).drop(f"temp_common_{rank}") + + return result_df + + +def override_input_common_name(df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame: + """ + Override any existing common_name column with backbone-derived common names. + + :param df: DataFrame that may have a pre-existing common_name column + :param common_lookup: Common name lookup table with hierarchical fallback applied + :return: DataFrame with backbone-derived common_name (input common_name completely replaced) + """ + # Drop any existing common_name column and apply the backbone lookup + df_clean = df.drop("common_name") if "common_name" in df.columns else df + return apply_hierarchical_common_name_lookup(df_clean, common_lookup) + + def merge_common_name(anno_df, common_name_df, taxon_df): """ This function merges common names with annotation dataframe using hierarchical lookup. @@ -190,10 +335,6 @@ def merge_common_name(anno_df, common_name_df, taxon_df): # Define hierarchical order of taxonomic ranks rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] - # Find which taxonomic ranks we actually have taxonIDs for - available_taxonid_cols = [f"taxonID_{rank}" for rank in rank_columns - if f"taxonID_{rank}" in new_anno_df.columns] - # Find which taxonomic classification columns we have available_rank_cols = [rank for rank in rank_columns if rank in new_anno_df.columns] @@ -224,33 +365,8 @@ def merge_common_name(anno_df, common_name_df, taxon_df): else: new_anno_df = new_anno_df.rename({temp_taxonid_col: taxonid_col}) - # Initialize common_name column - new_anno_df = new_anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) - - # Apply hierarchical lookup - check each rank in priority order - for rank in rank_columns: - taxonid_col = f"taxonID_{rank}" - if taxonid_col not in new_anno_df.columns: - continue - - # Join common names for this rank - temp_df = new_anno_df.join( - common_lookup.select([ - "taxonID", - pl.col("common_name").alias(f"temp_common_{rank}") - ]), - left_on=taxonid_col, - right_on="taxonID", - how="left" - ) - - # Update common_name where it's null and this rank has a name - new_anno_df = temp_df.with_columns([ - pl.coalesce([ - pl.col("common_name"), - pl.col(f"temp_common_{rank}") - ]).alias("common_name") - ]).drop(f"temp_common_{rank}") + # Override any input common_name with backbone data + new_anno_df = override_input_common_name(new_anno_df, common_lookup) # Clean up temporary taxonID columns (keep original taxonomic classification columns) cleanup_cols = [f"taxonID_{rank}" for rank in rank_columns] @@ -306,42 +422,7 @@ def main(annotation_dir=None, output_dir=None): vernacular_df = pl.read_csv(common_name_file, separator="\t", infer_schema_length=0, quote_char=None) # Create prioritized vernacular names: prefer English, fallback to any language - english_names = ( - vernacular_df - .filter(pl.col("language") == "en") - .with_columns([ - pl.col("vernacularName").str.to_lowercase().str.to_titlecase().alias("vernacularName"), - pl.lit(1).alias("priority") - ]) - .group_by("taxonID") - .agg([ - pl.col("vernacularName").first().alias("vernacularName"), - pl.col("priority").first().alias("priority") - ]) - ) - - other_names = ( - vernacular_df - .filter(pl.col("language") != "en") - .with_columns([ - pl.col("vernacularName").str.to_lowercase().str.to_titlecase().alias("vernacularName"), - pl.lit(2).alias("priority") - ]) - .group_by("taxonID") - .agg([ - pl.col("vernacularName").first().alias("vernacularName"), - pl.col("priority").first().alias("priority") - ]) - ) - - # Combine with English preference - common_name_df = ( - pl.concat([english_names, other_names]) - .group_by("taxonID") - .agg([ - pl.col("vernacularName").sort_by("priority").first().alias("vernacularName") - ]) - ) + common_name_df = prioritize_vernacular(vernacular_df) print(f"Loading taxon data from {taxon_file}") taxon_df = ( From ffa0322bd0316f0282f723244ae3487ce0272cf1 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 23 Jul 2025 18:23:09 -0400 Subject: [PATCH 03/11] Test common name functionality --- tests/test_resolve_common_names.py | 559 +++++++++++++++++++++++++++++ 1 file changed, 559 insertions(+) create mode 100644 tests/test_resolve_common_names.py diff --git a/tests/test_resolve_common_names.py b/tests/test_resolve_common_names.py new file mode 100644 index 0000000..da75185 --- /dev/null +++ b/tests/test_resolve_common_names.py @@ -0,0 +1,559 @@ +import pytest +import polars as pl +from taxonopy.resolve_common_names import ( + merge_taxon_id, merge_common_name, prioritize_vernacular, + normalize_taxonomic_columns, join_single_rank, + apply_hierarchical_common_name_lookup, override_input_common_name +) + + +class TestMergeTaxonId: + """Unit tests for merge_taxon_id function""" + + def test_merge_taxon_id_basic_functionality(self): + """Test that merge_taxon_id correctly assigns specific taxonIDs""" + anno_df = pl.DataFrame({ + "uuid": ["test1", "test2"], + "species": ["Canis lupus", "Felis catus"], + "genus": ["Canis", "Felis"], + "family": ["Canidae", "Felidae"], + "order": ["Carnivora", "Carnivora"], + "class": ["Mammalia", "Mammalia"], + "phylum": ["Chordata", "Chordata"], + "kingdom": ["Animalia", "Animalia"] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 200, 300, 400], + "canonicalName": ["Canis lupus", "Felis catus", "Canis", "Felis"], + "kingdom": ["Animalia", "Animalia", "Animalia", "Animalia"], + "phylum": ["Chordata", "Chordata", "Chordata", "Chordata"], + "class": ["Mammalia", "Mammalia", "Mammalia", "Mammalia"], + "order": ["Carnivora", "Carnivora", "Carnivora", "Carnivora"], + "family": ["Canidae", "Felidae", "Canidae", "Felidae"], + "genus": ["Canis", "Felis", "Canis", "Felis"] + }) + + result = merge_taxon_id(anno_df, taxon_df) + + # Basic contract checks + assert len(result) == len(anno_df), "Row count should be preserved" + assert "taxonID_species" in result.columns, "Should add taxonID_species column" + assert "taxonID_genus" in result.columns, "Should add taxonID_genus column" + + # Specific behavior checks + canis_row = result.filter(pl.col("species") == "Canis lupus") + assert canis_row["taxonID_species"].to_list()[0] == 100, "Canis lupus should get taxonID 100" + assert canis_row["taxonID_genus"].to_list()[0] == 300, "Canis genus should get taxonID 300" + + felis_row = result.filter(pl.col("species") == "Felis catus") + assert felis_row["taxonID_species"].to_list()[0] == 200, "Felis catus should get taxonID 200" + assert felis_row["taxonID_genus"].to_list()[0] == 400, "Felis genus should get taxonID 400" + + @pytest.mark.parametrize("missing_field,expected_null_column", [ + (("species", None), "taxonID_species"), + (("species", ""), "taxonID_species"), + (("genus", None), "taxonID_genus"), + (("genus", ""), "taxonID_genus"), + ]) + def test_merge_taxon_id_handles_missing_data(self, missing_field, expected_null_column): + """Test that merge_taxon_id handles missing/empty data gracefully""" + field_name, field_value = missing_field + + anno_data = { + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"] + } + anno_data[field_name] = [field_value] # Override with test value + anno_df = pl.DataFrame(anno_data) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300], + "canonicalName": ["Canis lupus", "Canis"], + "kingdom": ["Animalia", "Animalia"], + "phylum": ["Chordata", "Chordata"], + "class": ["Mammalia", "Mammalia"], + "order": ["Carnivora", "Carnivora"], + "family": ["Canidae", "Canidae"], + "genus": ["Canis", "Canis"] + }) + + result = merge_taxon_id(anno_df, taxon_df) + + # Missing/empty fields should result in null taxonIDs + assert len(result) == 1 + assert result[expected_null_column].to_list()[0] is None, f"Missing {field_name} should result in null {expected_null_column}" + + +class TestMergeCommonName: + """Unit tests for merge_common_name function""" + + def test_merge_common_name_species_priority(self): + """Test that species-level common names take priority""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [100], + "taxonID_genus": [300] + }) + + common_name_df = pl.DataFrame({ + "taxonID": [100, 300], + "vernacularName": ["Gray Wolf", "Dog Genus"] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300, 500], + "canonicalName": ["Canis lupus", "Canis", "Canidae"], + "taxonRank": ["species", "genus", "family"], + "kingdom": ["Animalia"] * 3, + "phylum": ["Chordata"] * 3, + "class": ["Mammalia"] * 3, + "order": ["Carnivora"] * 3, + "family": ["Canidae"] * 3, + "genus": ["Canis", "Canis", None] + }) + + result = merge_common_name(anno_df, common_name_df, taxon_df) + + assert len(result) == 1 + assert result["common_name"].to_list()[0] == "Gray Wolf", "Should prefer species over genus name" + + def test_merge_common_name_cleans_up_intermediate_columns(self): + """Test that all intermediate columns are removed after processing""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [100], + "taxonID_genus": [300] + }) + + common_name_df = pl.DataFrame({ + "taxonID": [100], + "vernacularName": ["Gray Wolf"] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300, 500], + "canonicalName": ["Canis lupus", "Canis", "Canidae"], + "taxonRank": ["species", "genus", "family"], + "kingdom": ["Animalia"] * 3, + "phylum": ["Chordata"] * 3, + "class": ["Mammalia"] * 3, + "order": ["Carnivora"] * 3, + "family": ["Canidae"] * 3, + "genus": ["Canis", "Canis", None] + }) + + result = merge_common_name(anno_df, common_name_df, taxon_df) + + # Check that intermediate columns are cleaned up + for col in result.columns: + assert not col.startswith("vernacular_"), f"Should not have intermediate column: {col}" + assert not col.startswith("taxonID_"), f"Should not have intermediate column: {col}" + + # Lock in the exact final column set + expected_columns = { + "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name" + } + assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}" + + def test_merge_common_name_overrides_existing_common_name(self): + """ + Test that pre-existing common_name in input is overridden by backbone data. + This is the core requirement from PR #10 - input data should not be given deference. + """ + # Store original input common_name to verify it gets overridden + original_input_name = "BAD INPUT NAME" + + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "common_name": [original_input_name], # This should be completely replaced + "taxonID_species": [100], + "taxonID_genus": [300] + }) + + backbone_name = "Backbone Gray Wolf" + common_name_df = pl.DataFrame({ + "taxonID": [100], + "vernacularName": [backbone_name] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300], + "canonicalName": ["Canis lupus", "Canis"], + "taxonRank": ["species", "genus"], + "kingdom": ["Animalia"] * 2, + "phylum": ["Chordata"] * 2, + "class": ["Mammalia"] * 2, + "order": ["Carnivora"] * 2, + "family": ["Canidae"] * 2, + "genus": ["Canis"] * 2 + }) + + result = merge_common_name(anno_df, common_name_df, taxon_df) + + # Core PR requirement: original input common_name must be completely replaced + final_common_name = result["common_name"].to_list()[0] + assert final_common_name == backbone_name, "Must use backbone data for common name" + assert final_common_name != original_input_name, "Must NOT use input data common name" + + # Verify that the original input common_name has been dropped and replaced + # (not just appended to or modified) + assert final_common_name == "Backbone Gray Wolf", "Should be exactly the backbone value" + assert "BAD INPUT NAME" not in str(result), "Original input name should be completely gone" + + # Should only have the final common_name column, no intermediate columns + for col in result.columns: + assert not col.startswith("vernacular_"), f"Should not have intermediate column: {col}" + + # Verify exact final column set (original input had common_name, should be overridden) + expected_columns = { + "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name" + } + assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}" + + def test_merge_common_name_duplicate_vernacular_deterministic(self): + """Test that merge_common_name handles duplicate vernacular names deterministically""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "taxonID_species": [100], + "taxonID_genus": [300] + }) + + # Multiple vernacular names for same taxonID - first one should be selected + common_name_df = pl.DataFrame({ + "taxonID": [100, 100, 300], + "vernacularName": ["Gray Wolf", "Loup Gris", "Dog Genus"] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300], + "canonicalName": ["Canis lupus", "Canis"], + "taxonRank": ["species", "genus"], + "kingdom": ["Animalia"] * 2, + "phylum": ["Chordata"] * 2, + "class": ["Mammalia"] * 2, + "order": ["Carnivora"] * 2, + "family": ["Canidae"] * 2, + "genus": ["Canis"] * 2 + }) + + result = merge_common_name(anno_df, common_name_df, taxon_df) + + # Should deterministically pick the first vernacular name (Gray Wolf) + assert result["common_name"].to_list()[0] == "Gray Wolf" + + @pytest.mark.parametrize("available_rank,expected_name", [ + ("species", "Species Name"), + ("genus", "Genus Name"), + ("family", "Family Name"), + ("order", "Order Name"), + ("class", "Class Name"), + ("phylum", "Phylum Name"), + ("kingdom", "Kingdom Name"), + ]) + def test_merge_common_name_hierarchical_fallback_levels(self, available_rank, expected_name): + """Test hierarchical fallback at all taxonomic levels from species to kingdom""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Test species"], + "genus": ["Test genus"], + "family": ["Test family"], + "order": ["Test order"], + "class": ["Test class"], + "phylum": ["Test phylum"], + "kingdom": ["Test kingdom"], + "taxonID_species": [999], # Won't match + "taxonID_genus": [998] # Won't match + }) + + # Create taxonID based on rank being tested + rank_to_taxonid = { + "species": 100, + "genus": 200, + "family": 300, + "order": 400, + "class": 500, + "phylum": 600, + "kingdom": 700 + } + + common_name_df = pl.DataFrame({ + "taxonID": [rank_to_taxonid[available_rank]], + "vernacularName": [expected_name] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [rank_to_taxonid[available_rank]], + "canonicalName": [f"Test {available_rank}"], + "taxonRank": [available_rank], + "kingdom": ["Test kingdom"], + "phylum": ["Test phylum"], + "class": ["Test class"], + "order": ["Test order"], + "family": ["Test family"], + "genus": ["Test genus"] + }) + + result = merge_common_name(anno_df, common_name_df, taxon_df) + + assert result["common_name"].to_list()[0] == expected_name, f"Should fallback to {available_rank} level name" + + +class TestNormalizeTaxonomicColumns: + """Unit tests for normalize_taxonomic_columns function""" + + def test_empty_strings_to_null(self): + """Test that empty strings are converted to null""" + df = pl.DataFrame({ + "species": ["", "Canis lupus", ""], + "genus": ["Canis", "", "Felis"], + "other_col": ["keep", "me", "unchanged"] + }) + + result = normalize_taxonomic_columns(df) + + assert result["species"].to_list() == [None, "Canis lupus", None] + assert result["genus"].to_list() == ["Canis", None, "Felis"] + assert result["other_col"].to_list() == ["keep", "me", "unchanged"] + + def test_casts_to_utf8(self): + """Test that columns are cast to Utf8""" + df = pl.DataFrame({ + "species": [123, 456], # Numeric input + "genus": ["text", "already"] + }) + + result = normalize_taxonomic_columns(df) + + assert result["species"].dtype == pl.Utf8 + assert result["genus"].dtype == pl.Utf8 + assert result["species"].to_list() == ["123", "456"] + + +class TestJoinSingleRank: + """Unit tests for join_single_rank function""" + + @pytest.mark.parametrize("rank", ["species", "genus"]) + def test_adds_taxonID_column(self, rank): + """Test that join_single_rank adds taxonID_{rank} column""" + anno_df = pl.DataFrame({ + rank: ["Canis"], + "kingdom": ["Animalia"], + "genus": ["Canis"] + }) + + taxon_df = pl.DataFrame({ + "canonicalName": ["Canis"], + "taxonID": [42], + "kingdom": ["Animalia"], + "phylum": ["Chordata"], + "class": ["Mammalia"], + "order": ["Carnivora"], + "family": ["Canidae"], + "genus": ["Canis"] + }) + + result = join_single_rank(anno_df, taxon_df, rank) + + expected_col = f"taxonID_{rank}" + assert expected_col in result.columns + assert result[expected_col].to_list() == [42] + + def test_returns_unchanged_if_rank_missing(self): + """Test that missing rank columns are handled gracefully""" + anno_df = pl.DataFrame({"genus": ["Canis"]}) + taxon_df = pl.DataFrame({"canonicalName": ["Species"], "taxonID": [1]}) + + result = join_single_rank(anno_df, taxon_df, "species") + + # Should return unchanged since species column doesn't exist + assert result.equals(anno_df) + + +class TestHierarchicalCommonNameLookup: + """Unit tests for apply_hierarchical_common_name_lookup function""" + + @pytest.mark.parametrize("available_rank,expected_name", [ + ("species", "Species Name"), + ("genus", "Genus Name"), + ("family", "Family Name"), + ]) + def test_hierarchical_fallback_levels(self, available_rank, expected_name): + """Test hierarchical fallback at different taxonomic levels""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + f"taxonID_{available_rank}": [100] + }) + + common_lookup = pl.DataFrame({ + "taxonID": [100], + "common_name": [expected_name] + }) + + result = apply_hierarchical_common_name_lookup(anno_df, common_lookup) + + assert result["common_name"].to_list()[0] == expected_name + + def test_species_takes_priority_over_genus(self): + """Test that species-level names take priority over genus""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "taxonID_species": [100], + "taxonID_genus": [200] + }) + + common_lookup = pl.DataFrame({ + "taxonID": [100, 200], + "common_name": ["Gray Wolf", "Dog Genus"] + }) + + result = apply_hierarchical_common_name_lookup(anno_df, common_lookup) + + assert result["common_name"].to_list()[0] == "Gray Wolf" + + +class TestOverrideInputCommonName: + """Unit tests for override_input_common_name function""" + + def test_overrides_existing_common_name(self): + """Test that pre-existing common_name is completely replaced""" + df_with_input = pl.DataFrame({ + "uuid": ["test1"], + "common_name": ["BAD INPUT NAME"], + "taxonID_species": [100] + }) + + common_lookup = pl.DataFrame({ + "taxonID": [100], + "common_name": ["Backbone Name"] + }) + + result = override_input_common_name(df_with_input, common_lookup) + + assert result["common_name"].to_list()[0] == "Backbone Name" + assert "BAD INPUT NAME" not in str(result) + + def test_handles_no_existing_common_name(self): + """Test that function works when no common_name column exists""" + df_no_input = pl.DataFrame({ + "uuid": ["test1"], + "taxonID_species": [100] + }) + + common_lookup = pl.DataFrame({ + "taxonID": [100], + "common_name": ["Backbone Name"] + }) + + result = override_input_common_name(df_no_input, common_lookup) + + assert result["common_name"].to_list()[0] == "Backbone Name" + + +class TestEnglishPreference: + """Test English language preference in vernacular name processing""" + + def test_english_preferred_over_other_languages(self): + """Test the vernacular name processing logic that prefers English""" + vernacular_df = pl.DataFrame({ + "taxonID": [100, 100, 200, 300], + "vernacularName": ["Gray Wolf", "Loup gris", "House Cat", "Roble blanco"], + "language": ["en", "fr", "en", "es"] + }) + + result = prioritize_vernacular(vernacular_df) + + # Check English preference + wolf_name = result.filter(pl.col("taxonID") == 100)["vernacularName"].to_list()[0] + assert wolf_name == "Gray Wolf", "Should prefer English 'Gray Wolf' over French 'Loup Gris'" + + cat_name = result.filter(pl.col("taxonID") == 200)["vernacularName"].to_list()[0] + assert cat_name == "House Cat", "Should use English name when only English available" + + # When only non-English available, should use that + spanish_name = result.filter(pl.col("taxonID") == 300)["vernacularName"].to_list()[0] + assert spanish_name == "Roble Blanco", "Should use non-English when English unavailable" + + +class TestDataIntegrity: + """Tests to ensure data integrity through the pipeline""" + + def test_pipeline_preserves_row_count_and_uuids(self): + """Test that the full pipeline preserves data integrity""" + anno_df = pl.DataFrame({ + "uuid": ["uuid1", "uuid2", "uuid3"], + "species": ["Canis lupus", "Felis catus", "Unknown species"], + "genus": ["Canis", "Felis", "Unknown genus"], + "family": ["Canidae", "Felidae", "Unknown family"], + "order": ["Carnivora", "Carnivora", "Unknown order"], + "class": ["Mammalia", "Mammalia", "Unknown class"], + "phylum": ["Chordata", "Chordata", "Unknown phylum"], + "kingdom": ["Animalia", "Animalia", "Unknown kingdom"] + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 200, 300, 400], + "canonicalName": ["Canis lupus", "Felis catus", "Canis", "Felis"], + "taxonRank": ["species", "species", "genus", "genus"], + "kingdom": ["Animalia"] * 4, + "phylum": ["Chordata"] * 4, + "class": ["Mammalia"] * 4, + "order": ["Carnivora"] * 4, + "family": ["Canidae", "Felidae", "Canidae", "Felidae"], + "genus": ["Canis", "Felis", "Canis", "Felis"] + }) + + common_name_df = pl.DataFrame({ + "taxonID": [100, 200], + "vernacularName": ["Gray Wolf", "House Cat"] + }) + + # Run full pipeline + step1 = merge_taxon_id(anno_df, taxon_df) + step2 = merge_common_name(step1, common_name_df, taxon_df) + + # Data integrity checks + assert len(step2) == len(anno_df), "Row count should be preserved through pipeline" + + original_uuids = set(anno_df["uuid"].to_list()) + final_uuids = set(step2["uuid"].to_list()) + assert original_uuids == final_uuids, "All UUIDs should be preserved" + assert len(step2["uuid"].unique()) == len(step2), "UUIDs should remain unique" + + # Should have final common names for matched taxa + matched_rows = step2.filter(pl.col("common_name").is_not_null()) + assert len(matched_rows) >= 2, "Should have common names for at least the matched taxa" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file From aee8c7ae81a3263ed904ea5f3848fc2771965636 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 23 Jul 2025 18:24:29 -0400 Subject: [PATCH 04/11] Run tests with Actions workflow --- .github/workflows/run-tests.yaml | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/run-tests.yaml diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml new file mode 100644 index 0000000..6e01ccc --- /dev/null +++ b/.github/workflows/run-tests.yaml @@ -0,0 +1,33 @@ +name: run-tests + +on: + pull_request: + push: + +jobs: + run-tests: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: install dependencies + run: pip install .[dev] + shell: bash + + - name: run Ruff linter + run: ruff check . + shell: bash + + - name: run test + run: pytest -rs + shell: bash \ No newline at end of file From fe0ccd899c19e45ac796743b20603eb28d6755f0 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 23 Jul 2025 18:51:10 -0400 Subject: [PATCH 05/11] Address issues identified running on real data --- src/taxonopy/resolve_common_names.py | 76 +++++++++++++++++----------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 79ce283..99bc8b6 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -146,16 +146,21 @@ def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) - if rank not in anno_df.columns: return anno_df - # Select and rename taxonID to avoid conflicts - backbone_subset = taxon_df.select([ - 'canonicalName', - pl.col('taxonID').alias(f'taxonID_{rank}'), - *TAXONOMIC_HIERARCHY - ]) - - # Get columns that exist in both dataframes for joining + # Figure out which higher-rank cols we actually have in the anno_df join_cols = [col for col in TAXONOMIC_HIERARCHY if col in anno_df.columns] + # Select, rename, *and* drop duplicate backbone rows on the full key + backbone_subset = ( + taxon_df + .select([ + 'canonicalName', + pl.col('taxonID').alias(f'taxonID_{rank}'), + *join_cols + ]) + # ensure (canonicalName + all join_cols) is unique + .unique(subset=['canonicalName'] + join_cols) + ) + result = anno_df.join( backbone_subset, left_on=[rank] + join_cols, @@ -183,17 +188,11 @@ def merge_taxon_id(anno_df, taxon_df): for rank in ['species', 'genus']: new_anno_df = join_single_rank(new_anno_df, taxon_df, rank) - # Only keep the smallest taxonID for each uuid (handle duplicates) - if 'uuid' in new_anno_df.columns and 'taxonID_genus' in new_anno_df.columns: - duplicated_uuids = new_anno_df.filter(pl.col('uuid').is_duplicated()) - if len(duplicated_uuids) > 0: - non_duplicated_df = new_anno_df.filter(~pl.col('uuid').is_in(duplicated_uuids['uuid'])) - duplicated_uuids = duplicated_uuids.group_by('uuid').agg(pl.col('taxonID_genus').min()).join( - duplicated_uuids, on=['uuid', 'taxonID_genus'], how='inner' - ) - new_anno_df = pl.concat([non_duplicated_df, duplicated_uuids]) - - assert len(new_anno_df) == len(anno_df), f"Length mismatch: {len(new_anno_df)} != {len(anno_df)}" + # With the backbone_subset de-duped above, joins are one-to-one, so + # the row count will always match. + assert len(new_anno_df) == len(anno_df), ( + f"Length mismatch after taxon joins: {len(new_anno_df)} != {len(anno_df)}" + ) return new_anno_df @@ -287,12 +286,21 @@ def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: ) # Update common_name where it's null and this rank has a name - result_df = temp_df.with_columns([ - pl.coalesce([ - pl.col("common_name"), - pl.col(f"temp_common_{rank}") - ]).alias("common_name") - ]).drop(f"temp_common_{rank}") + result_df = ( + temp_df + # pick up the new common_name, drop the temp join field + .with_columns([ + pl.coalesce([ + pl.col("common_name"), + pl.col(f"temp_common_{rank}") + ]).alias("common_name") + ]) + .drop(f"temp_common_{rank}") + ) + + # Drop taxonID column if it exists (may not exist if no matches) + if "taxonID" in result_df.columns: + result_df = result_df.drop("taxonID") return result_df @@ -345,10 +353,15 @@ def merge_common_name(anno_df, common_name_df, taxon_df): temp_taxonid_col = f"temp_taxonID_{rank}" # Always look up the authoritative taxonID from taxon_df + # Only one row per canonicalName at this rank rank_taxa = ( taxon_df .filter(pl.col("taxonRank") == rank) - .select([pl.col("taxonID").alias(temp_taxonid_col), pl.col("canonicalName").alias(rank)]) + .select([ + pl.col("taxonID").alias(temp_taxonid_col), + pl.col("canonicalName").alias(rank) + ]) + .unique(subset=[rank]) ) new_anno_df = new_anno_df.join( @@ -374,7 +387,10 @@ def merge_common_name(anno_df, common_name_df, taxon_df): if existing_cleanup_cols: new_anno_df = new_anno_df.drop(existing_cleanup_cols) - assert len(new_anno_df) == len(anno_df), f"Length mismatch: {len(new_anno_df)} != {len(anno_df)}" + # With all of our backbone joins de-duplicated, we should never change row count: + assert len(new_anno_df) == len(anno_df), ( + f"Length mismatch after common-name merge: {len(new_anno_df)} != {len(anno_df)}" + ) return new_anno_df @@ -415,8 +431,8 @@ def main(annotation_dir=None, output_dir=None): taxon_file, common_name_file = download_and_extract_backbone(cache_dir) # Load the two TSVs - print(f"Loading taxonomy data from {taxon_file}") - + print(f"Loading vernacular names from {common_name_file}") + # Load all vernacular names, prioritizing English but keeping others as fallback # Turn off schema inference to handle improperly escaped quotes in GBIF data vernacular_df = pl.read_csv(common_name_file, separator="\t", infer_schema_length=0, quote_char=None) @@ -424,7 +440,7 @@ def main(annotation_dir=None, output_dir=None): # Create prioritized vernacular names: prefer English, fallback to any language common_name_df = prioritize_vernacular(vernacular_df) - print(f"Loading taxon data from {taxon_file}") + print(f"Loading backbone taxa from {taxon_file}") taxon_df = ( pl.read_csv(taxon_file, separator="\t", infer_schema_length=0, quote_char=None) .filter( From 4e6ec07078301fe81800485660e272f322aaaa65 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Thu, 24 Jul 2025 09:58:01 -0400 Subject: [PATCH 06/11] Dry code; sync tests --- src/taxonopy/resolve_common_names.py | 49 ++++++++++++++++------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 99bc8b6..60875ee 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -7,6 +7,11 @@ from pathlib import Path import shutil +from taxonopy.constants import TAXONOMIC_RANKS_BY_SPECIFICITY, INVALID_VALUES, TAXONOMIC_RANKS + +# Module-level constant for join columns to avoid duplication +PARENT_RANKS = TAXONOMIC_RANKS[:-1] + def download_and_extract_backbone(cache_dir: Path): """Download and extract the GBIF backbone taxonomy files.""" cache_dir.mkdir(parents=True, exist_ok=True) @@ -110,7 +115,7 @@ def _normalize_one_column(col: str) -> pl.Expr: # Cast to string casted = pl.col(col).cast(pl.Utf8) # Turn "" into None - cleaned = casted.map_elements(lambda x: None if x == "" else x, return_dtype=pl.Utf8) + cleaned = casted.map_elements(lambda x: None if str(x).lower() in INVALID_VALUES else x, return_dtype=pl.Utf8) # Give it back its original name return cleaned.alias(col) @@ -130,10 +135,6 @@ def normalize_taxonomic_columns(df: pl.DataFrame) -> pl.DataFrame: # Apply them all at once return df.with_columns(exprs) -# Module-level constant for join columns to avoid duplication -TAXONOMIC_HIERARCHY = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus'] - - def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) -> pl.DataFrame: """ Join annotation dataframe with taxon dataframe for a single taxonomic rank. @@ -147,18 +148,25 @@ def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) - return anno_df # Figure out which higher-rank cols we actually have in the anno_df - join_cols = [col for col in TAXONOMIC_HIERARCHY if col in anno_df.columns] - - # Select, rename, *and* drop duplicate backbone rows on the full key + join_cols = [c for c in PARENT_RANKS + if c in anno_df.columns and c != rank] + + # Select, rename, and drop duplicate backbone rows on the full key + # - if the taxon_df actually has a taxonRank column, filter by it; + # - otherwise just use the whole table + if "taxonRank" in taxon_df.columns: + candidate = taxon_df.filter(pl.col("taxonRank") == rank) + else: + candidate = taxon_df + backbone_subset = ( - taxon_df - .select([ - 'canonicalName', - pl.col('taxonID').alias(f'taxonID_{rank}'), + candidate + .select( + pl.col("canonicalName"), + pl.col("taxonID").alias(f"taxonID_{rank}"), *join_cols - ]) - # ensure (canonicalName + all join_cols) is unique - .unique(subset=['canonicalName'] + join_cols) + ) + .unique(subset=["canonicalName"] + join_cols) ) result = anno_df.join( @@ -263,8 +271,9 @@ def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: :param common_lookup: Common name lookup table with (taxonID, common_name) columns :return: DataFrame with common_name column populated using hierarchical fallback """ - rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] - + # Define hierarchical order of taxonomic ranks (map class_ to class) + rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY] + # Initialize common_name column result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) @@ -340,9 +349,9 @@ def merge_common_name(anno_df, common_name_df, taxon_df): .agg(pl.col("vernacularName").first().alias("common_name")) ) - # Define hierarchical order of taxonomic ranks - rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] - + # Define hierarchical order of taxonomic ranks (map class_ to class) + rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY] + # Find which taxonomic classification columns we have available_rank_cols = [rank for rank in rank_columns if rank in new_anno_df.columns] From a4ef5aeb5b0b1959d7b7c1c668d908fb9a93da04 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Thu, 24 Jul 2025 10:17:17 -0400 Subject: [PATCH 07/11] Opt to preserve source data capitalization vs title case --- src/taxonopy/resolve_common_names.py | 25 +++++++------------------ tests/test_resolve_common_names.py | 4 ++-- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 60875ee..1c9556a 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -212,38 +212,27 @@ def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame: :param vernacular_df: Raw GBIF vernacular DataFrame with taxonID, vernacularName, language columns :return: DataFrame with columns (taxonID, vernacularName) prioritizing English, then any other language """ - # Tag English vs. other + # English vernaculars, priority 1 english = ( vernacular_df .filter(pl.col("language") == "en") - .with_columns([ - pl.col("vernacularName") - .str.to_lowercase() - .str.to_titlecase() - .alias("vernacularName"), - pl.lit(1).alias("priority") - ]) + .with_columns([pl.lit(1).alias("priority")]) .group_by("taxonID") .agg([ pl.col("vernacularName").first(), - pl.col("priority").first() + pl.col("priority").first(), ]) ) + # Mon‑English vernaculars, priority 2 other = ( vernacular_df .filter(pl.col("language") != "en") - .with_columns([ - pl.col("vernacularName") - .str.to_lowercase() - .str.to_titlecase() - .alias("vernacularName"), - pl.lit(2).alias("priority") - ]) + .with_columns([pl.lit(2).alias("priority")]) .group_by("taxonID") .agg([ pl.col("vernacularName").first(), - pl.col("priority").first() + pl.col("priority").first(), ]) ) @@ -273,7 +262,7 @@ def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: """ # Define hierarchical order of taxonomic ranks (map class_ to class) rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY] - + # Initialize common_name column result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) diff --git a/tests/test_resolve_common_names.py b/tests/test_resolve_common_names.py index da75185..f3ebfc6 100644 --- a/tests/test_resolve_common_names.py +++ b/tests/test_resolve_common_names.py @@ -500,9 +500,9 @@ def test_english_preferred_over_other_languages(self): cat_name = result.filter(pl.col("taxonID") == 200)["vernacularName"].to_list()[0] assert cat_name == "House Cat", "Should use English name when only English available" - # When only non-English available, should use that + # When only non-English available, should use the exact source casing spanish_name = result.filter(pl.col("taxonID") == 300)["vernacularName"].to_list()[0] - assert spanish_name == "Roble Blanco", "Should use non-English when English unavailable" + assert spanish_name == "Roble blanco", "Should use non‑English name exactly as in GBIF" class TestDataIntegrity: From ca8d0a2daf24e7af2ec4d99ace91424df0e3a998 Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:14:39 -0400 Subject: [PATCH 08/11] Fix typeo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/taxonopy/resolve_common_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 1c9556a..9d512c7 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -224,7 +224,7 @@ def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame: ]) ) - # Mon‑English vernaculars, priority 2 + # Non-English vernaculars, priority 2 other = ( vernacular_df .filter(pl.col("language") != "en") From a59f63e4c34af35b2e87edc3a42adf96971592d7 Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:17:25 -0400 Subject: [PATCH 09/11] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test_resolve_common_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_resolve_common_names.py b/tests/test_resolve_common_names.py index f3ebfc6..4de3aea 100644 --- a/tests/test_resolve_common_names.py +++ b/tests/test_resolve_common_names.py @@ -502,7 +502,7 @@ def test_english_preferred_over_other_languages(self): # When only non-English available, should use the exact source casing spanish_name = result.filter(pl.col("taxonID") == 300)["vernacularName"].to_list()[0] - assert spanish_name == "Roble blanco", "Should use non‑English name exactly as in GBIF" + assert spanish_name == "Roble blanco", "Should use non-English name exactly as in GBIF" class TestDataIntegrity: From c6d7b8606543ac5961d6693cdd7201c3b632e8c6 Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:17:55 -0400 Subject: [PATCH 10/11] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/taxonopy/resolve_common_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 9d512c7..668a70a 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -111,7 +111,7 @@ def download_and_extract_backbone(cache_dir: Path): return taxon_file, vernacular_file def _normalize_one_column(col: str) -> pl.Expr: - """Build the Utf8‐cast + empty→null expression for a single column.""" + """Build the Utf8-cast + empty->null expression for a single column.""" # Cast to string casted = pl.col(col).cast(pl.Utf8) # Turn "" into None From 1302e465e66cfc0e045f27d0a9c471dbeedf49be Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:18:18 -0400 Subject: [PATCH 11/11] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/taxonopy/resolve_common_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 668a70a..e669a5d 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -236,7 +236,7 @@ def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame: ]) ) - # Merge and pick the top‐priority name, then drop the priority column + # Merge and pick the top-priority name, then drop the priority column result = ( pl.concat([english, other]) .group_by("taxonID")