From 6dc2781c6a9c5c4da354325ad38f609fa40e6c24 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 17 Jul 2025 11:28:06 -0700 Subject: [PATCH 01/13] Raise AlignmentError if no alignment result for target --- src/dcd_mapping/align.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index b64f67a..2d02d41 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -363,6 +363,11 @@ def align( msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) + # confirm that there is an alignment result for each target gene + for target_gene in scoreset_metadata.target_genes: + if target_gene not in alignment_results: + msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}" + raise AlignmentError(msg) return alignment_results From 377a8c441f8fb6f52eeb8ab64057f1b2c0e68041 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 17 Jul 2025 11:28:48 -0700 Subject: [PATCH 02/13] Translate NT target sequence for targets with protein-level variants If a target has only protein-level variants, but the provided target sequence is a nucleotide sequence, translate the nucleotide sequence to an amino acid sequence immediately after metadata ingestion. This change avoids alignment errors that can occur when a target sequence has been codon-optimized to a non-human organism. Since we do not have sufficient metadata to assume that a target sequence has been codon-optimized, always perform translation when there are no nucleotide-level variants for a target. --- src/api/routers/map.py | 2 ++ src/dcd_mapping/main.py | 2 ++ src/dcd_mapping/mavedb_data.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 32b3856..8b27573 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -16,6 +16,7 @@ from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, + correct_target_sequence_type, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, @@ -48,6 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) + metadata = correct_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 7ccf26c..4966ca5 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -22,6 +22,7 @@ ) from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, + correct_target_sequence_type, get_scoreset_metadata, get_scoreset_records, with_mavedb_score_set, @@ -332,6 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) + metadata = correct_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index a137e30..582a873 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -30,8 +30,10 @@ ScoresetMapping, ScoresetMetadata, TargetGene, + TargetSequenceType, UniProtRef, ) +from dcd_mapping.transcripts import _get_protein_sequence __all__ = [ "get_scoreset_urns", @@ -324,6 +326,28 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) +def correct_target_sequence_type( + metadata: ScoresetMetadata, records: dict +) -> ScoresetMetadata: + """If target sequence type is DNA but all variants are protein-level, change to protein. + This avoids BLAT errors in cases where the target sequence was codon-optimized + for a non-human organism + """ + for target_label, target in metadata.target_genes.items(): + if target.target_sequence_type == TargetSequenceType.DNA: + all_protein = True + for record in records.get(target_label, []): + if record.hgvs_pro == "NA" or not record.hgvs_pro: + all_protein = False + break + if all_protein: + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level" + _logger.info(msg) + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN + return metadata + + def with_mavedb_score_set(fn: Callable) -> Callable: @wraps(fn) async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 From a0b69709959009265bf9cc46debaa973f18f8733 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 17 Jul 2025 12:58:02 -0400 Subject: [PATCH 03/13] Bump dataset file download timeout to 60 seconds This should help slightly with timeout issues we see in production --- src/dcd_mapping/resource_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index 153a99c..3689b6d 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -39,7 +39,7 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path: if not silent: click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}") with requests.get( - url, stream=True, timeout=30, headers=authentication_header() + url, stream=True, timeout=60, headers=authentication_header() ) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) From 01427c4bc0c1ca055bbeed19e0f2245e2c40f65b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 13:58:41 -0700 Subject: [PATCH 04/13] Change function name to patch_target_sequence_type --- src/api/routers/map.py | 4 ++-- src/dcd_mapping/main.py | 4 ++-- src/dcd_mapping/mavedb_data.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 8b27573..fca57d7 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -16,10 +16,10 @@ from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -49,7 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 4966ca5..107f885 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -22,9 +22,9 @@ ) from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -333,7 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 582a873..2cab736 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -326,7 +326,7 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) -def correct_target_sequence_type( +def patch_target_sequence_type( metadata: ScoresetMetadata, records: dict ) -> ScoresetMetadata: """If target sequence type is DNA but all variants are protein-level, change to protein. From d60e81a0cb29477301ed3b3b8980d4b2005eae7b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 14:01:31 -0700 Subject: [PATCH 05/13] Bump version number This update changes how alignment is performed for some score sets, so bump major version. --- src/dcd_mapping/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 8c31b0b..848b44d 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.1.0" +dcd_mapping_version = "2025.2.0" From 36bc828915aca52c544010bfd72cfc1de6a93241 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Tue, 22 Jul 2025 15:19:39 -0700 Subject: [PATCH 06/13] Enforce a 1 to 1 relationship between incoming score rows and outgoing annotated mappings Prior to this change, it was possible for some score rows to generate valid mappings with other score rows not creating a mapped variant. This had some negative downstream consequences, which will be remedied by ensuring that if any variant receives a mapped variant, all variants receive a mapped variant. --- src/api/routers/map.py | 45 ++++++++++++++++++++++---- src/dcd_mapping/annotate.py | 59 ++++++++++++++++++++-------------- src/dcd_mapping/transcripts.py | 25 ++++++-------- src/dcd_mapping/vrs_map.py | 35 +++++++++++++++----- 4 files changed, 109 insertions(+), 55 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 8b27573..f6fe6f9 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -68,6 +68,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse error_message="Score set contains no variants to map", ).model_dump(exclude_none=True) ) + total_score_records = sum(len(v) for v in records.values()) try: alignment_results = build_alignment_result(metadata, True) @@ -120,15 +121,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if not vrs_results or all( - mapping_result is None for mapping_result in vrs_results.values() - ): + + nonetype_vrs_results = [ + result is None + for target_gene in vrs_results + for result in vrs_results[target_gene] + ] + + if not vrs_results or all(nonetype_vrs_results): return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message="No variant mappings available for this score set", ).model_dump(exclude_none=True) ) + if any(nonetype_vrs_results): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.", + ).model_dump(exclude_none=True) + ) annotated_vrs_results = {} try: @@ -146,15 +159,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if not annotated_vrs_results or all( - mapping_result is None for mapping_result in annotated_vrs_results.values() - ): + + nonetype_annotated_vrs_results = [ + result is None + for target_gene in annotated_vrs_results + for result in annotated_vrs_results[target_gene] + ] + + if not annotated_vrs_results or all(nonetype_annotated_vrs_results): return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message="No annotated variant mappings available for this score set", ).model_dump(exclude_none=True) ) + if any(nonetype_annotated_vrs_results): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.", + ).model_dump(exclude_none=True) + ) try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) @@ -235,6 +260,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse ).model_dump(exclude_none=True) ) + if len(mapped_scores) != total_score_records: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.", + ).model_dump(exclude_none=True) + ) + return JSONResponse( content=ScoresetMapping( metadata=raw_metadata, diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 9a9532e..39e6902 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -266,13 +266,16 @@ def _annotate_allele_mapping( sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: - raise ValueError - if accession.startswith("refseq:"): + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + elif accession.startswith("refseq:"): accession = accession[7:] else: if tx_results is None or isinstance(tx_results, TxSelectError): - raise ValueError # impossible by definition - accession = tx_results.np + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + else: + accession = tx_results.np sr = get_seqrepo() loc = mapped_score.post_mapped.location @@ -281,8 +284,9 @@ def _annotate_allele_mapping( post_mapped.extensions = [ Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) ] - hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) - post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] + if accession: + hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) + post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _allele_to_vod(pre_mapped) @@ -295,9 +299,7 @@ def _annotate_allele_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, annotation_layer=mapped_score.annotation_layer, - error_message=mapped_score.error_message - if mapped_score.error_message - else None, # TODO might not need if statement here + error_message=mapped_score.error_message, ) @@ -321,13 +323,17 @@ def _annotate_haplotype_mapping( sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: - raise ValueError - if accession.startswith("refseq:"): + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + elif accession.startswith("refseq:"): accession = accession[7:] else: if tx_results is None or isinstance(tx_results, TxSelectError): - raise ValueError # impossible by definition - accession = tx_results.np + # impossible by definition + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + else: + accession = tx_results.np sr = get_seqrepo() for allele in post_mapped.members: @@ -337,8 +343,9 @@ def _annotate_haplotype_mapping( allele.extensions = [ Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) ] - hgvs, syntax = _get_hgvs_string(allele, accession) - allele.expressions = [Expression(syntax=syntax, value=hgvs)] + if accession: + hgvs, syntax = _get_hgvs_string(allele, accession) + allele.expressions = [Expression(syntax=syntax, value=hgvs)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped) @@ -351,9 +358,7 @@ def _annotate_haplotype_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score is not None else None, annotation_layer=mapped_score.annotation_layer, - error_message=mapped_score.error_message - if mapped_score.error_message - else None, # TODO might not need if statement here + error_message=mapped_score.error_message, ) @@ -388,6 +393,7 @@ def annotate( ScoreAnnotationWithLayer( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, + vrs_version=vrs_version, error_message=mapped_score.error_message, ) ) @@ -410,8 +416,16 @@ def annotate( ) ) else: - # TODO how to combine this error message with other potential error messages? - ValueError("inconsistent variant structure") + score_annotations.append( + ScoreAnnotationWithLayer( + pre_mapped=mapped_score.pre_mapped, + post_mapped=mapped_score.post_mapped, + vrs_version=vrs_version, + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score else None, + error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}", + ) + ) return score_annotations @@ -519,11 +533,6 @@ def _set_scoreset_layer( expressions. If genomic expressions are available, that's what we'd like to use. This function tells us how to filter the final annotation objects. """ - if urn.startswith("urn:mavedb:00000097"): - _logger.debug( - "Manually selecting protein annotation for scores from urn:mavedb:00000097" - ) - return AnnotationLayer.PROTEIN for mapping in mappings: if mapping.annotation_layer == AnnotationLayer.GENOMIC: return AnnotationLayer.GENOMIC diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 9034894..feb310d 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -54,7 +54,10 @@ async def _get_compatible_transcripts( chromosome = get_chromosome_identifier(aligned_chrom) gene_symbol = get_gene_symbol(target_gene) if not gene_symbol: - raise TxSelectError + msg = ( + f"Unable to find gene symbol for target gene {target_gene.target_gene_name}" + ) + raise TxSelectError(msg) transcript_matches = [] for hit_range in align_result.hit_subranges: matches_list = await get_transcripts( @@ -179,7 +182,8 @@ async def _select_protein_reference( if not best_tx: best_tx = await _get_longest_compatible_transcript(common_transcripts) if not best_tx: - raise TxSelectError + msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}" + raise TxSelectError(msg) ref_sequence = get_sequence(best_tx.refseq_prot) nm_accession = best_tx.refseq_nuc np_accession = best_tx.refseq_prot @@ -323,19 +327,6 @@ async def select_transcript( :param align_result: alignment results :return: Transcript description (accession ID, offset, selected sequence, etc) """ - if scoreset_urn.startswith("urn:mavedb:00000097"): - _logger.info( - "Score sets in urn:mavedb:00000097 are already expressed in full HGVS strings -- using predefined results because additional hard-coding is unnecessary" - ) - return TxSelectResult( - nm="NM_007294.3", - np="NP_009225.1", - start=0, - is_full_match=False, - transcript_mode=TranscriptPriority.MANE_SELECT, - sequence=_get_protein_sequence(target_gene.target_sequence), - ) - if target_gene.target_gene_category != TargetType.PROTEIN_CODING: _logger.debug( "%s is regulatory/noncoding -- skipping transcript selection", @@ -366,7 +357,9 @@ async def select_transcripts( * Dict where keys are target labels and values are alignment result objects :return: dict where keys are target labels and values are objects describing selected transcript (accession ID, offset, selected sequence, etc) """ - selected_transcripts = {} + selected_transcripts: dict[ + str, TxSelectResult | TxSelectError | KeyError | None + ] = {} for target_gene in scoreset_metadata.target_genes: if scoreset_metadata.target_genes[target_gene].target_accession_id: # for accession-based targets, create tx select objects for protein sequence accessions only diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index b03742c..ff928a7 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -453,8 +453,11 @@ def _map_genomic( # if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace namespace = "ga4gh" else: - msg = f"Namespace could not be inferred from sequence: {sequence_id}" - raise ValueError(msg) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Namespace could not be inferred from sequence: {sequence_id}", + ) if ( row.hgvs_nt in {"_wt", "_sy", "="} @@ -609,8 +612,11 @@ def _map_genomic( error_message=str(e), ) else: - msg = f"Reference sequence namespace not supported: {namespace}" - raise ValueError(msg) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Reference sequence namespace not supported: {namespace}", + ) return MappedScore( accession_id=row.accession, @@ -783,7 +789,14 @@ def _map_accession( variations: list[MappedScore] = [] sequence_id = metadata.target_accession_id if sequence_id is None: - raise ValueError + return [ + MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Could not generate mapped allele objects. No sequence id was provided.", + ) + for row in records + ] store_accession(sequence_id) @@ -802,8 +815,14 @@ def _map_accession( hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) variations.append(hgvs_nt_mappings) else: - msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" - raise ValueError(msg) + [ + MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Unrecognized accession prefix for accession id {metadata.target_accession_id}", + ) + for row in records + ] return variations @@ -887,7 +906,7 @@ def vrs_map( records: list[ScoreRow], transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, -) -> list[MappedScore] | None: +) -> list[MappedScore]: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects. From f6f354059aa7bf7d74475508bac32c1c933db98b Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 13:58:41 -0700 Subject: [PATCH 07/13] Change function name to patch_target_sequence_type --- src/api/routers/map.py | 4 ++-- src/dcd_mapping/main.py | 4 ++-- src/dcd_mapping/mavedb_data.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index f6fe6f9..9a6503d 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -16,10 +16,10 @@ from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -49,7 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 4966ca5..107f885 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -22,9 +22,9 @@ ) from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, - correct_target_sequence_type, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError @@ -333,7 +333,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) - metadata = correct_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 582a873..2cab736 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -326,7 +326,7 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) -def correct_target_sequence_type( +def patch_target_sequence_type( metadata: ScoresetMetadata, records: dict ) -> ScoresetMetadata: """If target sequence type is DNA but all variants are protein-level, change to protein. From a75d55ae36f61d80ff4f195334c4df5163ab4cdb Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 18 Jul 2025 14:01:31 -0700 Subject: [PATCH 08/13] Bump version number This update changes how alignment is performed for some score sets, so bump major version. --- src/dcd_mapping/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 8c31b0b..848b44d 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.1.0" +dcd_mapping_version = "2025.2.0" From 3deeeaf309a6e42d9d7a42a9d06208e68563795e Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 23 Jul 2025 14:33:44 -0700 Subject: [PATCH 09/13] fix: vrs_ref_allele_seq extension could potentially have an empty string as its sequence --- src/dcd_mapping/annotate.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 39e6902..145dafb 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -140,7 +140,7 @@ def _get_vrs_ref_allele_seq( metadata: TargetGene, urn: str, tx_select_results: TxSelectResult | None, -) -> Extension: +) -> Extension | None: """Create `vrs_ref_allele_seq` property.""" start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end) if ( @@ -161,8 +161,12 @@ def _get_vrs_ref_allele_seq( seq = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" # type: ignore sr = get_seqrepo() ref = sr.get_sequence(seq, start, end) - if ref is None: - raise ValueError + + if not ref: + msg = f"Could not retrieve reference sequence for allele {allele.id} in urn {urn} with start {start} and end {end}" + _logger.warning(msg) + return None + return Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) @@ -256,9 +260,11 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [ - _get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results) - ] + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + pre_mapped, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + pre_mapped.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence @@ -313,9 +319,14 @@ def _annotate_haplotype_mapping( """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore post_mapped: Haplotype = mapped_score.post_mapped # type: ignore + # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: - allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)] + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + allele, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + allele.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence From efe1ca4784862165a068becf695c5a7fc00686af Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 24 Jul 2025 11:23:16 -0700 Subject: [PATCH 10/13] Bump dev Seqrepo version --- docker-compose-dev.yml | 2 +- settings/.env.dev | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index f4266f8..25a0ca7 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -30,7 +30,7 @@ services: - vrs-mapping-data-dev:/var/lib/postgresql/data seqrepo: - image: biocommons/seqrepo:2021-01-29 + image: biocommons/seqrepo:2024-12-20 volumes: - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo diff --git a/settings/.env.dev b/settings/.env.dev index 956bfae..359fc2a 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -30,4 +30,4 @@ MAVEDB_API_KEY= # Environment variables for seqrepo #################################################################################################### -SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29 +SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20 From 28289d49b949b5ebebfe052f4dc7e082a3fca6b5 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 24 Jul 2025 11:23:42 -0700 Subject: [PATCH 11/13] fix: update logic in patch_target_sequence_type to handle cases with no nucleotide variants --- src/dcd_mapping/mavedb_data.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 2cab736..8dd0928 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -329,22 +329,19 @@ def get_scoreset_records( def patch_target_sequence_type( metadata: ScoresetMetadata, records: dict ) -> ScoresetMetadata: - """If target sequence type is DNA but all variants are protein-level, change to protein. + """If target sequence type is DNA but no nucleotide variants are defined, treat the target as if + it were a protein level target. This avoids BLAT errors in cases where the target sequence was codon-optimized for a non-human organism """ for target_label, target in metadata.target_genes.items(): - if target.target_sequence_type == TargetSequenceType.DNA: - all_protein = True - for record in records.get(target_label, []): - if record.hgvs_pro == "NA" or not record.hgvs_pro: - all_protein = False - break - if all_protein: - msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level" - _logger.info(msg) - target.target_sequence = _get_protein_sequence(target.target_sequence) - target.target_sequence_type = TargetSequenceType.PROTEIN + if target.target_sequence_type == TargetSequenceType.DNA and not any( + record.hgvs_nt for record in records.get(target_label, []) + ): + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because target only has protein-level variants" + _logger.info(msg) + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN return metadata From 1e732a27f36e50f310e07c4a8db3ad19bc3755a5 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Mon, 28 Jul 2025 09:38:23 -0700 Subject: [PATCH 12/13] feat: add custom exceptions for VRS mapping errors and update error handling in mapping functions --- src/api/routers/map.py | 15 ++++++++++++-- src/dcd_mapping/cli.py | 2 +- src/dcd_mapping/exceptions.py | 17 ++++++++++++++++ src/dcd_mapping/main.py | 15 ++++++++++++-- src/dcd_mapping/vrs_map.py | 38 +++++++++++------------------------ 5 files changed, 56 insertions(+), 31 deletions(-) create mode 100644 src/dcd_mapping/exceptions.py diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 9a6503d..1c9c0b7 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -13,6 +13,12 @@ _set_scoreset_layer, annotate, ) +from dcd_mapping.exceptions import ( + MissingSequenceIdError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, + VrsMapError, +) from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( ScoresetNotSupportedError, @@ -31,7 +37,7 @@ VrsVersion, ) from dcd_mapping.transcripts import select_transcripts -from dcd_mapping.vrs_map import VrsMapError, vrs_map +from dcd_mapping.vrs_map import vrs_map router = APIRouter( prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} @@ -115,7 +121,12 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse transcript=transcripts[target_gene], silent=True, ) - except VrsMapError as e: + except ( + UnsupportedReferenceSequenceNameSpaceError, + VrsMapError, + UnsupportedReferenceSequencePrefixError, + MissingSequenceIdError, + ) as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index 0db86b3..c5ae702 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -7,11 +7,11 @@ import click from dcd_mapping.align import AlignmentError +from dcd_mapping.exceptions import VrsMapError from dcd_mapping.main import map_scoreset_urn from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import VrsVersion from dcd_mapping.transcripts import TxSelectError -from dcd_mapping.vrs_map import VrsMapError _logger = logging.getLogger(__name__) diff --git a/src/dcd_mapping/exceptions.py b/src/dcd_mapping/exceptions.py new file mode 100644 index 0000000..8f5452b --- /dev/null +++ b/src/dcd_mapping/exceptions.py @@ -0,0 +1,17 @@ +"""Exceptions for DCD Mapping Module""" + + +class VrsMapError(Exception): + """Raise in case of generic VRS mapping errors.""" + + +class UnsupportedReferenceSequenceNameSpaceError(ValueError): + """Raised when a reference sequence name space is not supported.""" + + +class MissingSequenceIdError(ValueError): + """Raised when a sequence ID is not provided.""" + + +class UnsupportedReferenceSequencePrefixError(ValueError): + """Raised when a reference sequence prefix is not supported.""" diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 107f885..aea0205 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -14,6 +14,12 @@ save_mapped_output_json, write_scoreset_mapping_to_json, ) +from dcd_mapping.exceptions import ( + MissingSequenceIdError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, + VrsMapError, +) from dcd_mapping.lookup import ( DataLookupError, check_gene_normalizer, @@ -35,7 +41,7 @@ VrsVersion, ) from dcd_mapping.transcripts import select_transcripts -from dcd_mapping.vrs_map import VrsMapError, vrs_map +from dcd_mapping.vrs_map import vrs_map _logger = logging.getLogger(__name__) @@ -223,7 +229,12 @@ async def map_scoreset( transcript=transcripts[target_gene], silent=silent, ) - except VrsMapError as e: + except ( + MissingSequenceIdError, + UnsupportedReferenceSequencePrefixError, + UnsupportedReferenceSequenceNameSpaceError, + VrsMapError, + ) as e: _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR ) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index ff928a7..db81d57 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -20,6 +20,11 @@ from mavehgvs.util import parse_variant_strings from mavehgvs.variant import Variant +from dcd_mapping.exceptions import ( + MissingSequenceIdError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, +) from dcd_mapping.lookup import ( cdot_rest, get_chromosome_identifier, @@ -37,16 +42,12 @@ ) from dcd_mapping.transcripts import TxSelectError -__all__ = ["vrs_map", "VrsMapError"] +__all__ = ["vrs_map"] _logger = logging.getLogger(__name__) -class VrsMapError(Exception): - """Raise in case of VRS mapping errors.""" - - def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -612,11 +613,8 @@ def _map_genomic( error_message=str(e), ) else: - return MappedScore( - accession_id=row.accession, - score=row.score, - error_message=f"Reference sequence namespace not supported: {namespace}", - ) + msg = f"Unsupported reference sequence namespace: {namespace}" + raise UnsupportedReferenceSequenceNameSpaceError(msg) return MappedScore( accession_id=row.accession, @@ -789,14 +787,8 @@ def _map_accession( variations: list[MappedScore] = [] sequence_id = metadata.target_accession_id if sequence_id is None: - return [ - MappedScore( - accession_id=row.accession, - score=row.score, - error_message="Could not generate mapped allele objects. No sequence id was provided.", - ) - for row in records - ] + msg = " No target_accession_id was provided by target gene metadata. Target gene metadata must have a target_accession_id to map to VRS." + raise MissingSequenceIdError(msg) store_accession(sequence_id) @@ -815,14 +807,8 @@ def _map_accession( hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) variations.append(hgvs_nt_mappings) else: - [ - MappedScore( - accession_id=row.accession, - score=row.score, - error_message=f"Unrecognized accession prefix for accession id {metadata.target_accession_id}", - ) - for row in records - ] + msg = f"Unrecognized accession prefix for accession id: {metadata.target_accession_id}" + raise UnsupportedReferenceSequencePrefixError(msg) return variations From 430f1b794f80dfdd790aaf5a97212892240185d2 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Mon, 28 Jul 2025 09:53:50 -0700 Subject: [PATCH 13/13] feat: refactor custom exceptions to exceptions module --- src/api/routers/map.py | 10 +++++--- src/dcd_mapping/align.py | 19 ++++++-------- src/dcd_mapping/cli.py | 10 +++++--- src/dcd_mapping/exceptions.py | 42 +++++++++++++++++++++++++++++++ src/dcd_mapping/lookup.py | 21 ++++++++++------ src/dcd_mapping/main.py | 10 +++++--- src/dcd_mapping/mavedb_data.py | 11 ++++---- src/dcd_mapping/resource_utils.py | 4 --- src/dcd_mapping/transcripts.py | 7 ++---- 9 files changed, 87 insertions(+), 47 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 1c9c0b7..d53cc24 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result +from dcd_mapping.align import build_alignment_result from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, @@ -14,21 +14,23 @@ annotate, ) from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, + DataLookupError, MissingSequenceIdError, + ResourceAcquisitionError, + ScoresetNotSupportedError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, VrsMapError, ) -from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( - ScoresetNotSupportedError, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, patch_target_sequence_type, with_mavedb_score_set, ) -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreAnnotation, ScoresetMapping, diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 2d02d41..7f54b41 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -12,12 +12,15 @@ from Bio.SearchIO._model import Hit, QueryResult from cool_seq_tool.schemas import Strand -from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location -from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError -from dcd_mapping.resource_utils import ( +from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, ResourceAcquisitionError, - http_download, + ScoresetNotSupportedError, ) +from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH +from dcd_mapping.resource_utils import http_download from dcd_mapping.schemas import ( AlignmentResult, GeneLocation, @@ -32,14 +35,6 @@ _logger = logging.getLogger(__name__) -class AlignmentError(Exception): - """Raise when errors encountered during alignment.""" - - -class BlatNotFoundError(AlignmentError): - """Raise when BLAT binary appears to be missing.""" - - def _write_query_file(file: Path, lines: list[str]) -> None: """Write lines to query file. This method is broken out to enable easy mocking while testing. diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index c5ae702..dafa44f 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -6,12 +6,14 @@ import click -from dcd_mapping.align import AlignmentError -from dcd_mapping.exceptions import VrsMapError +from dcd_mapping.exceptions import ( + AlignmentError, + ResourceAcquisitionError, + TxSelectError, + VrsMapError, +) from dcd_mapping.main import map_scoreset_urn -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import VrsVersion -from dcd_mapping.transcripts import TxSelectError _logger = logging.getLogger(__name__) diff --git a/src/dcd_mapping/exceptions.py b/src/dcd_mapping/exceptions.py index 8f5452b..a2817c9 100644 --- a/src/dcd_mapping/exceptions.py +++ b/src/dcd_mapping/exceptions.py @@ -1,6 +1,9 @@ """Exceptions for DCD Mapping Module""" +### Custom Exceptions for VRS Mapping Errors ### + + class VrsMapError(Exception): """Raise in case of generic VRS mapping errors.""" @@ -15,3 +18,42 @@ class MissingSequenceIdError(ValueError): class UnsupportedReferenceSequencePrefixError(ValueError): """Raised when a reference sequence prefix is not supported.""" + + +### Custom Exceptions for Alignment Errors ### + + +class AlignmentError(ValueError): + """Raise when errors encountered during alignment.""" + + +class BlatNotFoundError(AlignmentError): + """Raise when BLAT binary appears to be missing.""" + + +### Custom Exceptions for Data Lookup Errors ### + + +class DataLookupError(Exception): + """Raise for misc. issues related to resource acquisition/lookup.""" + + +### Custom Exceptions for MaveDB Data Errors ### + + +class ScoresetNotSupportedError(ValueError): + """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" + + +### Custom Exceptions for Resource Acquisition Errors ### + + +class ResourceAcquisitionError(ValueError): + """Raise when resource acquisition fails.""" + + +### Custom Exceptions for Transcript Selection Errors ### + + +class TxSelectError(ValueError): + """Raise for transcript selection failure.""" diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index bf53f48..34de2f0 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -49,6 +49,7 @@ from gene.query import QueryHandler from gene.schemas import MatchType, SourceName +from dcd_mapping.exceptions import DataLookupError from dcd_mapping.schemas import ( GeneLocation, ManeDescription, @@ -93,10 +94,6 @@ def cdot_rest() -> RESTDataProvider: # ---------------------------------- Global ---------------------------------- # -class DataLookupError(Exception): - """Raise for misc. issues related to resource acquisition/lookup.""" - - class CoolSeqToolBuilder: """Singleton constructor for ``cool-seq-tool`` instance.""" @@ -241,7 +238,9 @@ async def check_uta() -> None: query = f"select * from {uta.schema}.meta" # noqa: S608 result = await uta.execute_query(query) if not result: - raise DataLookupError + msg = "UTA schema check failed. No results returned." + _logger.error(msg) + raise DataLookupError(msg) async def get_protein_accession(transcript: str) -> str | None: @@ -302,9 +301,13 @@ async def get_transcripts( def check_gene_normalizer() -> None: q = GeneNormalizerBuilder() if (not q.db.check_schema_initialized()) or not (q.db.check_tables_populated()): - raise DataLookupError + msg = "Gene Normalizer database schema check failed. No results returned." + _logger.error(msg) + raise DataLookupError(msg) if q.normalize("BRAF").match_type == MatchType.NO_MATCH: - raise DataLookupError + msg = "Gene Normalizer returned no normalization results for BRAF. This indicates an underlying issue with the database that should be investigated." + _logger.error(msg) + raise DataLookupError(msg) def _get_hgnc_symbol(term: str) -> str | None: @@ -436,7 +439,9 @@ def get_gene_location(target_gene: TargetGene) -> GeneLocation | None: def check_seqrepo() -> None: sr = get_seqrepo() if not sr.sr["NC_000001.11"][780000:780020]: - raise DataLookupError + msg = "SeqRepo returned no sequence for NC_000001.11 at 780000:780020. This indicates an underlying issue with SeqRepo that should be investigated." + _logger.error(msg) + raise DataLookupError(msg) conn = sr.sr.aliases._db try: # conn = sr.sr.aliases._db diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index aea0205..dac25d4 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -8,32 +8,34 @@ import click from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result +from dcd_mapping.align import build_alignment_result from dcd_mapping.annotate import ( annotate, save_mapped_output_json, write_scoreset_mapping_to_json, ) from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, + DataLookupError, MissingSequenceIdError, + ResourceAcquisitionError, + ScoresetNotSupportedError, UnsupportedReferenceSequenceNameSpaceError, UnsupportedReferenceSequencePrefixError, VrsMapError, ) from dcd_mapping.lookup import ( - DataLookupError, check_gene_normalizer, check_seqrepo, check_uta, ) from dcd_mapping.mavedb_data import ( - ScoresetNotSupportedError, get_scoreset_metadata, get_scoreset_records, patch_target_sequence_type, with_mavedb_score_set, ) -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreRow, ScoresetMapping, diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 8dd0928..b3510b4 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -17,11 +17,14 @@ from fastapi import HTTPException from pydantic import ValidationError -from dcd_mapping.lookup import DataLookupError +from dcd_mapping.exceptions import ( + DataLookupError, + ResourceAcquisitionError, + ScoresetNotSupportedError, +) from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, MAVEDB_BASE_URL, - ResourceAcquisitionError, authentication_header, http_download, ) @@ -47,10 +50,6 @@ _logger = logging.getLogger(__name__) -class ScoresetNotSupportedError(Exception): - """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" - - def get_scoreset_urns() -> set[str]: """Fetch all scoreset URNs. Since species is annotated at the scoreset target level, we can't yet filter on anything like `homo sapien` -- meaning this is fairly slow. diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index 3689b6d..f84caf1 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -18,10 +18,6 @@ LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True) -class ResourceAcquisitionError(Exception): - """Raise when resource acquisition fails.""" - - def authentication_header() -> dict | None: """Fetch with api key envvar, if available.""" return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index feb310d..00080d3 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -7,6 +7,7 @@ from Bio.SeqUtils import seq1 from cool_seq_tool.schemas import TranscriptPriority +from dcd_mapping.exceptions import TxSelectError from dcd_mapping.lookup import ( get_chromosome_identifier, get_gene_symbol, @@ -29,15 +30,11 @@ TxSelectResult, ) -__all__ = ["select_transcript", "TxSelectError"] +__all__ = ["select_transcript"] _logger = logging.getLogger(__name__) -class TxSelectError(Exception): - """Raise for transcript selection failure.""" - - async def _get_compatible_transcripts( target_gene: TargetGene, align_result: AlignmentResult ) -> list[list[str]]: