diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index f4266f8..25a0ca7 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -30,7 +30,7 @@ services: - vrs-mapping-data-dev:/var/lib/postgresql/data seqrepo: - image: biocommons/seqrepo:2021-01-29 + image: biocommons/seqrepo:2024-12-20 volumes: - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo diff --git a/settings/.env.dev b/settings/.env.dev index 956bfae..359fc2a 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -30,4 +30,4 @@ MAVEDB_API_KEY= # Environment variables for seqrepo #################################################################################################### -SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29 +SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20 diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 32b3856..d53cc24 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -6,22 +6,31 @@ from fastapi.responses import JSONResponse from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result +from dcd_mapping.align import build_alignment_result from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, _set_scoreset_layer, annotate, ) -from dcd_mapping.lookup import DataLookupError -from dcd_mapping.mavedb_data import ( +from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, + DataLookupError, + MissingSequenceIdError, + ResourceAcquisitionError, ScoresetNotSupportedError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, + VrsMapError, +) +from dcd_mapping.mavedb_data import ( get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreAnnotation, ScoresetMapping, @@ -30,7 +39,7 @@ VrsVersion, ) from dcd_mapping.transcripts import select_transcripts -from dcd_mapping.vrs_map import VrsMapError, vrs_map +from dcd_mapping.vrs_map import vrs_map router = APIRouter( prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} @@ -48,6 +57,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( @@ -66,6 +76,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse error_message="Score set contains no variants to map", ).model_dump(exclude_none=True) ) + total_score_records = sum(len(v) for v in records.values()) try: alignment_results = build_alignment_result(metadata, True) @@ -112,21 +123,38 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse transcript=transcripts[target_gene], silent=True, ) - except VrsMapError as e: + except ( + UnsupportedReferenceSequenceNameSpaceError, + VrsMapError, + UnsupportedReferenceSequencePrefixError, + MissingSequenceIdError, + ) as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if not vrs_results or all( - mapping_result is None for mapping_result in vrs_results.values() - ): + + nonetype_vrs_results = [ + result is None + for target_gene in vrs_results + for result in vrs_results[target_gene] + ] + + if not vrs_results or all(nonetype_vrs_results): return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message="No variant mappings available for this score set", ).model_dump(exclude_none=True) ) + if any(nonetype_vrs_results): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.", + ).model_dump(exclude_none=True) + ) annotated_vrs_results = {} try: @@ -144,15 +172,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if not annotated_vrs_results or all( - mapping_result is None for mapping_result in annotated_vrs_results.values() - ): + + nonetype_annotated_vrs_results = [ + result is None + for target_gene in annotated_vrs_results + for result in annotated_vrs_results[target_gene] + ] + + if not annotated_vrs_results or all(nonetype_annotated_vrs_results): return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message="No annotated variant mappings available for this score set", ).model_dump(exclude_none=True) ) + if any(nonetype_annotated_vrs_results): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.", + ).model_dump(exclude_none=True) + ) try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) @@ -233,6 +273,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse ).model_dump(exclude_none=True) ) + if len(mapped_scores) != total_score_records: + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.", + ).model_dump(exclude_none=True) + ) + return JSONResponse( content=ScoresetMapping( metadata=raw_metadata, diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index b64f67a..7f54b41 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -12,12 +12,15 @@ from Bio.SearchIO._model import Hit, QueryResult from cool_seq_tool.schemas import Strand -from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location -from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError -from dcd_mapping.resource_utils import ( +from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, ResourceAcquisitionError, - http_download, + ScoresetNotSupportedError, ) +from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH +from dcd_mapping.resource_utils import http_download from dcd_mapping.schemas import ( AlignmentResult, GeneLocation, @@ -32,14 +35,6 @@ _logger = logging.getLogger(__name__) -class AlignmentError(Exception): - """Raise when errors encountered during alignment.""" - - -class BlatNotFoundError(AlignmentError): - """Raise when BLAT binary appears to be missing.""" - - def _write_query_file(file: Path, lines: list[str]) -> None: """Write lines to query file. This method is broken out to enable easy mocking while testing. @@ -363,6 +358,11 @@ def align( msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" target_gene = scoreset_metadata.target_genes[target_label] alignment_results[target_label] = _get_best_match(blat_result, target_gene) + # confirm that there is an alignment result for each target gene + for target_gene in scoreset_metadata.target_genes: + if target_gene not in alignment_results: + msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}" + raise AlignmentError(msg) return alignment_results diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 9a9532e..145dafb 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -140,7 +140,7 @@ def _get_vrs_ref_allele_seq( metadata: TargetGene, urn: str, tx_select_results: TxSelectResult | None, -) -> Extension: +) -> Extension | None: """Create `vrs_ref_allele_seq` property.""" start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end) if ( @@ -161,8 +161,12 @@ def _get_vrs_ref_allele_seq( seq = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" # type: ignore sr = get_seqrepo() ref = sr.get_sequence(seq, start, end) - if ref is None: - raise ValueError + + if not ref: + msg = f"Could not retrieve reference sequence for allele {allele.id} in urn {urn} with start {start} and end {end}" + _logger.warning(msg) + return None + return Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) @@ -256,9 +260,11 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [ - _get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results) - ] + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + pre_mapped, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + pre_mapped.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence @@ -266,13 +272,16 @@ def _annotate_allele_mapping( sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: - raise ValueError - if accession.startswith("refseq:"): + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + elif accession.startswith("refseq:"): accession = accession[7:] else: if tx_results is None or isinstance(tx_results, TxSelectError): - raise ValueError # impossible by definition - accession = tx_results.np + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + else: + accession = tx_results.np sr = get_seqrepo() loc = mapped_score.post_mapped.location @@ -281,8 +290,9 @@ def _annotate_allele_mapping( post_mapped.extensions = [ Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) ] - hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) - post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] + if accession: + hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) + post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _allele_to_vod(pre_mapped) @@ -295,9 +305,7 @@ def _annotate_allele_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, annotation_layer=mapped_score.annotation_layer, - error_message=mapped_score.error_message - if mapped_score.error_message - else None, # TODO might not need if statement here + error_message=mapped_score.error_message, ) @@ -311,9 +319,14 @@ def _annotate_haplotype_mapping( """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore post_mapped: Haplotype = mapped_score.post_mapped # type: ignore + # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: - allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)] + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + allele, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + allele.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence @@ -321,13 +334,17 @@ def _annotate_haplotype_mapping( sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: - raise ValueError - if accession.startswith("refseq:"): + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + elif accession.startswith("refseq:"): accession = accession[7:] else: if tx_results is None or isinstance(tx_results, TxSelectError): - raise ValueError # impossible by definition - accession = tx_results.np + # impossible by definition + accession = None + mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available." + else: + accession = tx_results.np sr = get_seqrepo() for allele in post_mapped.members: @@ -337,8 +354,9 @@ def _annotate_haplotype_mapping( allele.extensions = [ Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) ] - hgvs, syntax = _get_hgvs_string(allele, accession) - allele.expressions = [Expression(syntax=syntax, value=hgvs)] + if accession: + hgvs, syntax = _get_hgvs_string(allele, accession) + allele.expressions = [Expression(syntax=syntax, value=hgvs)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped) @@ -351,9 +369,7 @@ def _annotate_haplotype_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score is not None else None, annotation_layer=mapped_score.annotation_layer, - error_message=mapped_score.error_message - if mapped_score.error_message - else None, # TODO might not need if statement here + error_message=mapped_score.error_message, ) @@ -388,6 +404,7 @@ def annotate( ScoreAnnotationWithLayer( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, + vrs_version=vrs_version, error_message=mapped_score.error_message, ) ) @@ -410,8 +427,16 @@ def annotate( ) ) else: - # TODO how to combine this error message with other potential error messages? - ValueError("inconsistent variant structure") + score_annotations.append( + ScoreAnnotationWithLayer( + pre_mapped=mapped_score.pre_mapped, + post_mapped=mapped_score.post_mapped, + vrs_version=vrs_version, + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score else None, + error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}", + ) + ) return score_annotations @@ -519,11 +544,6 @@ def _set_scoreset_layer( expressions. If genomic expressions are available, that's what we'd like to use. This function tells us how to filter the final annotation objects. """ - if urn.startswith("urn:mavedb:00000097"): - _logger.debug( - "Manually selecting protein annotation for scores from urn:mavedb:00000097" - ) - return AnnotationLayer.PROTEIN for mapping in mappings: if mapping.annotation_layer == AnnotationLayer.GENOMIC: return AnnotationLayer.GENOMIC diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index 0db86b3..dafa44f 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -6,12 +6,14 @@ import click -from dcd_mapping.align import AlignmentError +from dcd_mapping.exceptions import ( + AlignmentError, + ResourceAcquisitionError, + TxSelectError, + VrsMapError, +) from dcd_mapping.main import map_scoreset_urn -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import VrsVersion -from dcd_mapping.transcripts import TxSelectError -from dcd_mapping.vrs_map import VrsMapError _logger = logging.getLogger(__name__) diff --git a/src/dcd_mapping/exceptions.py b/src/dcd_mapping/exceptions.py new file mode 100644 index 0000000..a2817c9 --- /dev/null +++ b/src/dcd_mapping/exceptions.py @@ -0,0 +1,59 @@ +"""Exceptions for DCD Mapping Module""" + + +### Custom Exceptions for VRS Mapping Errors ### + + +class VrsMapError(Exception): + """Raise in case of generic VRS mapping errors.""" + + +class UnsupportedReferenceSequenceNameSpaceError(ValueError): + """Raised when a reference sequence name space is not supported.""" + + +class MissingSequenceIdError(ValueError): + """Raised when a sequence ID is not provided.""" + + +class UnsupportedReferenceSequencePrefixError(ValueError): + """Raised when a reference sequence prefix is not supported.""" + + +### Custom Exceptions for Alignment Errors ### + + +class AlignmentError(ValueError): + """Raise when errors encountered during alignment.""" + + +class BlatNotFoundError(AlignmentError): + """Raise when BLAT binary appears to be missing.""" + + +### Custom Exceptions for Data Lookup Errors ### + + +class DataLookupError(Exception): + """Raise for misc. issues related to resource acquisition/lookup.""" + + +### Custom Exceptions for MaveDB Data Errors ### + + +class ScoresetNotSupportedError(ValueError): + """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" + + +### Custom Exceptions for Resource Acquisition Errors ### + + +class ResourceAcquisitionError(ValueError): + """Raise when resource acquisition fails.""" + + +### Custom Exceptions for Transcript Selection Errors ### + + +class TxSelectError(ValueError): + """Raise for transcript selection failure.""" diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index bf53f48..34de2f0 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -49,6 +49,7 @@ from gene.query import QueryHandler from gene.schemas import MatchType, SourceName +from dcd_mapping.exceptions import DataLookupError from dcd_mapping.schemas import ( GeneLocation, ManeDescription, @@ -93,10 +94,6 @@ def cdot_rest() -> RESTDataProvider: # ---------------------------------- Global ---------------------------------- # -class DataLookupError(Exception): - """Raise for misc. issues related to resource acquisition/lookup.""" - - class CoolSeqToolBuilder: """Singleton constructor for ``cool-seq-tool`` instance.""" @@ -241,7 +238,9 @@ async def check_uta() -> None: query = f"select * from {uta.schema}.meta" # noqa: S608 result = await uta.execute_query(query) if not result: - raise DataLookupError + msg = "UTA schema check failed. No results returned." + _logger.error(msg) + raise DataLookupError(msg) async def get_protein_accession(transcript: str) -> str | None: @@ -302,9 +301,13 @@ async def get_transcripts( def check_gene_normalizer() -> None: q = GeneNormalizerBuilder() if (not q.db.check_schema_initialized()) or not (q.db.check_tables_populated()): - raise DataLookupError + msg = "Gene Normalizer database schema check failed. No results returned." + _logger.error(msg) + raise DataLookupError(msg) if q.normalize("BRAF").match_type == MatchType.NO_MATCH: - raise DataLookupError + msg = "Gene Normalizer returned no normalization results for BRAF. This indicates an underlying issue with the database that should be investigated." + _logger.error(msg) + raise DataLookupError(msg) def _get_hgnc_symbol(term: str) -> str | None: @@ -436,7 +439,9 @@ def get_gene_location(target_gene: TargetGene) -> GeneLocation | None: def check_seqrepo() -> None: sr = get_seqrepo() if not sr.sr["NC_000001.11"][780000:780020]: - raise DataLookupError + msg = "SeqRepo returned no sequence for NC_000001.11 at 780000:780020. This indicates an underlying issue with SeqRepo that should be investigated." + _logger.error(msg) + raise DataLookupError(msg) conn = sr.sr.aliases._db try: # conn = sr.sr.aliases._db diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 7ccf26c..dac25d4 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -8,25 +8,34 @@ import click from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result +from dcd_mapping.align import build_alignment_result from dcd_mapping.annotate import ( annotate, save_mapped_output_json, write_scoreset_mapping_to_json, ) -from dcd_mapping.lookup import ( +from dcd_mapping.exceptions import ( + AlignmentError, + BlatNotFoundError, DataLookupError, + MissingSequenceIdError, + ResourceAcquisitionError, + ScoresetNotSupportedError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, + VrsMapError, +) +from dcd_mapping.lookup import ( check_gene_normalizer, check_seqrepo, check_uta, ) from dcd_mapping.mavedb_data import ( - ScoresetNotSupportedError, get_scoreset_metadata, get_scoreset_records, + patch_target_sequence_type, with_mavedb_score_set, ) -from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreRow, ScoresetMapping, @@ -34,7 +43,7 @@ VrsVersion, ) from dcd_mapping.transcripts import select_transcripts -from dcd_mapping.vrs_map import VrsMapError, vrs_map +from dcd_mapping.vrs_map import vrs_map _logger = logging.getLogger(__name__) @@ -222,7 +231,12 @@ async def map_scoreset( transcript=transcripts[target_gene], silent=silent, ) - except VrsMapError as e: + except ( + MissingSequenceIdError, + UnsupportedReferenceSequencePrefixError, + UnsupportedReferenceSequenceNameSpaceError, + VrsMapError, + ) as e: _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR ) @@ -332,6 +346,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) + metadata = patch_target_sequence_type(metadata, records) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index a137e30..b3510b4 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -17,11 +17,14 @@ from fastapi import HTTPException from pydantic import ValidationError -from dcd_mapping.lookup import DataLookupError +from dcd_mapping.exceptions import ( + DataLookupError, + ResourceAcquisitionError, + ScoresetNotSupportedError, +) from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, MAVEDB_BASE_URL, - ResourceAcquisitionError, authentication_header, http_download, ) @@ -30,8 +33,10 @@ ScoresetMapping, ScoresetMetadata, TargetGene, + TargetSequenceType, UniProtRef, ) +from dcd_mapping.transcripts import _get_protein_sequence __all__ = [ "get_scoreset_urns", @@ -45,10 +50,6 @@ _logger = logging.getLogger(__name__) -class ScoresetNotSupportedError(Exception): - """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" - - def get_scoreset_urns() -> set[str]: """Fetch all scoreset URNs. Since species is annotated at the scoreset target level, we can't yet filter on anything like `homo sapien` -- meaning this is fairly slow. @@ -324,6 +325,25 @@ def get_scoreset_records( return _load_scoreset_records(scores_csv, metadata) +def patch_target_sequence_type( + metadata: ScoresetMetadata, records: dict +) -> ScoresetMetadata: + """If target sequence type is DNA but no nucleotide variants are defined, treat the target as if + it were a protein level target. + This avoids BLAT errors in cases where the target sequence was codon-optimized + for a non-human organism + """ + for target_label, target in metadata.target_genes.items(): + if target.target_sequence_type == TargetSequenceType.DNA and not any( + record.hgvs_nt for record in records.get(target_label, []) + ): + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because target only has protein-level variants" + _logger.info(msg) + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN + return metadata + + def with_mavedb_score_set(fn: Callable) -> Callable: @wraps(fn) async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index 153a99c..f84caf1 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -18,10 +18,6 @@ LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True) -class ResourceAcquisitionError(Exception): - """Raise when resource acquisition fails.""" - - def authentication_header() -> dict | None: """Fetch with api key envvar, if available.""" return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None @@ -39,7 +35,7 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path: if not silent: click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}") with requests.get( - url, stream=True, timeout=30, headers=authentication_header() + url, stream=True, timeout=60, headers=authentication_header() ) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 9034894..00080d3 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -7,6 +7,7 @@ from Bio.SeqUtils import seq1 from cool_seq_tool.schemas import TranscriptPriority +from dcd_mapping.exceptions import TxSelectError from dcd_mapping.lookup import ( get_chromosome_identifier, get_gene_symbol, @@ -29,15 +30,11 @@ TxSelectResult, ) -__all__ = ["select_transcript", "TxSelectError"] +__all__ = ["select_transcript"] _logger = logging.getLogger(__name__) -class TxSelectError(Exception): - """Raise for transcript selection failure.""" - - async def _get_compatible_transcripts( target_gene: TargetGene, align_result: AlignmentResult ) -> list[list[str]]: @@ -54,7 +51,10 @@ async def _get_compatible_transcripts( chromosome = get_chromosome_identifier(aligned_chrom) gene_symbol = get_gene_symbol(target_gene) if not gene_symbol: - raise TxSelectError + msg = ( + f"Unable to find gene symbol for target gene {target_gene.target_gene_name}" + ) + raise TxSelectError(msg) transcript_matches = [] for hit_range in align_result.hit_subranges: matches_list = await get_transcripts( @@ -179,7 +179,8 @@ async def _select_protein_reference( if not best_tx: best_tx = await _get_longest_compatible_transcript(common_transcripts) if not best_tx: - raise TxSelectError + msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}" + raise TxSelectError(msg) ref_sequence = get_sequence(best_tx.refseq_prot) nm_accession = best_tx.refseq_nuc np_accession = best_tx.refseq_prot @@ -323,19 +324,6 @@ async def select_transcript( :param align_result: alignment results :return: Transcript description (accession ID, offset, selected sequence, etc) """ - if scoreset_urn.startswith("urn:mavedb:00000097"): - _logger.info( - "Score sets in urn:mavedb:00000097 are already expressed in full HGVS strings -- using predefined results because additional hard-coding is unnecessary" - ) - return TxSelectResult( - nm="NM_007294.3", - np="NP_009225.1", - start=0, - is_full_match=False, - transcript_mode=TranscriptPriority.MANE_SELECT, - sequence=_get_protein_sequence(target_gene.target_sequence), - ) - if target_gene.target_gene_category != TargetType.PROTEIN_CODING: _logger.debug( "%s is regulatory/noncoding -- skipping transcript selection", @@ -366,7 +354,9 @@ async def select_transcripts( * Dict where keys are target labels and values are alignment result objects :return: dict where keys are target labels and values are objects describing selected transcript (accession ID, offset, selected sequence, etc) """ - selected_transcripts = {} + selected_transcripts: dict[ + str, TxSelectResult | TxSelectError | KeyError | None + ] = {} for target_gene in scoreset_metadata.target_genes: if scoreset_metadata.target_genes[target_gene].target_accession_id: # for accession-based targets, create tx select objects for protein sequence accessions only diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index 8c31b0b..848b44d 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2025.1.0" +dcd_mapping_version = "2025.2.0" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index b03742c..db81d57 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -20,6 +20,11 @@ from mavehgvs.util import parse_variant_strings from mavehgvs.variant import Variant +from dcd_mapping.exceptions import ( + MissingSequenceIdError, + UnsupportedReferenceSequenceNameSpaceError, + UnsupportedReferenceSequencePrefixError, +) from dcd_mapping.lookup import ( cdot_rest, get_chromosome_identifier, @@ -37,16 +42,12 @@ ) from dcd_mapping.transcripts import TxSelectError -__all__ = ["vrs_map", "VrsMapError"] +__all__ = ["vrs_map"] _logger = logging.getLogger(__name__) -class VrsMapError(Exception): - """Raise in case of VRS mapping errors.""" - - def _hgvs_variant_is_valid(hgvs_string: str) -> bool: return not hgvs_string.endswith((".=", ")", "X")) @@ -453,8 +454,11 @@ def _map_genomic( # if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace namespace = "ga4gh" else: - msg = f"Namespace could not be inferred from sequence: {sequence_id}" - raise ValueError(msg) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message=f"Namespace could not be inferred from sequence: {sequence_id}", + ) if ( row.hgvs_nt in {"_wt", "_sy", "="} @@ -609,8 +613,8 @@ def _map_genomic( error_message=str(e), ) else: - msg = f"Reference sequence namespace not supported: {namespace}" - raise ValueError(msg) + msg = f"Unsupported reference sequence namespace: {namespace}" + raise UnsupportedReferenceSequenceNameSpaceError(msg) return MappedScore( accession_id=row.accession, @@ -783,7 +787,8 @@ def _map_accession( variations: list[MappedScore] = [] sequence_id = metadata.target_accession_id if sequence_id is None: - raise ValueError + msg = " No target_accession_id was provided by target gene metadata. Target gene metadata must have a target_accession_id to map to VRS." + raise MissingSequenceIdError(msg) store_accession(sequence_id) @@ -802,8 +807,8 @@ def _map_accession( hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) variations.append(hgvs_nt_mappings) else: - msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" - raise ValueError(msg) + msg = f"Unrecognized accession prefix for accession id: {metadata.target_accession_id}" + raise UnsupportedReferenceSequencePrefixError(msg) return variations @@ -887,7 +892,7 @@ def vrs_map( records: list[ScoreRow], transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, -) -> list[MappedScore] | None: +) -> list[MappedScore]: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects.