Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ services:
- vrs-mapping-data-dev:/var/lib/postgresql/data

seqrepo:
image: biocommons/seqrepo:2021-01-29
image: biocommons/seqrepo:2024-12-20
volumes:
- vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo

Expand Down
2 changes: 1 addition & 1 deletion settings/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ MAVEDB_API_KEY=
# Environment variables for seqrepo
####################################################################################################

SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29
SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20
72 changes: 60 additions & 12 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,31 @@
from fastapi.responses import JSONResponse
from requests import HTTPError

from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
from dcd_mapping.align import build_alignment_result
from dcd_mapping.annotate import (
_get_computed_reference_sequence,
_get_mapped_reference_sequence,
_set_scoreset_layer,
annotate,
)
from dcd_mapping.lookup import DataLookupError
from dcd_mapping.mavedb_data import (
from dcd_mapping.exceptions import (
AlignmentError,
BlatNotFoundError,
DataLookupError,
MissingSequenceIdError,
ResourceAcquisitionError,
ScoresetNotSupportedError,
UnsupportedReferenceSequenceNameSpaceError,
UnsupportedReferenceSequencePrefixError,
VrsMapError,
)
from dcd_mapping.mavedb_data import (
get_raw_scoreset_metadata,
get_scoreset_metadata,
get_scoreset_records,
patch_target_sequence_type,
with_mavedb_score_set,
)
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import (
ScoreAnnotation,
ScoresetMapping,
Expand All @@ -30,7 +39,7 @@
VrsVersion,
)
from dcd_mapping.transcripts import select_transcripts
from dcd_mapping.vrs_map import VrsMapError, vrs_map
from dcd_mapping.vrs_map import vrs_map

router = APIRouter(
prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}
Expand All @@ -48,6 +57,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
try:
metadata = get_scoreset_metadata(urn, store_path)
records = get_scoreset_records(metadata, True, store_path)
metadata = patch_target_sequence_type(metadata, records)
except ScoresetNotSupportedError as e:
return JSONResponse(
content=ScoresetMapping(
Expand All @@ -66,6 +76,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
error_message="Score set contains no variants to map",
).model_dump(exclude_none=True)
)
total_score_records = sum(len(v) for v in records.values())

try:
alignment_results = build_alignment_result(metadata, True)
Expand Down Expand Up @@ -112,21 +123,38 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
transcript=transcripts[target_gene],
silent=True,
)
except VrsMapError as e:
except (
UnsupportedReferenceSequenceNameSpaceError,
VrsMapError,
UnsupportedReferenceSequencePrefixError,
MissingSequenceIdError,
) as e:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if not vrs_results or all(
mapping_result is None for mapping_result in vrs_results.values()
):

nonetype_vrs_results = [
result is None
for target_gene in vrs_results
for result in vrs_results[target_gene]
]

if not vrs_results or all(nonetype_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No variant mappings available for this score set",
).model_dump(exclude_none=True)
)
if any(nonetype_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.",
).model_dump(exclude_none=True)
)

annotated_vrs_results = {}
try:
Expand All @@ -144,15 +172,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if not annotated_vrs_results or all(
mapping_result is None for mapping_result in annotated_vrs_results.values()
):

nonetype_annotated_vrs_results = [
result is None
for target_gene in annotated_vrs_results
for result in annotated_vrs_results[target_gene]
]

if not annotated_vrs_results or all(nonetype_annotated_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No annotated variant mappings available for this score set",
).model_dump(exclude_none=True)
)
if any(nonetype_annotated_vrs_results):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.",
).model_dump(exclude_none=True)
)

try:
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
Expand Down Expand Up @@ -233,6 +273,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
).model_dump(exclude_none=True)
)

if len(mapped_scores) != total_score_records:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
).model_dump(exclude_none=True)
)

return JSONResponse(
content=ScoresetMapping(
metadata=raw_metadata,
Expand Down
24 changes: 12 additions & 12 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
from Bio.SearchIO._model import Hit, QueryResult
from cool_seq_tool.schemas import Strand

from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError
from dcd_mapping.resource_utils import (
from dcd_mapping.exceptions import (
AlignmentError,
BlatNotFoundError,
ResourceAcquisitionError,
http_download,
ScoresetNotSupportedError,
)
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH
from dcd_mapping.resource_utils import http_download
from dcd_mapping.schemas import (
AlignmentResult,
GeneLocation,
Expand All @@ -32,14 +35,6 @@
_logger = logging.getLogger(__name__)


class AlignmentError(Exception):
"""Raise when errors encountered during alignment."""


class BlatNotFoundError(AlignmentError):
"""Raise when BLAT binary appears to be missing."""


def _write_query_file(file: Path, lines: list[str]) -> None:
"""Write lines to query file. This method is broken out to enable easy mocking while
testing.
Expand Down Expand Up @@ -363,6 +358,11 @@ def align(
msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
target_gene = scoreset_metadata.target_genes[target_label]
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
# confirm that there is an alignment result for each target gene
for target_gene in scoreset_metadata.target_genes:
if target_gene not in alignment_results:
msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}"
raise AlignmentError(msg)
return alignment_results


Expand Down
84 changes: 52 additions & 32 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def _get_vrs_ref_allele_seq(
metadata: TargetGene,
urn: str,
tx_select_results: TxSelectResult | None,
) -> Extension:
) -> Extension | None:
"""Create `vrs_ref_allele_seq` property."""
start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end)
if (
Expand All @@ -161,8 +161,12 @@ def _get_vrs_ref_allele_seq(
seq = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" # type: ignore
sr = get_seqrepo()
ref = sr.get_sequence(seq, start, end)
if ref is None:
raise ValueError

if not ref:
msg = f"Could not retrieve reference sequence for allele {allele.id} in urn {urn} with start {start} and end {end}"
_logger.warning(msg)
return None

return Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)


Expand Down Expand Up @@ -256,23 +260,28 @@ def _annotate_allele_mapping(
post_mapped: Allele = mapped_score.post_mapped

# get vrs_ref_allele_seq for pre-mapped variants
pre_mapped.extensions = [
_get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results)
]
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
pre_mapped, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
pre_mapped.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}"
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
if accession is None:
raise ValueError
if accession.startswith("refseq:"):
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
elif accession.startswith("refseq:"):
accession = accession[7:]
else:
if tx_results is None or isinstance(tx_results, TxSelectError):
raise ValueError # impossible by definition
accession = tx_results.np
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
else:
accession = tx_results.np

sr = get_seqrepo()
loc = mapped_score.post_mapped.location
Expand All @@ -281,8 +290,9 @@ def _annotate_allele_mapping(
post_mapped.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
if accession:
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]

if vrs_version == VrsVersion.V_1_3:
pre_mapped = _allele_to_vod(pre_mapped)
Expand All @@ -295,9 +305,7 @@ def _annotate_allele_mapping(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
annotation_layer=mapped_score.annotation_layer,
error_message=mapped_score.error_message
if mapped_score.error_message
else None, # TODO might not need if statement here
error_message=mapped_score.error_message,
)


Expand All @@ -311,23 +319,32 @@ def _annotate_haplotype_mapping(
"""Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore
post_mapped: Haplotype = mapped_score.post_mapped # type: ignore

# get vrs_ref_allele_seq for pre-mapped variants
for allele in pre_mapped.members:
allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)]
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
allele, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
allele.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}"
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
if accession is None:
raise ValueError
if accession.startswith("refseq:"):
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
elif accession.startswith("refseq:"):
accession = accession[7:]
else:
if tx_results is None or isinstance(tx_results, TxSelectError):
raise ValueError # impossible by definition
accession = tx_results.np
# impossible by definition
accession = None
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
else:
accession = tx_results.np

sr = get_seqrepo()
for allele in post_mapped.members:
Expand All @@ -337,8 +354,9 @@ def _annotate_haplotype_mapping(
allele.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
if accession:
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]

if vrs_version == VrsVersion.V_1_3:
pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped)
Expand All @@ -351,9 +369,7 @@ def _annotate_haplotype_mapping(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score is not None else None,
annotation_layer=mapped_score.annotation_layer,
error_message=mapped_score.error_message
if mapped_score.error_message
else None, # TODO might not need if statement here
error_message=mapped_score.error_message,
)


Expand Down Expand Up @@ -388,6 +404,7 @@ def annotate(
ScoreAnnotationWithLayer(
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
vrs_version=vrs_version,
error_message=mapped_score.error_message,
)
)
Expand All @@ -410,8 +427,16 @@ def annotate(
)
)
else:
# TODO how to combine this error message with other potential error messages?
ValueError("inconsistent variant structure")
score_annotations.append(
ScoreAnnotationWithLayer(
pre_mapped=mapped_score.pre_mapped,
post_mapped=mapped_score.post_mapped,
vrs_version=vrs_version,
mavedb_id=mapped_score.accession_id,
score=float(mapped_score.score) if mapped_score.score else None,
error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}",
)
)

return score_annotations

Expand Down Expand Up @@ -519,11 +544,6 @@ def _set_scoreset_layer(
expressions. If genomic expressions are available, that's what we'd like to use.
This function tells us how to filter the final annotation objects.
"""
if urn.startswith("urn:mavedb:00000097"):
_logger.debug(
"Manually selecting protein annotation for scores from urn:mavedb:00000097"
)
return AnnotationLayer.PROTEIN
for mapping in mappings:
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
return AnnotationLayer.GENOMIC
Expand Down
Loading