diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index d043ecfdac..455bba720f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -234,6 +234,7 @@ public CellBaseDataResult getVariantAnnotation(String if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { TranscriptQuery query = new TranscriptQuery(); query.setTranscriptsId(Collections.singletonList(ensemblTranscriptId)); + query.setDataRelease(dataRelease); proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, position, aaAlternate).getResults()); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/VariantAnnotationUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/VariantAnnotationUtils.java index 15d8018c99..de31deb874 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/VariantAnnotationUtils.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/VariantAnnotationUtils.java @@ -616,6 +616,28 @@ public static SequenceOntologyTerm newSequenceOntologyTerm(String name) throws S return new SequenceOntologyTerm(ConsequenceTypeMappings.getSoAccessionString(name), name); } + + public static String buildVariantId(Variant variant) { + if (variant == null) { + return null; + } + return buildVariantId(variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate()); + } + + public static String buildVariantIds(List variants) { + StringBuilder variantIds = new StringBuilder(); + for (Variant variant : variants) { + if (variant == null) { + continue; + } + variantIds.append(buildVariantId(variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate())); + } + if (variantIds == null) { + return null; + } + return variantIds.toString(); + } + public static String buildVariantId(String chromosome, int start, String reference, String alternate) { StringBuilder stringBuilder = new StringBuilder(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index fdbd0fd35b..74d25d63be 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -874,7 +874,8 @@ private void adjustPhasedConsequenceTypes(Object[] variantArray, int dataRelease consequenceType3.setSequenceOntologyTerms(soTerms); // Flag these transcripts as already updated for this variant - flagTranscriptAnnotationUpdated(variant2, consequenceType1.getEnsemblTranscriptId()); + flagTranscriptAnnotationUpdated(variant2, consequenceType1.getEnsemblTranscriptId(), + Arrays.asList(variant0, variant1)); variant2DisplayCTNeedsUpdate = true; @@ -921,8 +922,8 @@ private void adjustPhasedConsequenceTypes(Object[] variantArray, int dataRelease consequenceType2.setSequenceOntologyTerms(soTerms); // Flag these transcripts as already updated for this variant - flagTranscriptAnnotationUpdated(variant0, consequenceType1.getEnsemblTranscriptId()); - flagTranscriptAnnotationUpdated(variant1, consequenceType1.getEnsemblTranscriptId()); + flagTranscriptAnnotationUpdated(variant0, consequenceType1.getEnsemblTranscriptId(), Arrays.asList((variant1))); + flagTranscriptAnnotationUpdated(variant1, consequenceType1.getEnsemblTranscriptId(), Arrays.asList((variant0))); variant0DisplayCTNeedsUpdate = true; variant1DisplayCTNeedsUpdate = true; @@ -947,24 +948,25 @@ private void adjustPhasedConsequenceTypes(Object[] variantArray, int dataRelease } } - private void flagTranscriptAnnotationUpdated(Variant variant, String ensemblTranscriptId) { + private void flagTranscriptAnnotationUpdated(Variant variant, String ensemblTranscriptId, List phasedVariants) { Map additionalAttributesMap = variant.getAnnotation().getAdditionalAttributes(); if (additionalAttributesMap == null) { additionalAttributesMap = new HashMap<>(); AdditionalAttribute additionalAttribute = new AdditionalAttribute(); Map transcriptsSet = new HashMap<>(); - transcriptsSet.put(ensemblTranscriptId, null); + transcriptsSet.put(ensemblTranscriptId, VariantAnnotationUtils.buildVariantIds(phasedVariants)); additionalAttribute.setAttribute(transcriptsSet); additionalAttributesMap.put("phasedTranscripts", additionalAttribute); variant.getAnnotation().setAdditionalAttributes(additionalAttributesMap); } else if (additionalAttributesMap.get("phasedTranscripts") == null) { AdditionalAttribute additionalAttribute = new AdditionalAttribute(); Map transcriptsSet = new HashMap<>(); - transcriptsSet.put(ensemblTranscriptId, null); + transcriptsSet.put(ensemblTranscriptId, VariantAnnotationUtils.buildVariantIds(phasedVariants)); additionalAttribute.setAttribute(transcriptsSet); additionalAttributesMap.put("phasedTranscripts", additionalAttribute); } else { - additionalAttributesMap.get("phasedTranscripts").getAttribute().put(ensemblTranscriptId, null); + additionalAttributesMap.get("phasedTranscripts").getAttribute().put(ensemblTranscriptId, + VariantAnnotationUtils.buildVariantIds(phasedVariants)); } } @@ -1297,7 +1299,8 @@ private ConsequenceTypeCalculator getConsequenceTypeCalculator(Variant variant) } } - private boolean[] getRegulatoryRegionOverlaps(Variant variant) throws QueryException, IllegalAccessException, CellBaseException { + private boolean[] getRegulatoryRegionOverlaps(Variant variant, int dataRelease) + throws QueryException, IllegalAccessException, CellBaseException { // 0: overlaps any regulatory region type // 1: overlaps transcription factor binding site boolean[] overlapsRegulatoryRegion = {false, false}; @@ -1305,15 +1308,15 @@ private boolean[] getRegulatoryRegionOverlaps(Variant variant) throws QueryExcep // Variant type checked in expected order of frequency of occurrence to minimize number of checks // Most queries will be SNVs - it's worth implementing an special case for them if (VariantType.SNV.equals(variant.getType())) { - return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart()); + return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart(), dataRelease); } else if (VariantType.INDEL.equals(variant.getType()) && StringUtils.isBlank(variant.getReference())) { - return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart() - 1, variant.getEnd()); + return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart() - 1, variant.getEnd(), dataRelease); // Short deletions and symbolic variants except breakends } else if (!VariantType.BREAKEND.equals(variant.getType())) { - return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart(), variant.getEnd()); + return getRegulatoryRegionOverlaps(variant.getChromosome(), variant.getStart(), variant.getEnd(), dataRelease); // Breakend "variants" only annotate features overlapping the exact positions } else { - overlapsRegulatoryRegion = getRegulatoryRegionOverlaps(variant.getChromosome(), Math.max(1, variant.getStart())); + overlapsRegulatoryRegion = getRegulatoryRegionOverlaps(variant.getChromosome(), Math.max(1, variant.getStart()), dataRelease); // If already found one overlapping regulatory region there's no need to keep checking if (overlapsRegulatoryRegion[0]) { return overlapsRegulatoryRegion; @@ -1322,7 +1325,7 @@ private boolean[] getRegulatoryRegionOverlaps(Variant variant) throws QueryExcep if (variant.getSv() != null && variant.getSv().getBreakend() != null && variant.getSv().getBreakend().getMate() != null) { return getRegulatoryRegionOverlaps(variant.getSv().getBreakend().getMate().getChromosome(), - Math.max(1, variant.getSv().getBreakend().getMate().getPosition())); + Math.max(1, variant.getSv().getBreakend().getMate().getPosition()), dataRelease); } else { return overlapsRegulatoryRegion; } @@ -1330,7 +1333,7 @@ private boolean[] getRegulatoryRegionOverlaps(Variant variant) throws QueryExcep } } - private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer position) + private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer position, int dataRelease) throws QueryException, IllegalAccessException, CellBaseException { // 0: overlaps any regulatory region type // 1: overlaps transcription factor binding site @@ -1339,6 +1342,7 @@ private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer positio RegulationQuery query = new RegulationQuery(); query.setIncludes(Collections.singletonList(REGULATORY_REGION_FEATURE_TYPE_ATTRIBUTE)); query.setRegions(Collections.singletonList(new Region(chromosome, position))); + query.setDataRelease(dataRelease); CellBaseDataResult cellBaseDataResult = regulationManager.search(query); if (cellBaseDataResult.getNumResults() > 0) { @@ -1357,7 +1361,7 @@ private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer positio return overlapsRegulatoryRegion; } - private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer start, Integer end) + private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer start, Integer end, int dataRelease) throws QueryException, IllegalAccessException, CellBaseException { // 0: overlaps any regulatory region type // 1: overlaps transcription factor binding site @@ -1369,7 +1373,7 @@ private boolean[] getRegulatoryRegionOverlaps(String chromosome, Integer start, query.setLimit(1); query.setRegions(Collections.singletonList(new Region(chromosome, start, end))); query.setFeatureTypes(Collections.singletonList(TF_BINDING_SITE)); - + query.setDataRelease(dataRelease); CellBaseDataResult cellBaseDataResult = regulationManager.search(query); // Overlaps transcription factor binding site - it's therefore a regulatory variant @@ -1407,7 +1411,7 @@ private List getConsequenceTypeList(Variant variant, List throws QueryException, IllegalAccessException, CellBaseException { boolean[] overlapsRegulatoryRegion = {false, false}; if (regulatoryAnnotation) { - overlapsRegulatoryRegion = getRegulatoryRegionOverlaps(variant); + overlapsRegulatoryRegion = getRegulatoryRegionOverlaps(variant, dataRelease); } ConsequenceTypeCalculator consequenceTypeCalculator = getConsequenceTypeCalculator(variant); List consequenceTypeList = consequenceTypeCalculator.run(variant, geneList, diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java index e1edc55cdc..ad75ab99f6 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/VariantAnnotationCalculatorTest.java @@ -26,13 +26,16 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantBuilder; import org.opencb.biodata.models.variant.avro.*; import org.opencb.cellbase.core.api.query.QueryException; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.variant.AnnotationBasedPhasedQueryManager; import org.opencb.cellbase.lib.GenericMongoDBAdaptorTest; import org.opencb.cellbase.lib.variant.annotation.VariantAnnotationCalculator; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.core.result.QueryResult; import java.io.IOException; import java.nio.file.Path; @@ -1995,4 +1998,150 @@ public void testSpliceAnnotation() throws Exception { assertEquals(1, cellBaseDataResult.getNumMatches()); } + private void initGrch38() throws Exception { + + clearDB(CELLBASE_DBNAME); + + createDataRelease(); + dataRelease = 1; + + jsonObjectMapper = new ObjectMapper(); + jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); + + + + Path path = Paths.get(getClass() + .getResource("/variant-annotation/grch38/gene.test.json.gz").toURI()); + loadRunner.load(path, "gene", dataRelease); + updateDataRelease(dataRelease, "gene", Collections.emptyList()); + path = Paths.get(getClass() + .getResource("/variant-annotation/grch38/genome_sequence.test.json.gz").toURI()); + loadRunner.load(path, "genome_sequence", dataRelease); + updateDataRelease(dataRelease, "genome_sequence", Collections.emptyList()); + path = Paths.get(getClass() + .getResource("/variant-annotation/grch38/clinical_variants.full.test.json.gz").toURI()); + loadRunner.load(path, "clinical_variants", dataRelease); + updateDataRelease(dataRelease, "clinical_variants", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("regulatory_region", dataRelease); + updateDataRelease(dataRelease, "regulatory_region", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("protein", dataRelease); + updateDataRelease(dataRelease, "protein", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("missense_variation_functional_score", dataRelease); + updateDataRelease(dataRelease, "missense_variation_functional_score", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("protein_functional_prediction", dataRelease); + updateDataRelease(dataRelease, "protein_functional_prediction", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("refseq", dataRelease); + updateDataRelease(dataRelease, "refseq", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("conservation", dataRelease); + updateDataRelease(dataRelease, "conservation", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("variation_functional_score", dataRelease); + updateDataRelease(dataRelease, "variation_functional_score", Collections.emptyList()); + + // Create empty collection + createEmptyCollection("splice_score", dataRelease); + updateDataRelease(dataRelease, "splice_score", Collections.emptyList()); + + variantAnnotationCalculator = new VariantAnnotationCalculator("hsapiens", "GRCh37", dataRelease, + cellBaseManagerFactory); + } + + private static final String PHASE_DATA_URL_SEPARATOR = "\\+"; + private static final String VARIANT_STRING_FORMAT = "\\+"; + + @Test + public void testMNVAdditionalProperties() throws Exception { + + initGrch38(); + +// chr19:13025339:C:A+1|1+999 - synonymous +// chr19:13025341:G:T+1|1+999 - synonymous + + Variant variant1 = parseVariant("chr19:13025339:C:A+1|1+999"); + Variant variant2 = parseVariant("chr19:13025341:G:T+1|1+999"); + + List variantList = new ArrayList<>(); + variantList.add(variant1); + variantList.add(variant2); + + QueryOptions queryOptions = new QueryOptions("useCache", false); + queryOptions.put("include", "consequenceType, reference, alternate, clinical"); + queryOptions.put("normalize", true); + queryOptions.put("skipDecompose", false); + queryOptions.put("checkAminoAcidChange", false); + queryOptions.put("imprecise", true); + queryOptions.put("phased", true); + queryOptions.put("dataRelease", 1); + + List> queryResult = variantAnnotationCalculator.getAnnotationByVariantList(variantList, + queryOptions); + + assertEquals(2, queryResult.size()); + + VariantAnnotation v1 = queryResult.get(0).getResults().get(0); + VariantAnnotation v2 = queryResult.get(1).getResults().get(0); + + assertEquals("missense_variant", v1.getDisplayConsequenceType()); + assertEquals("missense_variant", v2.getDisplayConsequenceType()); + + Map additionalAttributes1 = (Map) v1.getAdditionalAttributes().get("phasedTranscripts").get("attribute"); + Map additionalAttributes2 = (Map) v1.getAdditionalAttributes().get("phasedTranscripts").get("attribute"); + + //System.out.println(additionalAttributes1); + + assertEquals(12, additionalAttributes1.size()); + assertEquals(12, additionalAttributes2.size()); + } + + private Variant parseVariant(String variantString) { + String[] variantStringPartArray = variantString.split(PHASE_DATA_URL_SEPARATOR); + + VariantBuilder variantBuilder; + if (variantStringPartArray.length > 0) { + variantBuilder = new VariantBuilder(variantStringPartArray[0]); + // Either 1 or 3 parts expected variant+GT+PS + if (variantStringPartArray.length == 3) { + List formatList = new ArrayList<>(2); + // If phase set tag is not provided not phase data is added at all to the Variant object + if (!variantStringPartArray[2].isEmpty()) { + formatList.add(AnnotationBasedPhasedQueryManager.PHASE_SET_TAG); + List sampleData = new ArrayList<>(2); + sampleData.add(variantStringPartArray[2]); + // Genotype field might be empty - just PS would be added to Variant object in that case + if (!variantStringPartArray[1].isEmpty()) { + formatList.add(AnnotationBasedPhasedQueryManager.GENOTYPE_TAG); + sampleData.add(variantStringPartArray[1]); + } + variantBuilder.setSampleDataKeys(formatList); + SampleEntry sampleEntry = new SampleEntry(); + sampleEntry.setData(sampleData); + variantBuilder.setSamples(Collections.singletonList(sampleEntry)); + } + } else if (variantStringPartArray.length > 3) { + throw new IllegalArgumentException("Malformed variant string " + variantString + ". " + + "variantString+GT+PS expected, where variantString needs 3 or 4 fields separated by ':'. " + + "Format: \"" + VARIANT_STRING_FORMAT + "\""); + } + } else { + throw new IllegalArgumentException("Malformed variant string " + variantString + ". " + + "variantString+GT+PS expected, where variantString needs 3 or 4 fields separated by ':'. " + + "Format: \"" + VARIANT_STRING_FORMAT + "\""); + } + + return variantBuilder.build(); + } } diff --git a/cellbase-lib/src/test/resources/variant-annotation/grch38/clinical_variants.full.test.json.gz b/cellbase-lib/src/test/resources/variant-annotation/grch38/clinical_variants.full.test.json.gz new file mode 100644 index 0000000000..db56943931 Binary files /dev/null and b/cellbase-lib/src/test/resources/variant-annotation/grch38/clinical_variants.full.test.json.gz differ diff --git a/cellbase-lib/src/test/resources/variant-annotation/grch38/gene.test.json.gz b/cellbase-lib/src/test/resources/variant-annotation/grch38/gene.test.json.gz new file mode 100644 index 0000000000..5b52794b72 Binary files /dev/null and b/cellbase-lib/src/test/resources/variant-annotation/grch38/gene.test.json.gz differ diff --git a/cellbase-lib/src/test/resources/variant-annotation/grch38/genome_sequence.test.json.gz b/cellbase-lib/src/test/resources/variant-annotation/grch38/genome_sequence.test.json.gz new file mode 100644 index 0000000000..6aa967849c Binary files /dev/null and b/cellbase-lib/src/test/resources/variant-annotation/grch38/genome_sequence.test.json.gz differ diff --git a/cellbase-lib/src/test/resources/variant-annotation/grch38/variation_chr11.test.json.gz b/cellbase-lib/src/test/resources/variant-annotation/grch38/variation_chr11.test.json.gz new file mode 100644 index 0000000000..d78f48d063 Binary files /dev/null and b/cellbase-lib/src/test/resources/variant-annotation/grch38/variation_chr11.test.json.gz differ