diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 68342c5fed8..7080f95d4cc 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -448,7 +448,8 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, - function&, const vector&, int, int, int)> trav_to_string) { + function&, const vector&, int, int, int)> trav_to_string, + const vector>& info_tags) { #ifdef debug cerr << "emitting variant for " << pb2json(snarl) << endl; @@ -581,6 +582,11 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa // add some support info snarl_caller.update_vcf_info(snarl, site_traversals, site_genotype, call_info, sample_name, out_variant); + // add the info + for (const pair& info_tag : info_tags) { + out_variant.info[info_tag.first].push_back(info_tag.second); + } + // if genotype_snarls, then we only flatten up to the snarl endpoints // (this is when we are in genotyping mode and want consistent calls regardless of the sample) int64_t flatten_len_s = 0; @@ -1827,13 +1833,24 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { vector travs; FlowTraversalFinder* flow_trav_finder = dynamic_cast(&traversal_finder); + vector> extra_info_tags; + int64_t site_allele_count = -1; if (flow_trav_finder != nullptr) { // find the max flow traversals using specialized interface that accepts avg heurstic toggle pair, vector> weighted_travs = flow_trav_finder->find_weighted_traversals(snarl, greedy_avg_flow); travs = std::move(weighted_travs.first); } else { - // find the traversals using the generic interface - travs = traversal_finder.find_traversals(snarl); + // find the traversals with the gbwt. also count the haplotypes in the site and remember it + // for a VCF tag (GAN) + GBWTTraversalFinder* gbwt_trav_finder = dynamic_cast(&traversal_finder); + if (gbwt_trav_finder != nullptr) { + pair, vector> gbwt_path_travs = gbwt_trav_finder->find_path_traversals(snarl); + travs = std::move(gbwt_path_travs.first); + extra_info_tags.push_back(make_pair("GAN", std::to_string(gbwt_trav_finder->count_haplotypes(gbwt_path_travs.second)))); + } else { + // find the traversals using the generic interface + travs = traversal_finder.find_traversals(snarl); + } } if (travs.empty()) { @@ -1893,7 +1910,7 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { bool added = true; if (!gaf_output) { added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, - ref_offsets[ref_path_name], genotype_snarls, ploidy); + ref_offsets[ref_path_name], genotype_snarls, ploidy, nullptr, extra_info_tags); } else { pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); @@ -1908,6 +1925,9 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides) const { string header = VCFOutputCaller::vcf_header(graph, contigs, contig_length_overrides); + if (dynamic_cast(&this->traversal_finder) != nullptr) { + header+= "##INFO=\n"; + } header += "##FORMAT=\n"; snarl_caller.update_vcf_header(header); header += "##FILTER=\n"; diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 526dcbef573..f8e79428fdc 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -119,7 +119,8 @@ class VCFOutputCaller { const Snarl& snarl, const vector& called_traversals, const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, - function&, const vector&, int, int, int)> trav_to_string = nullptr); + function&, const vector&, int, int, int)> trav_to_string = nullptr, + const vector>& info_tags = {}); /// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface /// the bool is true if the snarl's backward on the path diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index b6bfba3112a..14349838caf 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -4,6 +4,7 @@ #include "cactus.hpp" #include "gbwt_helper.hpp" #include "haplotype_extracter.hpp" +#include //#define debug namespace vg { @@ -3435,7 +3436,7 @@ pair, vector> FlowTraversalFinder::find_weighted_ GBWTTraversalFinder::GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt) : graph(graph), gbwt(gbwt) { - + this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(gbwt); } GBWTTraversalFinder::~GBWTTraversalFinder() { @@ -3599,7 +3600,26 @@ pair, vector> GBWTTraversalFinder::find_path_ return path_traversals; } +int64_t GBWTTraversalFinder::count_haplotypes(const vector& gbwt_paths) const { + unordered_set> haplotypes; + for (const gbwt::size_type& gbwt_path : gbwt_paths) { + gbwt::size_type path_id = gbwt::Path::id(gbwt_path); + // metadata check copied from vg deconsturct + if (!gbwt.hasMetadata() || !gbwt.metadata.hasPathNames() || path_id >= gbwt.metadata.paths()) { + continue; + } + PathSense sense = gbwtgraph::get_path_sense(gbwt, path_id, gbwt_reference_samples); + string sample_name = gbwtgraph::get_path_sample_name(gbwt, path_id, sense); + if (sample_name != PathMetadata::NO_SAMPLE_NAME) { + haplotypes.insert(make_pair(sample_name, + gbwtgraph::get_path_haplotype(gbwt, path_id, sense))); + } else { + haplotypes.insert(make_pair(gbwtgraph::compose_path_name(gbwt, path_id, sense), 0)); + } + } + return haplotypes.size(); +} } diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index e39499cba03..ca41bb38622 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -638,7 +638,9 @@ class GBWTTraversalFinder : public TraversalFinder { const HandleGraph& graph; const gbwt::GBWT& gbwt; - + // When using the gbwt we need some precomputed information to ask about stored paths. + unordered_set gbwt_reference_samples; + public: GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt); @@ -666,6 +668,11 @@ class GBWTTraversalFinder : public TraversalFinder { virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); const gbwt::GBWT& get_gbwt() { return gbwt; } + + /** + * Count the unique sample/haplotype pairs that are found in the nsarl (from second output of find_path_traversals) + */ + int64_t count_haplotypes(const vector& gbwt_paths) const; protected: