diff --git a/deps/libvgio b/deps/libvgio index 6a1daffebc..0a91f59cd0 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 6a1daffebc8c38f9875a22e5a35ff12c3a37eacb +Subproject commit 0a91f59cd0e7e0532bcf080b6c2eb1dee21c4213 diff --git a/src/alignment.cpp b/src/alignment.cpp index 75cd3cf6f2..8214569f47 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -3898,4 +3898,6 @@ Alignment target_alignment(const PathPositionHandleGraph* graph, const path_hand } return aln; } + + } diff --git a/src/alignment.hpp b/src/alignment.hpp index 12817b1e3c..7902c437b8 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -475,6 +475,162 @@ Alignment target_alignment(const PathPositionHandleGraph* graph, const path_hand Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, const string& feature, bool is_reverse, Mapping& cigar_mapping); + +/// Returns indexes into an vector-like container of Alignments that correspond to supplementary alignments to the primary +/// min_read_coverage Require the supplementaries and primary to cover this fraction of the read +/// max_separation Require the separation or overlap between supplementaries to be at most this many bases +/// min_score_fraction Require each supplementary to have this fraction of the primary's score +/// min_size Require each supplementary to align at least this many bases +template +vector identify_supplementaries(const AlignmentVector& alignments, double min_read_coverage, size_t max_separation, + double min_score_fraction, size_t min_size, size_t primary_idx = 0) { + + assert(min_read_coverage >= 0.0 && min_read_coverage <= 1.0 && min_score_fraction >= 0.0 && min_score_fraction <= 1.0); + + vector supplementaries; + + if (alignments.size() > 1) { + + size_t seq_size = alignments[0].sequence().size(); + + size_t min_total_cov = ceil(min_read_coverage * seq_size); + int64_t min_score = ceil(min_score_fraction * alignments[primary_idx].score()); + + // do sparse DP over part of the read to determine which set of intervals achieves the highest read coverage, subject + // to the constraints + auto do_dp = [&](vector>& intervals, size_t begin, size_t end) -> vector { + + std::sort(intervals.begin(), intervals.end()); + + // map from (end index -> (total coverage, final interval)) + map> dp; + vector backpointer(intervals.size(), numeric_limits::max()); + + for (size_t i = 0; i < intervals.size(); ++i) { + + const auto& interval = intervals[i]; + + // look for the best feasible previous DP entry + // TODO: this could have better worst-case guarantees with a key-value RMQ + auto max_it = dp.end(); + for (auto it = dp.lower_bound(get<0>(interval) - max_separation); it != dp.end() && it->first <= get<0>(interval) + max_separation; ++it) { + if (max_it == dp.end() || max_it->second.first - max(max_it->first - get<0>(interval), 0) < it->second.first) { + max_it = it; + } + } + + if (max_it != dp.end() || get<0>(interval) <= begin + max_separation) { + // we can make an entry into the DP structure + + size_t prev_idx = numeric_limits::max(); + size_t cov = 0; + if (max_it != dp.end()) { + prev_idx = max_it->second.second; + cov = max_it->second.first - max(max_it->second.first - get<0>(interval), 0); + } + cov += get<1>(interval) - get<0>(interval); + backpointer[i] = prev_idx; + + auto dp_val = make_pair(cov, i); + auto insert_it = dp.emplace(get<1>(interval), dp_val); + if (!insert_it.second) { + // there is already an interval ending at this index, take the max + if (insert_it.first->second.first < dp_val.first) { + insert_it.first->second = dp_val; + } + } + } + } + + // traceback the optimum + vector traceback; + auto final_it = dp.lower_bound(max(begin, end - max_separation)); + if (final_it != dp.end()) { + // there is a feasible solution + traceback.emplace_back(final_it->second.second); + while (backpointer[traceback.back()] != numeric_limits::max()) { + traceback.emplace_back(backpointer[traceback.back()]); + } + reverse(traceback.begin(), traceback.end()); + } + + return traceback; + }; + + auto primary_interval = aligned_interval(alignments[primary_idx]); + + if (primary_interval.second - primary_interval.first >= min_size && alignments[primary_idx].score() >= min_score) { + + // records of (begin read pos, end read pos, idx of alignment) + vector> left_side, right_side; + + for (size_t i = 0; i < alignments.size(); ++i) { + if (i == primary_idx) { + continue; + } + auto interval = aligned_interval(alignments[i]); + // filter to alignments that meet the minimum thresholds + if (alignments[i].score() >= min_score && interval.second - interval.first >= min_size) { + if (interval.second <= primary_interval.first + max_separation) { + left_side.emplace_back(interval.first, interval.second, i); + } + else if (interval.first >= primary_interval.second - max_separation) { + right_side.emplace_back(interval.first, interval.second, i); + } + } + } + + // do DP on each side and combine the tracebacks + vector> full_traceback; + for (auto i : do_dp(left_side, 0, primary_interval.first)) { + full_traceback.push_back(left_side[i]); + } + full_traceback.emplace_back(primary_interval.first, primary_interval.second, primary_idx); + for (auto i : do_dp(right_side, primary_interval.second, seq_size)) { + full_traceback.push_back(right_side[i]); + } + + // compute the total read coverage with sweep line algorithm + if (!is_sorted(full_traceback.begin(), full_traceback.end())) { + // TODO: is this even possible? maybe in some weird cases where the max separation is larger than the min size + sort(full_traceback.begin(), full_traceback.end()); + } + size_t total_cov = 0; + pair curr_interval(0, 0); + for (const auto& interval : full_traceback) { + if (get<0>(interval) <= curr_interval.second) { + curr_interval.second = max(curr_interval.second, get<1>(interval)); + } + else { + total_cov += (curr_interval.second - curr_interval.first); + curr_interval.first = get<0>(interval); + curr_interval.second = get<1>(interval); + } + } + total_cov += (curr_interval.second - curr_interval.first); + if (aligned_interval(alignments[get<2>(full_traceback.front())]).first <= max_separation && + aligned_interval(alignments[get<2>(full_traceback.back())]).second >= max(seq_size - max_separation, 0) && + total_cov >= min_total_cov) { + // the supplementaries and primary jointly cover the entire read, the result of DP is feasible + + for (auto& interval : full_traceback) { + if (get<2>(interval) != primary_idx) { + supplementaries.push_back(get<2>(interval)); + } + } + + sort(supplementaries.begin(), supplementaries.end()); + } + } + } + + return supplementaries; +} + + + + + } #endif diff --git a/src/interval_union.cpp b/src/interval_union.cpp new file mode 100644 index 0000000000..6cdd6270c9 --- /dev/null +++ b/src/interval_union.cpp @@ -0,0 +1,113 @@ +/** + * \file interval_union.cpp: contains the implementation of IntervalUnion + */ + + +#include "interval_union.hpp" + +#include +#include + +namespace vg { + +using namespace std; + +void IntervalUnion::add(size_t begin, size_t end) { + + assert(begin <= end); + if (begin == end) { + // empty interval, can be ignore + return; + } + + auto after = irvl_union.upper_bound(begin); + auto merge_from = irvl_union.end(); + if (after != irvl_union.begin()) { + // get the nearest interval to the left, possibly at the same index + auto before = after; + --before; + if (before->second >= begin) { + // merge this interval into the earlier interval + if (end > before->second) { + union_size += end - before->second; + before->second = end; + merge_from = before; + } + } + else { + // the start of this interval is outside the one to the left + union_size += (end - begin); + merge_from = irvl_union.emplace_hint(after, begin, end); + } + } + else { + // there is no interval to the left + union_size += (end - begin); + merge_from = irvl_union.emplace_hint(after, begin, end); + } + + // TODO: it might also be possible to do a lazy-update version of this data structure + // to get amortized O(log n) add + + if (merge_from != irvl_union.end()) { + // try to merge this interval with the ones to the right + auto next = merge_from; + ++next; + while (next != irvl_union.end() && next->first <= merge_from->second) { + union_size -= (min(merge_from->second, next->second) - next->first); + merge_from->second = max(merge_from->second, next->second); + irvl_union.erase(next); + next = merge_from; + ++next; + } + } +} + +void IntervalUnion::clear() { + union_size = 0; + irvl_union.clear(); +} + +size_t IntervalUnion::overlap(size_t begin, size_t end) const { + + // start to the left of the query interval + auto iter = irvl_union.upper_bound(begin); + if (iter != irvl_union.begin()) { + --iter; + } + + // TODO: there's a worst-case O(log n) solution using a sum-augmented range tree + // to get the contribution from strictly contained intervals, but rebalancing it + // would be a pain + + // add up the length of the intervals that this overlaps + size_t total_overlap = 0; + while (iter != irvl_union.end() && iter->first < end) { + if (iter->second > begin && end > iter->first) { + total_overlap += (min(end, iter->second) - max(begin, iter->first)); + } + ++iter; + } + + return total_overlap; +} + +size_t IntervalUnion::total_size() const { + return union_size; +} + +size_t IntervalUnion::component_size() const { + return irvl_union.size(); +} + +vector> IntervalUnion::get_union() const { + vector> intervals; + intervals.reserve(irvl_union.size()); + for (const auto& interval : irvl_union) { + intervals.push_back(interval); + } + return intervals; +} + +} + diff --git a/src/interval_union.hpp b/src/interval_union.hpp new file mode 100644 index 0000000000..bd6c6c7b1a --- /dev/null +++ b/src/interval_union.hpp @@ -0,0 +1,67 @@ +/** \file + * interval_union.hpp: defines the union of a collection of intervals that can + * be constructed incrementally + */ +#ifndef VG_INTERVAL_UNION_HPP_INCLUDED +#define VG_INTERVAL_UNION_HPP_INCLUDED + +#include +#include +#include + +namespace vg { + +using namespace std; + +/** + * The union of a collection of intervals + */ +class IntervalUnion +{ +public: + IntervalUnion() = default; + ~IntervalUnion() = default; + + // add a new interval to the union + void add(size_t begin, size_t end); + inline void add(const pair& interval); + + // return the union to an empy starting state + void clear(); + + // query an interval's length of overlap with the union + size_t overlap(size_t begin, size_t end) const; + inline size_t overlap(const pair& interval) const; + + // get the total size of the intervals in the union + size_t total_size() const; + + // return the number of disjoint intervals that comprise the union + size_t component_size() const; + + // get the intervals that comprise the union + vector> get_union() const; + +private: + + map irvl_union; + + size_t union_size = 0; + +}; + +/* + * Inline implementations + */ + +void IntervalUnion::add(const pair& interval) { + add(interval.first, interval.second); +} + +size_t IntervalUnion::overlap(const pair& interval) const { + return overlap(interval.first, interval.second); +} + +} + +#endif // VG_INTERVAL_UNION_HPP_INCLUDED diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 5c24cdc4c2..fb27d363f5 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -13,6 +13,8 @@ #include "split_strand_graph.hpp" #include "subgraph.hpp" #include "statistics.hpp" +#include "utility.hpp" +#include "interval_union.hpp" #include "algorithms/alignment_path_offsets.hpp" #include "algorithms/count_covered.hpp" #include "algorithms/intersect_path_offsets.hpp" @@ -604,7 +606,7 @@ vector MinimizerMapper::map(Alignment& aln) { } vector MinimizerMapper::map_from_extensions(Alignment& aln) { - + if (show_work) { #pragma omp critical (cerr) dump_debug_query(aln); @@ -702,13 +704,36 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { size_t kept_cluster_count = 0; + // For identifying supplementary alignments, keep track of which read intervals we have covered by clusters already + IntervalUnion current_read_coverage; + //Process clusters sorted by both score and read coverage - process_until_threshold_c(clusters.size(), [&](size_t i) -> double { + process_until_threshold_e(clusters.size(), [&](size_t i) -> double { return clusters[i].coverage; }, [&](size_t a, size_t b) -> bool { return ((clusters[a].coverage > clusters[b].coverage) || (clusters[a].coverage == clusters[b].coverage && clusters[a].score > clusters[b].score)); - }, cluster_coverage_threshold, min_extensions, max_extensions, rng, [&](size_t cluster_num, size_t item_count) -> bool { + }, [&](size_t cluster_num) -> bool { + if (!find_supplementaries) { + // If we aren't augmenting the primary with supplementary alignments, we don't need to worry + // about covering the whole read + return false; + } + const Cluster& cluster = clusters[cluster_num]; + // TODO: Use sweep line for guaranteed O(n log n) to compute union + IntervalUnion cluster_intervals; + for (size_t seed_num : cluster.seeds) { + const Minimizer& minimizer = minimizers[seeds[seed_num].source]; + cluster_intervals.add(minimizer.agglomeration_start, minimizer.agglomeration_start + minimizer.agglomeration_length); + } + size_t total_overlap = 0; + for (const auto& interval : cluster_intervals.get_union()) { + total_overlap += current_read_coverage.overlap(interval); + } + // Could this cluster finish out a supplementary alignment? + return (total_overlap <= 2 * max_supplementary_separation && + max(cluster_intervals.total_size() - total_overlap, 0) >= min_supplementary_read_coverage); + }, cluster_coverage_threshold, min_extensions, max_extensions, rng, [&](size_t cluster_num, size_t item_count, bool escaped_threshold) -> bool { // Handle sufficiently good clusters in descending coverage order Cluster& cluster = clusters[cluster_num]; @@ -718,8 +743,8 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { } // First check against the additional score filter - if (cluster_score_threshold != 0 && cluster.score < cluster_score_cutoff - && kept_cluster_count >= min_extensions) { + if (cluster_score_threshold != 0 && cluster.score < cluster_score_cutoff + && kept_cluster_count >= min_extensions && !escaped_threshold) { //If the score isn't good enough and we already kept at least min_extensions clusters, //ignore this cluster if (track_provenance) { @@ -747,6 +772,7 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { cerr << log_name() << "Cluster " << cluster_num << endl; cerr << log_name() << "Covers " << cluster.coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; cerr << log_name() << "Scores " << cluster.score << "/" << cluster_score_cutoff << endl; + cerr << log_name() << (escaped_threshold ? "Escaped" : "Did not escape") << " score and coverage thresholds" << endl; } } @@ -762,6 +788,12 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { &funnel)); kept_cluster_count ++; + + if (find_supplementaries) { + for (const auto& extension : cluster_extensions.back()) { + current_read_coverage.add(extension.read_interval); + } + } return true; @@ -789,6 +821,7 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { funnel.fail("cluster-coverage", cluster_num, clusters[cluster_num].coverage); } if (show_work) { + #pragma omp critical (cerr) { cerr << log_name() << "Cluster " << cluster_num << " fails cluster coverage cutoffs" << endl; @@ -811,11 +844,6 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { // We will fill this with all computed alignments in estimated score order. vector alignments; alignments.reserve(cluster_extensions.size()); - // This maps from alignment index back to cluster extension index, for - // tracing back to minimizers for MAPQ. Can hold - // numeric_limits::max() for an unaligned alignment. - vector alignments_to_source; - alignments_to_source.reserve(cluster_extensions.size()); // Create a new alignment object to get rid of old annotations. aln.clear_refpos(); @@ -851,14 +879,37 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { } } }; + + // Reset the extension read coverage so we can use it to track alignment coverage. + current_read_coverage.clear(); // Go through the gapless extension groups in score order. - process_until_threshold_b(cluster_extension_scores, - extension_set_score_threshold, min_extension_sets, max_alignments, rng, [&](size_t extension_num, size_t item_count) -> bool { + process_until_threshold_d(cluster_extension_scores, [&](size_t extension_num) -> bool { + // TODO: repetitive with the escape function for seeds + if (!find_supplementaries) { + // No need to find alignments covering the entire read + return false; + } + + const auto& extensions = cluster_extensions[extension_num]; + // TODO: Use sweep line for guaranteed O(n log n) to compute union + IntervalUnion extension_intervals; + for (const auto& extension : extensions) { + extension_intervals.add(extension.read_interval); + } + size_t total_overlap = 0; + for (const auto& interval : extension_intervals.get_union()) { + total_overlap += current_read_coverage.overlap(interval); + } + // Could this cluster finish out a supplementary alignment? + return (total_overlap <= 2 * max_supplementary_separation && + max(extension_intervals.total_size() - total_overlap, 0) >= min_supplementary_read_coverage); + }, + extension_set_score_threshold, min_extension_sets, max_alignments, rng, [&](size_t extension_num, size_t item_count, bool escaped_threshold) -> bool { // This extension set is good enough. // Called in descending score order. - if (cluster_extension_scores[extension_num] < extension_set_min_score) { + if (cluster_extension_scores[extension_num] < extension_set_min_score && !escaped_threshold) { // Actually discard by score discard_processed_cluster_by_score(extension_num); return false; @@ -867,7 +918,7 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { if (show_work) { #pragma omp critical (cerr) { - cerr << log_name() << "gapless extension group " << extension_num << " is good enough (score=" << cluster_extension_scores[extension_num] << ")" << endl; + cerr << log_name() << "gapless extension group " << extension_num << " is good enough (score=" << cluster_extension_scores[extension_num] << "), " << (escaped_threshold ? "passed" : "did not pass") << " by escaping threshold" << endl; if (track_correctness && funnel.was_correct(extension_num)) { cerr << log_name() << "\tCORRECT!" << endl; dump_debug_extension_set(gbwt_graph, aln, cluster_extensions[extension_num]); @@ -950,7 +1001,6 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { // Have a function to process the best alignments we obtained auto observe_alignment = [&](Alignment& aln) { alignments.emplace_back(std::move(aln)); - alignments_to_source.push_back(extension_num); if (track_provenance) { @@ -1009,13 +1059,19 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { if (alignments.size() == 0) { // Produce an unaligned Alignment alignments.emplace_back(aln); - alignments_to_source.push_back(numeric_limits::max()); if (track_provenance) { // Say it came from nowhere funnel.introduce(); } } + + vector supplementaries; + if (find_supplementaries && !alignments.empty()) { + // Check if any of the secondaries can be a supplementary of the primary before they + // would need to compete for MAPQ + supplementaries = std::move(identify_supplementary_alignments(alignments, funnel)); + } if (track_provenance) { // Now say we are finding the winner(s) @@ -1125,6 +1181,21 @@ vector MinimizerMapper::map_from_extensions(Alignment& aln) { // Make sure to clamp 0-60. mappings.front().set_mapping_quality(max(min(mapq, 60.0), 0.0)); + if (!supplementaries.empty()) { + // Estimate a mapping quality for the supplementaries + // TODO: only count the score of the overlapping portion of other alignments + for (auto& supp : supplementaries) { + double score_diff = mappings.front().score() - supp.score(); + scores[0] -= score_diff; + double supp_mapq = get_regular_aligner()->compute_first_mapping_quality(scores, false); + supp.set_mapping_quality(max(min(supp_mapq, mappings.front().mapping_quality()), 0)); + scores[0] += score_diff; + } + // Store them in an annotation on the primary + for (auto& supp : supplementaries) { + *mappings.front().add_supplementary() = std::move(supp); + } + } if (track_provenance) { funnel.substage_stop(); @@ -1322,7 +1393,7 @@ pair, vector> MinimizerMapper::map_paired(Alignment // TODO: Make these local classes when C++ learns to let you use template members in local classes. /// Type to point to an alignment of a known read -struct read_alignment_index_t { +struct MinimizerMapper::read_alignment_index_t { size_t fragment; size_t alignment; @@ -1341,19 +1412,20 @@ struct read_alignment_index_t { } // Allow comparison - inline bool operator==(const read_alignment_index_t& other) { + inline bool operator==(const read_alignment_index_t& other) const { return fragment == other.fragment && alignment == other.alignment; }; - inline bool operator!=(const read_alignment_index_t& other) { + inline bool operator!=(const read_alignment_index_t& other) const { return !(*this == other); }; }; + /// Represents an unset index -const read_alignment_index_t NO_READ_INDEX = {std::numeric_limits::infinity(), std::numeric_limits::infinity()}; +const MinimizerMapper::read_alignment_index_t MinimizerMapper::NO_READ_INDEX = {std::numeric_limits::infinity(), std::numeric_limits::infinity()}; /// Type to point to an alignment of either read /// -struct alignment_index_t { +struct MinimizerMapper::alignment_index_t { size_t fragment; size_t alignment; bool read; @@ -1371,15 +1443,15 @@ struct alignment_index_t { } // Allow comparison - inline bool operator==(const alignment_index_t& other) { + inline bool operator==(const alignment_index_t& other) const { return fragment == other.fragment && alignment == other.alignment && read == other.read; }; - inline bool operator!=(const alignment_index_t& other) { + inline bool operator!=(const alignment_index_t& other) const { return !(*this == other); }; }; /// Represents an unset index -const alignment_index_t NO_INDEX {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; +const MinimizerMapper::alignment_index_t MinimizerMapper::NO_INDEX {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; pair, vector> MinimizerMapper::map_paired(Alignment& aln1, Alignment& aln2) { @@ -1995,7 +2067,6 @@ pair, vector> MinimizerMapper::map_paired(Alignment fragment_distances.reserve(alignments.size()); //for each alignment pair, what type of pair is it - enum PairType {paired, unpaired, rescued_from_first, rescued_from_second}; vector pair_types; //For each pair of alignments in paired_alignments, how many equivalent or better fragment clusters @@ -2132,6 +2203,7 @@ pair, vector> MinimizerMapper::map_paired(Alignment } + array, 2> supplementaries; if (!unpaired_alignments.empty()) { //If we found some clusters that had no pair in a fragment cluster if (!found_pair) { @@ -2251,12 +2323,16 @@ pair, vector> MinimizerMapper::map_paired(Alignment } } + // Keep track of which entries from unpaired_alignments were used for rescue + vector attempted_rescue_from(unpaired_alignments.size(), false); + if (max_rescue_attempts != 0) { //Attempt rescue on unpaired alignments if either we didn't find any pairs or if the unpaired alignments are very good process_until_threshold_a(unpaired_alignments.size(), (std::function) [&](size_t i) -> double{ return (double) unpaired_alignments.at(i).lookup_in(alignments).score(); }, 0, 1, max_rescue_attempts, rng, [&](size_t i, size_t item_count) { + attempted_rescue_from[i] = true; auto& index = unpaired_alignments.at(i); size_t j = index.lookup_in(alignment_indices); if (track_provenance) { @@ -2369,9 +2445,12 @@ pair, vector> MinimizerMapper::map_paired(Alignment return; }); } - } - + if (find_supplementaries) { + supplementaries = std::move(identify_supplementary_alignments(alignments, paired_alignments, paired_scores, pair_types, + unpaired_alignments, attempted_rescue_from, funnels)); + } + } if (track_provenance) { // Now say we are finding the winner(s) @@ -2675,6 +2754,33 @@ pair, vector> MinimizerMapper::map_paired(Alignment } } } + + if (!supplementaries[0].empty() || !supplementaries[1].empty()) { + for (auto r : {0, 1}) { + if (supplementaries[r].empty()) { + continue; + } + vector suppl_alignments; + suppl_alignments.reserve(supplementaries[r].size()); + for (auto& index : supplementaries[r]) { + suppl_alignments.emplace_back(index.lookup_for_read_in(r, alignments)); + } + + // Estimate a mapping quality for the supplementaries + // TODO: only count the score of the overlapping portion of other alignments + for (auto& supp : suppl_alignments) { + double score_diff = mappings[r].front().score() - supp.score(); + scores[0] -= score_diff; + double supp_mapq = get_regular_aligner()->compute_first_mapping_quality(scores, false, multiplicities); + supp.set_mapping_quality(max(min(supp_mapq, mappings[r].front().mapping_quality()), 0)); + scores[0] += score_diff; + } + + for (auto& suppl_aln : suppl_alignments) { + *mappings[r].front().add_supplementary() = std::move(suppl_aln); + } + } + } //Annotate top pair with its fragment distance, properly-paired-ness, fragment length distrubution, and secondary scores bool properly_paired = distances.front() == std::numeric_limits::max() ? false : @@ -3437,6 +3543,305 @@ void MinimizerMapper::fix_dozeu_end_deletions(Alignment& alignment) const { } } +//----------------------------------------------------------------------------- + + +array, 2> +MinimizerMapper::identify_supplementary_alignments(vector, 2>>& alignments, + vector>& paired_alignments, + vector& paired_scores, + vector& pair_types, + const vector& unpaired_alignments, + const vector& attempted_rescue_from, + array& funnels) const { + + array, 2> supplementaries; + + struct ReadAlnIdxHash { + size_t operator()(const read_alignment_index_t& val) const { + size_t h = 0x2B4D; + hash_combine(h, val.fragment); + hash_combine(h, val.alignment); + return h; + } + }; + + // Check whether we can unambiguously identify the primary alignment + // before the next shuffled score-order iteration loop + size_t primary_idx = 0; + bool tied = false; + for (size_t i = 1; i < paired_scores.size(); ++i) { + if (paired_scores[i] > paired_scores[primary_idx]) { + tied = false; + primary_idx = i; + } + else if (paired_scores[i] == paired_scores[primary_idx]) { + tied = true; + } + } + + if (!tied && primary_idx < paired_scores.size()) { + + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].stage("supplementary"); + } + } + + // The primary is unambiguous + + // For each read, the source indexes into paired_alignments for prospective supplementary alignments + array, 2> paired_suppl_source; + // For each read, source indexes into unpaired_alignments for prospective supplementary alignments + array, 2> unpaired_suppl_source; + // For each read, the prospective supplementary alignments (and the primary) + array, 2> candidates; + + // Put the primary in the vector with the candidates + for (auto r : {0, 1}) { + candidates[r].push_back(paired_alignments[primary_idx][r].lookup_for_read_in(r, alignments)); + } + + for (size_t i = 0; i < paired_alignments.size(); ++i) { + + // Does this pair contain a failed rescue? + int r = -1; + if (pair_types[i] == rescued_from_first) { + const auto& rescued = paired_alignments[i][1].lookup_for_read_in(1, alignments); + if (rescued.path().mapping_size() == 0) { + r = 0; + } + } + else if (pair_types[i] == rescued_from_second) { + const auto& rescued = paired_alignments[i][0].lookup_for_read_in(0, alignments); + if (rescued.path().mapping_size() == 0) { + r = 1; + } + } + + if (r >= 0) { + // Rescue failed, the rescuer is still essentially unpaired and can be a candidate + paired_suppl_source[r].push_back(i); + candidates[r].push_back(paired_alignments[i][r].lookup_for_read_in(r, alignments)); + } + } + + for (size_t i = 0; i < unpaired_alignments.size(); ++i) { + if (!attempted_rescue_from[i]) { + // Rescue was not attempted, alignment is still uniquely represented in among the unpaired alignments + const auto& index = unpaired_alignments[i]; + unpaired_suppl_source[index.read].push_back(i); + candidates[index.read].push_back(index.lookup_in(alignments)); + } + } + + // Look for supplementaries on each read + for (auto r : {0, 1}) { + + auto& read_supplementaries = supplementaries[r]; + auto& read_suppl_candidates = candidates[r]; + + auto supplementary_idxs = identify_supplementaries(read_suppl_candidates, min_supplementary_read_coverage, + max_supplementary_separation, min_supplementary_score_fraction, + min_supplementary_size, 0); + + if (!supplementary_idxs.empty()) { + // We found supplementaries + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Identified supplementary alignments for read " << r << " primary alignment " << log_alignment(read_suppl_candidates[0]) << endl; + for (auto i : supplementary_idxs) { + cerr << log_name() << log_alignment(read_suppl_candidates[i]) << endl; + } + } + } + + // Translate back from vector indexes to alignment indexes + auto& read_supplementaries = supplementaries[r]; + for (auto i : supplementary_idxs) { + if (i <= paired_suppl_source.size()) { + const auto& source = paired_alignments[paired_suppl_source[r][i - 1]][r]; + read_supplementaries.push_back({source.fragment, source.alignment}); + } + else { + const auto& source = unpaired_alignments[unpaired_suppl_source[r][i - paired_suppl_source.size() - 1]]; + read_supplementaries.push_back({source.fragment, source.alignment}); + } + } + } + else { + #pragma omp critical (cerr) + { + cerr << log_name() << "Did not identify any supplementary alignments for read " << r << endl; + } + } + } + + if (!supplementaries[0].empty() || !supplementaries[1].empty()) { + // Which alignments were consumed as supplementaries + array, 2> consumed; + for (auto r : {0, 1}) { + for (const auto& suppl : supplementaries[r]) { + consumed[r].insert(suppl); + } + } + + // Filter the input pairs + vector merged; + size_t s = 0; + for (size_t i = 0; i < paired_alignments.size(); ++i) { + if (consumed[0].count(paired_alignments[i][0]) || consumed[1].count(paired_alignments[i][1])) { + // At least one end of this alignment was taken as a supplementary + ++s; + merged.push_back(i); + } + else if (s != 0) { + // Shift into the front of the vector + paired_alignments[i - s] = paired_alignments[i]; + paired_scores[i - s] = paired_scores[i]; + pair_types[i - s] = pair_types[i]; + + } + } + + if (track_provenance) { + // Record the supplementaries being merged into the primary + merged.push_back(primary_idx); + for (size_t i = 0, j = 0; i < paired_alignments.size(); ++i) { + for (auto r : {0, 1}) { + auto& funnel = funnels[r]; + if (i == primary_idx) { + funnel.merge(merged.begin(), merged.end()); + } + else if (i == merged[j]) { + ++j; + } + else { + funnel.project(i); + } + } + } + } + + paired_alignments.resize(paired_alignments.size() - s); + paired_scores.resize(paired_alignments.size()); + pair_types.resize(paired_alignments.size()); + } + else { + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Did not identify any supplementary alignments" << endl; + } + } + + if (track_provenance) { + // All alignments are carried forward unchanged + for (size_t i = 0; i < paired_alignments.size(); ++i) { + for (auto r : {0, 1}) { + funnels[r].project(i); + } + } + } + } + } + + return std::move(supplementaries); +} + +vector MinimizerMapper::identify_supplementary_alignments(vector& alignments, Funnel& funnel) const { + + + vector supplementaries; + + // Check whether we can unambiguously identify the primary alignment + // before the upcoming shuffled score-order iteration loop + size_t primary_idx = 0; + bool tied = false; + for (size_t i = 1; i < alignments.size(); ++i) { + if (alignments[i].score() > alignments[primary_idx].score()) { + tied = false; + primary_idx = i; + } + else if (alignments[i].score() == alignments[primary_idx].score()) { + tied = true; + } + } + + if (!tied && primary_idx < alignments.size()) { + + // The primary is unambiguous + + if (track_provenance) { + funnel.stage("supplementary"); + } + + auto supplementary_idxs = identify_supplementaries(alignments, min_supplementary_read_coverage, + max_supplementary_separation, min_supplementary_score_fraction, + min_supplementary_size, primary_idx); + + if (!supplementary_idxs.empty()) { + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Identified supplementary alignments for primary alignment " << log_alignment(alignments[primary_idx]) << endl; + for (auto i : supplementary_idxs) { + cerr << log_name() << log_alignment(alignments[i]) << endl; + } + } + } + + supplementaries.reserve(supplementary_idxs.size()); + for (size_t i = 0, s = 0; i < alignments.size(); ++i) { + + if (i == supplementary_idxs[s]) { + // Move this alignment to the supplementaries vector + supplementaries.emplace_back(std::move(alignments[i])); + ++s; + } + else { + if (s != 0) { + // Shift into the front of the vector + alignments[i - s] = std::move(alignments[i]); + } + + if (track_provenance) { + if (i == primary_idx) { + auto group = supplementary_idxs; + group.push_back(primary_idx); + funnel.merge(group.begin(), group.end()); + } + else { + funnel.project(i); + } + } + } + } + + alignments.resize(alignments.size() - supplementary_idxs.size()); + + } + else { + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Did not identify any supplementary alignments" << endl; + } + } + + if (this->track_provenance) { + for (size_t i = 0; i < alignments.size(); ++i) { + funnel.project(i); + } + } + } + } + + return supplementaries; +} //----------------------------------------------------------------------------- diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index f02428f20c..cb13324a6b 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -541,6 +541,27 @@ class MinimizerMapper : public AlignerClient { /// The algorithm used for rescue. RescueAlgorithm rescue_algorithm = rescue_dozeu; + /// Identify secondary alignments as supplementaries if they partition the read with the primary + static constexpr bool default_find_supplementaries = false; + bool find_supplementaries = default_find_supplementaries; + + /// The minimum length of aligned sequence required in order to report a supplementary alignment + static constexpr size_t default_min_supplementary_size = 40; + size_t min_supplementary_size = default_min_supplementary_size; + + /// The maximum amount of read separation or overlap between supplementary alignment(s) and the primary alignment + static constexpr size_t default_max_supplementary_separation = 10; + size_t max_supplementary_separation = default_max_supplementary_separation; + + /// The minimum score of a supplementary as a fraction of the primary alignment score + static constexpr double default_min_supplementary_score_fraction = 0.4; + size_t min_supplementary_score_fraction = default_min_supplementary_score_fraction; + + /// The minimum fraction of the read that the primary and the supplementaries must jointly align in order for + /// supplementary alignments to be reported from disjoint graph regions + static constexpr double default_min_supplementary_read_coverage = 0.9; + size_t min_supplementary_read_coverage = default_min_supplementary_read_coverage; + /// Apply this sample name string sample_name; /// Apply this read group name @@ -639,6 +660,9 @@ class MinimizerMapper : public AlignerClient { protected: + /// Types of paired alignments in paired-end mapping + enum PairType {paired, unpaired, rescued_from_first, rescued_from_second}; + /// Convert an integer distance, with limits standing for no distance, to a /// double annotation that can safely be parsed back from JSON into an /// integer if it is integral. @@ -1025,6 +1049,39 @@ class MinimizerMapper : public AlignerClient { */ void fix_dozeu_end_deletions(Alignment& rescued_alignment) const; +//----------------------------------------------------------------------------- + + // Supplementary alignments + + /** + * In a vector of single end alignments, identify alignments that should be considered supplementary + * alignments to the primary, remove them from the multimapping vector, and return them in a separate + * vector + */ + vector identify_supplementary_alignments(vector& alignments, Funnel& funnel) const; + + struct read_alignment_index_t; + struct alignment_index_t; + static const read_alignment_index_t NO_READ_INDEX; + static const alignment_index_t NO_INDEX; + + /** + * For a paired read, identify alignments for each read, identify other alignments that should be considered + * supplementary alignments to the corresponding alignment in the primary pair. Candidates are identified + * from unpaired alignments that failed rescue attempts or for which rescue was not attempted. Supplementary + * alignments that are identified are removed from the paired read data structures and returned as a pair + * of Alignment vectors + */ + array, 2> identify_supplementary_alignments(vector, 2>>& alignments, + vector>& paired_alignments, + vector& paired_scores, + vector& pair_types, + const vector& unpaired_alignments, + const vector& attempted_rescue_from, + array& funnels) const; + + + //----------------------------------------------------------------------------- // Helper functions. @@ -1404,6 +1461,36 @@ class MinimizerMapper : public AlignerClient { const function& process_item, const function& discard_item_by_count, const function& discard_item_by_score) const; + + /** + * Same as the process_until_threshold_b, except that items can escape the score + * threshold by a function-specified condition. The process_item function receives + * an additional bool indicating whether an item passed by threshold escape + */ + template + void process_until_threshold_d(const vector& scores, + const function& threshold_escape, + double threshold, size_t min_count, size_t max_count, + LazyRNG& get_seed, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const; + + /** + * Same as the process_until_threshold_c, except that items can escape the score + * threshold by a function-specified condition. The process_item function receives + * an additional bool indicating whether an item passed by threshold escape + */ + template + void process_until_threshold_e(size_t items, + const function& get_score, + const function& comparator, + const function& threshold_escape, + double threshold, size_t min_count, size_t max_count, + LazyRNG& get_seed, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const; // Internal debugging functions @@ -1500,7 +1587,7 @@ void MinimizerMapper::process_until_threshold_b(const vector& scores, return scores[i]; }, [&](size_t a, size_t b) -> bool { return (scores[a] > scores[b]); - },threshold, min_count, max_count, rng, process_item, discard_item_by_count, discard_item_by_score); + }, threshold, min_count, max_count, rng, process_item, discard_item_by_count, discard_item_by_score); } template @@ -1512,6 +1599,38 @@ void MinimizerMapper::process_until_threshold_c(size_t items, const function& discard_item_by_count, const function& discard_item_by_score) const { + process_until_threshold_e(items, get_score, comparator, [](size_t){return false;}, threshold, min_count, + max_count, rng, [&](size_t a, size_t b, bool e){return process_item(a, b);}, discard_item_by_count, discard_item_by_score); +} + +template +void MinimizerMapper::process_until_threshold_d(const vector& scores, + const function& threshold_escape, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const { + + process_until_threshold_e(scores.size(), (function)[&](size_t i) -> Score { + return scores[i]; + }, [&](size_t a, size_t b) -> bool { + return (scores[a] > scores[b]); + }, threshold_escape, threshold, min_count, max_count, rng, process_item, discard_item_by_count, discard_item_by_score); +} + + +template +void MinimizerMapper::process_until_threshold_e(size_t items, const function& get_score, + const function& comparator, + const function& threshold_escape, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const { + + // Sort item indexes by item score vector indexes_in_order; indexes_in_order.reserve(items); @@ -1551,13 +1670,15 @@ void MinimizerMapper::process_until_threshold_c(size_t items, const function get_options() { MinimizerMapper::default_max_multimaps, "produce up to INT alignments for each read" ); + result_opts.add_flag( + "supplementary", + &MinimizerMapper::find_supplementaries, + MinimizerMapper::default_find_supplementaries, + "identify and report supplementary alignments" + ); // Configure normal Giraffe mapping computation auto& comp_opts = parser->add_group("computational parameters"); @@ -787,7 +793,6 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map surjected1, surjected2; - if (multimap) { - surjected1 = std::move(surjector.multi_surject(src1, paths, subpath_global, spliced)); - surjected2 = std::move(surjector.multi_surject(src2, paths, subpath_global, spliced)); - } - else { - surjected1 = std::move(surjector.surject(src1, paths, subpath_global, spliced)); - surjected2 = std::move(surjector.surject(src2, paths, subpath_global, spliced)); - } + // Surject + auto surjected1 = surjector.surject(src1, paths, subpath_global, spliced); + auto surjected2 = surjector.surject(src2, paths, subpath_global, spliced); // pair up non-supplementary alignments unordered_map, size_t> strand_idx1, strand_idx2; @@ -660,12 +654,7 @@ int main_surject(int argc, char** argv) { set_metadata(src); // Surject and emit the single read. - if (multimap) { - alignment_emitter->emit_singles(surjector.multi_surject(src, paths, subpath_global, spliced)); - } - else { - alignment_emitter->emit_singles(surjector.surject(src, paths, subpath_global, spliced)); - } + alignment_emitter->emit_singles(surjector.surject(src, paths, subpath_global, spliced)); total_reads_surjected++; if (watchdog) { watchdog->check_out(thread_num); @@ -745,17 +734,10 @@ int main_surject(int argc, char** argv) { // TODO: highly repetitive with the version above for Alignments // surject and record path positions - vector surjected1, surjected2; vector> positions1, positions2; - if (multimap) { - surjected1 = std::move(surjector.multi_surject(mp_src1, paths, positions1, subpath_global, spliced)); - surjected2 = std::move(surjector.multi_surject(mp_src2, paths, positions2, subpath_global, spliced)); - } - else { - surjected1 = std::move(surjector.surject(mp_src1, paths, positions1, subpath_global, spliced)); - surjected2 = std::move(surjector.surject(mp_src2, paths, positions2, subpath_global, spliced)); + auto surjected1 = surjector.surject(mp_src1, paths, positions1, subpath_global, spliced); + auto surjected2 = surjector.surject(mp_src2, paths, positions2, subpath_global, spliced); - } // pair up non-supplementary alignments unordered_map, size_t> strand_idx1, strand_idx2; for (size_t i = 0; i < surjected1.size(); ++i) { @@ -869,15 +851,10 @@ int main_surject(int argc, char** argv) { // surject and record path positions vector> positions; - vector surjected; vector> surject_positions; - if (multimap) { - surjected = std::move(surjector.multi_surject(mp_src, paths, surject_positions, subpath_global, spliced)); - } - else { - surjected = std::move(surjector.surject(mp_src, paths, surject_positions, subpath_global, spliced)); - } + auto surjected = surjector.surject(mp_src, paths, surject_positions, subpath_global, spliced); + // positions are in different orders in these two interfaces for (auto& position : surject_positions) { positions.emplace_back(std::move(get<0>(position)), get<2>(position), get<1>(position)); diff --git a/src/surjecting_alignment_emitter.cpp b/src/surjecting_alignment_emitter.cpp index 60d53075e7..76eaad4985 100644 --- a/src/surjecting_alignment_emitter.cpp +++ b/src/surjecting_alignment_emitter.cpp @@ -39,27 +39,49 @@ void SurjectingAlignmentEmitter::surject_paired_alignments_in_place(vector(surjected1[j], "supplementary")) { + if (surjected1.size() > 1 || surjected2.size() > 1) { + + // find the primaries + size_t primary_idx1 = -1, primary_idx2 = -1; + for (size_t j = 0; j < surjected1.size(); ++j) { + if (!has_annotation(surjected1[j], "supplementary") || !get_annotation(surjected1[j], "supplementary")) { + primary_idx1 = j; + break; + } + } + for (size_t j = 0; j < surjected2.size(); ++j) { + if (!has_annotation(surjected2[j], "supplementary") || !get_annotation(surjected2[j], "supplementary")) { + primary_idx2 = j; + break; + } + } + + // annotate supplementaries with primary mate info + const auto& primary_pos1 = surjected1[primary_idx1].refpos(0); + const auto& primary_pos2 = surjected2[primary_idx2].refpos(0); + for (size_t j = 0; j < surjected1.size(); ++j) { + if (j == primary_idx1) { + continue; + } supplementary_alns.emplace_back(std::move(surjected1[j])); - const auto& mate_pos = surjected2.front().refpos(0); set_annotation(supplementary_alns.back(), "mate_info", - mate_info(mate_pos.name(), mate_pos.offset(), mate_pos.is_reverse(), false)); - } - else { - alns1[i] = std::move(surjected1[j]); + mate_info(primary_pos2.name(), primary_pos2.offset(), primary_pos2.is_reverse(), false)); } - } - for (size_t j = 0; j < surjected2.size(); ++j) { - if (has_annotation(surjected2[j], "supplementary") && get_annotation(surjected2[j], "supplementary")) { + for (size_t j = 0; j < surjected2.size(); ++j) { + if (j == primary_idx1) { + continue; + } supplementary_alns.emplace_back(std::move(surjected2[j])); - const auto& mate_pos = surjected1.front().refpos(0); set_annotation(supplementary_alns.back(), "mate_info", - mate_info(mate_pos.name(), mate_pos.offset(), mate_pos.is_reverse(), true)); - } - else { - alns2[i] = std::move(surjected2[j]); + mate_info(primary_pos1.name(), primary_pos1.offset(), primary_pos1.is_reverse(), true)); } + + alns1[i] = std::move(surjected1[primary_idx1]); + alns2[i] = std::move(surjected2[primary_idx2]); + } + else { + alns1[i] = std::move(surjected1.front()); + alns2[i] = std::move(surjected2.front()); } } } diff --git a/src/surjector.cpp b/src/surjector.cpp index 48b4848737..28c71662fc 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -137,47 +137,68 @@ using namespace std; return surjected; } - vector Surjector::multi_surject(const Alignment& source, - const unordered_set& paths, - bool allow_negative_scores, - bool preserve_deletions) const { - - // TODO: redundant with non-multi now that it can emit multiple alignment records - vector surjected; - vector> positions; - surject_internal(&source, nullptr, &surjected, nullptr, paths, positions, - true, allow_negative_scores, preserve_deletions); - - for (size_t i = 0; i < surjected.size(); ++i) { - surjected[i].clear_refpos(); - auto* pos = surjected[i].add_refpos(); - pos->set_name(get<0>(positions[i])); - pos->set_offset(get<1>(positions[i])); - pos->set_is_reverse(get<2>(positions[i])); - } - - return surjected; - } - vector Surjector::surject(const Alignment& source, const unordered_set& paths, vector>& positions_out, bool allow_negative_scores, bool preserve_deletions) const { - vector surjected; - surject_internal(&source, nullptr, &surjected, nullptr, paths, positions_out, - false, allow_negative_scores, preserve_deletions); - return surjected; - } - vector Surjector::multi_surject(const Alignment& source, - const unordered_set& paths, - vector>& positions_out, - bool allow_negative_scores, - bool preserve_deletions) const { - // TODO: redundant with non-multi now that it can emit multiple alignment records vector surjected; - surject_internal(&source, nullptr, &surjected, nullptr, paths, positions_out, - true, allow_negative_scores, preserve_deletions); - + + if (source.supplementary_size() == 0 || !report_supplementary) { + // no supplementaries or not interested in supplementaries, we only need to worry about the primary + surject_internal(&source, nullptr, &surjected, nullptr, paths, positions_out, + allow_negative_scores, preserve_deletions); + } + else { + // the mapper identified supplementaries + + auto hard_clipped_alns = generate_hard_clipped_alignments(source); + + for (const auto& clipped : hard_clipped_alns) { + + // do the surjection + vector clipped_surjected; + vector> clipped_positions_out; + surject_internal(&get<0>(clipped), nullptr, &clipped_surjected, nullptr, paths, clipped_positions_out, + allow_negative_scores, preserve_deletions); + + + restore_hard_clips(clipped_surjected, source, get<1>(clipped), get<2>(clipped)); + +#ifdef debug_anchored_surject + cerr << "after restoring soft-clips:" << endl; + for (const auto& surj : clipped_surjected) { + cerr << pb2json(surj) << endl; + } +#endif + for (size_t j = 0; j < clipped_surjected.size(); ++j) { + surjected.emplace_back(std::move(clipped_surjected[j])); + positions_out.emplace_back(std::move(clipped_positions_out[j])); + } + } + } + + add_SA_tag(surjected, positions_out, *graph, preserve_deletions); + +#ifdef debug_anchored_surject + cerr << "added SA tags (if any):" << endl; + for (size_t i = 0; i < surjected.size(); ++i) { + const auto& surj = surjected[i]; + cerr << '\t' << i << ": "; + if (has_annotation(surj, "tags")) { + cerr << get_annotation(surj, "tags"); + } + else { + cerr << "(none)"; + } + cerr << endl; + } +#endif + + + if (annotate_with_graph_alignment) { + annotate_graph_cigar(surjected, source, positions_out); + } + return surjected; } @@ -187,28 +208,21 @@ using namespace std; vector surjected; surject_internal(nullptr, &source, nullptr, &surjected, paths, positions_out, - false, allow_negative_scores, preserve_deletions); + allow_negative_scores, preserve_deletions); - return surjected; - } + add_SA_tag(surjected, positions_out, *graph, preserve_deletions); + + if (annotate_with_graph_alignment) { + annotate_graph_cigar(surjected, source, positions_out); + } - vector Surjector::multi_surject(const multipath_alignment_t& source, - const unordered_set& paths, - vector>& positions_out, - bool allow_negative_scores, - bool preserve_deletions) const { - // TODO: redundant with non-multi now that it can emit multiple alignment records - vector surjected; - surject_internal(nullptr, &source, nullptr, &surjected, paths, positions_out, - true, allow_negative_scores, preserve_deletions); - return surjected; } void Surjector::surject_internal(const Alignment* source_aln, const multipath_alignment_t* source_mp_aln, vector* alns_out, vector* mp_alns_out, const unordered_set& paths, - vector>& positions_out, bool all_paths, + vector>& positions_out, bool allow_negative_scores, bool preserve_deletions) const { @@ -248,8 +262,7 @@ using namespace std; cerr << "error[Surjector::surject]: offending alignemnt: " << pb2json(*source_aln) << endl; exit(1); } - } - + } // do we need to simplify a complicated multipath alignment? multipath_alignment_t simplified_source_mp_aln; @@ -417,15 +430,9 @@ using namespace std; // copy over annotations // TODO: also redundantly copies over sequence and quality transfer_read_metadata(*source_mp_aln, mp_alns_out->back()); - if (annotate_with_graph_alignment) { - annotate_graph_cigar(*mp_alns_out, *source_mp_aln, false); - } } else { alns_out->emplace_back(make_null_alignment(*source_aln)); - if (annotate_with_graph_alignment) { - annotate_graph_cigar(*alns_out, *source_aln, false); - } } return; } @@ -442,7 +449,7 @@ using namespace std; // choose which path strands we will output vector> strands_to_output; - if (all_paths) { + if (multimap_to_all_paths) { vector> path_strands; if (source_aln) { // give each path strand the alignment score if it is primary or 0 if it is supplementary (to deprioritize) @@ -545,10 +552,6 @@ using namespace std; if (annotate_with_all_path_scores) { set_annotation(alns_out->back(), "all_scores", annotation_string); } - - if (annotate_with_graph_alignment) { - annotate_graph_cigar(*alns_out, *source_aln, path_strand.second); - } } else { auto& surjection = mp_aln_surjections[path_strand][j]; @@ -569,9 +572,6 @@ using namespace std; if (annotate_with_all_path_scores) { mp_alns_out->back().set_annotation("all_scores", annotation_string); } - if (annotate_with_graph_alignment) { - annotate_graph_cigar(*mp_alns_out, *source_mp_aln, path_strand.second); - } #ifdef debug_anchored_surject cerr << "outputting surjected mp aln: " << debug_string(mp_alns_out->back()) << endl; #endif @@ -587,16 +587,6 @@ using namespace std; cerr << "chose path " << get<0>(positions_out.back()) << " at position " << get<1>(positions_out.back()) << (get<2>(positions_out.back()) ? "-" : "+") << endl; #endif } - - // annotate the primary with an SA tag for supplementaries - if (num_suppls > 1) { - if (source_aln) { - add_SA_tag(*alns_out, positions_out, memoizing_graph, preserve_deletions); - } - else { - add_SA_tag(*mp_alns_out, positions_out, memoizing_graph, preserve_deletions); - } - } } } @@ -3704,6 +3694,7 @@ using namespace std; surj.first.set_read_group(source.read_group()); surj.first.set_sample_name(source.sample_name()); surj.first.set_mapping_quality(source.mapping_quality()); + surj.first.set_is_secondary(source.is_secondary()); if (source.has_fragment_next()) { *surj.first.mutable_fragment_next() = source.fragment_next(); } @@ -5264,28 +5255,26 @@ using namespace std; return null; } - void Surjector::annotate_graph_cigar(vector& surjections, const Alignment& source, bool rev_strand) const { - auto cigar = graph_cs_cigar(source, *graph, rev_strand); - if (cigar.empty()) { + void Surjector::annotate_graph_cigar(vector& surjections, const Alignment& source, const vector>& positions) const { + if (source.path().mapping_size() == 0) { return; } - for (auto& surjection : surjections) { - set_annotation(surjection, "graph_cigar", cigar); + for (size_t i = 0; i < surjections.size(); ++i) { + set_annotation(surjections[i], "graph_cigar", graph_cs_cigar(source, *graph, get<2>(positions[i]))); } } - void Surjector::annotate_graph_cigar(vector& surjections, const multipath_alignment_t& source, bool rev_strand) const { + void Surjector::annotate_graph_cigar(vector& surjections, const multipath_alignment_t& source, + const vector>& positions) const { + // TODO: repetitive with Alignment version Alignment opt; optimal_alignment(source, opt, false); - - // TODO: repetitive with Alignment version - auto cigar = graph_cs_cigar(opt, *graph, rev_strand); - if (cigar.empty()) { + if (opt.path().mapping_size() == 0) { return; } - for (auto& surjection : surjections) { - surjection.set_annotation("graph_cigar", cigar); + for (size_t i = 0; i < surjections.size(); ++i) { + surjections[i].set_annotation("graph_cigar", graph_cs_cigar(opt, *graph, get<2>(positions[i]))); } } @@ -5355,6 +5344,197 @@ using namespace std; return cigar_against_path(surjected, get<0>(position), get<2>(position), get<1>(position), graph, min_splice_length); } + vector> Surjector::generate_hard_clipped_alignments(const Alignment& source) const { + +#ifdef debug_anchored_surject + cerr << "alignment has mapper-identified supplementaries:" << endl; + for (const auto& aln : source.supplementary()) { + cerr << pb2json(aln) << endl; + } +#endif + + vector> clipped_alns; + clipped_alns.reserve(source.supplementary_size() + 1); + + // collect the supplementary intervals (begin, end, idx) + vector> suppl_intervals; + suppl_intervals.reserve(source.supplementary_size() + 1); + for (size_t i = 0; i < source.supplementary_size(); ++i) { + const auto& suppl = source.supplementary(i); + auto interval = aligned_interval(suppl); + if (interval.second < interval.first) { + // unaligned -- TODO: this shouldn't ever happen though, remove this? + continue; + } + suppl_intervals.emplace_back(interval.first, interval.second, i); + } + auto primary_interval = aligned_interval(source); + suppl_intervals.emplace_back(primary_interval.first, primary_interval.second, -1); + sort(suppl_intervals.begin(), suppl_intervals.end()); + + // make hard-clipped Alignments + for (size_t i = 0; i < suppl_intervals.size(); ++i) { + + const auto& interval = suppl_intervals[i]; + const auto& unclipped = get<2>(interval) < source.supplementary_size() ? source.supplementary(get<2>(interval)) : source; + + clipped_alns.emplace_back(); + auto& clipped = get<0>(clipped_alns.back()); + + // copy over metadata + clipped.set_score(unclipped.score()); + clipped.set_mapping_quality(unclipped.mapping_quality()); + clipped.set_name(source.name()); + clipped.set_read_group(source.read_group()); + clipped.set_sample_name(source.sample_name()); + clipped.set_is_secondary(source.is_secondary()); + if (source.has_fragment_next()) { + *clipped.mutable_fragment_next() = source.fragment_next(); + } + if (source.has_fragment_prev()) { + *clipped.mutable_fragment_prev() = source.fragment_prev(); + } + *clipped.mutable_annotation() = source.annotation(); + + if (&unclipped != &source) { + set_annotation(clipped, "supplementary", true); + } + + // possibly expand the sequence interval into "unclaimed" adjacent sequence + size_t left_bnd = i != 0 ? get<1>(suppl_intervals[i - 1]) : 0; + size_t right_bnd = i + 1 < suppl_intervals.size() ? get<0>(suppl_intervals[i + 1]) : source.sequence().size(); + size_t subseq_begin = min(left_bnd, get<0>(interval)); + size_t subseq_end = max(right_bnd, get<1>(interval)); + get<1>(clipped_alns.back()) = subseq_begin; + get<2>(clipped_alns.back()) = subseq_end; + +#ifdef debug_anchored_surject + cerr << "hard clipping "; + if (get<2>(interval) >= source.supplementary_size()) { + cerr << "primary alignment"; + } + else { + cerr << "supplementary " << get<2>(interval); + } + cerr << " to the interval " << subseq_begin << ":" << subseq_end << " for surjection" << endl; +#endif + + // make the clipped sequence + clipped.set_sequence(source.sequence().substr(subseq_begin, subseq_end - subseq_begin)); + if (!source.quality().empty()) { + clipped.set_quality(source.quality().substr(subseq_begin, subseq_end - subseq_begin)); + } + + auto clipped_path = clipped.mutable_path(); + const auto& unclipped_path = unclipped.path(); + for (size_t m = 0; m < unclipped_path.mapping_size(); ++m) { + const auto& unclipped_mapping = unclipped_path.mapping(m); + auto clipped_mapping = clipped_path->add_mapping(); + *clipped_mapping->mutable_position() = unclipped_mapping.position(); + if (m != 0 && m + 1 != unclipped_path.mapping_size()) { + // internal mapping + *clipped_mapping = unclipped_mapping; + clipped_mapping->set_rank(clipped_path->mapping_size()); + } + else { + // first and/or last mapping + for (size_t e = 0; e < unclipped_mapping.edit_size(); ++e) { + if (m == 0 && e == 0 && subseq_begin != 0) { + // initial hard-clip + const auto& softclip_edit = unclipped_mapping.edit(e); + assert(softclip_edit.from_length() == 0); + if (subseq_begin < softclip_edit.to_length()) { + auto clipped_edit = clipped_mapping->add_edit(); + clipped_edit->set_from_length(0); + clipped_edit->set_to_length(softclip_edit.to_length() - subseq_begin); + clipped_edit->set_sequence(softclip_edit.sequence().substr(softclip_edit.to_length() - clipped_edit->to_length(), clipped_edit->to_length())); + } + } + else if (m + 1 == unclipped_path.mapping_size() && e + 1 == unclipped_mapping.edit_size() + && subseq_end != source.sequence().size()) { + // final hard-clip + const auto& softclip_edit = unclipped_mapping.edit(e); + assert(softclip_edit.from_length() == 0); + if (source.sequence().size() - softclip_edit.to_length() < subseq_end) { + auto clipped_edit = clipped_mapping->add_edit(); + clipped_edit->set_from_length(0); + clipped_edit->set_to_length(subseq_end + softclip_edit.to_length() - source.sequence().size()); + clipped_edit->set_sequence(softclip_edit.sequence().substr(0, clipped_edit->to_length())); + } + } + else { + // internal edit + *clipped_mapping->add_edit() = unclipped_mapping.edit(e); + } + } + + if (clipped_mapping->edit_size() == 0) { + // the only edits were filtered out, don't leave an empty mapping + clipped_path->mutable_mapping()->RemoveLast(); + } + else { + clipped_mapping->set_rank(clipped_path->mapping_size()); + } + } + } +#ifdef debug_anchored_surject + cerr << "hard-clipped surjection input:" << endl; + cerr << pb2json(clipped) << endl; +#endif + } + + return clipped_alns; + } + + void Surjector::restore_hard_clips(vector& clipped_surjected, const Alignment& source, size_t subseq_begin, size_t subseq_end) const { + // restore soft-clips + for (size_t j = 0; j < clipped_surjected.size(); ++j) { + auto& clipped_surj = clipped_surjected[j]; + + clipped_surj.set_sequence(source.sequence()); + clipped_surj.set_quality(source.quality()); + + if (clipped_surj.path().mapping_size() != 0) { + + if (subseq_begin != 0) { + // restore a softclip on the front + auto first_mapping = clipped_surj.mutable_path()->mutable_mapping(0); + auto first_edit = first_mapping->mutable_edit(0); + if (first_edit->from_length() == 0) { + first_edit->set_sequence(clipped_surj.sequence().substr(0, subseq_begin) + first_edit->sequence()); + first_edit->set_to_length(first_edit->sequence().size()); + } + else { + auto softclip_edit = first_mapping->add_edit(); + softclip_edit->set_from_length(0); + softclip_edit->set_to_length(subseq_begin); + softclip_edit->set_sequence(clipped_surj.sequence().substr(0, subseq_begin)); + // move it to the front + for (size_t k = first_mapping->edit_size() - 1; k > 0; --k) { + first_mapping->mutable_edit()->SwapElements(k, k - 1); + } + } + } + if (subseq_end != source.sequence().size()) { + // restore a softclip on the back + auto final_mapping = clipped_surj.mutable_path()->mutable_mapping(clipped_surj.path().mapping_size() - 1); + auto final_edit = final_mapping->mutable_edit(final_mapping->edit_size() - 1); + if (final_edit->from_length() == 0) { + final_edit->set_sequence(final_edit->sequence() + + clipped_surj.sequence().substr(subseq_end, clipped_surj.sequence().size() - subseq_end)); + final_edit->set_to_length(final_edit->sequence().size()); + } + else { + auto softclip_edit = final_mapping->add_edit(); + softclip_edit->set_from_length(0); + softclip_edit->set_sequence(clipped_surj.sequence().substr(subseq_end, clipped_surj.sequence().size() - subseq_end)); + softclip_edit->set_to_length(softclip_edit->sequence().size()); + } + } + } + } + } + string Surjector::update_tag_string_for_SA(const string& tags, const string& sa_val) { string return_val; if (tags.empty()) { @@ -5376,7 +5556,7 @@ using namespace std; } else { // add onto SA existing tag - split[sa_idx] += ";" + sa_val; + split[sa_idx] += sa_val; stringstream strm; for (size_t i = 0; i < split.size(); ++i) { if (i) { diff --git a/src/surjector.hpp b/src/surjector.hpp index 552fb504ad..ad329a8396 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -61,13 +61,6 @@ using namespace std; vector>& positions_out, bool allow_negative_scores = false, bool preserve_deletions = false) const; - - /// Same as above, but include alignments to all paths instead of only the optimal one - vector multi_surject(const Alignment& source, - const unordered_set& paths, - vector>& positions_out, - bool allow_negative_scores = false, - bool preserve_deletions = false) const; /// Extract the portions of an alignment that are on a chosen set of /// paths and try to align realign the portions that are off of the @@ -88,12 +81,6 @@ using namespace std; bool allow_negative_scores = false, bool preserve_deletions = false) const; - /// Same as above, but include alignments to all paths instead of only the optimal one - vector multi_surject(const Alignment& source, - const unordered_set& paths, - bool allow_negative_scores = false, - bool preserve_deletions = false) const; - /// Same semantics as with alignments except that connections are always /// preserved as splices. The output consists of a multipath alignment with /// a single path, separated by splices (either from large deletions or from @@ -104,13 +91,6 @@ using namespace std; bool allow_negative_scores = false, bool preserve_deletions = false) const; - /// Same as above, but include alignments to all paths instead of only the optimal one - vector multi_surject(const multipath_alignment_t& source, - const unordered_set& paths, - vector>& positions_out, - bool allow_negative_scores = false, - bool preserve_deletions = false) const; - /// a local type that represents a read interval matched to a portion of the alignment path using path_chunk_t = pair, path_t>; @@ -172,6 +152,9 @@ using namespace std; /// anchors with the oppsite parity (even if this is odd, or odd if /// this is even) 1bp longer. int64_t pad_suspicious_anchors_to_length = 12; + + /// Return alignments to overlapping paths as secondaries instead of just the best one + bool multimap_to_all_paths = false; // A function for computing band padding std::function choose_band_padding; @@ -203,7 +186,7 @@ using namespace std; void surject_internal(const Alignment* source_aln, const multipath_alignment_t* source_mp_aln, vector* alns_out, vector* mp_alns_out, const unordered_set& paths, - vector>& positions_out, bool all_paths, + vector>& positions_out, bool allow_negative_scores, bool preserve_deletions) const; vector>> @@ -292,7 +275,11 @@ using namespace std; function annotate_supplementary = [](multipath_alignment_t& mp_aln) { mp_aln.set_annotation("supplementary", true); }; choose_primary_internal(surjections, annotate_supplementary); } + + vector> generate_hard_clipped_alignments(const Alignment& source) const; + void restore_hard_clips(vector& clipped_surjected, const Alignment& source, size_t subseq_begin, size_t subseq_end) const; + // helpers to add the SA tag template void add_SA_tag_internal(vector& surjected, const vector>& positions, @@ -323,6 +310,7 @@ using namespace std; add_SA_tag_internal(surjected, positions, graph, spliced, update_sa); } + // calculate the total length of overlaps between alignments, measured in terms of read sequence template int64_t total_overlap(const vector>>& surjections) const; @@ -394,9 +382,11 @@ using namespace std; static multipath_alignment_t make_null_mp_alignment(const string& src_sequence, const string& src_quality); - void annotate_graph_cigar(vector& surjections, const Alignment& source, bool rev_strand) const; + void annotate_graph_cigar(vector& surjections, const Alignment& source, + const vector>& positions) const; - void annotate_graph_cigar(vector& surjections, const multipath_alignment_t& source, bool rev_strand) const; + void annotate_graph_cigar(vector& surjections, const multipath_alignment_t& source, + const vector>& positions) const; template static int32_t get_score(const AlnType& aln); @@ -510,14 +500,19 @@ using namespace std; return; } size_t primary_idx = -1; - for (size_t i = 0; i < surjected.size(); ++i) { + bool has_supplementary = false; + for (size_t i = 0; i < surjected.size() && (!has_supplementary || primary_idx == -1); ++i) { if (!is_supplementary(surjected[i])) { - primary_idx = i; - break; + if (primary_idx == -1) { + primary_idx = i; + } + } + else { + has_supplementary = true; } } - if (primary_idx == -1) { - // there is no primary + if (primary_idx == -1 || !has_supplementary) { + // there is no primary or no supplementaries return; } @@ -532,7 +527,7 @@ using namespace std; const auto& pos = positions[i]; // note: convert to 1-based - strm << get<0>(pos) << ',' << (get<1>(pos) + 1) << ',' << (get<2>(pos) ? '-' : '+') << ','; + strm << (get<0>(pos).empty() ? string("*") : get<0>(pos)) << ',' << (get<1>(pos) >= 0 ? get<1>(pos) + 1 : (int64_t) 0) << ',' << (get<2>(pos) ? '-' : '+') << ','; auto cigar = get_cigar(surj, pos, graph, spliced); if (cigar.empty()) { strm << '*'; @@ -542,7 +537,7 @@ using namespace std; strm << op.first << op.second; } } - strm << ',' << surj.mapping_quality() << ',' << count_mismatches(surj); + strm << ',' << surj.mapping_quality() << ',' << count_mismatches(surj) << ';'; SA_entry[i] = std::move(strm.str()); } @@ -554,16 +549,11 @@ using namespace std; continue; } // this is a primary or a supplementary that needs the SA tag - bool first = true; stringstream strm; for (size_t j = 0; j < surjected.size(); ++j) { - if ((j == i) || (j != primary_idx && !is_supplementary(surjected[j]))) { + if (j == i || (j != primary_idx && !is_supplementary(surjected[j]))) { continue; } - if (!first) { - strm << ';'; - } - first = false; strm << SA_entry[j]; } diff --git a/src/unittest/alignment.cpp b/src/unittest/alignment.cpp index ac5d0d430f..4e41083b22 100644 --- a/src/unittest/alignment.cpp +++ b/src/unittest/alignment.cpp @@ -5,13 +5,19 @@ #include #include +#include + #include "vg/io/json2pb.h" #include #include "../alignment.hpp" #include "../vg.hpp" #include "../xg.hpp" +#include "../handle.hpp" +#include "../annotation.hpp" #include "catch.hpp" +#include "bdsg/hash_graph.hpp" + namespace vg { namespace unittest { using namespace std; @@ -464,6 +470,286 @@ TEST_CASE("Conversion to GAF removes an unused final node", "[alignment][gaf]") } } +TEST_CASE("Supplementary alignments can identified and processed", "[alignment][supplementary]") { + + string seq = "CAAGTTCTGCTTTCTGCGAT"; + string qual(seq.size(), 40); + string name = "read"; + string group = "group"; + string samp = "samp"; + + Alignment primary, supp1, supp2, irrelevant; + for (auto aln : {&primary, &supp1, &supp2, &irrelevant}) { + aln->set_sequence(seq); + aln->set_quality(qual); + aln->set_name(name); + aln->set_read_group(group); + aln->set_sample_name(samp); + } + + { + // CAAGTTCTGCTTTCTGCGAT + // ......CTGCTTTC...... + auto path = primary.mutable_path(); + auto m = path->add_mapping(); + m->set_rank(1); + auto p = m->mutable_position(); + p->set_node_id(1); + p->set_offset(4); + p->set_is_reverse(false); + auto e1 = m->add_edit(); + e1->set_from_length(0); + e1->set_to_length(6); + e1->set_sequence("CAAGTT"); + auto e2 = m->add_edit(); + e2->set_from_length(8); + e2->set_to_length(8); + auto e3 = m->add_edit(); + e3->set_from_length(0); + e3->set_to_length(6); + e3->set_sequence("TGCGAT"); + primary.set_score(8); + primary.set_mapping_quality(40); + } + + { + // C-AAGTTCTGCTTTCTGCGAT + // CAAAGGT............. + auto path = supp1.mutable_path(); + auto m = path->add_mapping(); + m->set_rank(1); + auto p = m->mutable_position(); + p->set_node_id(4); + p->set_offset(2); + p->set_is_reverse(true); + auto e1 = m->add_edit(); + e1->set_from_length(1); + e1->set_to_length(1); + auto e2 = m->add_edit(); + e2->set_from_length(1); + e2->set_to_length(0); + auto e3 = m->add_edit(); + e3->set_from_length(3); + e3->set_to_length(3); + auto e4 = m->add_edit(); + e4->set_from_length(1); + e4->set_to_length(1); + e4->set_sequence("T"); + auto e5 = m->add_edit(); + e5->set_from_length(1); + e5->set_to_length(1); + auto e6 = m->add_edit(); + e6->set_from_length(0); + e6->set_to_length(14); + e6->set_sequence("CTGCTTTCTGCGAT"); + supp1.set_score(5); + supp1.set_mapping_quality(15); + } + + { + // CAAGTTCTGCTTTCTGCGAT + // ..............TGC-AT + auto path = supp2.mutable_path(); + auto m1 = path->add_mapping(); + m1->set_rank(1); + auto p1 = m1->mutable_position(); + p1->set_node_id(10); + p1->set_offset(4); + p1->set_is_reverse(false); + auto e0 = m1->add_edit(); + e0->set_from_length(0); + e0->set_to_length(14); + e0->set_sequence("CAAGTTCTGCTTTC"); + auto e1 = m1->add_edit(); + e1->set_from_length(3); + e1->set_to_length(3); + auto m2 = path->add_mapping(); + m2->set_rank(2); + auto p2 = m2->mutable_position(); + p2->set_node_id(11); + p2->set_offset(0); + p2->set_is_reverse(true); + auto e2 = m2->add_edit(); + e2->set_from_length(0); + e2->set_to_length(1); + e2->set_sequence("G"); + auto e3 = m2->add_edit(); + e3->set_from_length(2); + e3->set_to_length(2); + supp2.set_score(5); + supp2.set_mapping_quality(10); + } + + { + // CAAGTTCTGCTTTCTGCGAT + // .......TGCTTT....... + auto path = irrelevant.mutable_path(); + auto m = path->add_mapping(); + m->set_rank(1); + auto p = m->mutable_position(); + p->set_node_id(20); + p->set_offset(0); + p->set_is_reverse(false); + auto e1 = m->add_edit(); + e1->set_from_length(0); + e1->set_to_length(7); + e1->set_sequence("CAAGTTC"); + auto e2 = m->add_edit(); + e2->set_from_length(6); + e2->set_to_length(6); + auto e3 = m->add_edit(); + e3->set_from_length(0); + e3->set_to_length(7); + e3->set_sequence("CTGCGAT"); + irrelevant.set_score(5); + irrelevant.set_mapping_quality(1); + } + + bdsg::HashGraph graph; + auto h1 = graph.create_handle("ATATCTGCTTTC", 1); + auto h4 = graph.create_handle(reverse_complement("GGCAAAGGT"), 4); + auto h10 = graph.create_handle("CCGCTGC", 10); + auto h11 = graph.create_handle(reverse_complement("ATTC"), 11); + auto h20 = graph.create_handle("TGCTTT", 20); + graph.create_edge(h1, graph.flip(h11)); + + SECTION("Supplementaries can be correctly identified among a list of Alignments") { + + vector alns{primary, supp1, supp2, irrelevant}; + + size_t min_size = 5; + size_t separation = 1; + double read_coverage = 0.95; + double score_fraction = 0.5; + + SECTION("Can identify a complete partition of the read") { + + auto supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + sort(supps.begin(), supps.end()); + REQUIRE(supps == vector{1, 2}); + } + + SECTION("Respects the mininmum size constraint") { + min_size = 6; + auto supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + min_size = 7; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.empty()); + } + + SECTION("Respects the mininmum score constraint") { + score_fraction = 5.0 / 8.0; + auto supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + score_fraction = 0.7; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.empty()); + } + + SECTION("Respects read coverage constraint") { + + read_coverage = 1.0; + auto supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + + auto m = alns[2].mutable_path()->mutable_mapping(0); + + // move an aligned base into softclip + auto e1 = m->mutable_edit(0); + auto e2 = m->mutable_edit(1); + e1->set_sequence("CAAGTTCTGCTTTCT"); + e1->set_to_length(15); + e2->set_from_length(2); + e2->set_to_length(2); + + read_coverage = 0.95; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + + read_coverage = 0.96; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 0); + } + + SECTION("Respects separation constraint") { + + separation = 0; + auto supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + + auto m = alns[2].mutable_path()->mutable_mapping(0); + + // move an aligned base into softclip + auto e1 = m->mutable_edit(0); + auto e2 = m->mutable_edit(1); + e1->set_sequence("CAAGTTCTGCTTTCT"); + e1->set_to_length(15); + e2->set_from_length(2); + e2->set_to_length(2); + + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 0); + + separation = 1; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + + // add a base of overlap + e1->set_sequence("CAAGTTCTGCTTT"); + e1->set_to_length(13); + e2->set_from_length(4); + e2->set_to_length(4); + m->mutable_position()->set_offset(3); + + separation = 0; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 0); + + separation = 1; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 2); + + // return it to the original state + e1->set_sequence("CAAGTTCTGCTTTC"); + e1->set_to_length(14); + e2->set_from_length(3); + e2->set_to_length(3); + + // split it into two alignments + + alns.emplace_back(alns[2]); + auto& split1 = alns[2]; + auto& split2 = alns.back(); + split1.mutable_path()->mutable_mapping()->RemoveLast(); + auto m2 = split2.mutable_path()->mutable_mapping(0); + *m2 = split2.path().mapping(1); + split2.mutable_path()->mutable_mapping()->RemoveLast(); + + // fix up the softclips + auto e3 = m->add_edit(); + e3->set_to_length(0); + e3->set_to_length(3); + e3->set_sequence("GAT"); + auto e4 = m2->mutable_edit(0); + e4->set_from_length(0); + e4->set_to_length(18); + e4->set_sequence("CAAGTTCTGCTTTCTGCG"); + + min_size = 1; + score_fraction = 0.1; + + separation = 1; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 3); + + separation = 0; + supps = identify_supplementaries(alns, read_coverage, separation, score_fraction, min_size); + REQUIRE(supps.size() == 0); + } + } +} + TEST_CASE("Unaligned sequences survive round-trip to GAF", "[alignment][gaf]") { VG graph; Alignment aln; diff --git a/src/unittest/interval_union.cpp b/src/unittest/interval_union.cpp new file mode 100644 index 0000000000..15be0d82c2 --- /dev/null +++ b/src/unittest/interval_union.cpp @@ -0,0 +1,114 @@ +/// \file interval_union.cpp +/// +/// unit tests for IntervalUnion +/// + +#include "../interval_union.hpp" +#include "catch.hpp" + +#include "bdsg/hash_graph.hpp" + +namespace vg { +namespace unittest { + + TEST_CASE("IntervalUnion produces correct results on a small example", "[interval]") { + + IntervalUnion iunion; + + REQUIRE(iunion.total_size() == 0); + REQUIRE(iunion.component_size() == 0); + REQUIRE(iunion.overlap(50, 100) == 0); + + // (50, 100) + iunion.add(50, 100); + REQUIRE(iunion.total_size() == 50); + REQUIRE(iunion.component_size() == 1); + REQUIRE(iunion.overlap(0, 40) == 0); + REQUIRE(iunion.overlap(0, 50) == 0); + REQUIRE(iunion.overlap(100, 150) == 0); + REQUIRE(iunion.overlap(110, 150) == 0); + REQUIRE(iunion.overlap(50, 100) == 50); + REQUIRE(iunion.overlap(75, 150) == 25); + REQUIRE(iunion.overlap(25, 75) == 25); + REQUIRE(iunion.overlap(60, 70) == 10); + + // (50, 100) + iunion.add(60, 80); + REQUIRE(iunion.total_size() == 50); + REQUIRE(iunion.component_size() == 1); + REQUIRE(iunion.overlap(0, 40) == 0); + REQUIRE(iunion.overlap(0, 50) == 0); + REQUIRE(iunion.overlap(100, 150) == 0); + REQUIRE(iunion.overlap(110, 150) == 0); + REQUIRE(iunion.overlap(50, 100) == 50); + REQUIRE(iunion.overlap(75, 150) == 25); + REQUIRE(iunion.overlap(25, 75) == 25); + REQUIRE(iunion.overlap(60, 70) == 10); + + // (50, 100), (200, 250) + iunion.add(200, 250); + REQUIRE(iunion.total_size() == 100); + REQUIRE(iunion.component_size() == 2); + REQUIRE(iunion.overlap(50, 250) == 100); + REQUIRE(iunion.overlap(40, 260) == 100); + REQUIRE(iunion.overlap(75, 225) == 50); + REQUIRE(iunion.overlap(40, 225) == 75); + REQUIRE(iunion.overlap(70, 270) == 80); + REQUIRE(iunion.overlap(20, 75) == 25); + REQUIRE(iunion.overlap(75, 120) == 25); + REQUIRE(iunion.overlap(225, 270) == 25); + REQUIRE(iunion.overlap(20, 110) == 50); + REQUIRE(iunion.overlap(190, 260) == 50); + + // (40, 100), (200, 250) + iunion.add(40, 60); + REQUIRE(iunion.total_size() == 110); + REQUIRE(iunion.overlap(30, 260) == 110); + REQUIRE(iunion.overlap(30, 100) == 60); + REQUIRE(iunion.overlap(50, 100) == 50); + REQUIRE(iunion.overlap(60, 110) == 40); + + // (40, 110), (200, 250) + iunion.add(50, 110); + REQUIRE(iunion.total_size() == 120); + REQUIRE(iunion.component_size() == 2); + REQUIRE(iunion.overlap(30, 260) == 120); + REQUIRE(iunion.overlap(30, 100) == 60); + REQUIRE(iunion.overlap(60, 120) == 50); + + // (40, 250) + iunion.add(100, 210); + REQUIRE(iunion.total_size() == 210); + REQUIRE(iunion.component_size() == 1); + REQUIRE(iunion.overlap(30, 260) == 210); + REQUIRE(iunion.overlap(30, 50) == 10); + REQUIRE(iunion.overlap(200, 300) == 50); + REQUIRE(iunion.overlap(80, 130) == 50); + REQUIRE(iunion.overlap(0, 10) == 0); + + // (10, 20), (40, 250) + iunion.add(10, 20); + REQUIRE(iunion.total_size() == 220); + REQUIRE(iunion.component_size() == 2); + REQUIRE(iunion.overlap(0, 30) == 10); + REQUIRE(iunion.overlap(10, 100) == 70); + + // (10, 20), (40, 250), (280, 300) + iunion.add(280, 300); + REQUIRE(iunion.total_size() == 240); + REQUIRE(iunion.component_size() == 3); + REQUIRE(iunion.overlap(15, 295) == 230); + + + // (10, 300) + iunion.add(20, 280); + REQUIRE(iunion.total_size() == 290); + REQUIRE(iunion.component_size() == 1); + REQUIRE(iunion.overlap(0, 30) == 20); + REQUIRE(iunion.overlap(0, 10) == 0); + REQUIRE(iunion.overlap(280, 310) == 20); + REQUIRE(iunion.overlap(300, 310) == 0); + REQUIRE(iunion.overlap(150, 200) == 50); + } +} +} diff --git a/src/utility.hpp b/src/utility.hpp index 0bb32fa7c2..8026c346c5 100644 --- a/src/utility.hpp +++ b/src/utility.hpp @@ -558,6 +558,45 @@ class VectorViewInverse { } }; +/** + * A vector of pointers that is wrapped so that the vector elements are returned + * as references instead of pointers + */ +template +class IndirectVectorView { +public: + IndirectVectorView() = default; + ~IndirectVectorView() = default; + IndirectVectorView(const IndirectVectorView& other) = default; + IndirectVectorView(IndirectVectorView&& other) = default; + IndirectVectorView& operator=(const IndirectVectorView& other) = default; + IndirectVectorView& operator=(IndirectVectorView&& other) = default; + + void push_back(Item& item) { + return items.push_back(&item); + } + + const Item& operator[](size_t index) const { + return *items[index]; + } + + Item& operator[](size_t index) { + return *items[index]; + } + + size_t size() const { + return items.size(); + } + + bool empty() const { + return items.size(); + } + +private: + + vector items; +}; + /// Vector containing positive integer values in [begin, end) vector range_vector(size_t begin, size_t end); diff --git a/test/graphs/disconnected.gfa b/test/graphs/disconnected.gfa new file mode 100644 index 0000000000..e3ebfdc1b2 --- /dev/null +++ b/test/graphs/disconnected.gfa @@ -0,0 +1,7 @@ +H VN:Z:1.1 +S 1 TCACGCTGGCAATACGGAGTACGAATTTACGGAGTCGCGTGGTGGCGAGT +S 2 CTTGACGATCCAATGTTTGACACCATTTGCTGCCATCACCCGGTGAGGAGATGGCGGACCTAACTCACTT +S 3 TGAGCGTGATCAGTTCGGGCTTCTCCCGTCCGTCCGAGACTCCACTACCGGTATGATTTAGGGACTTGGCCTTCACAACTGCGGCAGTGATGGAGTATTT +P x 1+ * +P y 2+ * +P z 3+ * diff --git a/test/reads/disconnected.fq b/test/reads/disconnected.fq new file mode 100644 index 0000000000..2fb5e85360 --- /dev/null +++ b/test/reads/disconnected.fq @@ -0,0 +1,4 @@ +@disc +TCACGCTGGCAATACGGAGTACGAATTTACGGAGTCGCGTGGTGGCGAGTCTTGACGATCCAATGTTTGACACCATTTGCTGCCATCACCCGGTGAGGAGATGGCGGACCTAACTCACTT ++ +HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH diff --git a/test/reads/disconnected_pair.fq b/test/reads/disconnected_pair.fq new file mode 100644 index 0000000000..4e55c80ee1 --- /dev/null +++ b/test/reads/disconnected_pair.fq @@ -0,0 +1,8 @@ +@disc/1 +TCACGCTGGCAATACGGAGTACGAATTTACGGAGTCGCGTGGTGGCGAGTTGAGCGTGATCAGTTCGGGCTTCTCCCGTCCGTCCGAGACTCCACTACCG ++ +HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH +@disc/2 +AAGTGAGTTAGGTCCGCCATCTCCTCACCGGGTGATGGCAGCAAATGGTGTCAAACATTGGATCGTCAAGAAATACTCCATCACTGCCGCAGTTGTGAAGGCCAAGTCCCTAAATCATAC ++ +HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index c540ed7605..47747b0887 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 71 +plan tests 80 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg @@ -251,6 +251,24 @@ rm -f x.gam xy.bam rm -f xy.giraffe.gbz rm -f xy.vg xy.gbwt xy.xg xy.shortread.zipcodes xy.shortread.withzip.min xy.dist xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi +vg autoindex -w giraffe -p x -g graphs/disconnected.gfa +vg giraffe -Z x.giraffe.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipcodes --supplementary -f reads/disconnected.fq > x.gam +is "$(vg view -aj x.gam | jq -c .supplementary | jq length)" "1" "graph alignments can be annotated with supplementary alignments in GAM format" +vg giraffe -Z x.giraffe.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipcodes --supplementary -f reads/disconnected.fq -o GAF > x.gaf +is "$(grep -E 'sa:Z:[a-zA-Z0-9=,<>]+;' x.gaf | wc -l | sed 's/^[[:space:]]*//')" "1" "graph alignments can be annotated with supplementary alignments in GAF format" +vg giraffe -Z x.giraffe.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipcodes -o BAM --supplementary -f reads/disconnected.fq > x.bam +is "$(samtools view -f 2048 x.bam | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM output converts supplementary alignment annotation into BAM records" +is "$(samtools view -F 2048 x.bam | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM output has a primary when converting supplementary alignment annotation" +is "$(samtools view x.bam | grep 'SA:Z:' | wc -l | sed 's/^[[:space:]]*//')" "2" "Supplementary and primary BAM records have SA tag" +vg giraffe -Z x.giraffe.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipcodes --supplementary -f reads/disconnected_pair.fq -i --fragment-mean 200 --fragment-stdev 50 > x.gam +is "$(vg view -aj x.gam | jq -c .supplementary | jq length | uniq)" "1" "paired graph alignments can be annotated with supplementary alignments" +vg giraffe -Z x.giraffe.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipcodes -o BAM --supplementary -f reads/disconnected_pair.fq -i --fragment-mean 200 --fragment-stdev 50 > x.bam +is "$(samtools view -f 2048 x.bam | wc -l | sed 's/^[[:space:]]*//')" "2" "paired BAM output converts supplementary alignment annotation into BAM records" +is "$(samtools view -F 2048 x.bam | wc -l | sed 's/^[[:space:]]*//')" "2" "paired BAM output has a primary when converting supplementary alignment annotation" +is "$(samtools view x.bam | grep 'SA:Z:' | wc -l | sed 's/^[[:space:]]*//')" "4" "Supplementary and primary BAM records have SA tag for paired reads" + +rm x.giraffe.gbz x.dist x.shortread.withzip.min x.shortread.zipcodes x.gam x.bam x.gaf + vg autoindex -p brca -w giraffe -g graphs/cactus-BRCA2.gfa vg sim -s 100 -x brca.giraffe.gbz -n 200 -a > reads.gam vg giraffe -Z brca.giraffe.gbz -m brca.shortread.withzip.min -z brca.shortread.zipcodes -d brca.dist -G reads.gam --named-coordinates > mapped.gam