Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/libvgio
2 changes: 2 additions & 0 deletions src/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3898,4 +3898,6 @@ Alignment target_alignment(const PathPositionHandleGraph* graph, const path_hand
}
return aln;
}


}
156 changes: 156 additions & 0 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,162 @@ Alignment target_alignment(const PathPositionHandleGraph* graph, const path_hand
Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2,
const string& feature, bool is_reverse, Mapping& cigar_mapping);


/// Returns indexes into an vector-like container of Alignments that correspond to supplementary alignments to the primary
/// min_read_coverage Require the supplementaries and primary to cover this fraction of the read
/// max_separation Require the separation or overlap between supplementaries to be at most this many bases
/// min_score_fraction Require each supplementary to have this fraction of the primary's score
/// min_size Require each supplementary to align at least this many bases
template<class AlignmentVector>
vector<size_t> identify_supplementaries(const AlignmentVector& alignments, double min_read_coverage, size_t max_separation,
double min_score_fraction, size_t min_size, size_t primary_idx = 0) {

assert(min_read_coverage >= 0.0 && min_read_coverage <= 1.0 && min_score_fraction >= 0.0 && min_score_fraction <= 1.0);

vector<size_t> supplementaries;

if (alignments.size() > 1) {

size_t seq_size = alignments[0].sequence().size();

size_t min_total_cov = ceil(min_read_coverage * seq_size);
int64_t min_score = ceil(min_score_fraction * alignments[primary_idx].score());

// do sparse DP over part of the read to determine which set of intervals achieves the highest read coverage, subject
// to the constraints
auto do_dp = [&](vector<tuple<size_t, size_t, size_t>>& intervals, size_t begin, size_t end) -> vector<size_t> {

std::sort(intervals.begin(), intervals.end());

// map from (end index -> (total coverage, final interval))
map<size_t, pair<size_t, size_t>> dp;
vector<size_t> backpointer(intervals.size(), numeric_limits<size_t>::max());

for (size_t i = 0; i < intervals.size(); ++i) {

const auto& interval = intervals[i];

// look for the best feasible previous DP entry
// TODO: this could have better worst-case guarantees with a key-value RMQ
auto max_it = dp.end();
for (auto it = dp.lower_bound(get<0>(interval) - max_separation); it != dp.end() && it->first <= get<0>(interval) + max_separation; ++it) {
if (max_it == dp.end() || max_it->second.first - max<int64_t>(max_it->first - get<0>(interval), 0) < it->second.first) {
max_it = it;
}
}

if (max_it != dp.end() || get<0>(interval) <= begin + max_separation) {
// we can make an entry into the DP structure

size_t prev_idx = numeric_limits<size_t>::max();
size_t cov = 0;
if (max_it != dp.end()) {
prev_idx = max_it->second.second;
cov = max_it->second.first - max<int64_t>(max_it->second.first - get<0>(interval), 0);
}
cov += get<1>(interval) - get<0>(interval);
backpointer[i] = prev_idx;

auto dp_val = make_pair(cov, i);
auto insert_it = dp.emplace(get<1>(interval), dp_val);
if (!insert_it.second) {
// there is already an interval ending at this index, take the max
if (insert_it.first->second.first < dp_val.first) {
insert_it.first->second = dp_val;
}
}
}
}

// traceback the optimum
vector<size_t> traceback;
auto final_it = dp.lower_bound(max<int64_t>(begin, end - max_separation));
if (final_it != dp.end()) {
// there is a feasible solution
traceback.emplace_back(final_it->second.second);
while (backpointer[traceback.back()] != numeric_limits<size_t>::max()) {
traceback.emplace_back(backpointer[traceback.back()]);
}
reverse(traceback.begin(), traceback.end());
}

return traceback;
};

auto primary_interval = aligned_interval(alignments[primary_idx]);

if (primary_interval.second - primary_interval.first >= min_size && alignments[primary_idx].score() >= min_score) {

// records of (begin read pos, end read pos, idx of alignment)
vector<tuple<size_t, size_t, size_t>> left_side, right_side;

for (size_t i = 0; i < alignments.size(); ++i) {
if (i == primary_idx) {
continue;
}
auto interval = aligned_interval(alignments[i]);
// filter to alignments that meet the minimum thresholds
if (alignments[i].score() >= min_score && interval.second - interval.first >= min_size) {
if (interval.second <= primary_interval.first + max_separation) {
left_side.emplace_back(interval.first, interval.second, i);
}
else if (interval.first >= primary_interval.second - max_separation) {
right_side.emplace_back(interval.first, interval.second, i);
}
}
}

// do DP on each side and combine the tracebacks
vector<tuple<size_t, size_t, size_t>> full_traceback;
for (auto i : do_dp(left_side, 0, primary_interval.first)) {
full_traceback.push_back(left_side[i]);
}
full_traceback.emplace_back(primary_interval.first, primary_interval.second, primary_idx);
for (auto i : do_dp(right_side, primary_interval.second, seq_size)) {
full_traceback.push_back(right_side[i]);
}

// compute the total read coverage with sweep line algorithm
if (!is_sorted(full_traceback.begin(), full_traceback.end())) {
// TODO: is this even possible? maybe in some weird cases where the max separation is larger than the min size
sort(full_traceback.begin(), full_traceback.end());
}
size_t total_cov = 0;
pair<size_t, size_t> curr_interval(0, 0);
for (const auto& interval : full_traceback) {
if (get<0>(interval) <= curr_interval.second) {
curr_interval.second = max(curr_interval.second, get<1>(interval));
}
else {
total_cov += (curr_interval.second - curr_interval.first);
curr_interval.first = get<0>(interval);
curr_interval.second = get<1>(interval);
}
}
total_cov += (curr_interval.second - curr_interval.first);
if (aligned_interval(alignments[get<2>(full_traceback.front())]).first <= max_separation &&
aligned_interval(alignments[get<2>(full_traceback.back())]).second >= max<int64_t>(seq_size - max_separation, 0) &&
total_cov >= min_total_cov) {
// the supplementaries and primary jointly cover the entire read, the result of DP is feasible

for (auto& interval : full_traceback) {
if (get<2>(interval) != primary_idx) {
supplementaries.push_back(get<2>(interval));
}
}

sort(supplementaries.begin(), supplementaries.end());
}
}
}

return supplementaries;
}





}

#endif
113 changes: 113 additions & 0 deletions src/interval_union.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/**
* \file interval_union.cpp: contains the implementation of IntervalUnion
*/


#include "interval_union.hpp"

#include <cassert>
#include <iostream>

namespace vg {

using namespace std;

void IntervalUnion::add(size_t begin, size_t end) {

assert(begin <= end);
if (begin == end) {
// empty interval, can be ignore
return;
}

auto after = irvl_union.upper_bound(begin);
auto merge_from = irvl_union.end();
if (after != irvl_union.begin()) {
// get the nearest interval to the left, possibly at the same index
auto before = after;
--before;
if (before->second >= begin) {
// merge this interval into the earlier interval
if (end > before->second) {
union_size += end - before->second;
before->second = end;
merge_from = before;
}
}
else {
// the start of this interval is outside the one to the left
union_size += (end - begin);
merge_from = irvl_union.emplace_hint(after, begin, end);
}
}
else {
// there is no interval to the left
union_size += (end - begin);
merge_from = irvl_union.emplace_hint(after, begin, end);
}

// TODO: it might also be possible to do a lazy-update version of this data structure
// to get amortized O(log n) add

if (merge_from != irvl_union.end()) {
// try to merge this interval with the ones to the right
auto next = merge_from;
++next;
while (next != irvl_union.end() && next->first <= merge_from->second) {
union_size -= (min(merge_from->second, next->second) - next->first);
merge_from->second = max(merge_from->second, next->second);
irvl_union.erase(next);
next = merge_from;
++next;
}
}
}

void IntervalUnion::clear() {
union_size = 0;
irvl_union.clear();
}

size_t IntervalUnion::overlap(size_t begin, size_t end) const {

// start to the left of the query interval
auto iter = irvl_union.upper_bound(begin);
if (iter != irvl_union.begin()) {
--iter;
}

// TODO: there's a worst-case O(log n) solution using a sum-augmented range tree
// to get the contribution from strictly contained intervals, but rebalancing it
// would be a pain

// add up the length of the intervals that this overlaps
size_t total_overlap = 0;
while (iter != irvl_union.end() && iter->first < end) {
if (iter->second > begin && end > iter->first) {
total_overlap += (min(end, iter->second) - max(begin, iter->first));
}
++iter;
}

return total_overlap;
}

size_t IntervalUnion::total_size() const {
return union_size;
}

size_t IntervalUnion::component_size() const {
return irvl_union.size();
}

vector<pair<size_t, size_t>> IntervalUnion::get_union() const {
vector<pair<size_t, size_t>> intervals;
intervals.reserve(irvl_union.size());
for (const auto& interval : irvl_union) {
intervals.push_back(interval);
}
return intervals;
}

}

67 changes: 67 additions & 0 deletions src/interval_union.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/** \file
* interval_union.hpp: defines the union of a collection of intervals that can
* be constructed incrementally
*/
#ifndef VG_INTERVAL_UNION_HPP_INCLUDED
#define VG_INTERVAL_UNION_HPP_INCLUDED

#include <map>
#include <vector>
#include <utility>

namespace vg {

using namespace std;

/**
* The union of a collection of intervals
*/
class IntervalUnion
Comment on lines +16 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a candidate for living in structures.

{
public:
IntervalUnion() = default;
~IntervalUnion() = default;

// add a new interval to the union
void add(size_t begin, size_t end);
inline void add(const pair<size_t, size_t>& interval);

// return the union to an empy starting state
void clear();

// query an interval's length of overlap with the union
size_t overlap(size_t begin, size_t end) const;
inline size_t overlap(const pair<size_t, size_t>& interval) const;

// get the total size of the intervals in the union
size_t total_size() const;

// return the number of disjoint intervals that comprise the union
size_t component_size() const;

// get the intervals that comprise the union
vector<pair<size_t, size_t>> get_union() const;

private:

map<size_t, size_t> irvl_union;

size_t union_size = 0;

};

/*
* Inline implementations
*/

void IntervalUnion::add(const pair<size_t, size_t>& interval) {
add(interval.first, interval.second);
}

size_t IntervalUnion::overlap(const pair<size_t, size_t>& interval) const {
return overlap(interval.first, interval.second);
}

}

#endif // VG_INTERVAL_UNION_HPP_INCLUDED
Loading
Loading