diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1f23b782..9d28e777 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -39,7 +39,6 @@ jobs: hash -r conda config --set always_yes yes --set changeps1 no conda config --add channels conda-forge - conda config --add channels defaults conda config --add channels bioconda conda config --get @@ -48,8 +47,11 @@ jobs: conda info -a # Create test environment and install deps - conda create -q -n test-environment python=${{ matrix.python-version }} setuptools pip cython numpy pandas nose samtools pysam scipy pyyaml bioframe + conda create -q -n test-environment -c anaconda python=${{ matrix.python-version }} setuptools pip numpy scipy pandas nose source activate test-environment + conda install -c conda-forge cython pyyaml + conda install -c bioconda samtools + pip install pysam bioframe # workaround: incompatible with python3.10 if installed by conda pip install click python setup.py build_ext -i diff --git a/README.md b/README.md index e5d5a276..f9746f9e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Documentation Status](https://readthedocs.org/projects/pairtools/badge/?version=latest)](http://pairtools.readthedocs.org/en/latest/) [![Build Status](https://travis-ci.org/mirnylab/pairtools.svg?branch=master)](https://travis-ci.org/mirnylab/pairtools) -[![Join the chat at https://gitter.im/mirnylab/distiller](https://badges.gitter.im/mirnylab/distiller.svg)](https://gitter.im/mirnylab/distiller?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Join the chat at Open2C Slack](https://join.slack.com/t/open2c/shared_invite/zt-1buzt2uyt-FLGp8~~MvbTCig5sQZTSNQ)](https://join.slack.com/t/open2c/shared_invite/zt-1buzt2uyt-FLGp8~~MvbTCig5sQZTSNQ) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1490831.svg)](https://doi.org/10.5281/zenodo.1490831) ## Process Hi-C pairs with pairtools @@ -19,9 +19,12 @@ operations: - generate extensive statistics of Hi-C datasets - select Hi-C pairs given flexibly defined criteria - restore .sam alignments from Hi-C pairs +- annotate restriction digestion sites +- get the mutated positions in Hi-C pairs To get started: -- Take a look at a [quick example](https://github.com/open2c/pairtools#quick-example) +- Visit [pairtools tutorials](https://pairtools.readthedocs.io/en/latest/examples/pairtools_walkthrough.html), +- Take a look at a [quick example](https://github.com/open2c/pairtools#quick-example), - Check out the detailed [documentation](http://pairtools.readthedocs.io). ## Data formats @@ -32,18 +35,23 @@ format defined by the [4D Nucleome Consortium](https://www.4dnucleome.org/). All pairtools properly manage file headers and keep track of the data processing history. -Additionally, `pairtools` define the .pairsam format, an extension of .pairs that includes the SAM alignments +Additionally, `pairtools` define the [.pairsam format](https://pairtools.readthedocs.io/en/latest/formats.html#pairsam), an extension of .pairs that includes the SAM alignments of a sequenced Hi-C molecule. .pairsam complies with the .pairs format, and can be processed by any tool that operates on .pairs files. +`pairtools` produces a set of additional extra columns, which describe properties of alignments, phase, mutations, restriction and complex walks. +The full list of possible extra columns is provided in the [`pairtools` format specification](https://pairtools.readthedocs.io/en/latest/formats.html#extra-columns). + ## Installation Requirements: - Python 3.x -- Python packages `cython`, `numpy` and `click`. +- Python packages `cython`, `pysam`, `bioframe`, `pyyaml`, `numpy`, `scipy`, `pandas` and `click`. - Command-line utilities `sort` (the Unix version), `bgzip` (shipped with `samtools`) and `samtools`. If available, `pairtools` can compress outputs with `pbgzip` and `lz4`. +For the full list of recommended versions, see [requirements in the the GitHub repo](https://github.com/open2c/pairtools/blob/detect_mutations/requirements.txt). + We highly recommend using the `conda` package manager to install `pairtools` together with all its dependencies. To get it, you can either install the full [Anaconda](https://www.continuum.io/downloads) Python distribution or just the standalone [conda](http://conda.pydata.org/miniconda.html) package manager. With `conda`, you can install `pairtools` and all of its dependencies from the [bioconda](https://bioconda.github.io/index.html) channel. @@ -91,21 +99,27 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex ## Tools -- `parse`: read .sam files produced by bwa and form Hi-C pairs +- `parse`: read .sam/.bam files produced by bwa and form Hi-C pairs - form Hi-C pairs by reporting the outer-most mapped positions and the strand on the either side of each molecule; - report unmapped/multimapped (ambiguous alignments)/chimeric alignments as chromosome "!", position 0, strand "-"; - - identify and rescue chrimeric alignments produced by singly-ligated Hi-C - molecules with a sequenced ligation junction on one of the sides; - - annotate chimeric alignments by restriction fragments and report true junctions and hops (One-Read-Based Interactions Annotation, ORBITA); - perform upper-triangular flipping of the sides of Hi-C molecules such that the first side has a lower sorting index than the second side; - form hybrid pairsam output, where each line contains all available data for one Hi-C molecule (outer-most mapped positions on the either side, read ID, pair type, and .sam entries for each alignment); + - report .sam tags or mutations of the alignments; - print the .sam header as #-comment lines at the start of the file. +- `parse2`: read .sam/.bam files with long paired-and or single-end reads and form Hi-C pairs from complex walks + - identify and rescue chrimeric alignments produced by singly-ligated Hi-C + molecules with a sequenced ligation junction on one of the sides; + - annotate chimeric alignments by restriction fragments and report true junctions and hops (One-Read-Based Interactions Annotation, ORBITA); + - perform intra-molecule deduplication of paired-end data when one side reads through the DNA on the other side of the read; + - report index of the pair in the complex walk; + - make combinatorial expansion of pairs produced from the same walk; + - `sort`: sort pairs files (the lexicographic order for chromosomes, the numeric order for the positions, the lexicographic order for pair types). @@ -125,7 +139,10 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex - `dedup`: remove PCR duplicates from a sorted triu-flipped .pairs file - remove PCR duplicates by finding pairs of entries with both sides mapped to similar genomic locations (+/- N bp); - - optionally output the PCR duplicate entries into a separate file. + - optionally output the PCR duplicate entries into a separate file; + - detect optical duplicates from the original Illumina read ids; + - apply filtering by various properties of pairs (MAPQ; orientation; distance) together with deduplication; + - output yaml or convenient tsv deduplication stats into text file. - NOTE: in order to remove all PCR duplicates, the input must contain \*all\* mapped read pairs from a single experimental replicate; @@ -136,10 +153,20 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex - `split`: split a .pairsam file into .pairs and .sam. +- `flip`: flip pairs to get an upper-triangular matrix + +- `header`: manipulate the .pairs/.pairsam header + - generate new header for headerless .pairs file + - transfer header from one .pairs file to another + - set column names for the .pairs file + - validate that the header corresponds to the information stored in .pairs file + - `stats`: calculate various statistics of .pairs files - `restrict`: identify the span of the restriction fragment forming a Hi-C junction +- `phase`: phase pairs mapped to a diploid genome + ## Contributing [Pull requests](https://akrabat.com/the-beginners-guide-to-contributing-to-a-github-project/) are welcome. diff --git a/doc/formats.rst b/doc/formats.rst index 8ca364d4..a778c994 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -148,3 +148,37 @@ Finally, sam1 and sam2 can store multiple .sam alignments, separated by a string .. |check| unicode:: U+2714 .. check .. |cross| unicode:: U+274C .. cross +Extra columns +---------------- + +`pairtools` can operate on `.pairs/.pairsam` with extra columns. +Extra columns are specified in the order defined by the order their addition by various tools. +Column names can be checked in the header of `.pairs/.pairsam` file. +We provide `pairtools header` utilities for manipulating and verifying compatibility of headers and their columns. + +The list of additional columns used throughout `pairtools` modules: + +=================================== =================== ====================== ================================================== ================= +extra column generating module format how to add it description +=================================== =================== ====================== ================================================== ================= +mapq1, mapq2 `parse/parse2` number from 0 to 255 `pairtools parse --add-columns mapq` `Mapping quality `_, as reported in .sam/.bam, $-10 log_{10}(P_{error})$ +pos51, pos52 `parse/parse2` genomic coordinate `pairtools parse --add-columns pos5` 5' position of alignment (closer to read start) +pos31, pos32 `parse/parse2` genomic coordinate `pairtools parse --add-columns pos3` 3' position of alignment (further from read start) +cigar1, cigar2 `parse/parse2` string `pairtools parse --add-columns cigar` `CIGAR, or Compact Idiosyncratic Gapped Alignment Report `_ of alignment, as reported in .sam/.bam +read_len1, read_len2 `parse/parse2` number `pairtools parse --add-columns read_len` read length +matched_bp1, matched_bp2 `parse/parse2` number `pairtools parse --add-columns matched_bp` number of matched alignment basepairs to the reference +algn_ref_span1, algn_ref_span2 `parse/parse2` number `pairtools parse --add-columns algn_ref_span` basepairs of reference covered by alignment +algn_read_span1, algn_read_span2 `parse/parse2` number `pairtools parse --add-columns algn_read_span` basepairs of read covered by alignment +dist_to_51, dist_to_52 `parse/parse2` number `pairtools parse --add-columns dist_to_5` distance to 5'-end of read +dist_to_31, dist_to_32 `parse/parse2` number `pairtools parse --add-columns dist_to_3` distance to 3'-end of read +seq1, seq2 `parse/parse2` string `pairtools parse --add-columns seq` sequence of alignment +mismatches1, mismatches2 `parse/parse2` string `pairtools parse --add-columns mismatches` comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" +XB1/2,AS1/2,XS1/2 or any sam tag `parse/parse2` `pairtools parse --add-columns XA,XB,NM` format depends on `tag specification `_ +walk_pair_type `parse/parse2` string `pairtools parse2 --add-pair-index` Type of the pair relative to R1 and R2 reads of paired-end sequencing, see `pasring docs `_ +walk_pair_index `parse/parse2` number `pairtools parse2 --add-pair-index` Order of the pair in the complex walk, starting from 5'-end of left read, see `pasring docs `_ +phase `phase` 0, 1 or "." `pairtools phase` Phase of alignment (haplotype 1, 2, on unphased), see `phasing walkthrough `_ +rfrag1, rfrag2 `restrict` number `pairtools restrict` Unique index of the restriction fragment after annotating pairs positions, see `restriction walkthrough `_ +rfrag_start1, rfrag_start2 `restrict` number `pairtools restrict` Coordinate of the start of restriction fragment +rfrag_end1, rfrag_end2 `restrict` number `pairtools restrict` Coordinate of the end of restriction fragment +=================================== =================== ====================== ================================================== ================= + diff --git a/doc/parsing.rst b/doc/parsing.rst index b38f04a9..d7d63935 100644 --- a/doc/parsing.rst +++ b/doc/parsing.rst @@ -265,7 +265,7 @@ position: Sometimes it is important to restore the sequence of ligation events (e.g., for MC-3C data). For that, you can add special columns ``walk_pair_index`` and ``walk_pair_type`` by setting ``--add-pair-index`` option of ``parse2``, that will keep the order and type of pair in the whole walk in the output .pairs file. -- ``walk_pair_index`` contains information on the order of the pair in the recovered walk, starting from 5'-end of left read +- ``walk_pair_index`` contains information on the order of the pair in the complex walk, starting from 5'-end of left read - ``walk_pair_type`` describes the type of the pair relative to R1 and R2 reads of paired-end sequencing: - "R1-2" - unconfirmed pair, right and left alignments in the pair originate from different reads (left or right). This might be indirect ligation (mediated by other DNA fragments). diff --git a/pairtools/cli/parse.py b/pairtools/cli/parse.py index 567e5e2f..fc34cc01 100644 --- a/pairtools/cli/parse.py +++ b/pairtools/cli/parse.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + import click import sys @@ -13,20 +14,6 @@ UTIL_NAME = "pairtools_parse" -EXTRA_COLUMNS = [ - "mapq", - "pos5", - "pos3", - "cigar", - "read_len", - "matched_bp", - "algn_ref_span", - "algn_read_span", - "dist_to_5", - "dist_to_3", - "seq", -] - @cli.command() @click.argument("sam_path", type=str, required=False) @@ -96,7 +83,7 @@ help="Report extra columns describing alignments " "Possible values (can take multiple values as a comma-separated " "list): a SAM tag (any pair of uppercase letters) or {}.".format( - ", ".join(EXTRA_COLUMNS) + ", ".join(pairsam_format.EXTRA_COLUMNS) ), ) @click.option( @@ -228,7 +215,7 @@ def parse_py( add_columns = kwargs.get("add_columns", []) add_columns = [col for col in add_columns.split(",") if col] for col in add_columns: - if not ((col in EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): + if not ((col in pairsam_format.EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): raise Exception("{} is not a valid extra column".format(col)) columns = pairsam_format.COLUMNS + ( diff --git a/pairtools/cli/parse2.py b/pairtools/cli/parse2.py index 37fb6cdd..725a5bec 100644 --- a/pairtools/cli/parse2.py +++ b/pairtools/cli/parse2.py @@ -13,21 +13,6 @@ UTIL_NAME = "pairtools_parse2" -EXTRA_COLUMNS = [ - "mapq", - "pos5", - "pos3", - "cigar", - "read_len", - "matched_bp", - "algn_ref_span", - "algn_read_span", - "dist_to_5", - "dist_to_3", - "seq", -] - - @cli.command() @click.argument("sam_path", type=str, required=False) # Parsing options: @@ -145,9 +130,11 @@ @click.option( "--flip/--no-flip", is_flag=True, - default=True, - help="If specified, do not flip pairs in genomic order and instead preserve " - "the order in which they were sequenced.", + default=False, + help="If specified, flip pairs in genomic order and instead preserve " + "the order in which they were sequenced. Note that no flip is recommended for analysis of walks because it will " + "override the order of alignments in pairs. Flip is required for appropriate deduplication of sorted pairs. " + "Flip is not required for cooler cload, which runs flipping internally. ", ) @click.option( "--drop-readid/--keep-readid", @@ -192,7 +179,7 @@ help="Report extra columns describing alignments " "Possible values (can take multiple values as a comma-separated " "list): a SAM tag (any pair of uppercase letters) or {}.".format( - ", ".join(EXTRA_COLUMNS) + ", ".join(pairsam_format.EXTRA_COLUMNS) ), ) @click.option( @@ -281,7 +268,7 @@ def parse2_py( add_columns = kwargs.get("add_columns", []) add_columns = [col for col in add_columns.split(",") if col] for col in add_columns: - if not ((col in EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): + if not ((col in pairsam_format.EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): raise Exception("{} is not a valid extra column".format(col)) columns = pairsam_format.COLUMNS + ( diff --git a/pairtools/lib/pairsam_format.py b/pairtools/lib/pairsam_format.py index dfbd95ec..96c13f01 100644 --- a/pairtools/lib/pairsam_format.py +++ b/pairtools/lib/pairsam_format.py @@ -62,3 +62,18 @@ UNMAPPED_STRAND = "-" UNANNOTATED_RFRAG = -1 + +EXTRA_COLUMNS = [ + "mapq", + "pos5", + "pos3", + "cigar", + "read_len", + "matched_bp", + "algn_ref_span", + "algn_read_span", + "dist_to_5", + "dist_to_3", + "seq", + "mismatches" # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" +] \ No newline at end of file diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index ce0a1a27..55f5acbb 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -35,7 +35,7 @@ """ from . import pairsam_format - +from .parse_pysam import get_mismatches_c def streaming_classify( instream, outstream, chromosomes, out_alignments_stream, out_stat, **kwargs @@ -125,6 +125,7 @@ def streaming_classify( walks_policy=kwargs["walks_policy"], sam_tags=sam_tags, store_seq=store_seq, + report_mismatches=True if 'mismatches' in add_columns else False, ) else: # parse2 parser: pairstream, all_algns1, all_algns2 = parse2_read( @@ -141,6 +142,7 @@ def streaming_classify( store_seq=store_seq, expand=kwargs["expand"], max_expansion_depth=kwargs["max_expansion_depth"], + report_mismatches=True if 'mismatches' in add_columns else False, ) ### Write: @@ -240,18 +242,26 @@ def empty_alignment(): "clip5_ref": 0, "read_len": 0, "type": "N", + "mismatches": "" } def parse_pysam_entry( - sam, min_mapq, sam_tags=None, store_seq=False, report_3_alignment_end=False + sam, + min_mapq, + sam_tags=None, + store_seq=False, + report_3_alignment_end=False, + report_mismatches=False ): """Parse alignments from pysam AlignedSegment entry :param sam: input pysam AlignedSegment entry :param min_mapq: minimal MAPQ to consider as a proper alignment :param sam_tags: list of sam tags to store :param store_seq: if True, the sequence will be parsed and stored in the output - :param report_3_alignment_end: if True, 3'-end of alignment will be reported as position (will be deprecated) + :param report_3_alignment_end: if True, 3'-end of alignment will be + reported as position (will be deprecated) + :param report_mismatches: if True, mismatches will be parsed from MD field :return: parsed aligned entry (dictionary) """ @@ -282,11 +292,23 @@ def parse_pysam_entry( # Note that pysam output is zero-based, thus add +1: pos3 = sam.reference_start + 1 + # Get number of matches: + if not sam.has_tag("MD") or not report_mismatches: + mismatches = "" + else: + seq = sam.query_sequence.upper() + quals = sam.query_qualities + aligned_pairs = sam.get_aligned_pairs(with_seq=True, matches_only=True) + mismatches = get_mismatches_c(seq, quals, aligned_pairs) + mismatches = ",".join([f"{original}:{mutated}:{phred}:{ref}:{read}" for original, mutated, phred, ref, read in mismatches]) + #n_matches = len(aligned_pairs) + else: chrom = pairsam_format.UNMAPPED_CHROM strand = pairsam_format.UNMAPPED_STRAND pos5 = pairsam_format.UNMAPPED_POS pos3 = pairsam_format.UNMAPPED_POS + mismatches = "" else: chrom = pairsam_format.UNMAPPED_CHROM strand = pairsam_format.UNMAPPED_STRAND @@ -295,6 +317,7 @@ def parse_pysam_entry( dist_to_5 = 0 dist_to_3 = 0 + mismatches = "" algn = { "chrom": chrom, @@ -308,6 +331,7 @@ def parse_pysam_entry( "dist_to_5": dist_to_5, "dist_to_3": dist_to_3, "type": ("N" if not is_mapped else ("M" if not is_unique else "U")), + "mismatches": mismatches } algn.update(cigar) @@ -393,6 +417,7 @@ def parse_read( walks_policy, sam_tags, store_seq, + report_mismatches=False, ): """ Parse sam entries corresponding to a single read (or Hi-C molecule) @@ -426,8 +451,8 @@ def parse_read( return iter([(algns1[0], algns2[0], pair_index)]), algns1, algns2 # Generate a sorted, gap-filled list of all alignments - algns1 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams1] - algns2 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams2] + algns1 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams1] + algns2 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2] if len(algns1) > 0: algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) @@ -545,6 +570,7 @@ def parse2_read( sam_tags=[], dedup_max_mismatch=3, store_seq=False, + report_mismatches=False, expand=False, max_expansion_depth=None, ): @@ -567,7 +593,7 @@ def parse2_read( if single_end: # Generate a sorted, gap-filled list of all alignments algns1 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2 # note sams2, that's how these reads are typically parsed ] algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) @@ -617,10 +643,10 @@ def parse2_read( # Generate a sorted, gap-filled list of all alignments algns1 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams1 + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams1 ] algns2 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams2 + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2 ] # Sort alignments by the distance to the 5'-end: diff --git a/pairtools/lib/parse_pysam.pyx b/pairtools/lib/parse_pysam.pyx index a65d3ba6..efdc6dfe 100644 --- a/pairtools/lib/parse_pysam.pyx +++ b/pairtools/lib/parse_pysam.pyx @@ -91,5 +91,35 @@ cdef class AlignedSegmentPairtoolized(AlignedSegment): "algn_ref_span": algn_ref_span, "algn_read_span": algn_read_span, "read_len": read_len, - "matched_bp": matched_bp, + "matched_bp": matched_bp } + + +from cpython cimport array +import cython +cimport cython + +cpdef list get_mismatches_c(str seq, array.array quals, list aligned_pairs): + ''' + This function takes a SAM alignment and, for every mismatch between the read and reference sequences, + returns a tuple (the reference bp, the read bp, PHRED quality of the bp, reference position, read position). + + Reference: https://github.com/gerlichlab/scshic_pipeline/blob/master/bin/seq_mismatches.pyx + ''' + + cdef cython.int read_pos, ref_pos + cdef str orig_bp, orig_bp_upper + cdef list mismatches = [] + + for read_pos, ref_pos, orig_bp in aligned_pairs: + orig_bp_upper = orig_bp.upper() + if (seq[read_pos] != orig_bp_upper): + mismatches.append( + (orig_bp_upper, + seq[read_pos], + quals[read_pos], + ref_pos, + read_pos) + ) + + return mismatches \ No newline at end of file diff --git a/tests/test_parse2.py b/tests/test_parse2.py index cef2c912..37075ae1 100644 --- a/tests/test_parse2.py +++ b/tests/test_parse2.py @@ -22,6 +22,7 @@ def test_mock_pysam_parse2_read(): "-c", mock_chroms_path, "--add-pair-index", + "--flip", "--report-position", "junction", "--report-orientation", @@ -81,6 +82,7 @@ def test_mock_pysam_parse2_pair(): "-c", mock_chroms_path, "--add-pair-index", + "--flip", "--report-position", "outer", "--report-orientation",