Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ build
dist
.cache
.coverage
.pytest_cache
6 changes: 5 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
language: python
addons:
apt:
packages:
- bowtie2
python:
- "2.7"
- "3.5"
Expand All @@ -10,7 +14,7 @@ matrix:
- pip install pytest-flake8
- pip install .
script:
- pytest --flake8 -m flake8
- flake8 nuc_processing test setup.py
after_success: []
install:
- pip install pytest-cov codecov
Expand Down
2 changes: 1 addition & 1 deletion nuc_processing/NucContactMap.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def nuc_contact_map(ncc_path, svg_tag='_contact_map', svg_width=1000, bin_size=5
ni = _get_num_isolated(contact_list)
s_a, n_a = chromo_offsets[chr_a]
s_b, n_b = chromo_offsets[chr_b]

for p_a, p_b, ag in contact_list:
if chr_a != chr_b:
if ('.' in chr_a) and ('.' in chr_b) and (chr_a.split('.')[0] == chr_b.split('.')[0]):
Expand Down
100 changes: 22 additions & 78 deletions nuc_processing/NucProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,28 +42,30 @@
from collections import defaultdict
from shutil import move
from subprocess import Popen, PIPE, call
from functools import partial
from .NucSvg import SvgDocument
from .NucContactMap import nuc_contact_map
from .common import open_file_r, strip_ext, merge_file_names

PROG_NAME = 'nuc_process'
VERSION = '1.1.1'
DESCRIPTION = 'Chromatin contact paired-read Hi-C processing module for Nuc3D and NucTools'
RE_CONF_FILE = 'enzymes.conf'
RE_SITES = {'MboI' : '^GATC_',
'DpnII' : '^GATC_',
'AluI' : 'AG^CT',
'BglII' : 'A^GATC_T',
'HindIII': 'A^AGCT_T'}
RE_SITES = {'MboI' : '^GATC_', # noqa: E203
'DpnII' : '^GATC_', # noqa: E203
'AluI' : 'AG^CT', # noqa: E203
'BglII' : 'A^GATC_T', # noqa: E203
'HindIII': 'A^AGCT_T'} # noqa: E203
QUAL_SCHEMES = ['phred33', 'phred64', 'solexa']
DEFAULT_MIN_QUAL = 10
QUAL_ZERO_ORDS = {'phred33':33, 'phred64':64, 'solexa':64}
FASTQ_READ_CHUNK = 1048576
READ_BUFFER = 2**16
open_file_r = partial(open_file_r, buffering=READ_BUFFER)
MIN_READ_LEN = 20
NUM_MAP_FASTAS = 10
SCORE_TAG = re.compile(r'\sAS:i:(\S+)')
SCORE_TAG_SEARCH = SCORE_TAG.search
FILENAME_SPLIT_PATT = re.compile('[_\.]')
NCC_FORMAT = '%s %d %d %d %d %s %s %d %d %d %d %s %d %d %d\n'
LOG_FILE_PATH = None
STAT_FILE_PATH = None
Expand Down Expand Up @@ -94,16 +96,6 @@
RE_SITES[name] = site


def open_file_r(file_path):

if file_path.endswith('.gz'):
file_obj = gzip.open(file_path, 'rt')
else:
file_obj = open(file_path, 'rU', READ_BUFFER)

return file_obj


def compress_file(file_path):

in_file_obj = open(file_path, 'rU', READ_BUFFER)
Expand Down Expand Up @@ -211,54 +203,6 @@ def write_sam_file(ncc_file_path, ref_sam_file_1, ref_sam_file_2):
return sam_file_path


def merge_file_names(file_path1, file_path2, sep='_'):

# same dir, need non truncated name

dir_name1, file_name1 = os.path.split(file_path1)
dir_name2, file_name2 = os.path.split(file_path2)

if dir_name1 != dir_name2:
msg = 'Attempt to merge file names for file from different directories'
raise Exception(msg)

file_root1, file_ext1 = os.path.splitext(file_name1)
file_root2, file_ext2 = os.path.splitext(file_name2)

if file_ext1 != file_ext2:
msg = 'Attempt to merge file names with different file extensions'
raise Exception(msg)

parts1 = FILENAME_SPLIT_PATT.split(file_root1)
parts2 = FILENAME_SPLIT_PATT.split(file_root2)
parts3 = []

n1 = len(parts1)
n2 = len(parts2)
n = max(n1, n2)

for i in range(n):

if (i < n1) and (i < n2):
a = parts1[i]
b = parts2[i]

parts3.append(a)
if a != b:
parts3.append(b)

elif i < n1:
parts3.append(parts1[i])
else:
parts3.append(parts2[i])

file_root3 = sep.join(parts3)

file_path3 = os.path.join(dir_name1, file_root3 + file_ext1)

return file_path3


def remove_promiscuous(ncc_file, num_copies=1, keep_files=True, zip_files=False,
resolve_limit=1e3, close_cis=1e4, ambig=False):
"""
Expand Down Expand Up @@ -898,10 +842,10 @@ def count_write(tag, line):

# Remove complete excluded ambiguity groups at the end:
# - Removes a whole ambiguity group if only one possibilty was suspect as it could be the real contact
out_file_objs['accepted'].close()

out_file_objs['accepted'].close()
del out_file_objs['accepted']

out_file_obj = open(filter_file, 'w')
with open_file_r(out_file_names['accepted']) as file_obj:
write = out_file_obj.write
Expand Down Expand Up @@ -1505,7 +1449,7 @@ def map_reads(fastq_file, genome_index, align_exe, num_cpu, ambig, qual_scheme,
patt_4 = re.compile('(\d+) \(.+\) aligned >1 times')

cmd_args = [align_exe,
'-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50',
'-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50',
'-x', genome_index,
'-k', '2',
'--reorder',
Expand Down Expand Up @@ -1855,7 +1799,7 @@ def get_chromo_re_fragments(fasta_file_objs, contig, sequence, re_site, cut_offs

fasta_write = [fo.write for fo in fasta_file_objs]

step = mappability_length/2
step = mappability_length // 2
site_len = len(re_site)
offset_start = site_len - cut_offset
frag_start = offset_start
Expand Down Expand Up @@ -2185,9 +2129,9 @@ def index_genome(base_name, file_names, output_dir, indexer_exe='bowtie2-build',
file_name = uncompress_gz_file(file_name)
fasta_files.append(file_name)

fasta_file_str = ','.join(fasta_files)
fasta_file_str = ','.join(map(os.path.abspath, fasta_files))

cmd_args = [indexer_exe, '-f']
cmd_args = [indexer_exe, '-f', '-c']

if quiet:
cmd_args.append('-q')
Expand All @@ -2199,6 +2143,11 @@ def index_genome(base_name, file_names, output_dir, indexer_exe='bowtie2-build',

call(cmd_args, cwd=output_dir)

# Bowtie2 always returns 0, so check if output is produced
status, error = check_index_file(os.path.join(output_dir, base_name))
if not status:
raise RuntimeError("Bowtie2 failed: " + error)


def get_ligation_junction(re_site):
"""
Expand Down Expand Up @@ -2705,7 +2654,7 @@ def pie_values(data, names):
hist, edges = frag_sizes
data_set = [(int(edges[i]), int(val)) for i, val in enumerate(hist)]
svg_doc.graph(x1, y+chart_height, table_width/2, th-chart_height-40, [data_set], 'Size', 'Count',
names=None, colors=None, graph_type='line',
names=None, colors=None, graph_type='line',
symbols=None, line_widths=None, symbol_sizes=None,
legend=False, title=None, plot_offset=(80, 20))
y += th
Expand Down Expand Up @@ -3030,12 +2979,7 @@ def nuc_process(fastq_paths, genome_index, genome_index2, re1, re2=None, sizes=(
break

else:
file_paths = []
for fastq_path in fastq_paths:
if fastq_path.lower().endswith('.gz'):
fastq_path = fastq_path[:-3]

file_paths.append(fastq_path)
file_paths = list(map(partial(strip_ext, ext=".gz"), fastq_paths))

merged_path = merge_file_names(file_paths[0], file_paths[1])
file_root = os.path.splitext(merged_path)[0]
Expand Down
2 changes: 1 addition & 1 deletion nuc_processing/NucSvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def color_func(x, p=pos_color, n=neg_color, b=bg_color):
self.text(y_axis_label, (x3, y), anchor='middle', size=font_size, bold=False, font=font, color=line_color, angle=270, vert_align=None)

def graph(self, x, y, width, height, data_lists, x_label, y_label,
names=None, colors=None, graph_type=LINE_TYPE,
names=None, colors=None, graph_type=LINE_TYPE,
symbols=None, line_widths=None, symbol_sizes=None,
legend=False, title=None, x_labels=None, plot_offset=(100, 50),
axis_color='black', bg_color='#F0F0F0', font=None, font_size=16, line_width=1,
Expand Down
58 changes: 58 additions & 0 deletions nuc_processing/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import gzip
import re
import os
try:
from itertools import zip_longest
except ImportError:
# Python 2
from itertools import izip_longest as zip_longest
from functools import partial


def open_file_r(file_path, buffering=-1):
if file_path.endswith('.gz'):
return gzip.open(file_path, 'rt')
else:
return open(file_path, 'rtU', buffering)


def strip_ext(name, ext):
if name.endswith(ext):
return name[:-len(ext)]
return name


def nwise_longest(iterable, n=2, fillvalue=None):
return zip_longest(*[iter(iterable)] * n, fillvalue=fillvalue)


def merge_file_names(file_path1, file_path2, sep='_'):
SPLIT_PATT = re.compile('([_\.])')

dir_name1, file_name1 = os.path.split(file_path1)
dir_name2, file_name2 = os.path.split(file_path2)

if dir_name1 != dir_name2:
msg = 'Attempt to merge file names for file from different directories'
raise Exception(msg)

file_root1, file_ext1 = os.path.splitext(file_name1)
file_root2, file_ext2 = os.path.splitext(file_name2)

if file_ext1 != file_ext2:
msg = 'Attempt to merge file names with different file extensions'
raise Exception(msg)

parts = []
# Split on separators
split_names = map(SPLIT_PATT.split, [file_root1, file_root2])
# Pair segment and following separator
split_names = map(partial(nwise_longest, n=2, fillvalue="."), split_names)

for (a, sep_a), (b, sep_b) in zip_longest(*split_names, fillvalue=("", "")):
if a is not None:
parts.extend([a, sep_a])
if a != b and b is not None:
parts.extend([b, sep_b])

return os.path.join(dir_name1, ''.join(parts) + file_ext1[1:])
Loading