tjs23 · kwohlfahrt · Dec 18, 2017 · Dec 18, 2017 · Dec 18, 2017 · Dec 18, 2017
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ build
 dist
 .cache
 .coverage
+.pytest_cache
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,8 @@
 language: python
+addons:
+  apt:
+    packages:
+      - bowtie2
 python:
   - "2.7"
   - "3.5"
@@ -10,7 +14,7 @@ matrix:
         - pip install pytest-flake8
         - pip install .
       script:
-        - pytest --flake8 -m flake8
+        - flake8 nuc_processing test setup.py
       after_success: []
 install:
   - pip install pytest-cov codecov

diff --git a/nuc_processing/NucContactMap.py b/nuc_processing/NucContactMap.py
@@ -439,7 +439,7 @@ def nuc_contact_map(ncc_path, svg_tag='_contact_map', svg_width=1000, bin_size=5
       ni = _get_num_isolated(contact_list)
       s_a, n_a = chromo_offsets[chr_a]
       s_b, n_b = chromo_offsets[chr_b]
-      
+
       for p_a, p_b, ag in contact_list:
         if chr_a != chr_b:
           if ('.' in chr_a) and ('.' in chr_b) and (chr_a.split('.')[0] == chr_b.split('.')[0]):

diff --git a/nuc_processing/NucProcess.py b/nuc_processing/NucProcess.py
@@ -42,28 +42,30 @@
 from collections import defaultdict
 from shutil import move
 from subprocess import Popen, PIPE, call
+from functools import partial
 from .NucSvg import SvgDocument
 from .NucContactMap import nuc_contact_map
+from .common import open_file_r, strip_ext, merge_file_names
 
 PROG_NAME = 'nuc_process'
 VERSION = '1.1.1'
 DESCRIPTION = 'Chromatin contact paired-read Hi-C processing module for Nuc3D and NucTools'
 RE_CONF_FILE = 'enzymes.conf'
-RE_SITES = {'MboI'   : '^GATC_',
-            'DpnII'  : '^GATC_',
-            'AluI'   : 'AG^CT',
-            'BglII'  : 'A^GATC_T',
-            'HindIII': 'A^AGCT_T'}
+RE_SITES = {'MboI'   : '^GATC_',    # noqa: E203
+            'DpnII'  : '^GATC_',    # noqa: E203
+            'AluI'   : 'AG^CT',     # noqa: E203
+            'BglII'  : 'A^GATC_T',  # noqa: E203
+            'HindIII': 'A^AGCT_T'}  # noqa: E203
 QUAL_SCHEMES = ['phred33', 'phred64', 'solexa']
 DEFAULT_MIN_QUAL = 10
 QUAL_ZERO_ORDS = {'phred33':33, 'phred64':64, 'solexa':64}
 FASTQ_READ_CHUNK = 1048576
 READ_BUFFER = 2**16
+open_file_r = partial(open_file_r, buffering=READ_BUFFER)
 MIN_READ_LEN = 20
 NUM_MAP_FASTAS = 10
 SCORE_TAG = re.compile(r'\sAS:i:(\S+)')
 SCORE_TAG_SEARCH = SCORE_TAG.search
-FILENAME_SPLIT_PATT = re.compile('[_\.]')
 NCC_FORMAT = '%s %d %d %d %d %s %s %d %d %d %d %s %d %d %d\n'
 LOG_FILE_PATH = None
 STAT_FILE_PATH = None
@@ -94,16 +96,6 @@
     RE_SITES[name] = site
 
 
-def open_file_r(file_path):
-
-  if file_path.endswith('.gz'):
-    file_obj = gzip.open(file_path, 'rt')
-  else:
-    file_obj = open(file_path, 'rU', READ_BUFFER)
-
-  return file_obj
-
-
 def compress_file(file_path):
 
   in_file_obj = open(file_path, 'rU', READ_BUFFER)
@@ -211,54 +203,6 @@ def write_sam_file(ncc_file_path, ref_sam_file_1, ref_sam_file_2):
    return sam_file_path
 
 
-def merge_file_names(file_path1, file_path2, sep='_'):
-
-  # same dir, need non truncated name
-
-  dir_name1, file_name1 = os.path.split(file_path1)
-  dir_name2, file_name2 = os.path.split(file_path2)
-
-  if dir_name1 != dir_name2:
-    msg = 'Attempt to merge file names for file from different directories'
-    raise Exception(msg)
-
-  file_root1, file_ext1 = os.path.splitext(file_name1)
-  file_root2, file_ext2 = os.path.splitext(file_name2)
-
-  if file_ext1 != file_ext2:
-    msg = 'Attempt to merge file names with different file extensions'
-    raise Exception(msg)
-
-  parts1 = FILENAME_SPLIT_PATT.split(file_root1)
-  parts2 = FILENAME_SPLIT_PATT.split(file_root2)
-  parts3 = []
-
-  n1 = len(parts1)
-  n2 = len(parts2)
-  n = max(n1, n2)
-
-  for i in range(n):
-
-    if (i < n1) and (i < n2):
-      a = parts1[i]
-      b = parts2[i]
-
-      parts3.append(a)
-      if a != b:
-        parts3.append(b)
-
-    elif i < n1:
-      parts3.append(parts1[i])
-    else:
-      parts3.append(parts2[i])
-
-  file_root3 = sep.join(parts3)
-
-  file_path3 = os.path.join(dir_name1, file_root3 + file_ext1)
-
-  return file_path3
-
-
 def remove_promiscuous(ncc_file, num_copies=1, keep_files=True, zip_files=False,
                        resolve_limit=1e3, close_cis=1e4, ambig=False):
   """
@@ -898,10 +842,10 @@ def count_write(tag, line):
 
   # Remove complete excluded ambiguity groups at the end:
   # - Removes a whole ambiguity group if only one possibilty was suspect as it could be the real contact
-  
-  out_file_objs['accepted'].close()  
+
+  out_file_objs['accepted'].close()
   del out_file_objs['accepted']
-  
+
   out_file_obj = open(filter_file, 'w')
   with open_file_r(out_file_names['accepted']) as file_obj:
     write = out_file_obj.write
@@ -1505,7 +1449,7 @@ def map_reads(fastq_file, genome_index, align_exe, num_cpu, ambig, qual_scheme,
   patt_4 = re.compile('(\d+) \(.+\) aligned >1 times')
 
   cmd_args = [align_exe,
-              '-D', '20', '-R', '3', '-N', '0',  '-L', '20',  '-i', 'S,1,0.50',
+              '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50',
               '-x', genome_index,
               '-k', '2',
               '--reorder',
@@ -1855,7 +1799,7 @@ def get_chromo_re_fragments(fasta_file_objs, contig, sequence, re_site, cut_offs
 
   fasta_write = [fo.write for fo in fasta_file_objs]
 
-  step = mappability_length/2
+  step = mappability_length // 2
   site_len = len(re_site)
   offset_start = site_len - cut_offset
   frag_start = offset_start
@@ -2185,9 +2129,9 @@ def index_genome(base_name, file_names, output_dir, indexer_exe='bowtie2-build',
     file_name = uncompress_gz_file(file_name)
     fasta_files.append(file_name)
 
-  fasta_file_str = ','.join(fasta_files)
+  fasta_file_str = ','.join(map(os.path.abspath, fasta_files))
 
-  cmd_args = [indexer_exe, '-f']
+  cmd_args = [indexer_exe, '-f', '-c']
 
   if quiet:
     cmd_args.append('-q')
@@ -2199,6 +2143,11 @@ def index_genome(base_name, file_names, output_dir, indexer_exe='bowtie2-build',
 
   call(cmd_args, cwd=output_dir)
 
+  # Bowtie2 always returns 0, so check if output is produced
+  status, error = check_index_file(os.path.join(output_dir, base_name))
+  if not status:
+    raise RuntimeError("Bowtie2 failed: " + error)
+
 
 def get_ligation_junction(re_site):
   """
@@ -2705,7 +2654,7 @@ def pie_values(data, names):
   hist, edges = frag_sizes
   data_set = [(int(edges[i]), int(val)) for i, val in enumerate(hist)]
   svg_doc.graph(x1, y+chart_height, table_width/2, th-chart_height-40, [data_set], 'Size', 'Count',
-                names=None, colors=None,  graph_type='line',
+                names=None, colors=None, graph_type='line',
                 symbols=None, line_widths=None, symbol_sizes=None,
                 legend=False, title=None, plot_offset=(80, 20))
   y += th
@@ -3030,12 +2979,7 @@ def nuc_process(fastq_paths, genome_index, genome_index2, re1, re2=None, sizes=(
       break
 
   else:
-    file_paths = []
-    for fastq_path in fastq_paths:
-      if fastq_path.lower().endswith('.gz'):
-        fastq_path = fastq_path[:-3]
-
-      file_paths.append(fastq_path)
+    file_paths = list(map(partial(strip_ext, ext=".gz"), fastq_paths))
 
     merged_path = merge_file_names(file_paths[0], file_paths[1])
     file_root = os.path.splitext(merged_path)[0]

diff --git a/nuc_processing/NucSvg.py b/nuc_processing/NucSvg.py
@@ -465,7 +465,7 @@ def color_func(x, p=pos_color, n=neg_color, b=bg_color):
       self.text(y_axis_label, (x3, y), anchor='middle', size=font_size, bold=False, font=font, color=line_color, angle=270, vert_align=None)
 
   def graph(self, x, y, width, height, data_lists, x_label, y_label,
-            names=None, colors=None,  graph_type=LINE_TYPE,
+            names=None, colors=None, graph_type=LINE_TYPE,
             symbols=None, line_widths=None, symbol_sizes=None,
             legend=False, title=None, x_labels=None, plot_offset=(100, 50),
             axis_color='black', bg_color='#F0F0F0', font=None, font_size=16, line_width=1,

diff --git a/nuc_processing/common.py b/nuc_processing/common.py
@@ -0,0 +1,58 @@
+import gzip
+import re
+import os
+try:
+  from itertools import zip_longest
+except ImportError:
+  # Python 2
+  from itertools import izip_longest as zip_longest
+from functools import partial
+
+
+def open_file_r(file_path, buffering=-1):
+  if file_path.endswith('.gz'):
+    return gzip.open(file_path, 'rt')
+  else:
+    return open(file_path, 'rtU', buffering)
+
+
+def strip_ext(name, ext):
+  if name.endswith(ext):
+    return name[:-len(ext)]
+  return name
+
+
+def nwise_longest(iterable, n=2, fillvalue=None):
+  return zip_longest(*[iter(iterable)] * n, fillvalue=fillvalue)
+
+
+def merge_file_names(file_path1, file_path2, sep='_'):
+  SPLIT_PATT = re.compile('([_\.])')
+
+  dir_name1, file_name1 = os.path.split(file_path1)
+  dir_name2, file_name2 = os.path.split(file_path2)
+
+  if dir_name1 != dir_name2:
+    msg = 'Attempt to merge file names for file from different directories'
+    raise Exception(msg)
+
+  file_root1, file_ext1 = os.path.splitext(file_name1)
+  file_root2, file_ext2 = os.path.splitext(file_name2)
+
+  if file_ext1 != file_ext2:
+    msg = 'Attempt to merge file names with different file extensions'
+    raise Exception(msg)
+
+  parts = []
+  # Split on separators
+  split_names = map(SPLIT_PATT.split, [file_root1, file_root2])
+  # Pair segment and following separator
+  split_names = map(partial(nwise_longest, n=2, fillvalue="."), split_names)
+
+  for (a, sep_a), (b, sep_b) in zip_longest(*split_names, fillvalue=("", "")):
+    if a is not None:
+      parts.extend([a, sep_a])
+    if a != b and b is not None:
+      parts.extend([b, sep_b])
+
+  return os.path.join(dir_name1, ''.join(parts) + file_ext1[1:])
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ build @@
     dist
     .cache
     .coverage
+    .pytest_cache