diff --git a/.gitignore b/.gitignore
index cbfb72c..967c89c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,6 @@ code/*/scratch/*
 code/setup/*
 .coverage
 *.swp
+*egg-info
+tags
+*.log
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..773d6cb
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,25 @@
+language: python
+python:
+  - "3.6"
+before_install:
+    cd code
+install:
+  - sudo apt-get update
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  # Useful for debugging any issues with conda
+  - conda info -a
+
+  - conda env create -q -n test-environment --file environment.yml
+  - source activate test-environment
+  - pip install codecov
+
+script:
+    pytest --cov --cov-config .coveragerc --flake8
+
+after_success:
+  - codecov
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..70ca603
--- /dev/null
+++ b/README.md
@@ -0,0 +1,199 @@
+[![Build Status](https://travis-ci.com/troycomi/introgression.svg?branch=master)](https://travis-ci.com/troycomi/introgression)
+[![codecov](https://codecov.io/gh/troycomi/introgression/branch/master/graph/badge.svg)](https://codecov.io/gh/troycomi/introgression)
+
+# introgression
+> Discovering yeast admixture through sequencing
+
+## Background
+TBA
+
+## Installation
+All required packages are specified in the conda environment located in 
+`code/environment.yml`.  The introgression environment can be generated with
+```bash
+conda env create -f environment.yml
+```
+To access the command line bindings of the main analyze class,
+install the setup file using pip with
+```bash
+conda activate introgression
+pip install --editable .
+```
+while in the code directory.
+
+## Usage
+
+### Configuration
+A set of initial parameters are provided in `code/config.yaml` which need to
+be set specifically for your system and dataset.
+
+Strings of the form \_\_KEY\_\_
+are substituted during execution and are used as a shortcut.  For example,
+with `output\_root` set to `/data/results`, the value `__OUTPUT_ROOT__/genes/`
+becomes `/data/results/genes/`.
+
+Strings of the form `{state}` are used for wildcards within the code.  Their
+location and surrounding characters can change, but the wildcard must be the
+same.  For example, `blocks_{state}.txt` can be changed to
+`{state}_with-block.txt` but not `blocks_{st}.txt`.
+
+### Command Line
+With the package installed and the conda environment activated, main methods
+are accessed with the `introgression` command. Some documentation is provided
+by adding the argument `--help` to introgression or any of its subcommands.
+
+#### introgression
+Options include:
+- --config: specify one or more configuration files.  Files are evaluated in
+order.  Conflicting values are overwritten by the newest file. This allows a
+base configuration for the system and analysis-specific configurations added
+as needed.
+- verbosity: set by varying the number of v's attached to the option, with 
+`-v` indicating a log level of critical and `-vvvvv` indicating debug logging.
+- --log-file: Optional location to store log information.  Default is stdout.
+If set and run on an interactive shell, some commands will display progress
+bars.
+
+Most subcommand options will overwrite corresponding values in the config
+file.  Leaving options unset without supplying a value in the config file
+will raise an error.  Some values are only set through the config file 
+including the list of chromosomes and the known states.
+
+Available subcommands are:
+##### predict
+The predict subcommand uses an HMM to predict regions of introgression from
+alignment files.  Several outputs are used in subsequent steps which refine
+the predicted introgressed regions.
+
+Test strains on which to predict introgression can be supplied in the config
+file under the name `strains` or pulled from the directory structure of
+`test_strains`.
+
+Available options are:
+- --alignment: input alignment file location with wildcards for 
+{prefix} (optional), {strain} and {chrom}.
+- --prefix: An optional wildcard value for alignment files.  If left blank,
+will default to the known states joined with an underscore.  Leaving the
+{prefix} wildcard out of the alignment file will prevent its use as well.
+- --blocks: An output file containing the regions predicted to belong to the
+given state.  Must contain the {state} wildcard which will be populated with a
+known state during analysis.  Columns are the strain, chromosomes, the 
+predicted state, start position, end position, and the number of sites 
+supporting the assignment.
+- --test-strains. If strains are not provided in the config, this file with
+{strain} and {chrom} wildcards will be used to populate the strains for 
+prediction.
+- --hmm-initial: Output file with the initial parameters of the HMM for each
+strain.
+- --hmm-trained: Output file with HMM parameters following Baum-Welch training.
+- --positions: Output file with indices of sites which are non-gapped sequences
+which differ between reference alignments.
+- --probabilities: Output file with the probability of each position belonging
+to the master reference strain.
+- --threshold: The threshold value to apply when filtering the predicted HMM
+path through the test sequence.  Either a float, indicating cutoff probability,
+or 'viterbi' to indicate the Viterbi algorithm should be used to find the most
+likely sequence of states.
+- --only-poly-sites/--all-sites: A switch to indicate if all non-gapped,
+sequenced sites should be considered during HMM training, or only polymorphic
+sites.  Default is only polymorphic sites.
+
+##### id-regions
+id-regions prepends a column to block files with a unique region id, of the
+form 'r#'.  Regions are sorted by the start position of the region.  Changing
+the states to label will affect the region numbers as a different set of
+regions will be considered.
+
+Available options are:
+- --blocks: The input file to label with the wildcard {state}.  This is the 
+file produced by predict in the previous step.
+- --labeled: The output file, also containing {state} wildcard.
+- --state: May be specified multiple times to indicate which states to add
+labels to.  Leaving unset will use the states in the config file (recommended).
+
+##### summarize-regions
+Analyzes the regions predicted to be introgressed.  Several columns are added
+to the block file containing information about the region including the number
+of matching sites to each state.
+
+Available options are:
+- --state: May be set multiple times for each state to summarize.  Leaving
+unset will default to all states in the config file.
+- --labeled: the labeled block file with {state} wildcard created in the
+previous step.
+- --masks: Sequence mask files with {strain} and {chrom} wildcards.
+- --alignment: The input alignment file similar to the predict option.
+- --positions: The position file created during predict.
+- --quality: The output file with a {state} wildcard.
+- --region: The alignment for each region in the labeled file with {state}
+wildcard.  Each state file contains all regions for the state.
+- --region-index: A pickled python dictionary used for random access into the
+region file.  Must have {state} wildcard.
+
+##### filter-regions
+From the quality files produced in `summarize-regions`, filter regions based
+on several criteria including those with weak support for the alternative
+hypothesis and those which can be assigned to multiple alternative states.
+
+Regions passing the 'introgressed filter' satisfy all of the following:
+- fraction of gaps masked in reference > 0.5
+- fraction of gaps masked in predicted state > 0.5
+- number of matches to predicted > 7
+- number of matches to predicted > number of matches to reference
+- sequence identity with predicted state is higher than reference
+- sequence identity with reference is > 0.7
+
+Regions passing the 'ambiguous filter' match only the predicted state.  No
+other state has:
+- sequence identity >= sequence identity with predicted state * threshold
+- matching bases >= matching bases with predicted state * threshold
+
+Available options are:
+- --region: The region file from summarize-regions.
+- --region-index: The region index file from summarize-regions.
+- --quality: The quality file produced by summarize-regions with {state}
+wildcard.
+- --introgress-filter: The output file with only regions passing introgression
+filter.  Must contain {state} wildcard.
+- --introgress-inter: An output file with all regions.  Includes the reason
+for filtering by the introgression filter or blank if it passes.
+Must contain {state} wildcard.
+- --ambiguous-filter: Output file containing only regions which pass the 
+ambiguous filter after passing the introgression filter.
+Must contain {state} wildcard.
+- --ambiguous-inter: Contains all regions from introgression filter with a 
+column for the reason the region failed ambiguous filtering.  Must contain
+{state} wildcard.
+- --thresh: The threshold to apply to the ambiguous filter.
+- --filter-sweep: If set and threshold values are supplied as arguments,
+will output summary information for applying the ambiguous filter with various
+threshold values.
+
+`filter-regions` accepts multiple threshold values as arguments to test
+and output to the `filter-sweep` file.  Sample usage would be
+```bash
+introgression --config config.yml \
+    filter-regions
+    --threshold 0.995 \
+    --filter-sweep sweep.txt \
+    0.99 0.98 0.8  # these are the sweep arguments
+```
+where 0.99, 0.98 and 0.8 are used as test threshold values as summarized in
+sweep.txt.  Note that the ambiguous filter will only use the threshold 0.995
+in this example.
+
+##### summarize-strains
+Summarize strains produces summary information for each test strain including
+the number of regions and bases assigned to each hidden state, filtered at
+each stage, and ambiguous between states.
+
+Available options are:
+- --introgress-inter: The introgressed filter file as used in `filter-regions`.
+- --ambiguous-inter: The ambiguous filter file as used in `filter-regions`.
+- --strain-info: Tab separate table with information on the strain to include
+with the summary output.  Columns should be the strain name, alternate name,
+location, environment, and population.
+- --state-counts: The summary output file.
+
+## License
+TBD
diff --git a/code/align/aggregate_alignment_stats.py b/code/align/aggregate_alignment_stats.py
index 290179a..a3e72bd 100644
--- a/code/align/aggregate_alignment_stats.py
+++ b/code/align/aggregate_alignment_stats.py
@@ -1,14 +1,17 @@
 import os
-import sys
-sys.path.insert(0, '..')
 import global_params as gp
 
 gp_dir = '../'
-stats_files = [gp_dir + gp.alignments_dir + x for x in filter(\
-        lambda x: 'stats' in x and 'summary' not in x, os.listdir(gp_dir + gp.alignments_dir))]
+stats_files = [gp_dir + gp.alignments_dir + x for x in filter(
+    lambda x: 'stats' in x and 'summary' not in x,
+    os.listdir(gp_dir + gp.alignments_dir))]
 
 # goal is to generate file for R (e.g. for two references and test strain):
-# chromosome strain frac_S288c_S288c frac_S288c_CBS432 frac_S288c_x frac_CBS432_S288c frac_CBS432_CBS432 frac_CBS432_x frac_x_S288c frac_x_CBS432 frac_x_x  aligned_length_S288c aligned_length_CBS432 aligned_length_x num_align_columns_0 num_align_columns_1 num_align_columns_2 num_align_columns_3
+# chromosome strain frac_S288c_S288c frac_S288c_CBS432 frac_S288c_x
+# frac_CBS432_S288c frac_CBS432_CBS432 frac_CBS432_x frac_x_S288c
+# frac_x_CBS432 frac_x_x  aligned_length_S288c aligned_length_CBS432
+# aligned_length_x num_align_columns_0 num_align_columns_1
+# num_align_columns_2 num_align_columns_3
 
 f = open(gp_dir + gp.alignments_dir + 'mafft_stats_summary.txt', 'w')
 
@@ -25,15 +28,15 @@
 
 for i in range(0, len(gp.alignment_ref_order) + 2):
     f.write('\t' + 'num_align_columns_' + str(i))
-    
+
 f.write('\n')
 
 all_strains = gp.alignment_ref_order + ['x']
 
 # one line for each of these files
 for fn in stats_files:
-    print fn
-    
+    print(fn)
+
     lines = [line.strip() for line in open(fn, 'r').readlines()]
 
     # histogram of number of number of strains aligned
@@ -43,10 +46,10 @@
         c.append(float(lines[i + offset].split(',')[1]))
 
     # aligned lengths
-    l = []
+    lengths = []
     offset += len(all_strains) + 1 + 2
     for i in range(len(all_strains)):
-        l.append(float(lines[i + offset].split(',')[1]))
+        lengths.append(float(lines[i + offset].split(',')[1]))
 
     sx = lines[offset + len(all_strains) - 1].split(',')[0]
 
@@ -67,7 +70,7 @@
         for j in range(len(all_strains)):
             f.write('\t' + str(fr[i][j]))
     for i in range(len(all_strains)):
-        f.write('\t' + str(l[i]))
+        f.write('\t' + str(lengths[i]))
     for i in range(len(all_strains) + 1):
         f.write('\t' + str(c[i]))
     f.write('\n')
diff --git a/code/align/align_helpers.py b/code/align/align_helpers.py
index be216de..a1f44ee 100644
--- a/code/align/align_helpers.py
+++ b/code/align/align_helpers.py
@@ -1,12 +1,24 @@
 import os
 import global_params as gp
+from typing import List, Tuple
 
 
-def flatten(l):
+def flatten(l: List[List]) -> List:
+    '''
+    Flatten list of lists into a single list
+    '''
     return [item for sublist in l for item in sublist]
 
 
-def get_strains(dirs):
+def get_strains(dirs: List[str]) -> List[Tuple[str, str]]:
+    '''
+    Find all strains in the provided list of directories
+    Returns a sorted list of tuples with (strain_name, directory) entries
+    Checks for files with the fasta_suffix and contain _chr
+    strain_name is the name of the file up to _chr.
+    Raises assertion error if the number of files found is < number of strains
+    * the number of chromosomes
+    '''
     # get all non-reference strains of cerevisiae and paradoxus; could
     # generalize this someday...
 
@@ -15,10 +27,11 @@ def get_strains(dirs):
     for d in dirs:
         fns = os.listdir(d)
         # only look at fasta files in the directory
-        fns = filter(lambda x: x.endswith(gp.fasta_suffix), fns)
         # only look at files containing '_chr' which should be chromosome
         # sequence files
-        fns = list(filter(lambda x: '_chr' in x, fns))
+        fns = list(
+            filter(lambda x: x.endswith(gp.fasta_suffix) and '_chr' in x,
+                   fns))
         num_files = len(fns)
         if num_files == 0:
             print(f'found no chromosome sequence files in {d} '
@@ -33,7 +46,13 @@ def get_strains(dirs):
     return sorted(s)
 
 
-def concatenate_fasta(input_files, names, output_file):
+def concatenate_fasta(input_files: List[str],
+                      names: List[str],
+                      output_file: str) -> None:
+    '''
+    Combines several fasta files together into a single output
+    Adds header between each input fasta as > name[i] filename
+    '''
     with open(output_file, 'w') as output:
         for i, file in enumerate(input_files):
             with open(file, 'r') as input:
diff --git a/code/align/alignment_stats.py b/code/align/alignment_stats.py
index 9a06939..58f5ac3 100644
--- a/code/align/alignment_stats.py
+++ b/code/align/alignment_stats.py
@@ -1,8 +1,6 @@
-import sys
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc')
-import read_fasta
+from misc import read_fasta
+
 
 # count sites where n, ..., 3, 2, 1 genomes aligned, etc.
 def num_strains_aligned_by_site(seqs):
@@ -17,7 +15,8 @@ def num_strains_aligned_by_site(seqs):
 
     return num_strains_hist
 
-# fraction of each strain's sequence contained in alignment 
+
+# fraction of each strain's sequence contained in alignment
 # (should be 1)
 def fraction_strains_aligned(headers, seqs):
     nseqs = len(seqs)
@@ -34,6 +33,7 @@ def fraction_strains_aligned(headers, seqs):
 
     return fracs_aligned, seq_lengths
 
+
 # using each genome as reference, percentage of other genomes aligned
 def frac_aligned_to_reference(seqs, seq_lengths):
     nseqs = len(seqs)
@@ -47,7 +47,8 @@ def frac_aligned_to_reference(seqs, seq_lengths):
             else:
                 total = 0
                 for i in range(nsites):
-                    if seqs[ref][i] != gp.gap_symbol and seqs[other][i] != gp.gap_symbol:
+                    if seqs[ref][i] != gp.gap_symbol and \
+                            seqs[other][i] != gp.gap_symbol:
                         total += 1
                 r.append(float(total) / seq_lengths[other])
         fracs_aligned_to_ref.append(r)
diff --git a/code/align/average_alignment_stats.py b/code/align/average_alignment_stats.py
index 718cd85..b6c04af 100644
--- a/code/align/average_alignment_stats.py
+++ b/code/align/average_alignment_stats.py
@@ -1,14 +1,12 @@
 import os
-import sys
-sys.path.insert(0, '..')
 import global_params as gp
 import numpy
 
 gp_dir = '../'
-stats_files = [gp_dir + gp.alignments_dir + x for x in filter(\
+stats_files = [gp_dir + gp.alignments_dir + x for x in filter(
         lambda x: 'stats' in x, os.listdir(gp_dir + gp.alignments_dir))]
 
-#avg_frac_aligned_by_chrm = dict(zip(gp.chrms, [0]*len(gp.chrms)))
+# avg_frac_aligned_by_chrm = dict(zip(gp.chrms, [0]*len(gp.chrms)))
 avg_frac_aligned_p = 0
 avg_frac_aligned_x = 0
 total_p = 0
@@ -26,18 +24,18 @@
     avg_frac_aligned_x += fx * lx
     total_p += lp
     total_x += lx
-    #print fn[fn.find('chr')-8:], fx, lx, lc
+    # print fn[fn.find('chr')-8:], fx, lx, lc
     a.append(fx)
 
 
 avg_frac_aligned_p /= total_p
 avg_frac_aligned_x /= total_x
 
-print len(stats_files)
-print avg_frac_aligned_p
-print avg_frac_aligned_x
+print(len(stats_files))
+print(avg_frac_aligned_p)
+print(avg_frac_aligned_x)
 
 hist, edges = numpy.histogram(a, bins=30)
-print hist
-print edges
-print sum(hist[:-1])
+print(hist)
+print(edges)
+print(sum(hist[:-1]))
diff --git a/code/align/convert_coordinates.py b/code/align/convert_coordinates.py
index e38f872..a14ded6 100644
--- a/code/align/convert_coordinates.py
+++ b/code/align/convert_coordinates.py
@@ -1,13 +1,11 @@
-import sys
 import gzip
-sys.path.insert(0, '..')
 import global_params as gp
 
 
 def convert(s1, s2):
     i2 = -1
     i2d = 0
-    l = []
+    result = []
     for i in range(len(s1)):
         if s2[i] == gp.gap_symbol:
             i2d += 1
@@ -16,14 +14,14 @@ def convert(s1, s2):
             i2d = 0
         if s1[i] != gp.gap_symbol:
             if i2d == 0:
-                l.append(str(i2))
+                result.append(str(i2))
             else:
-                l.append(str(i2) + '.' + str(i2d))
-    return l
+                result.append(str(i2) + '.' + str(i2d))
+    return result
 
-def write_coordinates(l, fn):
+
+def write_coordinates(coords, fn):
     f = gzip.open(fn, 'wb')
-    f.write('\n'.join([str(x) for x in l]))
+    f.write('\n'.join([str(x) for x in coords]))
     f.write('\n')
     f.close()
-
diff --git a/code/align/convert_coordinates_main.py b/code/align/convert_coordinates_main.py
index 22ff28e..08cc90a 100644
--- a/code/align/convert_coordinates_main.py
+++ b/code/align/convert_coordinates_main.py
@@ -1,33 +1,30 @@
-import sys
 import os
-from convert_coordinates import *
-sys.path.insert(0, '..')
+from convert_coordinates import (write_coordinates, convert)
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
+from misc import read_fasta
 
 gp_dir = '../'
 fns = os.listdir(gp_dir + gp.alignments_dir)
 fns = filter(lambda fn: fn.endswith(gp.alignment_suffix), fns)
 
 for fn in fns:
-    print fn
+    print(fn)
 
     x = fn.split('_')
     chrm = x[-2]
     strain_names = x[0:-2]
     headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + fn)
-    
+
     # for each index in cer reference, get index in other strain
     # (either par reference for 2-way alignment or cer strain for
     # 3-way)
-    coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
-               strain_names[0] + '_to_' + strain_names[-1] + \
-               '_' + chrm + '.txt.gz'
+    coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' +
+                strain_names[0] + '_to_' + strain_names[-1] +
+                '_' + chrm + '.txt.gz')
     write_coordinates(convert(seqs[0], seqs[-1]), coord_fn)
 
     # for each index in other strain, get index in cer reference
-    coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
-               strain_names[-1] + '_to_' + strain_names[0] + \
-               '_' + chrm + '.txt.gz'
+    coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' +
+                strain_names[-1] + '_to_' + strain_names[0] +
+                '_' + chrm + '.txt.gz')
     write_coordinates(convert(seqs[-1], seqs[0]), coord_fn)
diff --git a/code/align/mask.py b/code/align/mask.py
index df0ffa4..4d5ae74 100644
--- a/code/align/mask.py
+++ b/code/align/mask.py
@@ -1,7 +1,6 @@
 import sys
 import os
-from mask_helpers import *
-import align_helpers
+from mask_helpers import mask
 from analyze import read_args
 import global_params as gp
 
@@ -33,25 +32,22 @@
 
     current_strain_fn = d + strain_fn.replace('*', strain)
     current_strain_masked_fn = d + strain_masked_fn.replace('*', strain)
-    current_strain_intervals_fn = intervals_d + intervals_fn.replace('*', strain)
+    current_strain_intervals_fn = intervals_d + intervals_fn.replace('*',
+                                                                     strain)
 
     for chrm in gp.chrms:
 
         in_fn = current_strain_fn.replace('?', chrm)
         out_fn = current_strain_intervals_fn.replace('?', chrm)
-    
+
         # get dustmasker intervals
-        cmd_string = gp.blast_install_path + 'dustmasker' + \
-                     ' -in ' + in_fn + \
-                     ' -out ' + out_fn + \
-                     ' -outfmt interval'
-        
+        cmd_string = (gp.blast_install_path + 'dustmasker' +
+                      ' -in ' + in_fn +
+                      ' -out ' + out_fn +
+                      ' -outfmt interval')
+
         os.system(cmd_string)
 
         # replace those intervals with Ns and write to masked fasta file
         masked_fn = current_strain_masked_fn.replace('?', chrm)
         mask(in_fn, masked_fn, out_fn)
-        
-
-
-
diff --git a/code/align/mask_helpers.py b/code/align/mask_helpers.py
index 8bd7dc6..b71ede7 100644
--- a/code/align/mask_helpers.py
+++ b/code/align/mask_helpers.py
@@ -1,14 +1,11 @@
-import sys
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc')
-import read_fasta
-import write_fasta
+from misc import read_fasta
+from misc import write_fasta
 
-def read_intervals(fn):
 
+def read_intervals(fn):
     f = open(fn, 'r')
-    f.readline() # header
+    f.readline()  # header
     line = f.readline()
     intervals = []
     while line != '':
@@ -18,8 +15,8 @@ def read_intervals(fn):
     f.close()
     return intervals
 
-def mask(fn, masked_fn, intervals_fn):
 
+def mask(fn, masked_fn, intervals_fn):
     headers, seqs = read_fasta.read_fasta(fn)
     seq = list(seqs[0])
     intervals = read_intervals(intervals_fn)
@@ -28,4 +25,3 @@ def mask(fn, masked_fn, intervals_fn):
             seq[i] = gp.unsequenced_symbol
     seq = ''.join(seq)
     write_fasta.write_fasta(headers, [seq], masked_fn)
-
diff --git a/code/align/master_alignment.py b/code/align/master_alignment.py
index 7309012..aff6cc9 100644
--- a/code/align/master_alignment.py
+++ b/code/align/master_alignment.py
@@ -1,61 +1,60 @@
-# combine all chromosomal alignments into one master, indexed relative to cerevisiae reference
-
-import sys
-import os
-import copy
-sys.path.insert(0, '../misc')
-import read_maf
-sys.path.insert(0, '..')
+# combine all chromosomal alignments into one master
+# indexed relative to cerevisiae reference
+
+from misc import read_maf
 import global_params as gp
 
-complement = {'A':'T', 'T':'A', 'G':'C', 'C':'G', \
-                  'a':'t', 't':'a', 'g':'c', 'c':'g', \
-                  'N':'N', 'n':'n', '-':'-'}
+complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G',
+              'a': 't', 't': 'a', 'g': 'c', 'c': 'g',
+              'N': 'N', 'n': 'n', '-': '-'}
+
+flip = {'-': '+', '+': '-'}
 
-flip = {'-':'+', '+':'-'}
 
 def reverse_start(start, length, total_length):
     return total_length - start - length
 
-def reverse_complement(s):
 
+def reverse_complement(s):
     r = []
     for b in s[::-1]:
         r.append(complement[b])
     return r
 
-def forward_index(blocks):
 
+def forward_index(blocks):
     # go through all blocks and add a field for start relative to
     # forward strand, and sequence in forward direction
     for label in blocks.keys():
-        block = blocks[label]
         for strain in blocks[label]['strains'].keys():
 
             start = blocks[label]['strains'][strain]['start']
             seq = blocks[label]['strains'][strain]['sequence']
-            
+
             blocks[label]['strains'][strain]['forward_start'] = start
             blocks[label]['strains'][strain]['forward_sequence'] = seq
 
             if blocks[label]['strains'][strain]['strand'] == '-':
-                blocks[label]['strains'][strain]['forward_sequence'] = seq[::-1]
+                blocks[label]['strains'][strain]['forward_sequence'] = \
+                    seq[::-1]
                 blocks[label]['strains'][strain]['forward_start'] = \
-                    reverse_start(start, blocks[label]['strains'][strain]['length'], \
-                                      blocks[label]['strains'][strain]['aligned_length'])
+                    reverse_start(
+                        start,
+                        blocks[label]['strains'][strain]['length'],
+                        blocks[label]['strains'][strain]['aligned_length'])
 
     return blocks
 
-# make all master sequences go in forward direction (+) and flip
-# others as necessary
-def master_forward(blocks, master):
 
+def master_forward(blocks, master):
+    # make all master sequences go in forward direction (+) and flip
+    # others as necessary
     for label in blocks.keys():
-        block = blocks[label]
-        if blocks[label]['strains'].has_key(master):
+        if master in blocks[label]['strains']:
             if blocks[label]['strains'][master]['strand'] == '-':
                 for strain in blocks[label]['strains'].keys():
-                    aligned_length = blocks[label]['strains'][strain]['aligned_length']
+                    aligned_length = \
+                        blocks[label]['strains'][strain]['aligned_length']
                     seq = blocks[label]['strains'][strain]['sequence']
                     start = blocks[label]['strains'][strain]['start']
                     length = blocks[label]['strains'][strain]['length']
@@ -70,11 +69,12 @@ def master_forward(blocks, master):
 
     return blocks
 
+
 def make_master(fn, master):
 
     # keyed by block label; most of info in each keyed by ['strains'][strain]
     blocks = read_maf.read_mugsy(fn)
-    
+
     # flip all blocks so that master sequence is on + strand
     blocks = master_forward(blocks, master)
     # add fields giving index and sequence relative to + strand
@@ -83,13 +83,13 @@ def make_master(fn, master):
     # make sequences with alignment columns present in master
     n = blocks['1']['strains'][master]['aligned_length']
     all_strains = blocks['1']['strains'].keys()
-    a = dict(zip(all_strains, [[gp.unaligned_symbol] * n for s in all_strains]))
+    a = dict(zip(all_strains,
+                 [[gp.unaligned_symbol] * n for s in all_strains]))
 
     # loop through all blocks
     for label in blocks.keys():
-        block = blocks[label]
         # only care about aligned blocks that include master sequence
-        if blocks[label]['strains'].has_key(master):
+        if master in blocks[label]['strains']:
             absolute_ind = blocks[label]['strains'][master]['start']
             master_seq = blocks[label]['strains'][master]['sequence']
             block_length = len(master_seq)
@@ -104,25 +104,23 @@ def make_master(fn, master):
                     # apparently mugsy sometimes aligns the same part
                     # of one genome to multiple parts of another
                     # genome. this is a problem.
-                    assert a[master][absolute_ind] == gp.unaligned_symbol, absolute_ind
+                    assert a[master][absolute_ind] == gp.unaligned_symbol,\
+                        absolute_ind
                     # loop through all the strains in this block
                     for strain in strains:
                         a[strain][absolute_ind] = \
-                            blocks[label]['strains'][strain]\
-                            ['forward_sequence'][relative_ind]
+                            blocks[label]['strains'][strain][
+                                'forward_sequence'][relative_ind]
                     absolute_ind += 1
 
     for strain in all_strains:
         a[strain] = ''.join(a[strain])
-        print strain, a[strain].count(gp.unaligned_symbol)
-
-    #assert total_aligned_master == n - a[master].count(gp.unaligned_symbol), \
-    #    str(total_aligned_master) + ' ' + str(n - a[master].count(gp.unaligned_symbol))
+        print(strain, a[strain].count(gp.unaligned_symbol))
 
     return a
 
-def write_master(fn, a):
 
+def write_master(fn, a):
     f = open(fn, 'w')
     for strain in a.keys():
         f.write('> ' + strain + '\n')
diff --git a/code/align/master_alignment_main.py b/code/align/master_alignment_main.py
index 52d2126..f0c5a10 100644
--- a/code/align/master_alignment_main.py
+++ b/code/align/master_alignment_main.py
@@ -2,9 +2,8 @@
 # to one reference
 
 import sys
-from master_alignment import *
-sys.path.insert(0, '..')
-from align_helpers import *
+from align.master_alignment import (make_master, write_master)
+from align.align_helpers import (get_strains, flatten)
 import global_params as gp
 
 strains = get_strains(flatten(gp.non_ref_dirs.values()))
@@ -19,9 +18,9 @@
     '_'.join(gp.alignment_ref_order) + \
     '_' + strain
 for chrm in gp.chrms:
-    print chrm
+    print(chrm)
     alignment_fn = alignment_prefix + '_chr' + chrm + gp.alignment_suffix
-    master_alignment_fn = alignment_prefix + '_chr' + chrm + '_master' + gp.fasta_suffix
+    master_alignment_fn = (alignment_prefix + '_chr' +
+                           chrm + '_master' + gp.fasta_suffix)
     a = make_master(alignment_fn, gp.master_ref)
     write_master(master_alignment_fn, a)
-
diff --git a/code/align/polymorphism.py b/code/align/polymorphism.py
index 566f2fe..1f191c4 100644
--- a/code/align/polymorphism.py
+++ b/code/align/polymorphism.py
@@ -1,6 +1,8 @@
 # calculate polymorphism rate between reference genomes in 100-bp
 # windows across each chromosome
 
+import sys
+from misc import read_fasta
 
 headers, seqs = read_fasta.read_fasta(sys.argv[1])
 a = dict(zip(headers, seqs))
diff --git a/code/align/ref_ids_main.py b/code/align/ref_ids_main.py
index f6cf7b8..58812fd 100644
--- a/code/align/ref_ids_main.py
+++ b/code/align/ref_ids_main.py
@@ -1,14 +1,8 @@
-import re
-import sys
-import os
-import copy
 from collections import defaultdict
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import mystats
-import seq_functions
-import read_fasta
+from misc import mystats
+from misc import seq_functions
+from misc import read_fasta
 
 # get pairwise identities between all aligned references:
 # - overall average
@@ -23,28 +17,28 @@
 
 pair_chrm_ids = defaultdict(lambda: defaultdict(list))
 for chrm in gp.chrms:
-    print chrm
-    fn = gp_dir + gp.alignments_dir + \
-         '_'.join(gp.alignment_ref_order) + \
-         '_chr' + chrm + '_mafft' + gp.alignment_suffix
+    print(chrm)
+    fn = (gp_dir + gp.alignments_dir +
+          '_'.join(gp.alignment_ref_order) +
+          '_chr' + chrm + '_mafft' + gp.alignment_suffix)
 
     headers, seqs = read_fasta.read_fasta(fn)
 
     for i in range(nrefs):
         ref1 = gp.alignment_ref_order[i]
         for j in range(i+1, nrefs):
-            print i, j
+            print(i, j)
             ref2 = gp.alignment_ref_order[j]
 
             ids = seq_functions.seq_id_windowed(seqs[i], seqs[j], window)
-        
+
             pair_chrm_ids[(ref1, ref2)][chrm] = ids
 
-fs = open(gp.analysis_out_dir_absolute + 'ref_ids_summary_' + \
-         '_'.join(gp.alignment_ref_order) + '.txt', 'w')
+fs = open(gp.analysis_out_dir_absolute + 'ref_ids_summary_' +
+          '_'.join(gp.alignment_ref_order) + '.txt', 'w')
 fs.write('pair\tchromosome\tmean\tmedian\n')
 
-f = open(gp.analysis_out_dir_absolute + 'ref_ids_' + \
+f = open(gp.analysis_out_dir_absolute + 'ref_ids_' +
          '_'.join(gp.alignment_ref_order) + '.txt', 'w')
 f.write('pair\tid\n')
 
@@ -53,14 +47,14 @@
     pair_string = ','.join(pair)
     for chrm in gp.chrms:
         ids = pair_chrm_ids[pair][chrm]
-        fs.write(pair_string + '\t' + \
-                 chrm + '\t' + \
-                 str(mystats.mean(ids)) + '\t' + \
+        fs.write(pair_string + '\t' +
+                 chrm + '\t' +
+                 str(mystats.mean(ids)) + '\t' +
                  str(mystats.median(ids)) + '\n')
         all_ids += ids
-    fs.write(pair_string + '\t' + \
-             'all' + '\t' + \
-             str(mystats.mean(all_ids)) + '\t' + \
+    fs.write(pair_string + '\t' +
+             'all' + '\t' +
+             str(mystats.mean(all_ids)) + '\t' +
              str(mystats.median(all_ids)) + '\n')
 
     for i in ids:
diff --git a/code/align/run_alignment_stats.py b/code/align/run_alignment_stats.py
index 0fd7052..7f99b73 100644
--- a/code/align/run_alignment_stats.py
+++ b/code/align/run_alignment_stats.py
@@ -1,8 +1,11 @@
+import sys
 import os
-from alignment_stats import *
-from align_helpers import *
-sys.path.insert(0, '..')
+from align.alignment_stats import (num_strains_aligned_by_site,
+                                   fraction_strains_aligned,
+                                   frac_aligned_to_reference)
+from align.align_helpers import (flatten, get_strains)
 import global_params as gp
+from misc import read_fasta
 
 # gives info related to how good an alignment is:
 # - number of sites where n, ..., 3, 2, 1, genomes aligned
@@ -14,10 +17,11 @@
 strain, d = s[int(sys.argv[1])]
 gp_dir = '../'
 
-fn_start = gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + '_' + strain + '_chr'
+fn_start = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) +
+            '_' + strain + '_chr')
 
 for chrm in gp.chrms:
-    print chrm
+    print(chrm)
     sys.stdout.flush()
 
     if not os.path.isfile(fn_start + chrm + '_mafft.maf'):
@@ -30,8 +34,9 @@
 
     # number of sites where n,...,3,2,1 genomes aligned
     num_strains_by_site = num_strains_aligned_by_site(seqs)
-    f_out.write(\
-        '# histogram of number of strains aligned across all alignment columns\n')
+    f_out.write(
+        '# histogram of number of strains '
+        'aligned across all alignment columns\n')
     for n in range(len(num_strains_by_site)):
         f_out.write(str(n) + ',' + str(num_strains_by_site[n]) + '\n')
     f_out.write('\n')
@@ -44,7 +49,8 @@
     # length of chromosomes
     f_out.write('chromosome aligned lengths\n')
     for n in range(len(seqs)):
-        f_out.write(headers[n][1:].strip().split(' ')[0] + ',' + str(seq_lengths[n]) + '\n')
+        f_out.write(headers[n][1:].strip().split(' ')[0] +
+                    ',' + str(seq_lengths[n]) + '\n')
     f_out.write('\n')
 
     # using each genome as reference, fraction of other genomes aligned
@@ -52,7 +58,7 @@
     frac_aligned_to_ref = frac_aligned_to_reference(seqs, seq_lengths)
     for ref in range(len(seqs)):
         f_out.write(headers[ref][1:].strip().split(' ')[0])
-        for other in range(len(seqs)): 
+        for other in range(len(seqs)):
             f_out.write(',' + str(frac_aligned_to_ref[ref][other]))
         f_out.write('\n')
     f_out.write('\n')
diff --git a/code/align/run_mafft.py b/code/align/run_mafft.py
index 4681f77..e5f6681 100644
--- a/code/align/run_mafft.py
+++ b/code/align/run_mafft.py
@@ -1,6 +1,6 @@
 import sys
 import os
-from align_helpers import *
+from align.align_helpers import (concatenate_fasta)
 from analyze import read_args
 import global_params as gp
 
@@ -19,8 +19,8 @@
         if os.stat(args['alignments_directory'] + fn).st_size != 0:
             a.append(fn)
 ref_prefix = '_'.join(args['references']) + '_'
-ref_fns = [args['reference_directories'][r] + r + '_chr' + '?' + \
-           mask_suffix + gp.fasta_suffix \
+ref_fns = [args['reference_directories'][r] + r + '_chr' + '?' +
+           mask_suffix + gp.fasta_suffix
            for r in args['references']]
 
 if ref_only:
@@ -30,16 +30,16 @@
     ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns]
     combined_fn = 'run_mafft_' + chrm + '.temp'
 
-    concatenate_fasta(ref_fns_chrm, \
+    concatenate_fasta(ref_fns_chrm,
                       args['references'], combined_fn)
-    
-    align_fn = ref_prefix + 'chr' + chrm + \
-        '_mafft' + gp.alignment_suffix
+
+    align_fn = (ref_prefix + 'chr' + chrm +
+                '_mafft' + gp.alignment_suffix)
     align_fn_abs = args['alignments_directory'] + align_fn
 
-    cmd_string = gp.mafft_install_path + '/mafft ' + \
-                 combined_fn + ' > ' + align_fn_abs + '; '
-        
+    cmd_string = (gp.mafft_install_path + '/mafft ' +
+                  combined_fn + ' > ' + align_fn_abs + '; ')
+
     cmd_string += 'rm ' + combined_fn + ';'
 
     print(cmd_string)
@@ -60,7 +60,6 @@
 # shell instance every time (I think there's a limit on the
 # command character count or something which is why we're not
 # making a single string for all strains)
-#cmd_string = ''
 
 current_strain_fn = d + strain_fn.replace('*', strain)
 
@@ -74,7 +73,7 @@
 
     # if we don't already have an alignment for this strain/chromosome
     # (or that alignment file is empty), then make one
-    #if (align_fn not in a) or (os.stat(align_fn_abs).st_size == 0):
+    # if (align_fn not in a) or (os.stat(align_fn_abs).st_size == 0):
     if align_fn not in a:
         cmd_string = ''
 
@@ -82,18 +81,18 @@
         ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns]
         current_strain_fn_chrm = current_strain_fn.replace('?', chrm)
         combined_fn = 'run_mafft_' + strain + chrm + '.temp'
-        
-        concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], \
+
+        concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm],
                           args['references'] + [strain], combined_fn)
-        
+
         # add --ep 0.123 to maybe get shorter alignment
-        #cmd_string += gp.mafft_install_path + '/mafft --ep 0.123 ' + \
+        # cmd_string += gp.mafft_install_path + '/mafft --ep 0.123 ' + \
         #    combined_fn + ' > ' + align_fn_abs + '; '
-        #cmd_string += gp.mafft_install_path + '/mafft --retree 1 ' + \
+        # cmd_string += gp.mafft_install_path + '/mafft --retree 1 ' + \
         #    combined_fn + ' > ' + align_fn_abs + '; '
         cmd_string += gp.mafft_install_path + '/mafft ' + \
             combined_fn + ' > ' + align_fn_abs + '; '
-        
+
         cmd_string += 'rm ' + combined_fn + ';'
 
         print(cmd_string)
@@ -109,7 +108,3 @@
     else:
         print("already did this alignment: " + strain + ' chromosome ' + chrm)
         sys.stdout.flush()
-
-#print cmd_string
-#sys.stdout.flush()
-#os.system(cmd_string)
diff --git a/code/align/run_mafft_refs.py b/code/align/run_mafft_refs.py
index 21105c1..1242531 100644
--- a/code/align/run_mafft_refs.py
+++ b/code/align/run_mafft_refs.py
@@ -2,8 +2,7 @@
 
 import sys
 import os
-from align_helpers import *
-sys.path.insert(0, '..')
+from align.align_helpers import concatenate_fasta
 import global_params as gp
 
 masked = False
@@ -16,9 +15,9 @@
 if gp.resume_alignment:
     a = os.listdir(gp_dir + gp.alignments_dir)
 
-ref_prefix = '_'.join(gp.alignment_ref_order) 
-ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + \
-           mask_suffix + gp.fasta_suffix \
+ref_prefix = '_'.join(gp.alignment_ref_order)
+ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' +
+           mask_suffix + gp.fasta_suffix
            for r in gp.alignment_ref_order]
 
 
@@ -26,11 +25,11 @@
 # shell instance every time (I think there's a limit on the
 # command character count or something which is why we're not
 # making a single string for all strains)
-#cmd_string = ''
+# cmd_string = ''
 
 chrm = gp.chrms[int(sys.argv[1])]
 
-print chrm
+print(chrm)
 sys.stdout.flush()
 
 align_fn = ref_prefix + '_chr' + chrm + \
@@ -44,28 +43,24 @@
     # first put all sequences in same (temporary) file
     ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns]
     combined_fn = 'run_mafft_' + chrm + '.temp'
-        
-    concatenate_fasta(ref_fns_chrm, \
+
+    concatenate_fasta(ref_fns_chrm,
                       gp.alignment_ref_order, combined_fn)
-    
-    cmd_string += gp.mafft_install_path + '/mafft ' + \
-                  combined_fn + ' > ' + align_fn_abs + '; '
-        
+
+    cmd_string += (gp.mafft_install_path + '/mafft ' +
+                   combined_fn + ' > ' + align_fn_abs + '; ')
+
     cmd_string += 'rm ' + combined_fn + ';'
-    
-    print cmd_string
+
+    print(cmd_string)
     sys.stdout.flush()
     os.system(cmd_string)
 
     # want some kind of indication if alignment fails (due to
     # running out of memory probably)
     if os.stat(align_fn_abs).st_size == 0:
-        print 'alignment failed:' + ' chromosome ' + chrm
+        print('alignment failed: chromosome ' + chrm)
         sys.stdout.flush()
         sys.exit()
 else:
-    print "already did this alignment:" + ' chromosome ' + chrm
-
-#print cmd_string
-#sys.stdout.flush()
-#os.system(cmd_string)
+    print('already did this alignment: chromosome ' + chrm)
diff --git a/code/align/run_mugsy.py b/code/align/run_mugsy.py
index e858769..7af436b 100644
--- a/code/align/run_mugsy.py
+++ b/code/align/run_mugsy.py
@@ -1,7 +1,5 @@
-import sys
 import os
-sys.path.insert(0, '..')
-from align_helpers import *
+from align.align_helpers import get_strains, flatten
 import global_params as gp
 
 # get all non-reference strains of cerevisiae and paradoxus
@@ -22,23 +20,25 @@
 ref_dirs = [gp.ref_dir[ref] for ref in gp.alignment_ref_order]
 
 for strain, d in s:
-    print strain
+    print(strain)
 
     cmd_string = cmd_string_start
-        
+
     for chrm in [gp.chrms[-1]]:
         align_fn = ref_prefix + strain + '_chr' + chrm + gp.alignment_suffix
-        # if we don't already have an alignment for this strain/chromosome, then make one
+        # if we don't already have an alignment for this strain/chromosome,
+        # then make one
         if align_fn not in a:
             cmd_string += gp.mugsy_install_path + '/mugsy ' + \
                 '--directory ' + gp_dir + gp.alignments_dir + ' ' + \
                 '--prefix ' + ref_prefix + strain + '_chr' + chrm
             for ref in gp.alignment_ref_order:
-                cmd_string += ' ' + gp.ref_dir[ref] + '/' + \
-                gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix
-            cmd_string += ' ' + d + '/' + strain + '_chr' + chrm + gp.fasta_suffix + '; '
-
+                cmd_string += (' ' + gp.ref_dir[ref] + '/' +
+                               gp.ref_fn_prefix[ref] + '_chr' +
+                               chrm + gp.fasta_suffix)
+            cmd_string += (' ' + d + '/' + strain +
+                           '_chr' + chrm + gp.fasta_suffix + '; ')
 
     # commands can only be up to a certain length so break it up this way
-    print cmd_string
+    print(cmd_string)
     os.system(cmd_string)
diff --git a/code/align/run_tcoffee.py b/code/align/run_tcoffee.py
index c94a3af..dcf48bb 100644
--- a/code/align/run_tcoffee.py
+++ b/code/align/run_tcoffee.py
@@ -1,7 +1,6 @@
 import sys
 import os
-sys.path.insert(0, '..')
-from align_helpers import *
+from align.align_helpers import (get_strains, flatten, concatenate_fasta)
 import global_params as gp
 
 # get all non-reference strains of cerevisiae and paradoxus
@@ -14,13 +13,14 @@
 
 
 ref_prefix = '_'.join(gp.alignment_ref_order) + '_'
-ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + gp.fasta_suffix \
-               for r in gp.alignment_ref_order]
+ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] +
+           '_chr' + '?' + gp.fasta_suffix
+           for r in gp.alignment_ref_order]
 
 strain_fn = '*_chr?' + gp.fasta_suffix
 
 for strain, d in s:
-    print strain
+    print(strain)
 
     # building up one command string so that we don't create a new
     # shell instance every time (I think there's a limit on the
@@ -29,9 +29,9 @@
     cmd_string = ''
 
     current_strain_fn = d + strain_fn.replace('*', strain)
-        
+
     for chrm in gp.chrms[:2]:
-        print chrm
+        print(chrm)
         align_fn = ref_prefix + strain + '_chr' + chrm + \
             '_tcoffee' + gp.alignment_suffix
         # if we don't already have an alignment for this
@@ -42,14 +42,15 @@
             current_strain_fn_chrm = current_strain_fn.replace('?', chrm)
             combined_fn = 'run_tcoffee_' + strain + chrm + '.temp'
 
-            concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], combined_fn)
+            concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm],
+                              combined_fn)
 
             cmd_string += gp.tcoffee_install_path + '/t_coffee ' + \
                 combined_fn + '; '
 
-            #cmd_string += 'rm ' + combined_fn + ';'
+            # cmd_string += 'rm ' + combined_fn + ';'
 
     # commands can only be up to a certain length so break it up this way
-    print cmd_string
+    print(cmd_string)
     os.system(cmd_string)
     sys.exit()
diff --git a/code/analyze/combine_chromosome_files_main.py b/code/analyze/combine_chromosome_files_main.py
deleted file mode 100644
index 1ec1814..0000000
--- a/code/analyze/combine_chromosome_files_main.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-import os
-import gzip
-import predict
-import global_params as gp
-
-args = predict.process_predict_args(sys.argv[1:])
-
-header = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-              'blocks_' + args['known_states'][0] + \
-              '_' + args['tag'] + '_chr' + gp.chrms[0] + '_quality.txt', 'r').readline()
-
-for species_from in args['known_states']:
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'blocks_' + species_from + \
-         '_' + args['tag'] + '_quality.txt'
-    f = open(fn, 'w')
-    f.write(header)
-    for chrm in gp.chrms:
-        fn_chrm = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                  'blocks_' + species_from + \
-                  '_' + args['tag'] + '_chr' + chrm + '_quality.txt'
-        try:
-            fc = open(fn_chrm, 'r')
-        except:
-            continue
-        fc.readline()
-        for line in fc.readlines():
-            f.write(line)
-    f.close()
-
diff --git a/code/analyze/extract_region.py b/code/analyze/extract_region.py
index 476e4f0..e7649d7 100644
--- a/code/analyze/extract_region.py
+++ b/code/analyze/extract_region.py
@@ -1,21 +1,23 @@
 #!/usr/bin/env python3
 import argparse
-import os
-import pickle
-import gzip
-import sys
+from misc.region_reader import Region_Reader
+from typing import List, Tuple
 
 
 def main():
+    '''
+    Main method to read in arguments from stdin and perform lookup with
+    Region_Reader
+    '''
     args = parse_args()
-    args = validate_args(args)
-    index = pickle.load(open(args['pickle'], 'rb'))
-    locations = decode_regions(args['regions'], index, args['list_sort'])
-    with gzip.open(args['filename'], 'rt') as reader:
-        write_regions(reader, locations, args['suppress_header'])
+    args, reader = validate_args(args)
+    with reader:
+        locations = decode_regions(args['regions'],
+                                   reader, args['list_sort'])
+        write_regions(reader, locations)
 
 
-def parse_args(args=None):
+def parse_args(args: List[str] = None) -> argparse.Namespace:
     '''
     Read in input arguments or the supplied list of strings
     Returns a dictionary of options
@@ -40,44 +42,31 @@ def parse_args(args=None):
     return vars(parser.parse_args(args))
 
 
-def validate_args(args):
+def validate_args(args: argparse.Namespace) -> Tuple[argparse.Namespace,
+                                                     Region_Reader]:
     '''
     Performs checks and conversions of input, raises ValueErrors if invalid
     '''
-    if not os.path.exists(args['filename']):
-        raise ValueError(f'{args["filename"]} not found')
-
-    if args['filename'][-6:] != '.fa.gz':
-        raise ValueError(f'{args["filename"]} expected to be .fa.gz')
+    reader = Region_Reader(args['filename'],
+                           as_fa=False,
+                           suppress_header=args['suppress_header'],
+                           num_lines=15)
 
-    args['pickle'] = args['filename'][:-6] + '.pkl'
-    if not os.path.exists(args['pickle']):
-        raise ValueError(f'{args["pickle"]} not found with region file')
+    args['regions'] = [reader.convert_region(r) for r in args['regions']]
 
-    parsed_regions = []
-    for region in args['regions']:
-        r = region
-        if r[0] == 'r':
-            r = r[1:]
-        if not r.isdigit():
-            raise ValueError(f'{region} could not be parsed')
-        parsed_regions.append(int(r))
-    args['regions'] = parsed_regions
+    return args, reader
 
-    return args
 
-
-def decode_regions(regions, index, retain_sort):
+def decode_regions(regions: List[int],
+                   reader: Region_Reader,
+                   retain_sort: bool) -> List[int]:
     '''
     Converts list of regions to file locations based on index dictionary
     Retain_sort controls if the output list order is determined by the
     region order or the disk location (i.e. values of index dict)
     '''
 
-    try:
-        result = [index[r] for r in regions]
-    except KeyError as e:
-        raise KeyError(f'r{e} not found in index')
+    result = [reader.decode_region(r) for r in regions]
 
     if retain_sort:
         return result
@@ -85,25 +74,13 @@ def decode_regions(regions, index, retain_sort):
         return sorted(result)
 
 
-def write_regions(reader, locations, suppress_header, num_lines=15):
+def write_regions(reader: Region_Reader, locations: List[int]) -> None:
     '''
     Writes the regions specified by index to stdout
     If print_header is false, ignore first line after location
     '''
-    if suppress_header is True:
-        num_lines -= 1
-
     for location in locations:
-        reader.seek(location)
-        if suppress_header is True:
-            reader.readline()
-        for i in range(num_lines):
-            line = reader.readline()
-            if line == '':
-                print(f'{location} outside of file', file=sys.stderr)
-                break
-            else:
-                print(line, end='')
+        reader.read_location(location)
 
 
 if __name__ == '__main__':
diff --git a/code/analyze/filter_1_main.py b/code/analyze/filter_1_main.py
deleted file mode 100644
index acebb7d..0000000
--- a/code/analyze/filter_1_main.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# two levels of filtering:
-# 1. remove regions that don't look confidently introgressed at all,
-#    based on fraction gaps/masked, number of matches to S288c and not S288c
-#    --> _filtered1 
-# 2. remove regions that we can't confidently pin on a specific reference,
-#    based on whether it matches similarly to other reference(s)
-#    --> _filtered2
-
-# just do the first level here, then run filter_2_thresholds_main.py
-# to choose filtering thresholds for next level
-
-
-import re
-import sys
-import os
-import copy
-import read_args
-from filter_helpers import *
-import summarize_region_quality
-import global_params as gp
-from misc import read_table
-from misc import read_fasta
-
-args = read_args.process_predict_args(sys.argv[1:])
-
-for species_from in args['known_states'][1:]:
-
-    print(species_from)
-
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'blocks_' + species_from + \
-         '_' + args['tag'] + '_quality.txt'
-    region_summary, fields = read_table.read_table_rows(fn, '\t')
-
-    fields1i = fields + ['reason']
-    fields1 = fields 
-
-    fn_out1i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-              'blocks_' + species_from + \
-              '_' + args['tag'] + '_filtered1intermediate.txt'
-
-    fn_out1 = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-              'blocks_' + species_from + \
-              '_' + args['tag'] + '_filtered1.txt'
-
-    f_out1i = open(fn_out1i, 'w')
-    f_out1i.write('\t'.join(fields1i) + '\n')
-
-    f_out1 = open(fn_out1, 'w')
-    f_out1.write('\t'.join(fields1) + '\n')
-
-    regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/regions/' + \
-                 species_from + gp.fasta_suffix + '.gz'
-    region_seqs = summarize_region_quality.read_region_file(regions_fn)
-
-    for region_id in region_summary:
-
-        region = region_summary[region_id]
-
-        info_string = region_seqs[region_id]['info']['seq']
- 
-        # filtering stage 1: things that we're confident in calling not
-        # S288c
-        p, reason = passes_filters1(region, info_string, args['known_states'][0])
-        region['reason'] = reason
-        write_filtered_line(f_out1i, region_id, region, fields1i)
-
-        if p:
-            write_filtered_line(f_out1, region_id, region, fields1)
-
-    f_out1i.close()
-    f_out1.close()
diff --git a/code/analyze/filter_2_main.py b/code/analyze/filter_2_main.py
deleted file mode 100644
index d1d290f..0000000
--- a/code/analyze/filter_2_main.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# two levels of filtering:
-# 1. remove regions that don't look confidently introgressed at all,
-#    based on fraction gaps/masked, number of matches to S288c and not S288c
-#    --> _filtered1 
-# 2. remove regions that we can't confidently pin on a specific reference,
-#    based on whether it matches similarly to other reference(s)
-#    --> _filtered2
-
-# do second level of filtering here, based on previously selected
-# thresholds
-
-import re
-import sys
-import os
-import copy
-import numpy as np
-import read_args
-import summarize_region_quality
-from filter_helpers import *
-import global_params as gp
-from misc import read_table
-from misc import read_fasta
-
-args = read_args.process_predict_args(sys.argv[2:])
-threshold = float(sys.argv[1])
-
-for species_from in args['known_states'][1:]:
-
-    print(species_from)
-
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'blocks_' + species_from + \
-         '_' + args['tag'] + '_filtered1.txt'
-    region_summary, fields = read_table.read_table_rows(fn, '\t')
-
-    fields2i = fields + ['predicted_species_original', 'alternative_ids', \
-                         'alternative_P_counts']
-    fields2 = fields
-
-    fn_out2i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-              'blocks_' + species_from + \
-              '_' + args['tag'] + '_filtered2intermediate.txt'
-
-    fn_out2 = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-              'blocks_' + species_from + \
-              '_' + args['tag'] + '_filtered2.txt'
-
-    f_out2i = open(fn_out2i, 'w')
-    f_out2i.write('\t'.join(fields2i) + '\n')
-
-    f_out2 = open(fn_out2, 'w')
-    f_out2.write('\t'.join(fields2) + '\n')
-
-    regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/regions/' + \
-                 species_from + gp.fasta_suffix + '.gz'
-    region_seqs = summarize_region_quality.read_region_file(regions_fn)
-
-    for region_id in region_summary:
-
-        region = region_summary[region_id]
-
-        info_string = region_seqs[region_id]['info']['seq']
-        seqs = np.asarray([list(region_seqs[region_id][ref]['seq']) \
-                           for ref in args['known_states']])
-
-        # filtering stage 2: things that we're confident in calling
-        # introgressed from one species specifically
-        p, alt_states, alt_ids, alt_P_counts = passes_filters2(region, seqs, \
-                                                               threshold, \
-                                                               args['known_states'])
-        region['alternative_states'] = '/'.join(alt_states)
-        region['alternative_ids'] = '/'.join([str(x) for x in alt_ids])
-        region['alternative_P_counts'] = '/'.join([str(x) for x in alt_P_counts])
-
-        region['predicted_species_original'] = region['predicted_species']
-        region['predicted_species'] = region['alternative_states']
-        write_filtered_line(f_out2i, region_id, region, fields2i)
-
-        if p:
-            write_filtered_line(f_out2, region_id, region, fields2)
-
-    f_out2i.close()
-    f_out2.close()
diff --git a/code/analyze/filter_2_thresholds_main.py b/code/analyze/filter_2_thresholds_main.py
deleted file mode 100644
index 7a62fe8..0000000
--- a/code/analyze/filter_2_thresholds_main.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# explore different thresholds for calling introgressions for specific
-# strains
-
-# specifically, try a range of thresholds, and for each one, calculate
-# fraction of introgressions we've classified as 1 strain or every
-# possible combination of strains
-
-# then we'll make some plots in R to see if there's a sort of obvious
-# place to draw the line
-
-import re
-import sys
-import os
-import copy
-from collections import defaultdict
-import predict
-from filter_helpers import *
-sys.path.insert(0, '..')
-import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-
-args = predict.process_predict_args(sys.argv[1:])
-
-#thresholds = [.99, .98, .97, .96, .95, .94, .93, .92, .91, .9, .88, .85, .82, .8, .75, .7, .6, .5]
-#thresholds = [.999, .995, .985, .975, .965, .955, .945, .935, .925, .915, .905, .89, .87, .86]
-thresholds = [1]
-
-open_mode = 'a'
-f = open(gp.analysis_out_dir_absolute + args['tag'] + \
-         '/filter_2_thresholds_' + args['tag'] + '.txt', open_mode)
-if open_mode == 'w':
-    f.write('threshold\tpredicted_state\talternative_states\tcount\n')
-for threshold in thresholds:
-    print threshold
-    for species_from in args['known_states'][1:]:
-
-        print '*', species_from
-
-        fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-             'blocks_' + species_from + \
-             '_' + args['tag'] + '_filtered1.txt'
-        region_summary, fields = read_table.read_table_rows(fn, '\t')
-        
-        d = defaultdict(int)
-        for region_id in region_summary:
-            #print region_id, '****'
-            region = region_summary[region_id]
-            headers, seqs = read_fasta.read_fasta(gp.analysis_out_dir_absolute + \
-                                                  args['tag'] + \
-                                              '/regions/' + region_id + '.fa.gz', \
-                                              gz = True)
-            info_string = seqs[-1]
-            seqs = seqs[:-1]
-            
-            p, alt_states, alt_ids, alt_P_counts = \
-                passes_filters2(region, seqs, threshold)
-
-            d[','.join(sorted(alt_states))] += 1
-    
-        for key in d:
-            f.write(str(threshold) + '\t' + species_from + '\t' + \
-                    key + '\t' + str(d[key]) + '\n')
-f.close()
diff --git a/code/analyze/filter_helpers.py b/code/analyze/filter_helpers.py
deleted file mode 100644
index 6174401..0000000
--- a/code/analyze/filter_helpers.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import re
-import sys
-import os
-import copy
-import global_params as gp
-from misc import mystats
-from misc import read_table
-from misc import seq_functions
-
-def write_filtered_line(f, region_id, region, fields):
-    f.write(region_id + '\t' + '\t'.join([str(region[field]) for field in fields[1:]]))
-    f.write('\n')
-
-def passes_filters(region):
-    
-    # fraction gaps + masked filter
-    fraction_gaps_masked_threshold = .5
-    fraction_gaps_masked = \
-        (float(region['number_gaps']) + float(region['number_masked_non_gap'])) / \
-        (int(region['end']) - int(region['start']) + 1)
-    if fraction_gaps_masked > fraction_gaps_masked_threshold:
-        return False
-
-    # number sites match only par filter
-    number_match_only_threshold = 7
-    number_match_only = int(region['number_match_ref2_not_ref1'])
-    if number_match_only < number_match_only_threshold:
-        return False
-
-    # divergence from cer filter (idea is that poor alignments will
-    # result in much larger divergence than we'd expect)
-    id_ref1_threshold = .7
-    id_ref1 = float(region['number_match_ref1']) / \
-              (float(region['aligned_length']) - float(region['number_gaps']))
-    if id_ref1 < id_ref1_threshold:
-        return False
-    
-    return True
-
-def passes_filters1(region, info_string, r):
-    # filtering out things that we can't call introgressed in general
-    # with confidence (i.e. doesn't seem like a strong case against
-    # being S288c)
-
-    s = region['predicted_species']
-    
-    aligned_length = (int(region['end']) - int(region['start']) + 1)
-
-    # FILTER: fraction gaps + masked
-    fraction_gaps_masked_threshold = .5
-    # num_sites_nonmask_x is number of sites at which neither
-    # reference x nor the test sequence is masked or has a gap or
-    # unsequenced character
-    fraction_gaps_masked_r = \
-        1 - float(region['num_sites_nonmask_' + r]) / aligned_length
-    fraction_gaps_masked_s = \
-        1 - float(region['num_sites_nonmask_' + s]) / aligned_length
-
-    if fraction_gaps_masked_r > fraction_gaps_masked_threshold:
-        return False, 'fraction gaps/masked in master = ' + \
-            str(fraction_gaps_masked_r)
-    if fraction_gaps_masked_s > fraction_gaps_masked_threshold:
-        return False, 'fraction gaps/masked in predicted = ' + \
-            str(fraction_gaps_masked_s)
-    
-    # FILTER: number sites analyzed by HMM that match predicted
-    # reference
-    count_P = info_string.count('P')
-    count_C = info_string.count('C')
-    number_match_only_threshold = 7
-    if count_P < number_match_only_threshold:
-        return False, 'count_P = ' + str(count_P)
-    if count_P <= count_C:
-        return False, 'count_P = ' + str(count_P) + ' and count_C = ' + str(count_C)
-
-    # FILTER: divergence with predicted reference and master reference
-    # (S288c)
-    id_predicted = float(region['match_nongap_' + s]) / \
-                   float(region['num_sites_nongap_' + s])
-    id_master = float(region['match_nongap_' + r]) / \
-                float(region['num_sites_nongap_' + r])
-
-    if id_master >= id_predicted:
-        return False, 'id with master = ' + str(id_master) + \
-            ' and id with predicted = ' + str(id_predicted)
-    #if id_predicted < .7:
-    #    return False, 'id with predicted = ' + str(id_predicted)
-    if id_master < .7:
-        return False, 'id with master = ' + str(id_master)
-
-    return True, ''
-
-def passes_filters2(region, seqs, threshold, refs):
-    # filter out things we can't assign to one species specifically;
-    # also return the other reasonable alternatives if we're filtering
-    # it out
-
-    n = len(seqs[0])
-    s = region['predicted_species']
-    
-    ids = {}
-    totals = {}
-    P_counts = {}
-    skip = [gp.gap_symbol, gp.unsequenced_symbol]
-
-    for ri in range(1, len(refs)):
-        r_match, r_total = seq_functions.seq_id(seqs[-1], seqs[ri])
-        if r_total != 0:
-            ids[refs[ri]] = float(r_match) / r_total
-            totals[refs[ri]] = r_total
-            P_count = 0
-            for i in range(n):
-                if seqs[ri][i] in skip or seqs[0][i] in skip or seqs[-1][i] in skip:
-                    continue
-                if seqs[-1][i] == seqs[ri][i] and seqs[-1][i] != seqs[0][i]:
-                    P_count += 1
-            P_counts[refs[ri]] = P_count
-
-    alts = {}
-    for r in ids.keys():
-        #if float(totals[r]) / totals[s] > .75 and \
-        #   ids[r] >= threshold * ids[s] and \
-        #   P_counts[r] >= threshold * P_counts[s]:
-        # TODO should threshold be the same for both?
-        if ids[r] >= threshold * ids[s] and \
-           P_counts[r] >= threshold * P_counts[s]:
-            alts[r] = (ids[r], P_counts[r])
-    
-    alt_states = sorted(alts.keys(), key=lambda x: alts[x][0], reverse=True)
-    alt_ids = [alts[state][0] for state in alt_states]
-    alt_P_counts = [alts[state][1] for state in alt_states]
-
-    if len(alts) > 1:
-        return False, alt_states, alt_ids, alt_P_counts
-
-    return True, alt_states, alt_ids, alt_P_counts
diff --git a/code/analyze/filter_regions.py b/code/analyze/filter_regions.py
new file mode 100644
index 0000000..4d1695f
--- /dev/null
+++ b/code/analyze/filter_regions.py
@@ -0,0 +1,409 @@
+from misc import seq_functions
+import numpy as np
+from typing import List, Dict, TextIO, Tuple
+import click
+import logging as log
+from contextlib import ExitStack, contextmanager
+from misc import read_table
+from misc.region_reader import Region_Reader
+from analyze.introgression_configuration import Configuration
+
+
+class Filterer():
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+
+    def filter_introgressed(self,
+                            region: Dict,
+                            info: str,
+                            reference_species: str) -> Tuple[bool, str]:
+        '''
+        filtering out things that we can't call introgressed in general
+        with confidence (i.e. doesn't seem like a strong case against
+        being S288c)
+        Return true if the region passes the filter, or false with a string
+        specifying which filter failed
+        Tests:
+        -fraction of gaps masked in reference > 0.5
+        -fraction of gaps masked in predicted species > 0.5
+        -number of matches to predicted > 7
+        -number of matches to predicted > number matches to reference
+        -divergence with predicted species
+        '''
+
+        predicted_species = region['predicted_species']
+
+        aligned_length = (int(region['end']) - int(region['start']) + 1)
+
+        # FILTER: fraction gaps + masked
+        fraction_gaps_masked_threshold = .5
+        # num_sites_nonmask_x is number of sites at which neither
+        # reference x nor the test sequence is masked or has a gap or
+        # unsequenced character
+        fraction_gaps_masked_r = \
+            1 - int(region['num_sites_nonmask_' +
+                           reference_species])/aligned_length
+        fraction_gaps_masked_s = \
+            1 - int(region['num_sites_nonmask_' +
+                           predicted_species])/aligned_length
+
+        if fraction_gaps_masked_r > fraction_gaps_masked_threshold:
+            return False, f'fraction gaps/masked in master = '\
+                f'{fraction_gaps_masked_r}'
+        if fraction_gaps_masked_s > fraction_gaps_masked_threshold:
+            return False, f'fraction gaps/masked in predicted = '\
+                f'{fraction_gaps_masked_s}'
+
+        # FILTER: number sites analyzed by HMM that match predicted (P)
+        # reference (C)
+        count_P = np.sum(info == 'P')
+        count_C = np.sum(info == 'C')
+        number_match_only_threshold = 7
+        if count_P < number_match_only_threshold:
+            return False, f'count_P = {count_P}'
+        if count_P <= count_C:
+            return False, f'count_P = {count_P} and count_C = {count_C}'
+
+        # FILTER: divergence with predicted reference and master reference
+        # (S288c)
+        id_predicted = float(region['match_nongap_' + predicted_species]) / \
+            float(region['num_sites_nongap_' + predicted_species])
+        id_master = float(region['match_nongap_' + reference_species]) / \
+            float(region['num_sites_nongap_' + reference_species])
+
+        if id_master >= id_predicted:
+            return False, f'id with master = {id_master} '\
+                f'and id with predicted = {id_predicted}'
+        if id_master < .7:
+            return False, f'id with master = {id_master}'
+
+        return True, ''
+
+    def filter_ambiguous(self,
+                         region: Dict,
+                         seqs: np.array,
+                         threshold: float,
+                         refs: List[str]) -> Tuple[bool,
+                                                   List[str],
+                                                   List[float],
+                                                   List[int]]:
+        '''
+        filter out things we can't assign to one species specifically;
+        return the other reasonable alternatives if we're filtering
+        it out
+        Returns:
+        True if the region passes the filter
+        Fails the filter if number of matches and fraction matching are >= more
+        than one state for the region
+        Region is updated with:
+        A list of likely species for the region
+        A list of fraction of matching sequence for each species
+        A list of total matching sites
+        '''
+
+        s = region['predicted_species']
+
+        ids = {}
+        P_counts = {}
+
+        seqs = np.asarray(seqs)
+        # skip any gap or unsequenced in ref or test
+        # also skip if ref and test equal (later test ri == test but not ref)
+        symbols = self.config.symbols
+        skip = np.any(
+            (seqs[0] == symbols['gap'],
+             seqs[0] == symbols['unsequenced'],
+             seqs[-1] == symbols['gap'],
+             seqs[-1] == symbols['unsequenced'],
+             seqs[0] == seqs[-1]),
+            axis=0)
+
+        for ri, ref in enumerate(refs):
+            if ri == 0:
+                continue
+            r_match, r_total = seq_functions.seq_id(seqs[-1], seqs[ri])
+            if r_total != 0:
+                ids[ref] = r_match / r_total
+                P_counts[ref] = np.sum(
+                    np.logical_and(
+                        np.logical_not(skip),
+                        seqs[ri] == seqs[-1]))
+
+        alts = {}
+        for r in ids.keys():
+            # TODO should threshold be the same for both?
+            if ids[r] >= threshold * ids[s] and \
+               P_counts[r] >= threshold * P_counts[s]:
+                alts[r] = (ids[r], P_counts[r])
+
+        alt_states = sorted(alts.keys(),
+                            key=lambda x: alts[x][0],
+                            reverse=True)
+        region['alternative_states'] = ','.join(alt_states)
+
+        alt_ids = [alts[state][0] for state in alt_states]
+        region['alternative_ids'] = ','.join(
+            [str(x) for x in alt_ids])
+
+        alt_P_counts = [alts[state][1] for state in alt_states]
+        region['alternative_P_counts'] = ','.join(
+            [str(x) for x in alt_P_counts])
+
+        return len(alts) <= 1, alt_states
+
+    def validate_arguments(self):
+        args = [
+            'introgressed',
+            'introgressed_intermediate',
+            'ambiguous',
+            'ambiguous_intermediate',
+            'filter_threshold',
+            'known_states',
+            'regions',
+            'region_index',
+            'symbols',
+            'quality_blocks'
+        ]
+        variables = self.config.__dict__
+        for arg in args:
+            if arg not in variables or variables[arg] is None:
+                err = ('Failed to validate Filterer, required argument '
+                       f"'{arg}' was unset")
+                log.exception(err)
+                raise ValueError(err)
+
+        if 'filter_sweep' not in variables or \
+                variables['filter_sweep'] is None:
+            log.warning(f"'filter_sweep' was unset and will not be run")
+
+    def run(self, thresholds=[]):
+        '''
+        Filter region files based on thresold in config and sweep
+        with the supplied threshold list
+        '''
+        self.validate_arguments()
+        known_states = self.config.known_states
+        log.debug(f'Known states: {known_states}')
+
+        with Filter_Sweep(self.config.filter_sweep, thresholds) as sweeper,\
+                ExitStack() as stack:
+
+            progress_bar = None
+            if self.config.log_file:
+                progress_bar = stack.enter_context(
+                    click.progressbar(
+                        length=len(known_states[1:]),
+                        label='Filtering'))
+
+            sweeper.write_header()
+            writers = Filter_Writers(self.config)
+
+            for species_from in known_states[1:]:
+
+                log.info(species_from)
+
+                region_summary, fields = read_table.read_table_rows(
+                    self.config.quality_blocks.format(state=species_from),
+                    '\t')
+
+                with writers.open_state(species_from, fields) as writers,\
+                        Region_Reader(self.config.regions.format(
+                            state=species_from), as_fa=True) as region_reader:
+
+                    writers.write_headers()
+
+                    for region_id, _, seqs in region_reader.yield_fa():
+                        region = region_summary[region_id]
+                        seqs, info_string = seqs[:-1], seqs[-1]
+
+                        # filtering stage 1: things that we're confident in
+                        # calling not S288c
+                        passes, reason = self.filter_introgressed(
+                            region,
+                            info_string,
+                            known_states[0])
+                        region['reason'] = reason
+
+                        writers.write_introgressed(region_id, region, passes)
+
+                        if passes:
+                            sweeper.record(
+                                species_from,
+                                lambda thresh: self.filter_ambiguous(
+                                    region, seqs, thresh, known_states))
+
+                            passes, _ = self.filter_ambiguous(
+                                 region, seqs,
+                                 self.config.filter_threshold, known_states)
+                            writers.write_ambiguous(region_id, region, passes)
+
+                if progress_bar:
+                    progress_bar.update(1)
+
+            sweeper.write_results(known_states[1:])
+
+
+class Filter_Sweep():
+    def __init__(self,
+                 sweep_file: str,
+                 thresholds: List[float]):
+        self.sweep_file = sweep_file
+        self.sweep_writer = None
+        self.thresholds = thresholds
+        self.data_table = {}
+
+    def __enter__(self):
+        if self.sweep_file is not None and self.thresholds != []:
+            self.sweep_writer = open(self.sweep_file, 'w')
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        if self.sweep_writer:
+            self.sweep_writer.close()
+
+        return traceback is None
+
+    def write_header(self):
+        '''
+        Write the header for the sweep filter file
+        '''
+        if self.sweep_writer:
+            self.sweep_writer.write(
+                'threshold\tpredicted_state\talternative_states\tcount\n')
+
+    def record(self, species_from, thresh_lambda):
+        '''
+        Record the thresholds for this filter sweep object.
+        The thresh lambda is an anonymous function that takes a threshold
+        and returns a tuple with the value at index 1 being the alternative
+        states. Filter_ambiguous is what this is meant for.
+        '''
+        if self.sweep_writer is None:
+            return
+
+        for thresh in self.thresholds:
+            _, states = thresh_lambda(thresh)
+            self.record_data_hit(thresh, species_from, states)
+
+    def record_data_hit(self, threshold: float, species: str, states: List):
+        '''
+        adds an entry to the data table or increments if exists
+        '''
+        key = ','.join(sorted(states))
+        if threshold not in self.data_table:
+            self.data_table[threshold] = {}
+
+        if species not in self.data_table[threshold]:
+            self.data_table[threshold][species] = {}
+
+        if key not in self.data_table[threshold][species]:
+            self.data_table[threshold][species][key] = 0
+
+        self.data_table[threshold][species][key] += 1
+
+    def write_results(self, states):
+        if self.sweep_writer is None:
+            return
+
+        for thresh in self.thresholds:
+            for species in states:
+                if thresh in self.data_table and \
+                        species in self.data_table[thresh]:
+                    d = self.data_table[thresh][species]
+                    for key, value in d.items():
+                        self.sweep_writer.write(
+                            f'{thresh}\t{species}\t{key}\t{value}\n')
+
+
+class Filter_Writers():
+    '''
+    Writes the filter and intermediate files
+    '''
+    def __init__(self, config):
+        self.files = {
+            'introgressed': config.introgressed,
+            'introgressed_int': config.introgressed_intermediate,
+            'ambiguous': config.ambiguous,
+            'ambiguous_int': config.ambiguous_intermediate
+        }
+        self.headers = None
+        self.writers = None
+
+    @contextmanager
+    def open_state(self, state: str, fields: List):
+        '''
+        Open output files for the particular state
+        '''
+        self.headers = {
+            'introgressed': fields,
+            'introgressed_int': fields + ['reason'],
+            'ambiguous': fields,
+            'ambiguous_int': fields + ['alternative_states',
+                                       'alternative_ids',
+                                       'alternative_P_counts']
+        }
+
+        self.writers = {k: open(v.format(state=state), 'w')
+                        for k, v in self.files.items()}
+
+        yield self
+
+        for writer in self.writers.values():
+            writer.close()
+
+        self.headers = None
+        self.writers = None
+
+    def write_headers(self):
+        if self.headers is None or self.writers is None:
+            return
+
+        for key, writer in self.writers.items():
+            writer.write('\t'.join(self.headers[key]) + '\n')
+
+    def write_filtered_line(self,
+                            writer: TextIO,
+                            region_id: str,
+                            region: Dict,
+                            fields: List) -> None:
+        '''
+        Write the region id and values in "region" dict to open file writer
+        '''
+        writer.write(f'{region_id}\t')
+        writer.write('\t'.join([str(region[field]) for field in fields[1:]]))
+        writer.write('\n')
+
+    def write_introgressed(self,
+                           region_id: str,
+                           region: Dict,
+                           passes: bool):
+        self.write_filtered_line(
+            self.writers['introgressed_int'],
+            region_id,
+            region,
+            self.headers['introgressed_int'])
+
+        if passes:
+            self.write_filtered_line(
+                self.writers['introgressed'],
+                region_id,
+                region,
+                self.headers['introgressed'])
+
+    def write_ambiguous(self,
+                        region_id: str,
+                        region: Dict,
+                        passes: bool):
+        self.write_filtered_line(
+            self.writers['ambiguous_int'],
+            region_id,
+            region,
+            self.headers['ambiguous_int'])
+
+        if passes:
+            self.write_filtered_line(
+                self.writers['ambiguous'],
+                region_id,
+                region,
+                self.headers['ambiguous'])
diff --git a/code/analyze/id_regions.py b/code/analyze/id_regions.py
new file mode 100644
index 0000000..f964881
--- /dev/null
+++ b/code/analyze/id_regions.py
@@ -0,0 +1,92 @@
+from contextlib import ExitStack
+from operator import itemgetter
+from analyze.introgression_configuration import Configuration
+from analyze.predict import read_blocks
+import click
+import logging as log
+
+
+class ID_producer():
+    '''
+    ID_producer
+    Adds unique region id to block files
+    '''
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+
+    def add_ids(self):
+        '''
+        Adds a unique region id to block files, producing labeled text files
+        '''
+        self.validate_arguments()
+        regions = dict(zip(self.config.chromosomes,
+                           [[] for _ in self.config.chromosomes]))
+        with ExitStack() as stack:
+            writers = {}
+
+            # Progress bars don't seem to show since these complete too fast
+            progress_bar = None
+            if self.config.log_file:
+                progress_bar = stack.enter_context(
+                    click.progressbar(
+                        length=len(self.config.states),
+                        label='Reading in states'))
+
+            for state in self.config.states:
+                # read in region as dict keyed by strain, chromosome:
+                # (start, end, number non gapped)
+                region = read_blocks(self.config.blocks.format(state=state))
+                for strain, d_strain in region.items():
+                    for chrm, d_chrm in d_strain.items():
+                        for start, end, num in d_chrm:
+                            regions[chrm].append(
+                                (start, end, num, strain, state))
+
+                # open writer
+                writers[state] = stack.enter_context(
+                    open(self.config.labeled_blocks.format(state=state), 'w'))
+                writers[state].write(
+                    'region_id\tstrain\tchromosome\tpredicted_species\t'
+                    'start\tend\tnum_sites_hmm\n')
+
+                if progress_bar:
+                    progress_bar.update(1)
+            id_counter = 1
+
+            if progress_bar:
+                progress_bar = stack.enter_context(
+                    click.progressbar(
+                        length=len(regions.keys()),
+                        label='Adding regions'))
+
+            for chrm, entries in regions.items():
+                # sort by start, then strain
+                for start, end, num, strain, state in \
+                        sorted(entries, key=itemgetter(0, 3)):
+                    writers[state].write(
+                        f'r{id_counter}\t{strain}\t{chrm}\t{state}\t'
+                        f'{start}\t{end}\t{num}\n')
+                    id_counter += 1
+                if progress_bar:
+                    progress_bar.update(1)
+
+    def validate_arguments(self):
+        '''
+        Check that all required instance variables are set to perform a
+        id producer run. Returns true if valid, raises value error otherwise
+        '''
+        args = [
+            'chromosomes',
+            'blocks',
+            'labeled_blocks',
+            'states',
+        ]
+        variables = self.config.__dict__
+        for arg in args:
+            if arg not in variables or variables[arg] is None:
+                err = ('Failed to validate ID Producer, required argument '
+                       f"'{arg}' was unset")
+                log.exception(err)
+                raise ValueError(err)
+
+        return True
diff --git a/code/analyze/id_regions_main.py b/code/analyze/id_regions_main.py
deleted file mode 100644
index 951f124..0000000
--- a/code/analyze/id_regions_main.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import sys
-from analyze import predict
-from analyze import read_args
-from operator import itemgetter
-import global_params as gp
-
-
-def main():
-    args = read_args.process_predict_args(sys.argv[1:])
-
-    # order regions by chromosome, start (break ties alphabetically by strain)
-    all_regions_by_chrm = dict(zip(gp.chrms, [[] for chrm in gp.chrms]))
-    output_files = {}
-    base_dir = gp.analysis_out_dir_absolute + args['tag']
-    for species_from in args['states']:
-
-        # strain chromosome predicted_species start end number_non_gap
-        fn = f'{base_dir}/blocks_{species_from}_{args["tag"]}.txt'
-
-        # introgressed regions keyed by strain and then chromosome:
-        # (start, end, number_non_gap)
-        regions = predict.read_blocks(fn)
-
-        for strain in regions:
-            for chrm in regions[strain]:
-                for entry in regions[strain][chrm]:
-                    start, end, number_non_gap = entry
-                    all_regions_by_chrm[chrm].append(
-                        (start, end, number_non_gap, strain, species_from))
-
-        output_files[species_from] = f'{fn[:-4]}_labeled.txt'
-
-    writers = {}
-    for species_from in args['states']:
-        writers[species_from] = open(output_files[species_from], 'w')
-        writers[species_from].write(
-            'region_id\tstrain\tchromosome\tpredicted_species\t'
-            'start\tend\tnum_sites_hmm\n')
-
-    idc = 1
-    for chrm in gp.chrms:
-        for entry in sorted(all_regions_by_chrm[chrm], key=itemgetter(0, 3)):
-            (start, end, number_non_gap, strain, species_from) = entry
-            writers[species_from].write(
-                f'r{idc}\t{strain}\t{chrm}\t{species_from}\t'
-                f'{start}\t{end}\t{number_non_gap}\n')
-            idc += 1
-
-    for species_from in args['states']:
-        writers[species_from].close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/code/analyze/introgression_configuration.py b/code/analyze/introgression_configuration.py
new file mode 100644
index 0000000..61a50ff
--- /dev/null
+++ b/code/analyze/introgression_configuration.py
@@ -0,0 +1,399 @@
+import glob
+import re
+from typing import Tuple, Dict, List
+import logging as log
+from misc.config_utils import (get_nested, clean_config, merge_dicts,
+                               print_dict, validate, check_wildcards)
+
+
+class Configuration():
+    def __init__(self):
+        self.config = {}
+        self.log_file = None
+
+        # these are very regular variables with state as a wildcard
+        state_files = [
+            'blocks',
+            'labeled_blocks',
+            'quality_blocks',
+            'introgressed',
+            'introgressed_intermediate',
+            'ambiguous',
+            'ambiguous_intermediate',
+            'regions',
+            'region_index',
+        ]
+        # no wildcards, non nullable
+        nonwild_files = [
+            'hmm_initial',
+            'hmm_trained',
+            'positions',
+            'probabilities',
+            'strain_info',
+            'state_counts',
+        ]
+        var_list = [
+            Variable('chromosomes'),
+            Threshold_Variable(),
+            Convergence_Variable(),
+            Symbols_Variable(),
+            Filter_Threshold_Variable(),
+            Variable('log_file', 'paths.log_file', nullable=True),
+            Variable('filter_sweep', 'paths.analysis.filter_sweep',
+                     nullable=True),
+            Variable('masks', 'paths.analysis.masked_intervals',
+                     wildcards='strain,chrom'),
+        ] + [
+            Variable(n, f'paths.analysis.{n}', wildcards='state')
+            for n in state_files
+        ] + [
+            Variable(n, f'paths.analysis.{n}')
+            for n in nonwild_files
+        ]
+
+        self.variables = {v.name: v for v in var_list}
+        # these require too much state from configuration to split out
+        self.other_parsers = {
+            'states': self._set_states,
+            'prefix': self._set_prefix,
+            'strains': self._set_strains,
+            'alignment': self._set_alignment
+        }
+
+    def add_config(self, configuration: Dict):
+        '''
+        merge the provided configuration dictionary with this object.
+        Cleans configuration
+        '''
+        self.config = clean_config(
+            merge_dicts(self.config, configuration))
+
+    def set(self, *args, **kwargs):
+        '''
+        Set the supplied variable to the value provided.
+        If just a name is provided, set the value with a value of None
+        '''
+        kwargs.update({a: None for a in args})
+        for key, value in kwargs.items():
+            if key in self.variables:
+                variable = self.variables[key]
+                self.__dict__[key] = variable.parse(value, self.config)
+
+            elif key in self.other_parsers:
+                self.other_parsers[key](value)
+
+            else:
+                err = f'Unknown variable to set: {key}'
+                log.exception(err)
+                raise ValueError(err)
+
+    def get_states(self) -> Tuple[List, List]:
+        '''
+        Build lists of known and unknown states from the analysis params
+        '''
+
+        ref = get_nested(self.config, 'analysis_params.reference.name')
+        if ref is None:
+            ref = []
+        else:
+            ref = [ref]
+
+        known = get_nested(self.config, 'analysis_params.known_states')
+        if known is None:
+            known = []
+
+        known_states = ref + [s['name'] for s in known]
+
+        unknown = get_nested(self.config, 'analysis_params.unknown_states')
+        if unknown is None:
+            unknown = []
+
+        unknown_states = [s['name'] for s in unknown]
+
+        return known_states, unknown_states
+
+    def get_interval_states(self) -> List:
+        '''
+        Build list of interval states, typically just known names
+        but if the state has an interval name, use that
+        '''
+        ref = get_nested(self.config, 'analysis_params.reference')
+
+        # set with name or empty list
+        if ref is None:
+            ref = []
+        else:
+            ref = [ref]
+
+        known = get_nested(self.config, 'analysis_params.known_states')
+        if known is None:
+            known = []
+
+        return [s['interval_name']
+                if 'interval_name' in s
+                else s['name']
+                for s in ref + known]
+
+    def _set_states(self, states: List[str] = None):
+        '''
+        Set the states for which to perform region naming
+        '''
+        if states is None or states == []:
+            self.known_states, self.unknown_states = self.get_states()
+            self.states = self.known_states + self.unknown_states
+        else:
+            self.states = states
+
+        self.interval_states = self.get_interval_states()
+
+        if self.states == []:
+            err = 'No states specified'
+            log.exception(err)
+            raise ValueError(err)
+
+    def _set_prefix(self, prefix: str = ''):
+        '''
+        Set prefix string of the predictor to the supplied value or
+        build it from the known states
+        '''
+        if not prefix:
+            if self.known_states == []:
+                err = 'Unable to build prefix, no known states provided'
+                log.exception(err)
+                raise ValueError(err)
+
+            self.prefix = '_'.join(self.known_states)
+        else:
+            self.prefix = prefix
+
+    def _set_strains(self, test_strains: str = ''):
+        '''
+        build the strains to perform prediction on
+        '''
+        if not test_strains:
+            test_strains = get_nested(self.config, 'paths.test_strains')
+        else:
+            # need to support list for test strains
+            test_strains = [test_strains]
+
+        if test_strains is not None:
+            for test_strain in test_strains:
+                check_wildcards(test_strain, 'strain,chrom')
+
+        self.find_strains(test_strains)
+
+    def find_strains(self, test_strains: List[str] = None):
+        '''
+        Helper method to get strains supplied in config, or from test_strains
+        '''
+        strains = get_nested(self.config, 'strains')
+        self.test_strains = test_strains
+
+        if strains is None:
+            if test_strains is None:
+                err = ('Unable to find strains in config and '
+                       'no test_strains provided')
+                log.exception(err)
+                raise ValueError(err)
+
+            # try to build strains from wildcards in test_strains
+            strains = {}
+            for test_strain in test_strains:
+                # find matching files
+                strain_glob = test_strain.format(
+                    strain='*',
+                    chrom='*')
+                log.info(f'searching for {strain_glob}')
+                for fname in glob.iglob(strain_glob):
+                    # extract wildcard matches
+                    match = re.match(
+                        test_strain.format(
+                            strain='(?P<strain>.*?)',
+                            chrom='(?P<chrom>[^_]*?)'
+                        ),
+                        fname)
+                    if match:
+                        log.debug(
+                            f'matched with {match.group("strain", "chrom")}')
+                        strain, chrom = match.group('strain', 'chrom')
+                        if strain not in strains:
+                            strains[strain] = set()
+                        strains[strain].add(chrom)
+
+            if len(strains) == 0:
+                err = ('Found no chromosome sequence files '
+                       f'in {test_strains}')
+                log.exception(err)
+                raise ValueError(err)
+
+            # check if requested chromosomes are within the list of chroms
+            chrom_set = set(self.chromosomes)
+            for strain, chroms in strains.items():
+                if not chrom_set.issubset(chroms):
+                    not_found = chrom_set.difference(chroms).pop()
+                    err = (f'Strain {strain} is missing chromosomes. '
+                           f'Unable to find chromosome \'{not_found}\'')
+                    log.exception(err)
+                    raise ValueError(err)
+
+            self.strains = list(sorted(strains.keys()))
+
+        else:  # strains set in config
+            self.strains = list(sorted(set(strains)))
+
+    def _set_alignment(self, alignment: str):
+        '''
+        Set the alignment file, checking wildcards prefix, strain and chrom.
+        If prefix is present, it is substituted, otherwise checks just
+        strain and chrom
+        '''
+        alignment = validate(self.config,
+                             'paths.analysis.alignment',
+                             'No alignment provided',
+                             alignment)
+
+        check_wildcards(alignment, 'strain,chrom')
+        if '{prefix}' in alignment:
+            self.alignment = alignment.replace('{prefix}', self.prefix)
+        else:
+            self.alignment = alignment
+
+    def get(self, key: str):
+        '''
+        Get nested key from underlying dictionary. Returning none if any
+        key is not in dict
+        '''
+        return get_nested(self.config, key)
+
+    def __repr__(self):
+        return ('Config file:\n' +
+                print_dict(self.config) +
+                '\nSettings:\n' +
+                print_dict({k: v for k, v in self.__dict__.items()
+                            if k != 'config' and k != 'variables'
+                            and k != 'other_parsers'})
+                )
+
+
+class Variable():
+    def __init__(self, name, config_path=None, nullable=False, wildcards=None):
+        self.name = name
+        if config_path:
+            self.config_path = config_path
+        else:
+            self.config_path = name
+
+        self.nullable = nullable
+        self.wildcards = wildcards
+
+    def parse(self, value, config={}):
+        if self.nullable:
+            if not value:
+                value = get_nested(config, self.config_path)
+
+        else:
+            value = validate(config, self.config_path,
+                             f'No {self.name} provided', value)
+
+        if self.wildcards:
+            check_wildcards(value, self.wildcards)
+
+        return value
+
+
+class Threshold_Variable(Variable):
+    def __init__(self):
+        super().__init__('threshold', 'analysis_params.threshold')
+
+    def parse(self, value, config={}):
+        value = super().parse(value, config)
+
+        try:
+            value = float(value)
+
+        except ValueError:
+            if value != 'viterbi':
+                err = f'Unsupported threshold value: {value}'
+                log.exception(err)
+                raise ValueError(err)
+
+        return value
+
+
+class Filter_Threshold_Variable(Variable):
+    def __init__(self):
+        super().__init__('filter_threshold',
+                         'analysis_params.filter_threshold')
+
+    def parse(self, value, config={}):
+        value = super().parse(value, config)
+
+        try:
+            value = float(value)
+
+        except (ValueError, TypeError):
+            err = 'Filter threshold is not a valid number'
+            log.exception(err)
+            raise ValueError(err)
+
+        return value
+
+
+class Convergence_Variable(Variable):
+    def __init__(self):
+        super().__init__('convergence',
+                         'analysis_params.convergence_threshold',
+                         nullable=True)
+
+    def parse(self, value, config={}):
+        value = super().parse(value, config)
+
+        try:
+            value = float(value)
+
+        except (ValueError, TypeError):
+            log.warning('No value set for convergence_threshold, using '
+                        'default of 0.001')
+            value = 0.001
+
+        return value
+
+
+class Symbols_Variable(Variable):
+    def __init__(self):
+        super().__init__('symbols', '')
+
+    def parse(self, value, config):
+        '''
+        Set symbols based on config values, using defaults if unset
+        '''
+        symbols = {
+            'match': '+',
+            'mismatch': '-',
+            'unknown': '?',
+            'unsequenced': 'n',
+            'gap': '-',
+            'unaligned': '?',
+            'masked': 'x'
+        }
+        config_symbols = get_nested(config, 'HMM_symbols')
+        if config_symbols is not None:
+            for k, v in config_symbols.items():
+                if k not in symbols:
+                    log.warning("Unused symbol in configuration: "
+                                f"{k} -> '{v}'")
+                else:
+                    symbols[k] = v
+                    log.debug(f"Overwriting default symbol for {k} with '{v}'")
+
+            for k, v in symbols.items():
+                if k not in config_symbols:
+                    log.warning(f'Symbol for {k} unset in config, '
+                                f"using default '{v}'")
+
+        else:
+            for k, v in symbols.items():
+                log.warning(f'Symbol for {k} unset in config, '
+                            f"using default '{v}'")
+
+        return symbols
diff --git a/code/analyze/main.py b/code/analyze/main.py
new file mode 100644
index 0000000..bff61bd
--- /dev/null
+++ b/code/analyze/main.py
@@ -0,0 +1,337 @@
+import click
+import yaml
+import logging as log
+import analyze.predict
+from analyze.introgression_configuration import Configuration
+from analyze.id_regions import ID_producer
+from analyze.summarize_region_quality import Summarizer
+from analyze.filter_regions import Filterer
+from analyze.summarize_strain_states import Strain_Summarizer
+
+
+# TODO also check for snakemake object?
+@click.group(invoke_without_command=True)
+@click.option('--config', '-c',
+              multiple=True,
+              type=click.File('r'),
+              help='Base configuration yaml.')
+@click.option('-v', '--verbosity', count=True, default=3)
+@click.option('--log-file',
+              default='',
+              help='Optional log file. If unset print to stdout.')
+@click.pass_context
+def cli(ctx, config, verbosity, log_file):
+    '''
+    Main entry script to run analyze methods
+    '''
+
+    verbosity -= 1
+    verbosity = 4 if verbosity > 4 else verbosity
+    levelstr, level = [
+        ('CRITICAL', log.CRITICAL),
+        ('ERROR', log.ERROR),
+        ('WARNING', log.WARNING),
+        ('INFO', log.INFO),
+        ('DEBUG', log.DEBUG),
+    ][verbosity]
+
+    ctx.ensure_object(Configuration)
+
+    confs = len(config)
+    for path in config:
+        conf = yaml.safe_load(path)
+        ctx.obj.add_config(conf)
+
+    ctx.obj.set(log_file=log_file)
+    if ctx.obj.log_file is not None:
+        log.basicConfig(level=level, filename=ctx.obj.log_file, filemode='w')
+    else:
+        log.basicConfig(level=level)
+    log.info(f'Verbosity set to {levelstr}')
+
+    log.info(f'Read in {confs} config file{"" if confs == 1 else "s"}')
+    log.debug('Cleaned config:\n' + repr(ctx.obj))
+
+    if ctx.invoked_subcommand is None:
+        click.echo_via_pager(
+            click.style(
+                'No command supplied. Read in the following config:\n',
+                fg='yellow') + repr(ctx.obj))
+
+
+@cli.command()
+@click.option('--blocks', default='', help='Block file location with {state}')
+@click.option('--prefix', default='', help='Prefix of test-strain files '
+              'default to list of states joined with _.')
+@click.option('--test-strains', default='',
+              help='Test files location with {strain} and {chrom}')
+@click.option('--hmm-initial', default='',
+              help='Initial hmm parameter text file')
+@click.option('--hmm-trained', default='',
+              help='Trained hmm parameter text file')
+@click.option('--positions', default='',
+              help='Positions file, gzipped')
+@click.option('--probabilities', default='',
+              help='Probabilities file, gzipped')
+@click.option('--threshold', default='',
+              help='Threshold to apply to estimated path. Valid values are '
+              'floats or `viterbi\'')
+@click.option('--alignment', default='',
+              help='Alignment file location with '
+              '{prefix}, {strain}, and {chrom}')
+@click.option('--only-poly-sites/--all-sites', default=True,
+              help='Consider only polymorphic sites or all sites. '
+              'Default is only polymorphic.')
+@click.pass_context
+def predict(ctx,
+            blocks,
+            prefix,
+            test_strains,
+            hmm_initial,
+            hmm_trained,
+            positions,
+            probabilities,
+            threshold,
+            alignment,
+            only_poly_sites):
+    config = ctx.obj
+
+    config.set('chromosomes')
+    log.info(f'Found {len(config.chromosomes)} chromosomes in config')
+
+    config.set(threshold=threshold)
+    log.info(f'Threshold value is \'{config.threshold}\'')
+
+    config.set(blocks=blocks)
+    log.info(f'Output blocks file is \'{config.blocks}\'')
+
+    config.set('states')
+    config.set(prefix=prefix)
+    log.info(f'Prefix is \'{config.prefix}\'')
+
+    config.set(strains=test_strains)
+    if config.test_strains is None:
+        log.info(f'No test_strains provided')
+    else:
+        str_len = len(config.test_strains)
+        log.info(f'Found {str_len} test strain'
+                 f'{"" if str_len == 1 else "s"}')
+    str_len = len(config.strains)
+    log.info(f'Found {str_len} unique strain'
+             f'{"" if str_len == 1 else "s"}')
+
+    config.set(hmm_initial=hmm_initial,
+               hmm_trained=hmm_trained,
+               positions=positions,
+               probabilities=probabilities,
+               alignment=alignment)
+    log.info(f'Hmm_initial file is \'{config.hmm_initial}\'')
+    log.info(f'Hmm_trained file is \'{config.hmm_trained}\'')
+    log.info(f'Positions file is \'{config.positions}\'')
+    log.info(f'Probabilities file is \'{config.probabilities}\'')
+    log.info(f'Alignment file is \'{config.alignment}\'')
+
+    predictor = analyze.predict.Predictor(config)
+    if only_poly_sites:
+        log.info('Only considering polymorphic sites')
+    else:
+        log.info('Considering all sites')
+    predictor.run_prediction(only_poly_sites)
+
+
+@cli.command()
+@click.option('--blocks', default='', help='Block file location with {state}')
+@click.option('--labeled', default='', help='Block file location with {state}')
+@click.option('--state', multiple=True, help='States to add ids to')
+@click.pass_context
+def id_regions(ctx, blocks, labeled, state):
+    config = ctx.obj
+    config.set('chromosomes')
+    log.info(f'Found {len(config.chromosomes)} chromosomes in config')
+
+    state = list(state)
+    config.set(states=state)
+    log.info(f'Found {len(config.states)} states to process')
+
+    config.set(blocks=blocks)
+    log.info(f'Input blocks file is \'{config.blocks}\'')
+
+    config.set(labeled_blocks=labeled)
+    log.info(f'Output blocks file is \'{config.labeled_blocks}\'')
+
+    id_producer = ID_producer(config)
+    id_producer.add_ids()
+
+
+@cli.command()
+@click.option('--state', multiple=True, help='States to summarize')
+@click.option('--labeled', default='',
+              help='Labeled block file with {state} '
+              'Created during id_regions')
+@click.option('--masks', default='',
+              help='Mask file with {strain} and {chrom}')
+@click.option('--alignment', default='',
+              help='Alignment file with {prefix} [optional], '
+              '{strain} and {chrom}')
+@click.option('--positions', default='',
+              help='Position file created during prediction')
+@click.option('--quality', default='',
+              help='Output quality file with {state}')
+@click.option('--region', default='',
+              help='Output region file with {state}, gzipped')
+@click.option('--region-index', default='',
+              help='Output region index file with {state}, pickled')
+@click.pass_context
+def summarize_regions(ctx,
+                      state,
+                      labeled,
+                      quality,
+                      masks,
+                      alignment,
+                      positions,
+                      region,
+                      region_index):
+    config = ctx.obj
+
+    config.set('states',
+               'chromosomes')
+    log.info(f'Found {len(config.chromosomes)} chromosomes in config')
+
+    config.set(labeled_blocks=labeled)
+    log.info(f'Labeled blocks file is \'{config.labeled_blocks}\'')
+
+    config.set(quality_blocks=quality)
+    log.info(f'Quality file is \'{config.quality_blocks}\'')
+
+    config.set(masks=masks)
+    log.info(f'Mask file is \'{config.masks}\'')
+
+    config.set('prefix')
+    config.set(alignment=alignment)
+    log.info(f'Alignment file is \'{config.alignment}\'')
+
+    config.set(positions=positions)
+    log.info(f'Positions file is \'{config.positions}\'')
+
+    config.set(regions=region, region_index=region_index)
+    log.info(f'Region file is \'{config.regions}\'')
+    log.info(f'Region index file is \'{config.region_index}\'')
+
+    config.set('symbols')
+
+    summarizer = Summarizer(config)
+    summarizer.run(list(state))
+
+
+@cli.command()
+@click.option('--thresh', help='Threshold to apply to ambiguous filter',
+              default=None, type=float)
+@click.option('--introgress-filter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains only regions passing introgression filter')
+@click.option('--introgress-inter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains all regions with reasons they failed filtering')
+@click.option('--ambiguous-filter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains only regions passing ambiguous filter')
+@click.option('--ambiguous-inter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains all regions passing introgressing filtering, '
+              'with reasons they failed ambiguous filtering')
+@click.option('--filter-sweep', default='',
+              help='Contains summary results for applying ambiguous filter '
+              'with various threshold values supplied as arguments.')
+@click.option('--region', default='',
+              help='Region file with {state}, gzipped')
+@click.option('--region-index', default='',
+              help='Region index file with {state}, pickled')
+@click.option('--quality', default='',
+              help='Quality file with {state}')
+@click.argument('thresholds', nargs=-1, type=float)
+@click.pass_context
+def filter_regions(ctx,
+                   thresh,
+                   introgress_filter,
+                   introgress_inter,
+                   ambiguous_filter,
+                   ambiguous_inter,
+                   filter_sweep,
+                   region,
+                   region_index,
+                   quality,
+                   thresholds):
+    config = ctx.obj  # type: Configuration
+    config.set('states')
+
+    config.set(filter_threshold=thresh)
+    log.info(f"Filter threshold set to '{config.filter_threshold}'")
+
+    config.set(introgressed=introgress_filter,
+               introgressed_intermediate=introgress_inter,
+               ambiguous=ambiguous_filter,
+               ambiguous_intermediate=ambiguous_inter,
+               filter_sweep=filter_sweep)
+    log.info(f"Introgressed filtered file is '{config.introgressed}'")
+    log.info('Introgressed intermediate file is '
+             f"'{config.introgressed_intermediate}'")
+    log.info(f"Ambiguous filtered file is '{config.ambiguous}'")
+    log.info('Ambiguous intermediate file is '
+             f"'{config.ambiguous_intermediate}'")
+    if config.filter_sweep is not None:
+        log.info(f"Filter sweep file is '{config.filter_sweep}'")
+
+    config.set(regions=region,
+               region_index=region_index)
+    log.info(f'Region file is \'{config.regions}\'')
+    log.info(f'Region index file is \'{config.region_index}\'')
+
+    config.set(quality_blocks=quality)
+    log.info(f'Quality file is \'{config.quality_blocks}\'')
+
+    config.set('symbols')
+
+    thresholds = list(thresholds)
+    log.info(f'Threshold sweep with: {thresholds}')
+
+    filterer = Filterer(config)
+    filterer.run(thresholds)
+
+
+@cli.command()
+@click.option('--introgress-inter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains all regions with reasons they failed filtering')
+@click.option('--ambiguous-inter', default='',
+              help='Filtered block file location with {state}.'
+              ' Contains all regions passing introgressing filtering, '
+              'with reasons they failed ambiguous filtering')
+@click.option('--strain-info', default='',
+              help='Tab separated table with strain name, alternate name, '
+              'location, envionment, and population')
+@click.option('--state-counts', default='',
+              help='Output state summary file')
+@click.pass_context
+def summarize_strains(ctx,
+                      introgress_inter,
+                      ambiguous_inter,
+                      strain_info,
+                      state_counts):
+    config = ctx.obj  # type: Configuration
+    config.set('states')
+    config.set(introgressed_intermediate=introgress_inter,
+               ambiguous_intermediate=ambiguous_inter,
+               strain_info=strain_info,
+               state_counts=state_counts)
+    log.info('Introgressed intermediate file is '
+             f"'{config.introgressed_intermediate}'")
+    log.info('Ambiguous intermediate file is '
+             f"'{config.ambiguous_intermediate}'")
+    log.info(f"Strain information from '{config.strain_info}'")
+    log.info(f"State counts saved to '{config.state_counts}'")
+    strain_summarizer = Strain_Summarizer(config)
+    strain_summarizer.run()
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/code/analyze/plotting/format_for_plot_gene_region.py b/code/analyze/plotting/format_for_plot_gene_region.py
index 306576e..176912f 100644
--- a/code/analyze/plotting/format_for_plot_gene_region.py
+++ b/code/analyze/plotting/format_for_plot_gene_region.py
@@ -7,29 +7,23 @@
 # - for gap
 
 
-import re
-import sys
-import os
-import copy
 import gzip
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
+from align import align_helpers
+from misc import read_fasta
 
 # copy pasta
 
+
 def try_int(s, default=-1):
     try:
         i = int(s)
         return i
-    except:
+    except ValueError:
         return default
 
-def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
+
+def referize(strain_seq, ref_ind_to_strain_ind, skip_char='N'):
     s = [skip_char for r in ref_ind_to_strain_ind]
     for i in range(len(ref_ind_to_strain_ind)):
         si = ref_ind_to_strain_ind[i]
@@ -39,31 +33,34 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
             s[i] = strain_seq[si]
     return s
 
-#region_start = 787000
-#region_end = 794000
-#chrm = 'II'
+
+# region_start = 787000
+# region_end = 794000
+# chrm = 'II'
 region_start = 917571 - 100
 region_end = 921647 + 100
 chrm = 'IV'
 region_length = region_end - region_start + 1
 
-##======
+# ======
 # get strains
-##======
+# ======
 
-strain_dirs = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
+strain_dirs = align_helpers.get_strains(
+    align_helpers.flatten(gp.non_ref_dirs.values()))
 num_strains = len(strain_dirs)
 
 
-##======
+# ======
 # loop through all strains, getting appropriate sequence
-##======
+# ======
 
 # master reference and other reference seqs
 master_ref = gp.alignment_ref_order[0]
 master_fn = gp.ref_dir[master_ref] + gp.ref_fn_prefix[master_ref] + '_chr' + \
             chrm + gp.fasta_suffix
-master_seq = read_fasta.read_fasta(master_fn)[1][0][region_start:region_end+1].lower()
+master_seq = read_fasta.read_fasta(master_fn)[1][0][
+    region_start:region_end+1].lower()
 
 
 other_ref = gp.alignment_ref_order[1]
@@ -74,30 +71,30 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
 ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()]
 other_ref_fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \
                '_chr' + chrm + gp.fasta_suffix
-other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(), \
+other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(),
                          ref_ind_to_strain_ind)[region_start:region_end+1]
 
 # other strains
 seqs = {}
 for i in range(num_strains):
     strain, d = strain_dirs[i]
-    print strain
-    coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
-               gp.master_ref + '_to_' + strain + \
-               '_chr' + chrm + '.txt.gz'
+    print(strain)
+    coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' +
+                gp.master_ref + '_to_' + strain +
+                '_chr' + chrm + '.txt.gz')
     f_coord = gzip.open(coord_fn, 'rb')
-    ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()]
+    ref_ind_to_strain_ind = [try_int(line[:-1])
+                             for line in f_coord.readlines()]
     strain_fn = d + strain + '_chr' + chrm + gp.fasta_suffix
-    seqs[strain] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(), \
+    seqs[strain] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(),
                             ref_ind_to_strain_ind)[region_start:region_end+1]
 
 # write file
 fn = 'gene_region_variants.txt'
 f = open(fn, 'w')
 
-f.write('ps\t' + '\t'.join([x[0] for x in strain_dirs]) + '\n') 
+f.write('ps\t' + '\t'.join([x[0] for x in strain_dirs]) + '\n')
 for i in range(region_length):
-    
     f.write(str(region_start + i))
     for strain, d in strain_dirs:
         x = seqs[strain][i]
@@ -119,4 +116,3 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
                     f.write('n')
     f.write('\n')
 f.close()
-
diff --git a/code/analyze/plotting/format_for_plotting.py b/code/analyze/plotting/format_for_plotting.py
index fca2521..d0aca70 100644
--- a/code/analyze/plotting/format_for_plotting.py
+++ b/code/analyze/plotting/format_for_plotting.py
@@ -1,35 +1,29 @@
 # format output files to be read easily and plotted in R
 
-import re
 import sys
-import os
-import copy
 import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import mystats
+from misc import mystats
 
-##======
+# ======
 # read in analysis parameters
-##======
+# ======
 
 suffix = ''
 if len(sys.argv == 3):
     suffix = sys.argv[1]
 
-all_predict_args = [x.strip().split() for x in open(sys.argv[2], 'r').readlines()]
-all_predict_args = [{'tag':x[0], 'improvement_frac':x[1], 'threshold':x[2], \
-                     'expected_length':x[-2], 'expected_frac':x[-1]} \
+all_predict_args = [x.strip().split()
+                    for x in open(sys.argv[2], 'r').readlines()]
+all_predict_args = [{'tag': x[0], 'improvement_frac': x[1], 'threshold': x[2],
+                     'expected_length':x[-2], 'expected_frac':x[-1]}
                     for x in all_predict_args]
 
-l = range(0,36)
-l.remove(19)
-l.remove(25)
-l = [0]
-all_predict_args = [all_predict_args[i] for i in l]
+arg_inds = range(0, 36)
+arg_inds.remove(19)
+arg_inds.remove(25)
+arg_inds = [0]
+all_predict_args = [all_predict_args[i] for i in arg_inds]
 
 '''
 finished = range(1,36)
@@ -49,39 +43,42 @@
 
 sep = '\t'
 
-##======
+# ======
 # for plot: lengths of all introgressed regions
-##======
+# ======
 
 # one table for each tag
 # strain chrm region_length
 
 # one table for all tags
-# tag improvement_frac threshold expected_length expected_frac avg_length lower upper median min max total_num_regions
+# tag improvement_frac threshold expected_length expected_frac
+# avg_length lower upper median min max total_num_regions
 
-print 'working on region lengths'
+print('working on region lengths')
 
 f = open(gp.analysis_out_dir_absolute + 'plot_region_lengths.txt', 'w')
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                 'plot_region_lengths' + suffix + '_' + args['tag'] + '.txt', 'w')
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'introgressed_blocks_par' + suffix + '_' + args['tag'] + '_summary_plus.txt'
+    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' +
+                 'plot_region_lengths' + suffix + '_' + args['tag']
+                 + '.txt', 'w')
+    fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' +
+          'introgressed_blocks_par' + suffix + '_' +
+          args['tag'] + '_summary_plus.txt')
     region_summary = gene_predictions.read_region_summary(fn)
     lengths_all = []
     for region in region_summary:
         length = int(region_summary[region]['end']) - \
                  int(region_summary[region]['start']) + 1
         if int(region_summary[region]['number_match_ref2_not_ref1']) >= 5:
-            f_tag.write(region + sep + region_summary[region]['strain'] + sep + \
-                        region_summary[region]['chromosome'] + sep + \
+            f_tag.write(region + sep + region_summary[region]['strain'] + sep +
+                        region_summary[region]['chromosome'] + sep +
                         str(length) + '\n')
             lengths_all.append(length)
     f_tag.close()
-    f.write(args['tag'] + sep + args['improvement_frac'] + sep + \
-            args['threshold'] + sep + args['expected_length'] + sep + \
+    f.write(args['tag'] + sep + args['improvement_frac'] + sep +
+            args['threshold'] + sep + args['expected_length'] + sep +
             args['expected_frac'] + sep)
     f.write(str(mystats.mean(lengths_all)) + sep)
     bs_lower, bs_upper = mystats.bootstrap(lengths_all)
@@ -92,43 +89,44 @@
     f.write(str(len(lengths_all)) + '\n')
 f.close()
 
-print 'done'
+print('done')
 
 sys.exit()
 
-##======
+# ======
 # for plot: number of genes per introgressed region
-##======
+# ======
 
 # one table for each tag
 # strain chrm region number_genes
 
 # one table for all tags
-# tag improvement_frac threshold expected_length expected_frac avg_genes_per_region lower upper median min max
+# tag improvement_frac threshold expected_length expected_frac
+# avg_genes_per_region lower upper median min max
 
-print 'working on number of genes for each region'
+print('working on number of genes for each region')
 
 f = open(gp.analysis_out_dir_absolute + 'plot_number_genes_by_region.txt', 'w')
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \
+    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' +
                  'plot_number_genes_by_region_' + args['tag'] + '.txt', 'w')
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'genes_for_each_region_' + args['tag'] + '.txt'
+    fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' +
+          'genes_for_each_region_' + args['tag'] + '.txt')
     genes = gene_predictions.read_genes_for_each_region_summary(fn)
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'introgressed_blocks_par_' + args['tag'] + '_summary.txt'
+    fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' +
+          'introgressed_blocks_par_' + args['tag'] + '_summary.txt')
     region_summary = gene_predictions.read_region_summary(fn)
     num_genes_all = []
     for region in genes:
-        f_tag.write(region + sep + region_summary[region]['strain'] + sep + \
-                    region_summary[region]['chromosome'] + sep + \
+        f_tag.write(region + sep + region_summary[region]['strain'] + sep +
+                    region_summary[region]['chromosome'] + sep +
                     genes[region]['num_genes'] + '\n')
         num_genes_all.append(int(genes[region]['num_genes']))
     f_tag.close()
-    f.write(args['tag'] + sep + args['improvement_frac'] + sep + \
-            args['threshold'] + sep + args['expected_length'] + sep + \
+    f.write(args['tag'] + sep + args['improvement_frac'] + sep +
+            args['threshold'] + sep + args['expected_length'] + sep +
             args['expected_frac'] + sep)
     f.write(str(mystats.mean(num_genes_all)) + sep)
     bs_lower, bs_upper = mystats.bootstrap(num_genes_all)
@@ -138,91 +136,96 @@
     f.write(str(max(num_genes_all)) + '\n')
 f.close()
 
-print 'done'
+print('done')
 
 
-##======
+# ======
 # for plot: number of introgressed bases for each strain
-##======
+# ======
 
 # one table for all tags
-# tag improvement_frac threshold expected_length expected_frac strain number_bases
+# tag improvement_frac threshold expected_length
+# expected_frac strain number_bases
 
-print 'working on number of bases for each strain'
+print('working on number of bases for each strain')
 
-f = open(gp.analysis_out_dir_absolute + \
+f = open(gp.analysis_out_dir_absolute +
          'plot_number_introgressed_bases_by_strain.txt', 'w')
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    fn = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-         'regions_for_each_strain_' + args['tag'] + '.txt'
+    fn = (gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' +
+          'regions_for_each_strain_' + args['tag'] + '.txt')
     regions = gene_predictions.read_regions_for_each_strain(fn)
     for strain in regions:
         total = 0
         for r in regions[strain]['region_list']:
             total += int(r[1])
-        f.write(args['tag'] + sep + args['improvement_frac'] + sep + \
-                args['threshold'] + sep + args['expected_length'] + sep + \
+        f.write(args['tag'] + sep + args['improvement_frac'] + sep +
+                args['threshold'] + sep + args['expected_length'] + sep +
                 args['expected_frac'] + sep + strain + sep + str(total) + '\n')
 f.close()
 
-print 'done'    
+print('done')
 
-##======
+# ======
 # for plot: number of introgressed genes for each strain
-##======
+# ======
 
 # one table for all tags
-# tag improvement_frac threshold expected_length expected_frac strain number_genes
+# tag improvement_frac threshold expected_length
+# expected_frac strain number_genes
 
-print 'working on number of genes for each strain'
+print('working on number of genes for each strain')
 
-f = open(gp.analysis_out_dir_absolute + \
+f = open(gp.analysis_out_dir_absolute +
          'plot_number_introgressed_genes_by_strain.txt', 'w')
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    fn = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-         'genes_for_each_strain_' + args['tag'] + '.txt'
+    fn = (gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' +
+          'genes_for_each_strain_' + args['tag'] + '.txt')
     genes = gene_predictions.read_genes_for_each_strain(fn)
     for strain in genes:
-        f.write(args['tag'] + sep + args['improvement_frac'] + sep + \
-                args['threshold'] + sep + args['expected_length'] + sep + \
-                args['expected_frac'] + sep + strain + sep + \
+        f.write(args['tag'] + sep + args['improvement_frac'] + sep +
+                args['threshold'] + sep + args['expected_length'] + sep +
+                args['expected_frac'] + sep + strain + sep +
                 genes[strain]['num_genes'] + sep + '\n')
 f.close()
 
-print 'done'
+print('done')
 
-##======
-# for plot: number of strains each gene introgressed in 
-##======
+# ======
+# for plot: number of strains each gene introgressed in
+# ======
 
 # one table for each tag
 # gene num_strains
 
 # one table for all tags
-# tag improvement_frac threshold expected_length expected_frac avg_strains_per_gene lower upper median min max total_num_genes total_num_genes_1 total_num_genes_>1
+# tag improvement_frac threshold expected_length expected_frac
+# avg_strains_per_gene lower upper median min max total_num_genes
+# total_num_genes_1 total_num_genes_>1
 
-print 'working on number of strains for each gene'
+print('working on number of strains for each gene')
 
-f = open(gp.analysis_out_dir_absolute + 'plot_number_strains_by_genes.txt', 'w')
+f = open(gp.analysis_out_dir_absolute + 'plot_number_strains_by_genes.txt',
+         'w')
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \
+    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' +
                  'plot_number_strains_by_genes_' + args['tag'] + '.txt', 'w')
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'strains_for_each_gene_' + args['tag'] + '.txt'
+    fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' +
+          'strains_for_each_gene_' + args['tag'] + '.txt')
     strains = gene_predictions.read_strains_for_each_gene(fn)
     num_strains_all = []
     for gene in strains:
         f_tag.write(gene + sep + strains[gene]['num_strains'] + '\n')
         num_strains_all.append(int(strains[gene]['num_strains']))
     f_tag.close()
-    f.write(args['tag'] + sep + args['improvement_frac'] + sep + \
-            args['threshold'] + sep + args['expected_length'] + sep + \
+    f.write(args['tag'] + sep + args['improvement_frac'] + sep +
+            args['threshold'] + sep + args['expected_length'] + sep +
             args['expected_frac'] + sep)
     f.write(str(mystats.mean(num_strains_all)) + sep)
     bs_lower, bs_upper = mystats.bootstrap(num_strains_all)
@@ -235,24 +238,25 @@
     f.write(str(len(filter(lambda x: x > 1, num_strains_all))) + '\n')
 f.close()
 
-print 'done'
+print('done')
 
-##======
-# for plot: average fraction of each (introgressed) gene that's introgressed 
-##======
+# ======
+# for plot: average fraction of each (introgressed) gene that's introgressed
+# ======
 
 # one table for each tag
 # gene avg_frac_introgressed lower upper median min max
 
-print 'working on fraction of gene introgressed'
+print('working on fraction of gene introgressed')
 
 for i in range(len(all_predict_args)):
-    print '-', i
+    print('-', i)
     args = all_predict_args[i]
-    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                 'plot_frac_introgressed_by_genes_' + args['tag'] + '.txt', 'w')
-    fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-         'strains_for_each_gene_' + args['tag'] + '.txt'
+    f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' +
+                 'plot_frac_introgressed_by_genes_' + args['tag'] + '.txt',
+                 'w')
+    fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' +
+          'strains_for_each_gene_' + args['tag'] + '.txt')
     strains = gene_predictions.read_strains_for_each_gene(fn)
     for gene in strains:
         fracs = [float(x[1]) for x in strains[gene]['strain_list']]
@@ -265,4 +269,4 @@
         f_tag.write(str(max(fracs)) + '\n')
     f_tag.close()
 
-print 'done'
+print('done')
diff --git a/code/analyze/plotting/format_for_plotting2.py b/code/analyze/plotting/format_for_plotting2.py
index 3362e63..4d9eaca 100644
--- a/code/analyze/plotting/format_for_plotting2.py
+++ b/code/analyze/plotting/format_for_plotting2.py
@@ -1,14 +1,6 @@
-import re
 import sys
-import os
-import copy
-import gene_predictions
-sys.path.insert(0, '..')
+from analyze.to_update import gene_predictions
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import mystats
 
 
 tag = sys.argv[1]
@@ -16,15 +8,15 @@
 if len(sys.argv == 3):
     suffix = sys.argv[2]
 
-fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-     'introgressed_blocks_par' + suffix + '_' + args['tag'] + '_summary_plus.txt'
+fn = gp.analysis_out_dir_absolute + tag + '/' + \
+     'introgressed_blocks_par' + suffix + '_' + tag + '_summary_plus.txt'
 region_summary = gene_predictions.read_region_summary(fn)
 
 sep = '\t'
 
-##======
+# ======
 # for plot: lengths of all introgressed regions
-##======
+# ======
 
 # one table for each tag
 # strain chrm region_length
diff --git a/code/analyze/plotting/format_for_plotting_region.py b/code/analyze/plotting/format_for_plotting_region.py
index b6454dc..15cebad 100644
--- a/code/analyze/plotting/format_for_plotting_region.py
+++ b/code/analyze/plotting/format_for_plotting_region.py
@@ -1,24 +1,20 @@
-import gene_predictions
+from analyze.to_update import gene_predictions
 import sys
 import os
 import gzip
-sys.path.insert(0, '../misc/')
-import read_fasta
 import global_params as gp
-sys.path.insert(0, '../sim/')
+
 
 def read_annotated_alignment(fn, nstrains):
     f = gzip.open(fn, 'rb')
     lines = f.readlines()
     f.close()
-    strains = [l[:-1] for l in lines[:nstrains]]
     genes = lines[nstrains + 2][len('genes:'):-1].split()
-    
+
     x = 11
     match_cer = ''
     match_par = ''
     gene = ''
-    gene_ind = -1
     intd = ''
 
     while x < len(lines):
@@ -38,13 +34,12 @@ def read_annotated_alignment(fn, nstrains):
 
     return match_cer, match_par, gene, genes, intd
 
-def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn):
 
+def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn):
     f = open(fn, 'w')
     f.write('ps\tmatch\tintd\tgene\n')
 
-    block_start = int(region['start']) - intd.index('I') 
-    block_end = len(intd) - intd.rindex('I') + int(region['end'])
+    block_start = int(region['start']) - intd.index('I')
 
     out_of_gene = True
     gene_ind = -1
@@ -64,11 +59,13 @@ def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn):
         f.write('\n')
     f.close()
 
+
 tag = sys.argv[1]
 region = sys.argv[2]
 
-blocks_fn = gp.analysis_out_dir_absolute + tag + '/' + \
-            'introgressed_blocks_filtered_' + 'par' + '_' + tag + '_summary.txt'
+blocks_fn = (gp.analysis_out_dir_absolute + tag + '/' +
+             'introgressed_blocks_filtered_' + 'par' +
+             '_' + tag + '_summary.txt')
 r = gene_predictions.read_region_summary(blocks_fn)
 strain = r[region]['strain']
 chrm = r[region]['chromosome']
@@ -86,6 +83,5 @@ def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn):
 
 write_ps_annotated(match_cer, match_par, gene, glist, intd, r[region], fn_out)
 
-#probs_f = gzip.open(gp.analysis_out_dir_absolute + tag + '/' + \
+# probs_f = gzip.open(gp.analysis_out_dir_absolute + tag + '/' + \
 #                    'probs_' + tag + '.txt.gz', 'rb')
-
diff --git a/code/analyze/plotting/format_polymorphism_for_r.py b/code/analyze/plotting/format_polymorphism_for_r.py
index 58d10ce..3665d0d 100644
--- a/code/analyze/plotting/format_polymorphism_for_r.py
+++ b/code/analyze/plotting/format_polymorphism_for_r.py
@@ -1,7 +1,5 @@
 # lol because i'm so bad at R
 
-import sys
-sys.path.insert(0, '..')
 import global_params as gp
 
 tag = 'u3_i.001_tv_l1000_f.01'
@@ -16,7 +14,7 @@
 d2_sums = {}
 for line in lines[1:]:
     chrm = line[0]
-    if not d_sums.has_key(chrm):
+    if chrm not in d_sums:
         d_sums[chrm] = 0
         d2_sums[chrm] = 0
         d[chrm] = {}
@@ -53,24 +51,24 @@
                     fab += c
     try:
         fo = str(float(fo)/d_sums[chrm])
-    except:
+    except ValueError:
         fo = 'NaN'
     try:
         fob = str(float(fob)/d2_sums[chrm])
-    except:
+    except ValueError:
         fob = 'NaN'
     try:
         fa = str(float(fa)/d_sums[chrm])
-    except:
+    except ValueError:
         fa = 'NaN'
     try:
         fab = str(float(fab)/d2_sums[chrm])
-    except:
+    except ValueError:
         fab = 'NaN'
 
-    f.write(chrm + '\tone\tpolymorphic\t' + fo + '\t' + str(d_sums[chrm]) + '\n')
-    f.write(chrm + '\tone\tbiallelic\t' + fob + '\t' + str(d2_sums[chrm]) + '\n')
-    f.write(chrm + '\tall\tpolymorphic\t' + fa + '\t' + str(d_sums[chrm]) + '\n')
-    f.write(chrm + '\tall\tbiallelic\t' + fab + '\t' + str(d2_sums[chrm]) + '\n')
+    f.write(f'{chrm}\tone\tpolymorphic\t{fo}\t{d_sums[chrm]}\n')
+    f.write(f'{chrm}\tone\tbiallelic\t{fob}\t{d2_sums[chrm]}\n')
+    f.write(f'{chrm}\tall\tpolymorphic\t{fa}\t{d_sums[chrm]}\n')
+    f.write(f'{chrm}\tall\tbiallelic\t{fab}\t{d2_sums[chrm]}\n')
 
 f.close()
diff --git a/code/analyze/predict.py b/code/analyze/predict.py
index f6eef19..1e48ef4 100644
--- a/code/analyze/predict.py
+++ b/code/analyze/predict.py
@@ -1,366 +1,700 @@
 import copy
 import gzip
+import os
 import itertools
+import click
 from collections import defaultdict, Counter
 from hmm import hmm_bw
 from sim import sim_predict
 from sim import sim_process
-import global_params as gp
-from misc import read_fasta
 import numpy as np
+from typing import List, Dict, Tuple, TextIO
+from contextlib import ExitStack
+import logging as log
+from misc.read_fasta import read_fasta
+from analyze.introgression_configuration import Configuration
+
+
+class Predictor():
+    '''
+    Predictor class
+    Stores all variables needed to run an HMM prediction
+    '''
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+
+    def run_prediction(self, only_poly_sites=True):
+        '''
+        Run prediction with this predictor object
+        '''
+        self.validate_arguments()
+
+        hmm_builder = HMM_Builder(self.config)
+        hmm_builder.set_expected_values()
+        self.emission_symbols = \
+            hmm_builder.update_emission_symbols(len(self.config.known_states))
+
+        with open(self.config.hmm_initial, 'w') as initial, \
+                open(self.config.hmm_trained, 'w') as trained, \
+                gzip.open(self.config.probabilities, 'wt') as probabilities, \
+                gzip.open(self.config.positions, 'wt') as positions, \
+                ExitStack() as stack:
+
+            self.write_hmm_header(initial)
+            self.write_hmm_header(trained)
+
+            block_writers = {state:
+                             stack.enter_context(
+                                 open(self.config.blocks.format(
+                                     state=state), 'w'))
+                             for state in
+                             self.config.states}
+            for writer in block_writers.values():
+                self.write_blocks_header(writer)
+
+            counter = 0
+            total = len(self.config.chromosomes) * len(self.config.strains)
+            progress_bar = None
+            if self.config.log_file:  # logging to file
+                progress_bar = stack.enter_context(
+                    click.progressbar(
+                        length=total,
+                        label='Running prediction'))
+
+            for chrom in self.config.chromosomes:
+                for strain in self.config.strains:
+                    counter += 1
+                    log.info(f'working on: {strain} {chrom} '
+                             f'({counter} of {total})')
+
+                    # get sequences and encode
+                    alignment_file = self.config.alignment.format(
+                        strain=strain, chrom=chrom)
+
+                    if not os.path.exists(alignment_file):
+                        log.info(f'skipping, file {alignment_file} not found')
+                    else:
+                        hmm_initial, hmm_trained, pos = hmm_builder.run_hmm(
+                            alignment_file, only_poly_sites)
+
+                        self.write_hmm(hmm_initial, initial, strain, chrom)
+                        self.write_hmm(hmm_trained, trained, strain, chrom)
+
+                        # process and threshold hmm result
+                        predicted_states, probs = self.process_path(
+                            hmm_trained)
+                        state_blocks = self.convert_to_blocks(predicted_states)
+
+                        self.write_positions(pos, positions, strain, chrom)
+
+                        for state, block in state_blocks.items():
+                            self.write_blocks(block,
+                                              pos,
+                                              block_writers[state],
+                                              strain,
+                                              chrom,
+                                              state)
+
+                        self.write_state_probs(probs, probabilities,
+                                               strain, chrom)
+
+                    if progress_bar:
+                        progress_bar.update(1)
+
+    def validate_arguments(self):
+        '''
+        Check that all required instance variables are set to perform a
+        prediction run. Returns true if valid, raises value error otherwise
+        '''
+        args = [
+            'chromosomes',
+            'blocks',
+            'prefix',
+            'strains',
+            'hmm_initial',
+            'hmm_trained',
+            'probabilities',
+            'positions',
+            'alignment',
+            'known_states',
+            'unknown_states',
+            'threshold',
+        ]
+        variables = self.config.__dict__
+        for arg in args:
+            if arg not in variables or variables[arg] is None:
+                err = ('Failed to validate Predictor, required argument '
+                       f"'{arg}' was unset")
+                log.exception(err)
+                raise ValueError(err)
+
+        # check the parameters for each state are present
+        known_states = self.config.get('analysis_params.known_states')
+        if known_states is None:
+            err = 'Configuration did not provide any known_states'
+            log.exception(err)
+            raise ValueError(err)
+
+        for s in known_states:
+            if 'expected_length' not in s:
+                err = f'{s["name"]} did not provide an expected_length'
+                log.exception(err)
+                raise ValueError(err)
+            if 'expected_fraction' not in s:
+                err = f'{s["name"]} did not provide an expected_fraction'
+                log.exception(err)
+                raise ValueError(err)
+
+        unknown_states = self.config.get('analysis_params.unknown_states')
+        if unknown_states is not None:
+            for s in unknown_states:
+                if 'expected_length' not in s:
+                    err = f'{s["name"]} did not provide an expected_length'
+                    log.exception(err)
+                    raise ValueError(err)
+                if 'expected_fraction' not in s:
+                    err = f'{s["name"]} did not provide an expected_fraction'
+                    log.exception(err)
+                    raise ValueError(err)
+
+        reference = self.config.get('analysis_params.reference')
+        if reference is None:
+            err = f'Configuration did not specify a reference strain'
+            log.exception(err)
+            raise ValueError(err)
+
+        return True
+
+    def write_hmm_header(self, writer: TextIO) -> None:
+        '''
+        Write the header line for an hmm file to the provided textIO object
+        Output is tab delimited with:
+        strain chromosome initial_probs emissions transitions
+        '''
+
+        writer.write('strain\tchromosome\t')
+
+        states = self.config.states
+
+        writer.write('\t'.join(
+            [f'init_{s}' for s in states] +  # initial
+            [f'emis_{s}_{symbol}'
+             for s in states
+             for symbol in self.emission_symbols] +  # emissions
+            [f'trans_{s1}_{s2}'
+             for s1 in states
+             for s2 in states]))  # transitions
 
-def read_aligned_seqs(fn, strain):
-    headers, seqs = read_fasta.read_fasta(fn)
-    d = {}
-    for i in range(len(seqs)):
-        name = headers[i][1:].split(' ')[0]
-        d[name] = seqs[i]
-
-    ref_seqs = []
-    for ref in gp.alignment_ref_order:
-        ref_seqs.append(d[ref])
-    predict_seq = d[strain]
-
-    return ref_seqs, predict_seq
-
-
-def set_expectations(args, n):
-
-    species_to = args['known_states'][0]
-    species_from = args['known_states'][1:]
-
-    args['expected_num_tracts'] = {}
-    args['expected_bases'] = {}
-    for s in species_from:
-        args['expected_num_tracts'][s] = \
-            args['expected_frac'][s] * n / args['expected_length'][s]
-        args['expected_bases'][s] = args['expected_num_tracts'][s] * \
-                                    args['expected_length'][s]
-
-    args['expected_bases'][species_to] = \
-        n - sum([args['expected_bases'][s] for s in species_from])
-
-    args['expected_num_tracts'][species_to] = \
-        sum([args['expected_num_tracts'][s] for s in species_from]) + 1
-
-    args['expected_length'][species_to] = \
-        args['expected_bases'][species_to] / args['expected_num_tracts'][species_to]
-
-
-def ungap_and_code(predict_seq, ref_seqs, index_ref=0):
-    # index_ref is index of reference strain to index relative to
-    # build character array
-    sequences = np.array([list(predict_seq)] +
-                         [list(r) for r in ref_seqs])
-
-    isbase = sequences != gp.gap_symbol
-
-    # make boolean for valid characters
-    isvalid = np.logical_and(sequences != gp.gap_symbol,
-                             sequences != gp.unsequenced_symbol)
-
-    # positions are where everything is valid, index where the reference is
-    # valid.  The +1 removes the predict sequence at index 0
-    positions = np.where(
-        np.all(isvalid[:, isbase[index_ref+1, :]], axis=0))[0]
-
-    matches = np.where(sequences[0] == sequences[1:],
-                       gp.match_symbol,
-                       gp.mismatch_symbol)
-
-    # 1: indexing removes currently examined sequence
-    matches = [''.join(row)
-               for row in np.transpose(matches[:, np.all(isvalid, axis=0)])]
-
-    # NOTE list is for unit test comparisons
-    return matches, positions
-
-
-def poly_sites(sequences, positions):
-    seq_len = len(sequences[0])
-    # check if seq only contains match_symbol
-    retain = np.vectorize(
-        lambda x: x.count(gp.match_symbol) != seq_len)(sequences)
-    indices = np.where(retain)[0]
-    ps_poly = [positions[i] for i in indices]
-    seq_poly = [sequences[i] for i in indices]
-
-    return seq_poly, ps_poly
-
-
-def get_symbol_freqs(sequence):
-
-    individual = []
-    weighted = []
-
-    symbols = defaultdict(int, Counter(sequence))
-    total = len(sequence)
-    for k in symbols:
-        symbols[k] /= total
-
-    sequence = np.array([list(s) for s in sequence])
-
-    # look along species
-    for s in np.transpose(sequence):
-        s = ''.join(s)
-        counts = Counter(s)
-        weighted.append(counts[gp.match_symbol])
-        total = sum(counts.values())
-        for k in counts:
-            counts[k] /= total
-        individual.append(defaultdict(int, counts))
-
-    total = sum(weighted)
-    weighted = [w / total for w in weighted]
-    return individual, symbols, weighted
-
-
-def initial_probabilities(known_states, unknown_states,
-                          expected_frac, weighted_match_freqs):
-
-    init = []
-    expectation_weight = .9
-    for s, state in enumerate(known_states):
-        expected = expected_frac[state]
-        estimated = weighted_match_freqs[s]
-        init.append(expected * expectation_weight +
-                    estimated * (1 - expectation_weight))
-
-    for state in unknown_states:
-        expected_frac = expected_frac[state]
-        init.append(expected_frac)
-
-    return init / np.sum(init)
-
-
-def emission_probabilities(known_states, unknown_states, symbols):
-
-    probabilities = {
-        gp.mismatch_symbol + gp.match_symbol: 0.9,
-        gp.match_symbol + gp.match_symbol: 0.09,
-        gp.mismatch_symbol + gp.mismatch_symbol: 0.009,
-        gp.match_symbol + gp.mismatch_symbol: 0.001,
-    }
-
-    mismatch_bias = .99
-
-    num_per_category = 2 ** (len(known_states) - 2)
-    for key in probabilities:
-        probabilities[key] *= num_per_category
-
-    # for known states
-    symbol_array = np.array([list(s) for s in symbols], dtype='<U1')
-    # for unknown states
-    symbol_length = symbol_array.shape[1]
-    number_matches = (symbol_array == gp.match_symbol).sum(axis=1)
-    # combine first column with rest to generate probabilities
-    first_column = np.tile(symbol_array[:, 0:1], (1, len(known_states)))
-    symbol_array = np.core.defchararray.add(
-        first_column, symbol_array[:, 0:len(known_states)])
-    # index into probabilities and normalize
-    emissions = np.vectorize(probabilities.__getitem__)(symbol_array)
-    emissions /= sum(emissions)
-
-    # convert to match * (1-bias) + mismatch * bias, simplified
-    number_matches = (number_matches + mismatch_bias *
-                      (symbol_length - 2 * number_matches))
-    number_matches /= sum(number_matches)
-    # repeat for each unknown state
-    number_matches = np.transpose(
-        np.tile(number_matches, (len(unknown_states), 1)))
-
-    result = [defaultdict(float,
-                          {k: v for k, v in
-                           zip(symbols, emissions[:, i])})
-              for i in range(emissions.shape[1])]
-    result.extend([defaultdict(float,
-                               {k: v for k, v in
-                                zip(symbols, number_matches[:, i])})
-                   for i in range(number_matches.shape[1])])
-
-    return result
-
+        writer.write('\n')
 
-def transition_probabilities(known_states, unknown_states,
-                             expected_frac, expected_length):
+    def write_hmm(self,
+                  hmm: hmm_bw.HMM,
+                  writer: TextIO,
+                  strain: str,
+                  chrm: str):
+        '''
+        Write information on the provided hmm as a line to the supplied textIO
+        object.
+        Output is tab delimited with:
+        strain chromosome initial_probs emissions transitions
+        '''
+        writer.write(f'{strain}\t{chrm}\t')
+
+        states = len(hmm.hidden_states)
+        writer.write('\t'.join(
+            [f'{p}' for p in hmm.initial_p] +  # initial
+            [f'{hmm.emissions[i, hmm.symbol_to_ind[symbol]]}'
+             if symbol in hmm.symbol_to_ind else '0.0'
+             for i in range(states)
+             for symbol in self.emission_symbols] +  # emission
+            [f'{hmm.transitions[i, j]}'
+             for i in range(states)
+             for j in range(states)]  # transition
+        ))
+        writer.write('\n')
 
-    # doesn't depend on sequence observations but maybe it should?
+    def write_blocks_header(self, writer: TextIO) -> None:
+        '''
+        Write header line to tab delimited block file:
+        strain chromosome predicted_species start end num_sites_hmm
+        '''
+        # NOTE: num_sites_hmm represents the sites considered by the HMM,
+        # so it might exclude non-polymorphic sites in addition to gaps
+        writer.write('\t'.join(['strain',
+                                'chromosome',
+                                'predicted_species',
+                                'start',
+                                'end',
+                                'num_sites_hmm'])
+                     + '\n')
+
+    def write_blocks(self,
+                     state_seq_blocks: List[Tuple[int, int]],
+                     positions: np.array,
+                     writer: TextIO,
+                     strain: str,
+                     chrm: str,
+                     species_pred: str) -> None:
+        '''
+        Write entry into tab delimited block file, with columns:
+        strain chromosome predicted_species start end num_sites_hmm
+        '''
+        writer.write('\n'.join(
+            ['\t'.join([strain,
+                        chrm,
+                        species_pred,
+                        str(positions[start]),
+                        str(positions[end]),
+                        str(end - start + 1)])
+             for start, end in state_seq_blocks]))
+        if state_seq_blocks:  # ensure ends with \n
+            writer.write('\n')
+
+    def write_positions(self,
+                        positions: np.array,
+                        writer: TextIO,
+                        strain: str,
+                        chrm: str) -> None:
+        '''
+        Write the positions of the specific strain, chromosome as a line to the
+        provided textIO object
+        '''
+        writer.write(f'{strain}\t{chrm}\t' +
+                     '\t'.join([str(x) for x in positions]) + '\n')
+
+    def write_state_probs(self,
+                          probs: Dict[str, List[float]],
+                          writer: TextIO,
+                          strain: str,
+                          chrm: str) -> None:
+        '''
+        Write the probability of each state to the supplied textIO object
+        Output is tab delimited with:
+        strain chrom state1:prob1,prob2,...,probn state2...
+        '''
+        writer.write(f'{strain}\t{chrm}\t')
+
+        writer.write('\t'.join(
+            [f'{state}:' +
+             ','.join([f'{site[i]:.5f}' for site in probs])
+             for i, state in enumerate(self.config.states)]))
 
-    # also should we care about number of tracts rather than fraction
-    # of genome? maybe theoretically, but that number is a lot more
-    # suspect
+        writer.write('\n')
 
-    states = known_states + unknown_states
+    def process_path(self, hmm: hmm_bw.HMM) -> Tuple[List[str], np.array]:
+        '''
+        Process the hmm path based the the predictor threshold value
+        Return the predicted states and the probabilities of the master
+        reference sequence
+        '''
+        probabilities = hmm.posterior_decoding()[0]
+
+        # posterior
+        if type(self.config.threshold) is float:
+            path, path_probs = sim_process.get_max_path(probabilities,
+                                                        hmm.hidden_states)
+            path_t = sim_process.threshold_predicted(
+                path,
+                path_probs,
+                self.config.threshold,
+                self.config.known_states[0])
+
+            return path_t, probabilities
 
-    fractions = np.array([expected_frac[s] for s in states])
-    lengths = 1/np.array([expected_length[s] for s in states])
-
-    # general case,
-    # trans[i,j] = 1/ length[i] * expected[j] * 1 /(1 - fraction[i])
-    transitions = np.outer(
-        np.multiply(lengths, 1/(1-fractions)),
-        fractions)
-    # when i == j, trans[i,j] = 1 - 1/length[i]
-    np.fill_diagonal(transitions, 1-lengths)
-
-    # normalize
-    return transitions / transitions.sum(axis=1)[:, None]
-
-
-def initial_hmm_parameters(seq, known_states, unknown_states,
-                           expected_frac, expected_length):
-
-    # get frequencies of individual symbols (e.g. '+') and all full
-    # combinations of symbols (e.g. '+++-')
-    individual_symbol_freqs, symbol_freqs, weighted_match_freqs = get_symbol_freqs(seq)
-
-    init = initial_probabilities(known_states, unknown_states,
-                                 expected_frac, weighted_match_freqs)
-    emis = emission_probabilities(known_states, unknown_states, symbol_freqs.keys())
-    trans = transition_probabilities(known_states, unknown_states,
-                                     expected_frac, expected_length)
-
-    # new Hidden Markov Model
-    hmm = hmm_bw.HMM()
-
-    hmm.set_initial_p(init)
-    hmm.set_emissions(emis)
-    hmm.set_transitions(trans)
-    return hmm
-
-
-def predict_introgressed(ref_seqs, predict_seq, predict_args,
-                         train=True, only_poly_sites=True,
-                         return_positions=False):
-
-    # code sequence by which reference it matches at each site;
-    # positions are relative to master (first) reference sequence
-    seq_coded, positions = ungap_and_code(predict_seq, ref_seqs)
-    if only_poly_sites:
-        seq_coded, positions = poly_sites(seq_coded, positions)
-    if return_positions:
-        return positions
-    
-    # sets expected number of tracts and bases for each reference
-    # based on expected length of introgressed tracts and expected
-    # total fraction of genome
-    set_expectations(predict_args, len(predict_seq))
-
-    # set initial hmm parameters based on combination of (1) initial
-    # expectations (length of introgressed tract and fraction of
-    # genome/total number tracts and bases) and (2) number of sites at
-    # which predict seq matches each reference
-    hmm = initial_hmm_parameters(seq_coded,
-                                 predict_args['known_states'],
-                                 predict_args['unknown_states'],
-                                 predict_args['expected_frac'],
-                                 predict_args['expected_length'])
-
-    # make predictions
-
-    # set states and initial probabilties
-    hmm.set_hidden_states(predict_args['states'])
-
-    # copy before setting observations to save memory
-    hmm_init = copy.deepcopy(hmm)
-
-    # set obs
-    hmm.set_observations([seq_coded])
-
-    # optional Baum-Welch parameter estimation
-    if train:
-        hmm.train(predict_args['improvement_frac'])
-
-    p = hmm.posterior_decoding()
-    path, path_probs = sim_process.get_max_path(p[0], hmm.hidden_states)
-
-    # posterior
-    if type(predict_args['threshold']) is float:
-        path_t = sim_process.threshold_predicted(path, path_probs,
-                                                 predict_args['threshold'],
-                                                 predict_args['states'][0])
-        return path_t, p[0], hmm, hmm_init, positions
-
-    else:
-        hmm.set_observations([seq_coded])
-        predicted = sim_predict.convert_predictions(hmm.viterbi(),
-                                                    predict_args['states'])
-        return predicted, p[0], hmm, hmm_init, positions
-
-
-def convert_to_blocks(state_seq, states):
-    # single individual state sequence
-    blocks = {}
-    for state in states:
-        blocks[state] = []
-    prev_species = state_seq[0]
-    block_start = 0
-    block_end = 0
-    for i in range(len(state_seq)):
-        if state_seq[i] == prev_species:
-            block_end = i
         else:
-            blocks[prev_species].append((block_start, block_end))
-            block_start = i
-            block_end = i
-            prev_species = state_seq[i]
-    # add last block
-    if prev_species not in blocks:
-        blocks[prev_species] = []
-    blocks[prev_species].append((block_start, block_end))
-
-    return blocks
-
-
-def write_positions(ps, writer, strain, chrm):
-    writer.write(f'{strain}\t{chrm}\t' +
-                 '\t'.join([str(x) for x in ps]) + '\n')
-
-
-def read_positions(fn):
-    # dictionary keyed by strain and then chromosome
-    with gzip.open(fn, 'rb') as reader:
+            predicted = sim_predict.convert_predictions(hmm.viterbi(),
+                                                        self.config.states)
+            return predicted, probabilities
+
+    def convert_to_blocks(self,
+                          state_seq: List[str]) -> Dict[
+                              str, List[Tuple[int, int]]]:
+        '''
+        Convert a list of sequences into a structure of start and end positions
+        Return structure is a dict keyed on species with values of Lists of
+        each block, which is a tuple with start and end positions
+        '''
+        # single individual state sequence
+        blocks = {}
+        for state in self.config.states:
+            blocks[state] = []
+        prev_species = state_seq[0]
+        block_start = 0
+        block_end = 0
+        for i in range(len(state_seq)):
+            if state_seq[i] == prev_species:
+                block_end = i
+            else:
+                blocks[prev_species].append((block_start, block_end))
+                block_start = i
+                block_end = i
+                prev_species = state_seq[i]
+        # add last block
+        if prev_species not in blocks:
+            blocks[prev_species] = []
+        blocks[prev_species].append((block_start, block_end))
+
+        return blocks
+
+
+class HMM_Builder():
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+        self.config.set('symbols')
+        self.symbols = self.config.symbols
+        self.config.set('convergence')
+
+    def update_emission_symbols(self, repeats: int):
+        '''
+        Generate all permutations of match and mismatch symbols with
+        repeats number of characters, in lexigraphical order.
+        Sets internal state and returns the emission symbols
+        '''
+        syms = [self.symbols['match'], self.symbols['mismatch']]
+        emis_symbols = [''.join(x) for x in
+                        itertools.product(syms,
+                                          repeat=repeats)]
+        emis_symbols.sort()
+        self.emission_symbols = emis_symbols
+        return emis_symbols
+
+    def get_symbol_freqs(self, sequence: np.array) -> Tuple[Dict, List]:
+        '''
+        Calculate metrics from the provided, coded sequence
+        Returns:
+        the fraction of each matching pattern (e.g. +--++)
+        the weighted fraction of matches for each species
+        '''
+
+        weighted = []
+
+        symbols = defaultdict(int, Counter(sequence))
+        total = len(sequence)
+        for k in symbols:
+            symbols[k] /= total
+
+        sequence = np.array([list(s) for s in sequence])
+
+        # look along species
+        for s in np.transpose(sequence):
+            s = ''.join(s)
+            counts = Counter(s)
+            weighted.append(counts[self.symbols['match']])
+
+        total = sum(weighted)
+        weighted = [w / total for w in weighted]
+        return symbols, weighted
+
+    def set_expected_values(self):
+        '''
+        Get expected lengths and fractions for each state.
+        Assumes config has been validated by Predictor prior to running
+        '''
+        self.expected_lengths = {}
+        self.expected_fractions = {}
+        known_states = self.config.get('analysis_params.known_states')
+        for state in known_states:
+            self.expected_lengths[state['name']] = state['expected_length']
+            self.expected_fractions[state['name']] = state['expected_fraction']
+
+        unknown_states = self.config.get('analysis_params.unknown_states')
+        for state in unknown_states:
+            self.expected_lengths[state['name']] = state['expected_length']
+            self.expected_fractions[state['name']] = state['expected_fraction']
+
+        reference = self.config.get('analysis_params.reference')
+        # expected fraction of reference is the remainder after other states
+        # are specified
+        self.expected_fractions[reference['name']] =\
+            1 - sum(self.expected_fractions.values())
+
+        self.ref_state = self.config.get('analysis_params.reference.name')
+        self.known_states = self.config.known_states
+        self.unknown_states = self.config.unknown_states
+
+        # have to remove effect of unknown of these values for later
+        self.ref_fraction = self.expected_fractions[self.ref_state] + \
+            sum([self.expected_fractions[s] for s in self.unknown_states])
+        # sum of fraction / length, or 1 / tract length
+        self.other_sum = sum([self.expected_fractions[s['name']] /
+                              self.expected_lengths[s['name']]
+                              for s in known_states])
+
+    def update_expected_length(self, total_length: int):
+        '''
+        Updates the expected length for the reference state
+        based on the provided total_length of the sequence.
+        This is the expected length of a single tract, determined as the sum
+        of the total length (sequence length * fraction) divided by the number
+        of tracts (sequence length * 1 / other's tracts). The + 1 assumes that
+        the sequence will start and end with the reference.
+        '''
+        self.expected_lengths[self.ref_state] = (
+            total_length * self.ref_fraction /
+            (total_length * self.other_sum + 1))
+
+    def initial_probabilities(self,
+                              weighted_match_freqs: List[float]) -> np.array:
+        '''
+        Estimate the initial probability of being in each state
+        based on the number of states and their expected fractions
+        Returns the initial probability of each state
+        '''
+
+        init = []
+        expectation_weight = .9
+        for s, state in enumerate(self.known_states):
+            expected = self.expected_fractions[state]
+            estimated = weighted_match_freqs[s]
+            init.append(expected * expectation_weight +
+                        estimated * (1 - expectation_weight))
+
+        for state in self.unknown_states:
+            expected_frac = self.expected_fractions[state]
+            init.append(expected_frac)
+
+        return init / np.sum(init)
+
+    def emission_probabilities(self,
+                               symbols: List[str]) -> List[Dict]:
+        '''
+        Estimate initial emission probabilities
+        Return estimates as list of default dict of probabilities
+        '''
+
+        match = self.symbols['match']
+        mismatch = self.symbols['mismatch']
+        probabilities = {
+            mismatch + match: 0.9,
+            match + match: 0.09,
+            mismatch + mismatch: 0.009,
+            match + mismatch: 0.001,
+        }
+
+        mismatch_bias = .99
+
+        num_per_category = 2 ** (len(self.known_states) - 2)
+        for key in probabilities:
+            probabilities[key] *= num_per_category
+
+        # for known states
+        symbol_array = np.array([list(s) for s in symbols], dtype='<U1')
+        # for unknown states
+        symbol_length = symbol_array.shape[1]
+        number_matches = (symbol_array == match).sum(axis=1)
+        # combine first column with rest to generate probabilities
+        first_column = np.tile(symbol_array[:, 0:1],
+                               (1, len(self.known_states)))
+        symbol_array = np.core.defchararray.add(
+            first_column, symbol_array[:, 0:len(self.known_states)])
+        # index into probabilities and normalize
+        emissions = np.vectorize(probabilities.__getitem__)(symbol_array)
+        emissions /= sum(emissions)
+
+        # convert to match * (1-bias) + mismatch * bias, simplified
+        number_matches = (number_matches + mismatch_bias *
+                          (symbol_length - 2 * number_matches))
+        number_matches /= sum(number_matches)
+        # repeat for each unknown state
+        number_matches = np.transpose(
+            np.tile(number_matches, (len(self.unknown_states), 1)))
+
+        # convert result into default dict
+        result = [defaultdict(float,
+                              {k: v for k, v in
+                               zip(symbols, emissions[:, i])})
+                  for i in range(emissions.shape[1])]
+        result.extend([defaultdict(float,
+                                   {k: v for k, v in
+                                    zip(symbols, number_matches[:, i])})
+                       for i in range(number_matches.shape[1])])
+
+        return result
+
+    def transition_probabilities(self) -> np.array:
+        '''
+        Estimate initial transition probabilities
+        '''
+
+        # doesn't depend on sequence observations but maybe it should?
+
+        # also should we care about number of tracts rather than fraction
+        # of genome? maybe theoretically, but that number is a lot more
+        # suspect
+
+        states = self.config.states
+
+        fractions = np.array([self.expected_fractions[s] for s in states])
+        lengths = 1/np.array([self.expected_lengths[s] for s in states])
+
+        # general case,
+        # trans[i,j] = 1/ length[i] * expected[j] * 1 /(1 - fraction[i])
+        transitions = np.outer(
+            np.multiply(lengths, 1/(1-fractions)),
+            fractions)
+        # when i == j, trans[i,j] = 1 - 1/length[i]
+        np.fill_diagonal(transitions, 1-lengths)
+
+        # normalize
+        return transitions / transitions.sum(axis=1)[:, None]
+
+    def build_initial_hmm(self, seq: np.array) -> hmm_bw.HMM:
+        '''
+        Build a HMM object initialized based on expected values and sequence
+        '''
+
+        # get frequencies of individual symbols (e.g. '+') and all full
+        # combinations of symbols (e.g. '+++-')
+        (symbol_freqs,
+         weighted_match_freqs) = self.get_symbol_freqs(seq)
+
+        # new Hidden Markov Model
+        hmm = hmm_bw.HMM()
+
+        hmm.set_initial_p(self.initial_probabilities(weighted_match_freqs))
+        hmm.set_emissions(self.emission_probabilities(symbol_freqs.keys()))
+        hmm.set_transitions(self.transition_probabilities())
+        return hmm
+
+    def run_hmm(self,
+                alignment_file: str,
+                only_poly_sites: bool = True) -> Tuple[hmm_bw.HMM,
+                                                       hmm_bw.HMM,
+                                                       np.array]:
+        '''
+        Runs the hmm training, returning the initial and trained HMM along
+        with the positions of hmm importance
+        '''
+        coded_sequence, positions, len_seq = \
+            self.encode_sequence(alignment_file, only_poly_sites)
+
+        self.update_expected_length(len_seq)
+        # set initial hmm parameters based on combination of (1) initial
+        # expectations (length of introgressed tract and fraction of
+        # genome/total number tracts and bases) and (2) number of sites at
+        # which predict seq matches each reference
+        hmm = self.build_initial_hmm(coded_sequence)
+
+        # set states and initial probabilties
+        hmm.set_hidden_states(self.known_states + self.unknown_states)
+
+        # copy before setting observations to save memory
+        hmm_init = copy.deepcopy(hmm)
+
+        # set obs
+        hmm.set_observations([coded_sequence])
+
+        # Baum-Welch parameter estimation
+        hmm.train(self.config.convergence)
+
+        return hmm_init, hmm, positions
+
+    def encode_sequence(self,
+                        alignment_file: str,
+                        only_poly_sites: bool = True) -> Tuple[
+                            np.array,
+                            np.array,
+                            int]:
+        '''
+        open the supplied alignment file, encode, and return the coded
+        sequence along with the positions.  If only_poly_sites is True,
+        also filter out non-polymorphic sites.
+        Returns the encoded sequence, positions, and length of original seq
+        '''
+        _, sequences = read_fasta(alignment_file)
+
+        references = sequences[:-1]
+        predicted = sequences[-1]
+
+        seq_coded, positions = self.ungap_and_code(predicted, references)
+        if only_poly_sites:
+            seq_coded, positions = self.poly_sites(seq_coded, positions)
+
+        return seq_coded, positions, len(predicted)
+
+    def ungap_and_code(self,
+                       predict_seq: str,
+                       ref_seqs: List[str],
+                       index_ref: int = 0) -> Tuple[np.array, np.array]:
+        '''
+        Remove any sequence locations where a gap is present and code
+        into matching or mismatching sequence
+        Returns the coded sequences, by default an array of + where matching, -
+        where mismatching.  Also return the positions where the sequences are
+        not gapped.
+        '''
+        # index_ref is index of reference strain to index relative to
+        # build character array
+        sequences = np.array([list(predict_seq)] +
+                             [list(r) for r in ref_seqs])
+
+        isbase = sequences != self.symbols['gap']
+
+        # make boolean for valid characters
+        isvalid = np.logical_and(isbase,
+                                 sequences != self.symbols['unsequenced'])
+
+        # positions are where everything is valid, index where the reference is
+        # valid.  The +1 removes the predict sequence at index 0
+        positions = np.where(
+            np.all(isvalid[:, isbase[index_ref+1, :]], axis=0))[0]
+
+        matches = np.where(sequences[0] == sequences[1:],
+                           self.symbols['match'],
+                           self.symbols['mismatch'])
+
+        matches = np.fromiter((''.join(row)
+                               for row in np.transpose(
+                                   matches[:, np.all(isvalid, axis=0)])),
+                              dtype=f'U{len(sequences) - 1}')
+
+        return matches, positions
+
+    def poly_sites(self,
+                   sequences: np.array,
+                   positions: np.array) -> Tuple[np.array, np.array]:
+        '''
+        Remove all sequences where the sequence is all match_symbol
+        Returns the filtered sequence and position
+        '''
+        seq_len = len(sequences[0])
+        # check if seq only contains match_symbol
+        retain = np.vectorize(
+            lambda x: x.count(self.symbols['match']) != seq_len)(sequences)
+        indices = np.where(retain)[0]
+
+        ps_poly = positions[indices]
+        seq_poly = sequences[indices]
+
+        return seq_poly, ps_poly
+
+
+def read_positions(filename: str) -> Dict[str, Dict[str, List[int]]]:
+    '''
+    Read in positions from the provided filename, returning a dictionary
+    keyed first by the strain, then chromosome.  Returned positions are
+    lists of ints
+    '''
+    with gzip.open(filename, 'rt') as reader:
         result = defaultdict({})
         for line in reader:
             line = line.split()
             strain, chrm = line[0:2]
-            ps = [int(x) for x in line[2:]]
-            result[strain][chrm] = ps
+            positions = [int(x) for x in line[2:]]
+            result[strain][chrm] = positions
     return result
 
 
-def write_blocks_header(writer):
-    # NOTE: num_sites_hmm represents the sites considered by the HMM,
-    # so it might exclude non-polymorphic sites in addition to gaps
-    writer.write('\t'.join(['strain',
-                            'chromosome',
-                            'predicted_species',
-                            'start',
-                            'end',
-                            'num_sites_hmm'])
-                 + '\n')
-
-
-# TODO: find source of all the newlines in output!!
-def write_blocks(state_seq_blocks, ps, writer, strain, chrm, species_pred):
-    # file format is:
-    # strain chrm predicted_species start end number_non_gap
-    writer.write('\n'.join(
-        ['\t'.join([strain,
-                    chrm,
-                    species_pred,
-                    str(ps[start]),
-                    str(ps[end]),
-                    str(end - start + 1)])
-         for start, end in state_seq_blocks]))
-    if state_seq_blocks:
-        writer.write('\n')
-
-
-def read_blocks(fn, labeled=False):
-    # return dictionary of (start, end, number_non_gap, [region_id]),
-    # keyed by strain and then chromosome
-    with open(fn, 'r') as reader:
+def read_blocks(filename: str,
+                labeled: bool = False) -> Dict[
+                    str, Dict[str, Tuple[int, int, int, str]]]:
+    '''
+    Read in the supplied block file, returning a dict keyed on strain,
+    then chromosome.  Values are tuples of start, end, and number of postions
+    for the block.
+    If labeled is true, values contain the region_id as last element
+    '''
+    with open(filename, 'r') as reader:
         reader.readline()  # header
         result = defaultdict(lambda: defaultdict(list))
         for line in reader:
@@ -375,58 +709,3 @@ def read_blocks(fn, labeled=False):
                 item = (int(start), int(end), int(number_non_gap))
             result[strain][chrm].append(item)
     return result
-
-
-def get_emis_symbols(known_states):
-
-    symbols = [gp.match_symbol, gp.mismatch_symbol]
-    emis_symbols = [''.join(x) for x in
-                    itertools.product(symbols, repeat=len(known_states))]
-    emis_symbols.sort()
-    return emis_symbols
-
-
-def write_hmm_header(known_states, unknown_states, symbols, writer):
-
-    writer.write('strain\tchromosome\t')
-
-    states = known_states + unknown_states
-
-    writer.write('\t'.join(
-        [f'init_{s}' for s in states] +  # initial
-        [f'emis_{s}_{symbol}'
-         for s in states
-         for symbol in symbols] +  # emissions
-        [f'trans_{s1}_{s2}'
-         for s1 in states
-         for s2 in states]))  # transitions
-
-    writer.write('\n')
-
-
-def write_hmm(hmm, writer, strain, chrm, emis_symbols):
-    writer.write(f'{strain}\t{chrm}\t')
-
-    states = len(hmm.hidden_states)
-    writer.write('\t'.join(
-        [f'{p}' for p in hmm.initial_p] +  # initial
-        [f'{hmm.emissions[i, hmm.symbol_to_ind[symbol]]}'
-         if symbol in hmm.symbol_to_ind else '0.0'
-         for i in range(states)
-         for symbol in emis_symbols] +  # emission
-        [f'{hmm.transitions[i, j]}'
-         for i in range(states)
-         for j in range(states)]  # transition
-    ))
-    writer.write('\n')
-
-
-def write_state_probs(probs, writer, strain, chrm, states):
-    writer.write(f'{strain}\t{chrm}\t')
-
-    writer.write('\t'.join(
-        [f'{state}:' +
-         ','.join([f'{site[i]:.5f}' for site in probs])
-         for i, state in enumerate(states)]))
-
-    writer.write('\n')
diff --git a/code/analyze/predict_main.py b/code/analyze/predict_main.py
deleted file mode 100644
index f9566e9..0000000
--- a/code/analyze/predict_main.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import sys
-import os
-import predict
-import read_args
-import gzip
-import predict
-import global_params as gp
-from misc import read_fasta
-
-# read in analysis parameters
-
-args = read_args.process_predict_args(sys.argv[1:])
-
-##======
-# output files and if and where to resume
-##======
-
-if not os.path.isdir(gp.analysis_out_dir_absolute + args['tag']):
-    os.makedirs(gp.analysis_out_dir_absolute + args['tag'])
-
-# positions
-# TODO move this to more general location and make separate files for
-# each strain x chrm
-base_dir = f'{gp.analysis_out_dir_absolute}{args["tag"]}'
-
-# introgressed blocks
-blocks_f = {}
-for s in args['states']:
-    blocks_f[s] = open(f'{base_dir}/blocks_{s}_{args["tag"]}.txt', 'w')
-    predict.write_blocks_header(blocks_f[s])
-
-# HMM parameters
-emis_symbols = predict.get_emis_symbols(args['known_states'])
-
-hmm_init_f = open(f'{base_dir}/hmm_init_{args["tag"]}.txt', 'w')
-predict.write_hmm_header(args['known_states'], args['unknown_states'],
-                         emis_symbols, hmm_init_f)
-
-hmm_f = open(f'{base_dir}/hmm_{args["tag"]}.txt', 'w')
-predict.write_hmm_header(args['known_states'], args['unknown_states'],
-                         emis_symbols, hmm_f)
-
-# posterior probabilities
-
-write_ps = True
-if write_ps:
-    ps_f = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'wt')
-
-probs_f = gzip.open(f'{base_dir}/probs_{args["tag"]}.txt.gz', 'wt')
-
-# loop through all sequences and predict introgression
-
-
-for chrm in gp.chrms:
-
-    for strain, strain_dir in args['setup_args']['strain_dirs']:
-
-        print(f'working on: {strain} {chrm}')
-
-        ref_prefix = '_'.join(args['known_states'])
-        fn = (f'{args["setup_args"]["alignments_directory"]}{ref_prefix}_{strain}'
-              f'_chr{chrm}_mafft{gp.alignment_suffix}')
-
-        if not os.path.exists(fn):
-            print(fn)
-            print(f'no alignment for {strain} {chrm}')
-            continue
-
-        headers, seqs = read_fasta.read_fasta(fn)
-
-        ref_seqs = seqs[:-1]
-        predict_seq = seqs[-1]
-
-        # predict introgressed/non-introgressed tracts
-
-        state_seq, probs, hmm, hmm_init, ps = \
-            predict.predict_introgressed(ref_seqs, predict_seq,
-                                         args, train=True)
-
-        state_seq_blocks = predict.convert_to_blocks(state_seq, args['states'])
-
-        # output
-
-        # the positions actually used in predictions
-        # (alignment columns with no gaps)
-        if write_ps:
-            predict.write_positions(ps, ps_f, strain, chrm)
-
-        # blocks predicted to be introgressed, separate files for each species
-        for s in state_seq_blocks:
-            predict.write_blocks(state_seq_blocks[s], ps, blocks_f[s],
-                                 strain, chrm, s)
-
-        # summary info about HMM (before training)
-        predict.write_hmm(hmm_init, hmm_init_f, strain, chrm, emis_symbols)
-
-        # summary info about HMM (after training)
-        predict.write_hmm(hmm, hmm_f, strain, chrm, emis_symbols)
-
-        # probabilities at each site
-        predict.write_state_probs(probs, probs_f, strain,
-                                  chrm, hmm.hidden_states)
-
-for k in blocks_f:
-    blocks_f[k].close()
-
-ps_f.close()
-hmm_init_f.close()
-hmm_f.close()
-probs_f.close()
diff --git a/code/analyze/read_args.py b/code/analyze/read_args.py
index 26c6698..b78c022 100644
--- a/code/analyze/read_args.py
+++ b/code/analyze/read_args.py
@@ -1,10 +1,11 @@
 import sys
 from align import align_helpers
 
+
 def process_predict_args(args):
-    
+
     d = {}
-    
+
     i = 0
     d['tag'] = args[i]
 
@@ -14,7 +15,7 @@ def process_predict_args(args):
 
     i += 1
     d['improvement_frac'] = float(args[i])
-    
+
     i += 1
     d['threshold'] = args[i]
     if d['threshold'] != 'viterbi':
@@ -34,8 +35,9 @@ def process_predict_args(args):
             d['expected_length'][state] = float(args[i])
             i += 1
             d['expected_frac'][state] = float(args[i])
-        d['expected_frac'][d['known_states'][0]] = 1 - sum(d['expected_frac'].values())
-        d['expected_length'][d['known_states'][0]] = 0 # calculate later
+        d['expected_frac'][d['known_states'][0]] = \
+            1 - sum(d['expected_frac'].values())
+        d['expected_length'][d['known_states'][0]] = 0  # calculate later
 
         i += 1
         while i < len(args):
@@ -57,7 +59,8 @@ def process_predict_args(args):
     d['setup_args'] = setup_args
 
     return d
-    
+
+
 def read_setup_args(fn):
 
     x = {}
@@ -72,7 +75,8 @@ def read_setup_args(fn):
 
     d = {}
     d['references'] = x['references']
-    d['reference_directories'] =  dict(zip(x['references'], x['reference_directories']))
+    d['reference_directories'] = \
+        dict(zip(x['references'], x['reference_directories']))
     d['alignments_directory'] = x['alignments_directory'][0]
 
     d['strain_dirs'] = \
@@ -80,6 +84,7 @@ def read_setup_args(fn):
 
     return d
 
+
 def get_predict_args_by_tag(fn, tag):
     f = open(fn, 'r')
     line = f.readline()
@@ -90,4 +95,3 @@ def get_predict_args_by_tag(fn, tag):
         line = f.readline()
     print(f'tag not found: {tag}')
     return None
-
diff --git a/code/analyze/structure/structure_1_main.py b/code/analyze/structure/structure_1_main.py
index 27e060b..bb6063d 100644
--- a/code/analyze/structure/structure_1_main.py
+++ b/code/analyze/structure/structure_1_main.py
@@ -3,24 +3,18 @@
 
 import sys
 import os
-import gzip
 import predict
 from collections import defaultdict
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
-import seq_functions
+from misc import read_fasta
 
 args = predict.process_predict_args(sys.argv[2:])
 
 chrm = gp.chrms[int(sys.argv[1])]
 
 # maybe getting strains should be simpler
-strains = [line.split('\t')[0] for line in \
-           open(gp.analysis_out_dir_absolute + args['tag'] + \
+strains = [line.split('\t')[0] for line in
+           open(gp.analysis_out_dir_absolute + args['tag'] +
                 '/state_counts_by_strain.txt', 'r').readlines()[1:]]
 
 nucs = set(['a', 't', 'g', 'c'])
@@ -32,10 +26,10 @@
 gp_dir = '../'
 
 
-##======
+# ======
 # use program ldselect to find set of tag snps all in low LD for
 # specified chromosome
-##======
+# ======
 
 # input file for ldselect is formatted so that each row is a snp and
 # each column is the genotype for a strain, e.g.
@@ -45,13 +39,13 @@
 snps = defaultdict(list)
 # loop through all the strains
 for strain in strains:
-    print '-', strain
+    print('-', strain)
     # read multiple alignment file for this strain with the master
     # reference (and other references which we don't care about
     # here)
-    headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + \
-                                          '_'.join(gp.alignment_ref_order) + \
-                                          '_' + strain + '_chr' + chrm + \
+    headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir +
+                                          '_'.join(gp.alignment_ref_order) +
+                                          '_' + strain + '_chr' + chrm +
                                           '_mafft.maf')
     # look at all alignment columns, keeping track of the index in
     # the master reference
@@ -66,11 +60,11 @@
 
 # get reference sequence (unaligned, without gaps)
 # TODO correct alignment file location
-ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir + \
-                                '_'.join(gp.alignment_ref_order) + \
-                                '_' + strains[0] + '_chr' + chrm + \
+ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir +
+                                '_'.join(gp.alignment_ref_order) +
+                                '_' + strains[0] + '_chr' + chrm +
                                 '_mafft.maf')[1][0].replace(gp.gap_symbol, '')
-open(out_dir + 'chromosome_lengths.txt', 'a').write(chrm + '\t' + \
+open(out_dir + 'chromosome_lengths.txt', 'a').write(chrm + '\t' +
                                                     str(len(ref_seq)) + '\n')
 
 # loop through all the sites we collected above
@@ -83,13 +77,13 @@
         # TODO do names have to be integers and/or equal in length?
         snp_id = str(snp)
         # write row for master reference
-        f.write(snp_id + '\t' + \
-                gp.alignment_ref_order[0] + '\t' + \
+        f.write(snp_id + '\t' +
+                gp.alignment_ref_order[0] + '\t' +
                 ref_seq[snp] + '\n')
         # and one row for each of the other strains
         for si in range(len(strains)):
-            f.write(snp_id + '\t' + \
-                    strains[si] + '\t' + \
+            f.write(snp_id + '\t' +
+                    strains[si] + '\t' +
                     snps[snp][si] + '\n')
 
 f.close()
@@ -97,7 +91,8 @@
 """
 # run ldselect on this input file
 fn_out = fn.replace('input', 'output')
-os.system('perl ' + gp.ldselect_install_path + 'ldSelect.pl -pb ' + fn + ' > ' + fn_out)
+os.system('perl ' + gp.ldselect_install_path +
+          'ldSelect.pl -pb ' + fn + ' > ' + fn_out)
 
 # extract one tag snp from each set of equivalent tag snps from
 # ldselect output file
diff --git a/code/analyze/structure/structure_2_main.py b/code/analyze/structure/structure_2_main.py
index 9b22ed9..3298a44 100644
--- a/code/analyze/structure/structure_2_main.py
+++ b/code/analyze/structure/structure_2_main.py
@@ -1,15 +1,8 @@
 import sys
 import os
-import gzip
 import predict
 from collections import defaultdict
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
-import seq_functions
 
 args = predict.process_predict_args(sys.argv[2:])
 
@@ -23,18 +16,18 @@
     os.makedirs(out_dir_run + '/population_ranges')
 
 # maybe getting strains should be simpler
-strains = [line.split('\t')[0] for line in \
-           open(gp.analysis_out_dir_absolute + args['tag'] + \
+strains = [line.split('\t')[0] for line in
+           open(gp.analysis_out_dir_absolute + args['tag'] +
                 '/state_counts_by_strain.txt', 'r').readlines()[1:]]
 
 gp_dir = '../'
 
-nuc_to_int = {'a':1, 't':2, 'g':3, 'c':4}
+nuc_to_int = {'a': 1, 't': 2, 'g': 3, 'c': 4}
 
-##======
+# ======
 # use program structure to find population proportion using either
 # unlinked tagsnps from ldselect, or just all snps
-##======
+# ======
 
 use_all_snps = True
 
@@ -93,12 +86,13 @@
 
 f = open(out_dir_run + 'structure_input_run' + run_id + '.txt', 'w')
 for chrm in gp.chrms:
-    f.write('\t\t\t' + '\t'.join([chrm + '_' + str(x) \
-                              for x in sorted(all_snps[chrm].keys())]))
+    f.write('\t\t\t' + '\t'.join([chrm + '_' + str(x)
+                                  for x in sorted(all_snps[chrm].keys())]))
 f.write('\n')
 for chrm in gp.chrms:
-    f.write('\t\t\t' + '\t'.join([str(map_distances[chrm][x]) \
-                              for x in sorted(map_distances[chrm].keys())]))
+    f.write('\t\t\t' + '\t'.join(
+        [str(map_distances[chrm][x])
+         for x in sorted(map_distances[chrm].keys())]))
 f.write('\n')
 
 for strain in strains:
@@ -118,7 +112,8 @@
 """
 os.system(gp.structure_install_path + 'structure -L ' + str(num_snps) + \
           ' -K 6 -i ' + out_dir_run + 'structure_input_run' + run_id + \
-          '.txt -o ' + out_dir_run + 'structure_output_k6_run' + run_id + '.txt')
+          '.txt -o ' + out_dir_run + 'structure_output_k6_run'
+          + run_id + '.txt')
 
 os.system('mv ' + out_dir_run + 'structure_output_k6_run' + \
           run_id + '.txt_ss ' + out_dir_run + \
@@ -136,7 +131,7 @@
 line = f.readline()
 while line != "Inferred ancestry of individuals:\n":
     line = f.readline()
-f.readline() # column headings
+f.readline()  # column headings
 line = f.readline()
 f_out.write('strain\tpopulation\tfraction\tindex\n')
 while line != "\n":
@@ -149,7 +144,7 @@
             ind = i
             break
     for i in range(len(fracs)):
-        f_out.write(strain + '\t' + str(i + 1) + '\t' + \
+        f_out.write(strain + '\t' + str(i + 1) + '\t' +
                     str(fracs[i]) + '\t' + str(ind + 1) + '\n')
     line = f.readline()
 f.close()
@@ -161,7 +156,8 @@
 f = open(out_dir_run + 'structure_output_ss_k6_run' + run_id + '.txt', 'r')
 k = 6
 
-# read in posterior probabilities for each strain locus being in each population
+# read in posterior probabilities for
+# each strain locus being in each population
 line = f.readline()
 while line.strip() == '\n':
     line = f.readline()
@@ -187,8 +183,6 @@
     line = f.readline()
 f.close()
 
-
-
 # TODO at some point associate numbered populations with logical names
 # (i.e. ones from strope et al)
 
@@ -198,17 +192,17 @@
 # population_ranges_strain_chrX.txt
 # start end popx
 # start end popx/popy
-# start end 
+# start end
 
-chrm_lengths = [line[:-1].split('\t') for line in \
-                     open(out_dir + 'chromosome_lengths.txt', 'r').readlines()]
-chrm_lengths = dict(zip([x[0] for x in chrm_lengths], \
+chrm_lengths = [line[:-1].split('\t') for line in
+                open(out_dir + 'chromosome_lengths.txt', 'r').readlines()]
+chrm_lengths = dict(zip([x[0] for x in chrm_lengths],
                         [int(x[1]) for x in chrm_lengths]))
 
 for strain in strains:
     for chrm in gp.chrms:
         ranges = []
-        snps =  sorted(strain_snp_pop[strain][chrm].keys())
+        snps = sorted(strain_snp_pop[strain][chrm].keys())
         start = snps[0]
         end = start
         previous_pop = strain_snp_pop[strain][chrm][start]
@@ -221,7 +215,8 @@
 
             else:
                 ranges.append((start, end, previous_pop))
-                ranges.append((end + 1, snp - 1, previous_pop + '/' + current_pop))
+                ranges.append((end + 1, snp - 1,
+                               previous_pop + '/' + current_pop))
                 start = snp
                 end = snp
                 previous_pop = current_pop
@@ -231,7 +226,8 @@
         ranges.append((end + 1, chrm_lengths[chrm], 'end'))
 
         # TODO file location
-        f = open(out_dir_run + 'population_ranges/population_ranges_' + strain + '_chr' + chrm + '_run' + run_id + '.txt', 'w')
+        f = open(out_dir_run + 'population_ranges/population_ranges_' +
+                 strain + '_chr' + chrm + '_run' + run_id + '.txt', 'w')
         for r in ranges:
             f.write('\t'.join([str(x) for x in r]) + '\n')
         f.close()
diff --git a/code/analyze/structure/structure_3_main.py b/code/analyze/structure/structure_3_main.py
index 4446cdd..fc1403c 100644
--- a/code/analyze/structure/structure_3_main.py
+++ b/code/analyze/structure/structure_3_main.py
@@ -1,26 +1,19 @@
-## generate three files:
+# generate three files:
 
-## 1. introgressed regions annotated by which population background(s)
-## they overlap
+# 1. introgressed regions annotated by which population background(s)
+# they overlap
 
-## 2. population backgrounds annotated by how much introgression they
-## have from each reference strain (or ambiguous strains)
+# 2. population backgrounds annotated by how much introgression they
+# have from each reference strain (or ambiguous strains)
 
-## 3. counts of bases in for each strain x population background x
-## introgresssing reference [or lack of introgression]
+# 3. counts of bases in for each strain x population background x
+# introgresssing reference [or lack of introgression]
 
 import sys
-import os
-import gzip
 import predict
 from collections import defaultdict
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
-import seq_functions
+from misc import read_table
 
 args = predict.process_predict_args(sys.argv[3:])
 
@@ -33,8 +26,8 @@
 
 # TODO maybe getting strains should be simpler...at least make this
 # not copy pasta
-strains = [line.split('\t')[0] for line in \
-           open(gp.analysis_out_dir_absolute + args['tag'] + \
+strains = [line.split('\t')[0] for line in
+           open(gp.analysis_out_dir_absolute + args['tag'] +
                 '/state_counts_by_strain.txt', 'r').readlines()[1:]]
 
 
@@ -58,8 +51,10 @@ def find_pops(start, end, pop_ranges):
             bases.append(r[1] - r[0] + 1)
     return pops, bases
 
+
 population_int_counts = defaultdict(lambda: defaultdict(int))
-strain_population_int_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+strain_population_int_counts = defaultdict(
+    lambda: defaultdict(lambda: defaultdict(int)))
 population_totals = defaultdict(int)
 strain_population_totals = defaultdict(lambda: defaultdict(int))
 all_alternative_states = set([])
@@ -74,24 +69,24 @@ def find_pops(start, end, pop_ranges):
         strain = regions[region_id]['strain']
         regions_strain_chrm[strain][chrm][region_id] = regions[region_id]
     new_regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                     'blocks_' + ref + \
-                     '_' + args['tag'] + '_populations.txt'
+        'blocks_' + ref + \
+        '_' + args['tag'] + '_populations.txt'
     f = open(new_regions_fn, 'w')
     labels = labels[1:] + ['population']
     f.write('region_id' + '\t' + '\t'.join(labels) + '\n')
 
-    #for chrm in regions_strain_chrm[strain]:
+    # for chrm in regions_strain_chrm[strain]:
     for strain in strains:
         for chrm in gp.chrms:
             # TODO get rid of run_id in filenames?
             pop_ranges_fn = out_dir_run + 'population_ranges/' + \
                             'population_ranges_' + strain + \
                             '_chr' + chrm + '_run' + run_id + '.txt'
-            pop_ranges = [line[:-1].split('\t') for line in \
+            pop_ranges = [line[:-1].split('\t') for line in
                           open(pop_ranges_fn, 'r').readlines()]
             pop_ranges = [(int(x[0]), int(x[1]), x[2]) for x in pop_ranges]
             for pr in pop_ranges:
-                population_totals[pr[2]] += pr[1] - pr[0] + 1                
+                population_totals[pr[2]] += pr[1] - pr[0] + 1
                 strain_population_totals[strain][pr[2]] += pr[1] - pr[0] + 1
 
             for region_id in regions_strain_chrm[strain][chrm]:
@@ -101,40 +96,47 @@ def find_pops(start, end, pop_ranges):
 
                 # find the population ranges that the region start and end
                 # coordinates fall within
-                pops, overlaps = find_pops(int(r['start']), int(r['end']), pop_ranges)
+                pops, overlaps = find_pops(int(r['start']), int(r['end']),
+                                           pop_ranges)
                 regions_strain_chrm[strain][chrm][region_id]['population'] = \
                     ','.join(pops)
-                f.write(region_id + '\t' + \
-                        '\t'.join([str(regions_strain_chrm[strain][chrm][region_id][x])\
-                                   for x in labels]) + '\n')
+                f.write(region_id + '\t' +
+                        '\t'.join(
+                            [str(regions_strain_chrm[strain][
+                                chrm][region_id][x])
+                             for x in labels]) + '\n')
 
                 for i in range(len(pops)):
-                    population_int_counts[pops[i]][r['alternative_states']] += \
+                    population_int_counts[pops[i]][r['alternative_states']] +=\
                         overlaps[i]
-                    strain_population_int_counts[strain][pops[i]]\
-                        [r['alternative_states']] += overlaps[i]
+                    strain_population_int_counts[strain][pops[i]][
+                        r['alternative_states']] += overlaps[i]
 
-f = open(out_dir_run + 'population_introgression_counts_run' + run_id + '.txt', 'w')
-f.write('population\treference\tnum_bases_introgressed\tfrac_bases_introgressed\n')
+f = open(out_dir_run + 'population_introgression_counts_run' + run_id + '.txt',
+         'w')
+f.write('population\treference\t'
+        'num_bases_introgressed\tfrac_bases_introgressed\n')
 for i in population_int_counts.keys():
     for ref in population_int_counts[i].keys():
-        f.write(str(i) + '\t' + ref + '\t' + str(population_int_counts[i][ref]) + '\t' + \
-                str(float(population_int_counts[i][ref])/population_totals[i]) + '\n')
+        f.write(str(i) + '\t' + ref + '\t' +
+                str(population_int_counts[i][ref]) + '\t' +
+                str(float(population_int_counts[i][ref])/population_totals[i])
+                + '\n')
 f.close()
 
 
-f = open(out_dir_run + 'strain_population_introgression_counts_run' + \
+f = open(out_dir_run + 'strain_population_introgression_counts_run' +
          run_id + '.txt', 'w')
-f.write('strain\tpopulation\treference\tnum_bases_introgressed' + \
+f.write('strain\tpopulation\treference\tnum_bases_introgressed' +
         '\tfrac_bases_introgressed\n')
 for strain in strains:
     for i in strain_population_int_counts[strain].keys():
         for ref in all_alternative_states:
             count = strain_population_int_counts[strain][i][ref]
             total = strain_population_totals[strain][i]
-            #frac = 0
-            #if total > 0:
+            # frac = 0
+            # if total > 0:
             frac = float(count)/total
-            f.write(strain + '\t' + str(i) + '\t' + ref + '\t' + 
+            f.write(strain + '\t' + str(i) + '\t' + ref + '\t' +
                     str(count) + '\t' + str(frac) + '\n')
 f.close()
diff --git a/code/analyze/summarize_region_quality.py b/code/analyze/summarize_region_quality.py
index d9754a2..c2a4160 100644
--- a/code/analyze/summarize_region_quality.py
+++ b/code/analyze/summarize_region_quality.py
@@ -1,8 +1,18 @@
+from __future__ import annotations
 import bisect
 import gzip
-import global_params as gp
-from misc import binary_search
 import numpy as np
+import pickle
+from contextlib import ExitStack
+import click
+import logging as log
+from collections import Counter
+from misc import read_fasta
+from misc import read_table
+from misc import seq_functions
+from typing import List, Tuple, Dict
+from analyze.introgression_configuration import Configuration
+
 
 cen_starts = [151465, 238207, 114385, 449711, 151987, 148510,
               496920, 105586, 355629, 436307, 440129, 150828,
@@ -37,12 +47,16 @@
 tel_right_starts = [tel_coords[i] for i in range(2, len(tel_coords), 4)]
 tel_right_ends = [tel_coords[i] for i in range(3, len(tel_coords), 4)]
 
+chromosomes = ('I II III IV V '
+               'VI VII VIII IX X '
+               'XI XII XIII XIV XV XVI').split()
+
 
 def distance_from_telomere(start, end, chrm):
 
     assert start <= end, str(start) + ' ' + str(end)
 
-    i = gp.chrms.index(chrm)
+    i = chromosomes.index(chrm)
     # region entirely on left arm
     if end <= cen_starts[i]:
         return start - tel_left_ends[i]
@@ -52,11 +66,12 @@ def distance_from_telomere(start, end, chrm):
     # region overlaps centromere: return minimum distance from either telomere
     return min(start - tel_left_ends[i], tel_right_starts[i] - end)
 
+
 def distance_from_centromere(start, end, chrm):
 
     assert start <= end, str(start) + ' ' + str(end)
 
-    i = gp.chrms.index(chrm)
+    i = chromosomes.index(chrm)
     # region entirely on left arm
     if end <= cen_starts[i]:
         return cen_starts[i] - end
@@ -66,13 +81,15 @@ def distance_from_centromere(start, end, chrm):
     # region overlaps centromere: return 0
     return 0
 
+
 def write_region_summary_plus(fn, regions, fields):
     f = open(fn, 'w')
     f.write('region_id\t' + '\t'.join(fields) + '\n')
     keys = sorted(regions.keys(), key=lambda x: int(x[1:]))
     for region_id in keys:
         f.write(region_id + '\t')
-        f.write('\t'.join([str(regions[region_id][field]) for field in fields]))
+        f.write('\t'.join([str(regions[region_id][field])
+                           for field in fields]))
         f.write('\n')
     f.close()
 
@@ -81,11 +98,12 @@ def gap_columns(seqs):
     g = 0
     for i in range(len(seqs[0])):
         for seq in seqs:
-            if seq[i] == gp.gap_symbol:
+            if seq[i] == '-':  # gp.gap_symbol:
                 g += 1
                 break
     return g
 
+
 def longest_consecutive(s, c):
     max_consecutive = 0
     current_consecutive = 0
@@ -114,9 +132,9 @@ def masked_columns(seqs):
         mask = False
         gap = False
         for s in range(num_seqs):
-            if seqs[s][ps] == gp.gap_symbol:
+            if seqs[s][ps] == '-':  # gp.gap_symbol:
                 gap = True
-            elif seqs[s][ps] == gp.masked_symbol:
+            elif seqs[s][ps] == 'x':  # gp.masked_symbol:
                 mask = True
         if mask:
             mask_total += 1
@@ -124,6 +142,7 @@ def masked_columns(seqs):
                 mask_non_gap_total += 1
     return mask_total, mask_non_gap_total
 
+
 def index_by_reference(ref_seq, seq):
     # return dictionary keyed by reference index, with value the
     # corresponding index in non-reference sequence
@@ -132,20 +151,14 @@ def index_by_reference(ref_seq, seq):
     ri = 0
     si = 0
     for i in range(len(ref_seq)):
-        if ref_seq[i] != gp.gap_symbol:
+        if ref_seq[i] != '-':  # gp.gap_symbol:
             d[ri] = si
             ri += 1
-        if seq[i] != gp.gap_symbol:
+        if seq[i] != '-':  # gp.gap_symbol:
             si += 1
     return d
 
 
-def index_alignment_by_reference(ref_seq):
-    # want a way to go from reference sequence coordinate to index in
-    # alignment
-    return np.where(ref_seq != gp.gap_symbol)[0]
-
-
 def num_sites_between(sites, start, end):
     # sites are sorted
     i = bisect.bisect_left(sites, start)
@@ -153,172 +166,850 @@ def num_sites_between(sites, start, end):
     return j - i, sites[i:j]
 
 
-def read_masked_intervals(fn):
-    with open(fn, 'r') as reader:
-        reader.readline()  # header
-        ints = []
-        for line in reader:
-            line = line.split()
-            ints.append((int(line[0]), int(line[2])))
-
-    return ints
-
-
-def convert_intervals_to_sites(ints):
-    sites = []
-    for start, end in ints:
-        sites += range(start, end + 1)
-    return np.array(sites)
-
-
-def seq_id_hmm(seq1, seq2, offset, include_sites):
-    sites = np.array(include_sites) - offset
-
-    info_gap = np.logical_or(seq1 == gp.gap_symbol,
-                             seq2 == gp.gap_symbol)
-    info_unseq = np.logical_or(seq1 == gp.unsequenced_symbol,
-                               seq2 == gp.unsequenced_symbol)
-    info_match = seq1 == seq2
-    info_hmm = np.zeros(info_match.shape, bool)
-    sites = sites[np.logical_and(sites < len(info_match), sites >= 0)]
-    info_hmm[sites] = True
-
-    total_sites = np.sum(info_hmm)
-    total_moatch = np.sum(np.logical_and(info_hmm, info_match))
-
-    # check all included are not gapped or skipped
-    include_in_skip = np.logical_and(
-        info_hmm, np.logical_or(
-            info_unseq, info_gap))
-    if np.any(include_in_skip):
-        ind = np.where(include_in_skip)[0][0]
-        raise AssertionError(f'{seq1[ind]} {seq2[ind]} {ind}')
-
-    return total_match, total_sites, \
-        {'gap_flag': info_gap, 'unseq_flag': info_unseq,
-         'hmm_flag': info_hmm, 'match': info_match}
-
-
-def seq_id_unmasked(seq1, seq2, offset, exclude_sites1, exclude_sites2):
-    # total_sites is number of sites at which neither sequence is
-    # masked or has a gap or unsequenced character; total_match is the
-    # number of those sites at which the two sequences match
-    # gapped and unsequenced locations
-    info_gap = np.logical_or(seq1 == gp.gap_symbol,
-                             seq2 == gp.gap_symbol)
-    info_unseq = np.logical_or(seq1 == gp.unsequenced_symbol,
-                               seq2 == gp.unsequenced_symbol)
-
-    # convert offset excluded sites to boolean array
-    info_mask = np.zeros(seq1.shape, bool)
-    if exclude_sites1 != []:
-        sites1 = np.array(exclude_sites1) - offset
-        sites1 = sites1[np.logical_and(sites1 < len(info_gap),
-                                       sites1 >= 0)]
-        info_mask[sites1] = True
-    if exclude_sites2 != []:
-        sites2 = np.array(exclude_sites2) - offset
-        sites2 = sites2[np.logical_and(sites2 < len(info_gap),
-                                       sites2 >= 0)]
-        info_mask[sites2] = True
-
-    # find sites that are not masked, gapped, or unsequenced
-    sites = np.logical_not(
-        np.logical_or(
-            info_mask,
-            np.logical_or(
-                info_gap, info_unseq)))
-
-    # determine totals
-    total_sites = np.sum(sites)
-    total_match = np.sum(
-        np.logical_and(
-            seq1 == seq2,
-            sites))
-
-    return total_match, total_sites, {'mask_flag': info_mask}
-
-    n = len(seq1)
-    total_sites = 0
-    total_match = 0
-
-    skip = [gp.gap_symbol, gp.unsequenced_symbol]
-    info_mask = [False for i in range(n)]
-    for i in range(n):
+class Summarizer():
+    '''
+    Summarize region quality of each region
+    '''
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+
+    def validate_arguments(self):
+        '''
+        Check that all required instance variables are set to perform a
+        summarize run. Returns true if valid, raises value error otherwise
+        '''
+        args = [
+            'chromosomes',
+            'labeled_blocks',
+            'quality_blocks',
+            'masks',
+            'alignment',
+            'positions',
+            'regions',
+            'region_index',
+            'known_states',
+            'unknown_states',
+            'states',
+            'symbols'
+        ]
+        variables = self.config.__dict__
+        for arg in args:
+            if arg not in variables or variables[arg] is None:
+                err = ('Failed to validate Summarizer, required argument '
+                       f"'{arg}' was unset")
+                log.exception(err)
+                raise ValueError(err)
+
+        reference = self.config.get('analysis_params.reference')
+        if reference is None:
+            err = f'Configuration did not specify a reference strain'
+            log.exception(err)
+            raise ValueError(err)
+
+        return True
+
+    def run(self, states: List[str] = None):
+        '''
+        Summarize region quality of each region for the states specified
+        '''
+        ref_ind, states = self.states_to_process(states)
+
+        log.debug(f'reference index: {ref_ind}')
+        log.debug(f'states to analyze: {states}')
+
+        known_states = self.config.known_states
+        log.debug(f'known_states {known_states}')
+
+        analyzer = Sequence_Analyzer(
+            self.config.masks,
+            self.config.alignment,
+            self.config.known_states,
+            self.config.interval_states,
+            self.config.chromosomes,
+            self.config.symbols)
+
+        log.debug(f'Sequence_Analyzer init with:')
+        log.debug(f'masks: {self.config.masks}')
+        log.debug(f'alignment: {self.config.alignment}')
+
+        analyzer.build_masked_sites()
+
+        for ind, state in enumerate(states):
+            log.info(f'Working on state {state}')
+            state_ind = self.config.states.index(state)
+
+            with Position_Reader(
+                    self.config.positions
+                                 ) as positions,\
+                    Region_Writer(
+                        self.config.regions.format(state=state),
+                        self.config.region_index.format(state=state),
+                        known_states
+                    ) as region_writer,\
+                    Quality_Writer(
+                        self.config.quality_blocks.format(state=state)
+                    ) as quality_writer,\
+                    ExitStack() as stack:
+
+                progress_bar = None
+                if self.config.log_file:
+                    progress_bar = stack.enter_context(
+                        click.progressbar(
+                            length=len(self.config.chromosomes),
+                            label=f'State {ind+1} of {len(states)}'))
+
+                for chrm in self.config.chromosomes:
+                    log.info(f'Working on chromosome {chrm}')
+                    region = Region_Database(
+                        self.config.labeled_blocks.format(state=state),
+                        chrm,
+                        known_states)
+
+                    for strain, ps in positions.get_positions(region, chrm):
+                        log.debug(f'{strain} {chrm}')
+
+                        analyzer.process_alignment(ref_ind,
+                                                   state_ind,
+                                                   chrm,
+                                                   strain,
+                                                   ps,
+                                                   region,
+                                                   region_writer)
+
+                    quality_writer.write_quality(region)
+
+                    if progress_bar:
+                        progress_bar.update(1)
+
+    def states_to_process(self,
+                          states: List[str] = None) -> Tuple[int,
+                                                             List[str]]:
+        '''
+        Set the states to summarize to the values passed in.
+        If no values are specified, run all states in config
+        Checks if states are in config, warning if a state is not
+        found and raising an error if none of the states are in config.
+        '''
+        reference = self.config.get('analysis_params.reference.name')
+        ref_ind = self.config.states.index(reference)
+
+        if states is None or states == []:
+            to_process = self.config.states
 
-        if binary_search.present(exclude_sites1, i + offset) or \
-           binary_search.present(exclude_sites2, i + offset):
-            info_mask[i] = True
-            continue
-        if seq1[i] not in skip and seq2[i] not in skip:
-            total_sites += 1
-            if seq1[i] == seq2[i]:
-                total_match += 1
-
-    # TODO: keep track of gapped/masked sites for master/predicted to
-    # incorporate into info string later
-    return total_match, total_sites, {'mask_flag': info_mask}
-
-
-def make_info_string_unknown(info, master_ind):
-
-    # used with indices to decode result
-    decoder = np.array(list('Xx._-'))
-    indices = np.zeros(info['gap_any_flag'].shape, int)
+        else:
+            to_process = []
+            for s in states:
+                if s in self.config.states:
+                    to_process.append(s)
+                else:
+                    log.warning(f"state '{s}' was not found as a state")
+
+            if to_process == []:
+                err = 'No valid states were found to process'
+                log.exception(err)
+                raise ValueError(err)
+
+        return ref_ind, to_process
+
+
+class Flag_Info():
+    '''
+    Collection of boolean flags for sequence summary
+    '''
+    def __init__(self):
+        self.gap_any = None
+        self.mask_any = None
+        self.unseq_any = None
+        self.hmm = None
+        self.gap = None
+        self.mask = None
+        self.unseq = None
+        self.match = None
+
+    def initialize_flags(self, number_sequences: int, number_states: int):
+        '''
+        Initialize internal flags to np arrays of false
+        '''
+        self.gap_any = np.zeros((number_sequences), bool)
+        self.mask_any = np.zeros((number_sequences), bool)
+        self.unseq_any = np.zeros((number_sequences), bool)
+        self.gap = np.zeros((number_sequences, number_states), bool)
+        self.mask = np.zeros((number_sequences, number_states), bool)
+        self.unseq = np.zeros((number_sequences, number_states), bool)
+        self.match = np.zeros((number_sequences, number_states), bool)
+
+    def add_sequence_flags(self, other: Flag_Info, state: int):
+        '''
+        Join the other flag info with this info by replacing values
+        in the gap, unseq, and match arrays and performing OR with anys
+        '''
+        # only write the first time
+        if state == 0:
+            self.hmm = other.hmm
+
+        self.gap_any = np.logical_or(self.gap_any, other.gap)
+        self.unseq_any = np.logical_or(self.unseq_any, other.unseq)
+
+        self.gap[:, state] = other.gap
+        self.unseq[:, state] = other.unseq
+        self.match[:, state] = other.match
+
+    def add_mask_flags(self, other: Flag_Info, state: int):
+        '''
+        Join the other flag info with this by replacing values in mask and
+        performing an OR with mask_any
+        '''
+        self.mask_any = np.logical_or(self.mask_any, other.mask)
+        self.mask[:, state] = other.mask
+
+    def encode_info(self,
+                    master_ind: int,
+                    predict_ind: int) -> str:
+        '''
+        Summarize info flags into a string. master_ind is the index of
+        the master reference state. predict_ind is the index of the predicted
+        state.  The return string is encoded for each position as:
+         '-': if either master or predict has a gap
+         '_': if either master or predict is masked
+         '.': if any state has a match
+         'b': both predict and master match
+         'c': master matches but not predict
+         'p': predict matches but not master
+         'x': no other condition applies
+         if the position is in the hmm_flag
+          it will be capitalized for x, p, c, or b
+        in order of precidence, e.g. if a position satisfies both '-' and '.',
+        it will be '-'.
+        '''
+
+        if predict_ind >= self.match.shape[1]:
+            return self.encode_unknown_info(master_ind)
+
+        decoder = np.array(list('xXpPcCbB._-'))
+        indices = np.zeros(self.match.shape[0], int)
+
+        indices[self.match[:, predict_ind]] += 2  # x to p if true
+        indices[self.match[:, master_ind]] += 4  # x to c, p to b
+        indices[self.hmm] += 1  # to upper
+
+        matches = np.all(self.match, axis=1)
+        indices[matches] = 8  # .
+        indices[np.any(
+            self.mask[:, [master_ind, predict_ind]],
+            axis=1)] = 9  # _
+        indices[np.any(
+            self.gap[:, [master_ind, predict_ind]],
+            axis=1)] = 10  # -
+
+        return ''.join(decoder[indices])
+
+    def encode_unknown_info(self,
+                            master_ind: int) -> str:
+        '''
+        Summarize info dictionary into a string for unknown state.
+        master_ind is the index of the master reference state.
+        The return string is encoded as each position as:
+         '-': if any state has a gap
+         '_': if any state has a mask
+         '.': all states match
+         'x': master matches
+         'X': no other condition applies
+        in order of precidence, e.g. if a position satisfies both '-' and '.',
+        it will be '-'.
+        '''
+
+        # used with indices to decode result
+        decoder = np.array(list('Xx._-'))
+        indices = np.zeros(self.gap_any.shape, int)
+
+        indices[self.match[:, master_ind]] = 1  # x
+        matches = np.all(self.match, axis=1)
+        indices[matches] = 2  # .
+        indices[self.mask_any] = 3  # _
+        indices[self.gap_any] = 4  # -
+
+        return ''.join(decoder[indices])
+
+
+class Sequence_Analyzer():
+    '''
+    Performs handling of masking, reading, and analyzing sequence data for
+    summarizing the sequences
+    '''
+    def __init__(self,
+                 mask_file: str,
+                 alignment_file: str,
+                 known_states: List,
+                 interval_states: List,
+                 chromosomes: List,
+                 symbols: Dict):
+        self.masks = mask_file
+        self.alignments = alignment_file
+        self.known_states = known_states
+        self.interval_states = interval_states
+        self.chromosomes = chromosomes
+        self.symbols = symbols
+
+    def build_masked_sites(self):
+        '''
+        Read in all intervals files and return dictionary of intervals,
+        keyed first by chromosome, then state
+        '''
+        result = {}
+        for chrom in self.chromosomes:
+            result[chrom] = {}
+            for state, name in zip(self.known_states, self.interval_states):
+                result[chrom][state] = self.read_masked_sites(chrom, name)
+
+        self.masked_sites = result
+
+    def read_masked_sites(self, chrom: str, strain: str) -> np.array:
+        filename = self.masks.format(chrom=chrom, strain=strain)
+        intervals = self.read_masked_intervals(filename)
+        sites = self.convert_intervals_to_sites(intervals)
+        return sites
+
+    def convert_intervals_to_sites(self,
+                                   intervals: List[Tuple]) -> np.array:
+        '''
+        Given a list of start, end positions, returns a 1D np.array of sites
+        contained in the intervals List
+        convert_intervals_to_sites([(1, 2), (4, 6)]) -> [1, 2, 4, 5, 6]
+        '''
+        sites = []
+        for start, end in intervals:
+            sites += range(start, end + 1)
+        return np.array(sites, dtype=int)
+
+    def read_masked_intervals(self,
+                              filename: str) -> List[Tuple[int, int]]:
+        '''
+        Read the interval file provided and return start and end sequences
+        as a list of tuples of 2 ints
+        '''
+        with open(filename, 'r') as reader:
+            reader.readline()  # header
+            intervals = []
+            for line in reader:
+                line = line.split()
+                intervals.append((int(line[0]), int(line[2])))
+
+        return intervals
+
+    def get_stats(self,
+                  current_sequence,
+                  other_sequence,
+                  slice_start,
+                  aligned_index_positions,
+                  masked_site):
+        '''
+        Helper function to perform analyses on the sequences returning
+        the results of seq_id_hmm, seq_id, and seq_id_unmasked
+        '''
+
+        # only alignment columns used by HMM (polymorphic, no
+        # gaps in any strain)
+        hmm_stats = self.seq_id_hmm(other_sequence,
+                                    current_sequence,
+                                    slice_start,
+                                    aligned_index_positions)
+
+        # all alignment columns, excluding ones with gaps in
+        # these two sequences
+        nongap_stats = seq_functions.seq_id(other_sequence,
+                                            current_sequence)
+
+        # all alignment columns, excluding ones with gaps or
+        # masked bases or unsequenced in *these two sequences*
+        nonmask_stats = self.seq_id_unmasked(other_sequence,
+                                             current_sequence,
+                                             slice_start,
+                                             masked_site[0],
+                                             masked_site[1])
+
+        return hmm_stats, nongap_stats, nonmask_stats
+
+    def seq_id_hmm(self,
+                   seq1: np.array,
+                   seq2: np.array,
+                   offset: int,
+                   include_sites: List[int]) -> Tuple[
+                       int, int, Flag_Info]:
+        '''
+        Compare two sequences and provide statistics of their overlap
+        considering only the included sites.
+        Takes the two sequences to consider, an offset of the included sites,
+        and a list of the included sites.
+        Returns:
+        -the total number of matching sites, where seq1[i] == seq2[i] and
+         i is an element in included_sites - offset
+        -the total number of sites considered in the included sites, e.g. where
+         included_sites - offset >= 0 and < len(seq)
+        -a Flag_Info object with:
+         -gap: true where seq1 or seq1 == gap_symbol
+         -unseq: true where seq1 or seq1 == unsequenced_symbol
+         -hmm: true where hmm[i] is in included_sites - offset
+         -match: true where seq1 == seq2, regardless of symbol
+        '''
+        sites = np.array(include_sites) - offset
+
+        info = Flag_Info()
+        info.gap = np.logical_or(seq1 == self.symbols['gap'],
+                                 seq2 == self.symbols['gap'])
+        info.unseq = np.logical_or(seq1 == self.symbols['unsequenced'],
+                                   seq2 == self.symbols['unsequenced'])
+        info.match = seq1 == seq2
+        info.hmm = np.zeros(info.match.shape, bool)
+        sites = sites[np.logical_and(sites < len(info.match), sites >= 0)]
+        info.hmm[sites] = True
+
+        total_sites = np.sum(info.hmm)
+        total_match = np.sum(np.logical_and(info.hmm, info.match))
+
+        # check all included are not gapped or skipped
+        include_in_skip = np.logical_and(
+            info.hmm, np.logical_or(
+                info.unseq, info.gap))
+        if np.any(include_in_skip):
+            ind = np.where(include_in_skip)[0][0]
+            err = ('Need to skip site specified as included '
+                   f'seq1: {seq1[ind]}, seq2: {seq2[ind]}, index: {ind}')
+            log.exception(err)
+            raise ValueError(err)
+
+        return total_match, total_sites, info
+
+    def seq_id_unmasked(self,
+                        seq1: np.array,
+                        seq2: np.array,
+                        offset: int,
+                        exclude_sites1: List[int],
+                        exclude_sites2: List[int]) -> Tuple[
+                            int, int, Flag_Info]:
+        '''
+        Compare two sequences and provide statistics of their overlap
+        considering only the included sites.
+        Takes two sequences, an offset applied to each excluded sites list
+        Returns:
+         -total number of matching sites in non-excluded sites. A position is
+          excluded if it is an element of either excluded site list - offset,
+          or it is a gap or unsequenced symbol in either sequence.
+         -total number of non-excluded sites
+         A Flag_Info object with:
+          -mask_flag: a boolean array that is true if the position is in
+           either excluded list - offset
+        '''
+        info = Flag_Info()
+        info.gap = np.logical_or(seq1 == self.symbols['gap'],
+                                 seq2 == self.symbols['gap'])
+        info.unseq = np.logical_or(seq1 == self.symbols['unsequenced'],
+                                   seq2 == self.symbols['unsequenced'])
+        exclude_sites1 = np.array(exclude_sites1)
+        exclude_sites2 = np.array(exclude_sites2)
+
+        # convert offset excluded sites to boolean array
+        info.mask = np.zeros(seq1.shape, bool)
+        if exclude_sites1.size != 0:
+            sites1 = exclude_sites1 - offset
+            sites1 = sites1[np.logical_and(sites1 < len(info.gap),
+                                           sites1 >= 0)]
+            info.mask[sites1] = True
+
+        if exclude_sites2.size != 0:
+            sites2 = exclude_sites2 - offset
+            sites2 = sites2[np.logical_and(sites2 < len(info.gap),
+                                           sites2 >= 0)]
+            info.mask[sites2] = True
+
+        # find sites that are not masked, gapped, or unsequenced
+        sites = np.logical_not(
+            np.logical_or(
+                info.mask,
+                np.logical_or(
+                    info.gap, info.unseq)))
+
+        # determine totals
+        total_sites = np.sum(sites)
+        total_match = np.sum(
+            np.logical_and(
+                seq1 == seq2,
+                sites))
+
+        return total_match, total_sites, info
+
+    def process_alignment(self,
+                          reference_index: int,
+                          state_index: int,
+                          chromosome: str,
+                          strain: str,
+                          positions: np.array,
+                          region: Region_Database,
+                          region_writer: Region_Writer):
+        '''
+        Analyze the alignment of a given strain, chromosome, and position.
+        Result is stored in the provided region database
+        '''
+        sequences, alignments, masked_sites = self.get_indices(chromosome,
+                                                               strain)
+
+        # convert position indices from indices in master reference to
+        # indices in alignment
+        ps_align = alignments[reference_index][positions]
+
+        for i, (r_id, start, end) in enumerate(region.get_entries(strain)):
+            start, end = self.get_slice(start, end,
+                                        alignments[reference_index],
+                                        ps_align)
+
+            info = Flag_Info()
+            info.initialize_flags(
+                end - start + 1,
+                len(self.known_states))
+
+            for ind, state in enumerate(self.known_states):
+                hmm, nongap, nonmask = self.get_stats(
+                    sequences[-1][start:end + 1],
+                    sequences[ind][start:end + 1],
+                    start,
+                    ps_align,
+                    (masked_sites[ind],
+                     masked_sites[-1]))
+
+                region.set_region(strain, i, state,
+                                  hmm,
+                                  nongap,
+                                  nonmask)
+
+                info.add_sequence_flags(hmm[2], ind)
+                info.add_mask_flags(nonmask[2], ind)
+
+            info_string = info.encode_info(reference_index, state_index)
+
+            region_writer.write_header(r_id)
+            region_writer.write_sequences(
+                strain,
+                alignments,
+                sequences,
+                (start, end))
+            region_writer.write_info_string(info_string)
+
+            # and keep track of each symbol count
+            region.update_counts(strain, i, info_string)
+
+    def get_indices(self, chromosome: str, strain: str) -> Tuple:
+        '''
+        Get the sequences and different indices for the provided
+        chromosome and strain
+        Returned tuple contains:
+        -sequences as np.array
+        -index alignment list of indices for each sequence
+        -masked_sites, index aligned for each sequence
+        '''
+        _, sequences = read_fasta.read_fasta(
+            self.alignments.format(chrom=chromosome, strain=strain))
+
+        # to go from index in reference seq to index in alignment
+        alignments = [
+            self.index_alignment_by_reference(seq)
+            for seq in sequences
+        ]
+
+        masked = self.read_masked_sites(chromosome, strain)
+
+        masked_sites = [
+            alignments[ind][self.masked_sites[chromosome][state]]
+            for ind, state in enumerate(self.known_states)
+        ] + [alignments[-1][masked]]  # for strain
+
+        return sequences, alignments, masked_sites
+
+    def index_alignment_by_reference(self, sequence: np.array) -> np.array:
+        '''
+        Find locations of non-gapped sites in sequence
+        want a way to go from reference sequence coordinate to index in
+        alignment
+        '''
+        return np.where(sequence != self.symbols['gap'])[0]
+
+    def get_slice(self,
+                  start: int,
+                  end: int,
+                  alignment: np.array,
+                  ps_align: np.array) -> Tuple[int, int]:
+        '''
+        Get start and end positions of index aligned sequence.
+        Checks that positions are valid (in ps_align), and raises
+        value errors otherwise
+        '''
+        # index of start and end of region in aligned sequences
+        slice_start, slice_end = alignment[[start, end]]
+
+        if not np.in1d([slice_start, slice_end], ps_align).all():
+            err = 'Slice not found in position alignment'
+            log.exception(err)
+            raise ValueError(err)
+
+        return slice_start, slice_end
+
+
+class Region_Database():
+    '''
+    Contains data and logic for regions data during summarizing
+    '''
+    def __init__(self,
+                 labeled_file: str,
+                 chromosome: str,
+                 known_states: List[str]):
+        '''
+        Read in labeled file and store resulting table and labels
+        '''
+        self.info_string_symbols = list('.-_npbcxNPBCX')
+
+        self.label_prefixes = ['match_nongap',
+                               'num_sites_nongap',
+                               'match_hmm',
+                               'match_nonmask',
+                               'num_sites_nonmask']
+
+        self.data, self.labels = read_table.read_table_columns(
+            labeled_file,
+            sep='\t',
+            group_by='strain',
+            chromosome=chromosome)
+
+        if self.labels[0] != 'region_id':
+            err = 'Unexpected labeled format'
+            log.exception(err)
+            raise ValueError(err)
+
+        for strain, data in self.data.items():
+            n = len(data['region_id'])
+
+            for s in known_states:
+                for lbl in self.label_prefixes:
+                    data[f'{lbl}_{s}'] = [0] * n
+
+            for s in self.info_string_symbols:
+                data['count_' + s] = [0] * n
+
+        self.labels += [f'{lbl}_{st}' for lbl in self.label_prefixes
+                        for st in known_states]
+        self.labels += ['count_' + x for x in self.info_string_symbols]
+
+    def has_strain(self, strain: str) -> bool:
+        '''
+        Checks if the strain is in this database
+        '''
+        return strain in self.data
+
+    def get_entries(self, strain: str) -> Tuple[str, int, int]:
+        '''
+        returns an iterator for the region entries of the strain
+        with region id (string), start (int) and end (int) positions
+        '''
+        if not self.has_strain(strain):
+            err = f'Region Database does not contain strain {strain}'
+            log.exception(err)
+            raise ValueError(err)
+
+        r_ids = self.data[strain]['region_id']
+        starts = self.data[strain]['start']
+        ends = self.data[strain]['end']
+        for i in range(len(r_ids)):
+            yield (r_ids[i], int(starts[i]), int(ends[i]))
+
+    def set_region(self,
+                   strain: str,
+                   index: int,
+                   state: str,
+                   hmm, nongap, nonmask):
+        '''
+        Set the region state with the provided values.
+        hmm, nongap and nonmask are tuples of the (match, total) values
+        '''
+        ds = self.data[strain]
+        MATCH, TOTAL = 0, 1
+        if hmm[TOTAL] is not None:
+            ds['num_sites_hmm'][index] = hmm[TOTAL]
+
+        ds[f'match_hmm_{state}'][index] = hmm[MATCH]
+
+        ds[f'match_nongap_{state}'][index] = nongap[MATCH]
+        ds[f'num_sites_nongap_{state}'][index] = nongap[TOTAL]
+
+        ds[f'match_nonmask_{state}'][index] = nonmask[MATCH]
+        ds[f'num_sites_nonmask_{state}'][index] = nonmask[TOTAL]
+
+    def update_counts(self,
+                      strain: str,
+                      index: int,
+                      info_string: str):
+        '''
+        Update the counts variables based on the provided info string
+        '''
+        counts = Counter(info_string)
+        for sym in self.info_string_symbols:
+            self.data[strain]['count_' + sym][index] = counts[sym]
+
+    def generate_output(self):
+        '''
+        Yield lines for writing to the quality output file.
+        To save memory, this effectively deletes the data structure!
+        Outputs are tab delimited, sorted by region_id
+        '''
+        # reorganize output as list of tuples ordered by label
+        output = []
+        # have to store this as dict changes during iterations
+        strains = list(self.data.keys())
+        for strain in strains:
+            # pop to limit memory usage
+            d = self.data.pop(strain)
+            output += list(zip(*[d[l] for l in self.labels]))
+
+        # sort by region id (index 0, remove r #[1:])
+        for entry in sorted(output, key=lambda e: int(e[0][1:])):
+            yield '\t'.join([str(e) for e in entry]) + '\n'
+
+    def generate_header(self):
+        '''
+        Generate a header line for the region database
+        '''
+        return '\t'.join(self.labels) + '\n'
+
+
+class Region_Writer():
+    '''
+    Controls the writing of region files and indices
+    '''
+    def __init__(self,
+                 region_file: str,
+                 index_file: str,
+                 known_states: List[str]):
+        self.region_file = region_file
+        self.index_file = index_file
+        self.index = {}
+        self.known_states = known_states
+
+    def __enter__(self):
+        self.region_writer = gzip.open(self.region_file, 'wt')
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.region_writer.close()
+
+        if traceback is None:
+            # write index
+            with open(self.index_file, 'wb') as index_writer:
+                pickle.dump(self.index, index_writer)
+            return True
 
-    indices[info['match_flag'][:, master_ind]] = 1  # x
-    matches = np.all(info['match_flag'], axis=1)
-    indices[matches] = 2  # .
-    indices[info['mask_any_flag']] = 3  # _
-    indices[info['gap_any_flag']] = 4  # -
+        else:
+            return False
+
+    def write_header(self, region_id: str):
+        '''
+        Add a header line with the region id
+        '''
+        self.index[int(region_id[1:])] = self.region_writer.tell()
+        self.region_writer.write(f'#{region_id}\n')
+
+    def write_sequences(self,
+                        strain: str,
+                        alignments: List,
+                        sequences: np.array,
+                        indices: Tuple):
+        '''
+        Write sequences to region file
+        '''
+        start, end = indices
+        names = self.known_states + [strain]
+        for sj, name in enumerate(names):
+            startj = bisect.bisect_left(alignments[sj], start)
+            endj = bisect.bisect_left(alignments[sj], end)
+
+            self.region_writer.write(f'> {name} {startj} {endj}\n')
+
+            self.region_writer.write(''.join(
+                sequences[sj][start:end+1]) + '\n')
+
+    def write_info_string(self, info_string: str):
+        '''
+        Write info string with header to region file
+        '''
+        # write info string
+        self.region_writer.write('> info\n')
+        self.region_writer.write(info_string + '\n')
+
+
+class Position_Reader():
+    '''
+    Read in position file, yielding positions until no longer on current
+    chromosome
+    '''
+
+    def __init__(self, position_file):
+        self.position_file = position_file
+        self.last_position = 0
+
+    def __enter__(self):
+        self.reader = gzip.open(self.position_file, 'rt')
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.reader.close()
+        return traceback is None
+
+    def get_positions(self,
+                      region: Region_Database,
+                      chromosome: str) -> Tuple[str, np.array]:
+        self.reader.seek(self.last_position)
+        line = self.next_line()
+        while line != '':
+            line = line.split('\t')
+
+            chrm = line[1]
+            if chrm != chromosome:
+                break
 
-    return ''.join(decoder[indices])
+            strain = line[0]
+            if not region.has_strain(strain):
+                line = self.next_line()
+                continue
 
+            yield strain, np.array(line[2:], dtype=int)
 
-def make_info_string(info, master_ind, predict_ind):
+            line = self.next_line()
 
-    if predict_ind >= info['match_flag'].shape[1]:
-        return make_info_string_unknown(info, master_ind)
+    def next_line(self) -> str:
+        self.last_position = self.reader.tell()
+        line = self.reader.readline()
+        return line
 
-    decoder = np.array(list('xXpPcCbB._-'))
-    indices = np.zeros(info['match_flag'].shape[0], int)
 
-    indices[info['match_flag'][:, predict_ind]] += 2  # x to p if true
-    indices[info['match_flag'][:, master_ind]] += 4  # x to c, p to b
-    indices[info['hmm_flag']] += 1  # to upper
+class Quality_Writer():
+    '''
+    Control writing of quality file from region database
+    '''
+    def __init__(self, quality_filename):
+        self.filename = quality_filename
+        self.first_write = True
 
-    matches = np.all(info['match_flag'], axis=1)
-    indices[matches] = 8  # .
-    indices[np.any(
-        info['mask_flag'][:, [master_ind, predict_ind]],
-        axis=1)] = 9  # _
-    indices[np.any(
-        info['gap_flag'][:, [master_ind, predict_ind]],
-        axis=1)] = 10  # -
+    def __enter__(self):
+        self.writer = open(self.filename, 'w')
+        return self
 
-    return ''.join(decoder[indices])
+    def __exit__(self, type, value, traceback):
+        self.writer.close()
+        return traceback is None
 
-def read_region_file(fn):
-    f = gzip.open(fn, 'rb')
-    d = {}
-    line = f.readline().decode()
-    while line != '':
-        region_id = line[1:-1]
-        line = f.readline().decode()
-        seqs = {}
-        while line[0] != '#':
-            line = line[:-1].split(' ')
-            strain = line[1]
-            seqs[strain] = {}
-            if len(line) > 2:
-                seqs[strain]['start'] = int(line[2])
-                seqs[strain]['end'] = int(line[3])
-            seqs[strain]['seq'] = f.readline().decode()[:-1]
-            line = f.readline().decode()
-            if line == '':
-                break
-        d[region_id] = seqs
+    def write_quality(self, region: Region_Database):
+        '''
+        Writes header if needed and region database values
+        '''
+        if self.first_write is True:
+            self.writer.write(region.generate_header())
+            self.first_write = False
 
-    f.close()
-    return d
-    
+        for line in region.generate_output():
+            self.writer.write(line)
diff --git a/code/analyze/summarize_region_quality_main.py b/code/analyze/summarize_region_quality_main.py
deleted file mode 100644
index 7e6c2a9..0000000
--- a/code/analyze/summarize_region_quality_main.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import sys
-import os
-import gzip
-from analyze import read_args
-from analyze.summarize_region_quality import (convert_intervals_to_sites,
-                                              read_masked_intervals,
-                                              index_alignment_by_reference,
-                                              seq_id_hmm,
-                                              seq_id_unmasked,
-                                              make_info_string)
-import global_params as gp
-from misc import read_fasta
-from misc import read_table
-from misc import seq_functions
-import numpy as np
-import bisect
-import pickle
-
-
-def main():
-
-    args = read_args.process_predict_args(sys.argv[2:])
-
-    task_ind = int(sys.argv[1])
-    species_ind = task_ind
-
-    species_from = args['states'][species_ind]
-
-    base_dir = gp.analysis_out_dir_absolute + args['tag']
-
-    regions_dir = f'{base_dir}/regions/'
-    if not os.path.isdir(regions_dir):
-        os.mkdir(regions_dir)
-
-    quality_writer = None
-    positions = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'rt')
-    line_number = 0
-
-    region_writer = gzip.open(
-        f'{regions_dir}{species_from}{gp.fasta_suffix}.gz', 'wt')
-    region_index = {}
-
-    for chrm in gp.chrms:
-        # region_id strain chromosome predicted_species start end num_non_gap
-        regions_chrm, labels = read_table.read_table_columns(
-            f'{base_dir}/blocks_{species_from}_{args["tag"]}_labeled.txt',
-            '\t',
-            group_by='strain',
-            chromosome=chrm
-        )
-
-        for strain in regions_chrm:
-            n = len(regions_chrm[strain]['region_id'])
-
-            for s in args['known_states']:
-                regions_chrm[strain]['match_nongap_' + s] = [0] * n
-                regions_chrm[strain]['num_sites_nongap_' + s] = [0] * n
-                regions_chrm[strain]['match_hmm_' + s] = [0] * n
-                regions_chrm[strain]['match_nonmask_' + s] = [0] * n
-                regions_chrm[strain]['num_sites_nonmask_' + s] = [0] * n
-
-            info_string_symbols = list('.-_npbcxNPBCX')
-            for s in info_string_symbols:
-                regions_chrm[strain]['count_' + s] = [0] * n
-
-        # get masked sites for all references, not just the current
-        # species_from we're considering regions from
-        masked_sites_refs = {}
-        for s, state in enumerate(args['known_states']):
-            masked_sites_refs[s] = \
-                convert_intervals_to_sites(
-                    read_masked_intervals(
-                        f'{gp.mask_dir}{state}'
-                        f'_chr{chrm}_intervals.txt'))
-
-        # loop through chromosomes and strains, followed by species of
-        # introgression so that we only have to read each alignment in once
-        # move to last read chromosome
-        positions.seek(line_number)
-        line = positions.readline()
-        while line != '':
-            line = line.split('\t')
-
-            current_chrm = line[1]
-            if current_chrm != chrm:
-                break
-
-            strain = line[0]
-            if strain not in regions_chrm:
-                # record current position in case need to re read line
-                line_number = positions.tell()
-                line = positions.readline()
-                continue
-
-            print(strain, chrm)
-
-            # indices of alignment columns used by HMM
-            ps = np.array([int(x) for x in line[2:]])
-
-            headers, seqs = read_fasta.read_fasta(
-                args['setup_args']['alignments_directory'] + \
-                '_'.join(args['known_states'])
-                + f'_{strain}_chr{chrm}_mafft{gp.alignment_suffix}')
-
-            # to go from index in reference seq to index in alignment
-            ind_align = []
-            for seq in seqs:
-                ind_align.append(index_alignment_by_reference(seq))
-            
-            masked_sites = convert_intervals_to_sites(
-                read_masked_intervals(
-                    f'{gp.mask_dir}{strain}_chr{chrm}_intervals.txt'))
-
-            masked_sites_ind_align = []
-            for s in range(len(args['known_states'])):
-                masked_sites_ind_align.append(
-                    ind_align[s][masked_sites_refs[s]])
-
-            # add in sequence of query strain
-            masked_sites_ind_align.append(
-                ind_align[-1][masked_sites])
-
-            # convert position indices from indices in master reference to
-            # indices in alignment
-            ps_ind_align = ind_align[0][ps]
-
-            # loop through all regions for the specified chromosome and the
-            # current strain
-            for i in range(len(regions_chrm[strain]['region_id'])):
-                r_id = regions_chrm[strain]['region_id'][i]
-                start = regions_chrm[strain]['start'][i]
-                end = regions_chrm[strain]['end'][i]
-
-                # calculate:
-                # - identity with each reference
-                # - fraction of region that is gapped/masked
-
-                # index of start and end of region in aligned sequences
-                slice_start = ind_align[0][int(start)]
-                slice_end = ind_align[0][int(end)]
-                assert slice_start in ps_ind_align, \
-                    f'{slice_start} {start} {r_id}'
-                assert slice_end in ps_ind_align, \
-                    f'{slice_end} {end} {r_id}'
-
-                seqx = seqs[-1][slice_start:slice_end + 1]
-                len_seqx = slice_end - slice_start + 1
-                len_states = len(args['known_states'])
-
-                # . = all match
-                # - = gap in one or more sequences
-                # p = matches predicted reference
-
-                info = {'gap_any_flag': np.zeros((len_seqx), bool),
-                        'mask_any_flag': np.zeros((len_seqx), bool),
-                        'unseq_any_flag': np.zeros((len_seqx), bool),
-                        'hmm_flag': np.zeros((len_seqx), bool),
-                        'gap_flag': np.zeros((len_seqx, len_states), bool),
-                        'mask_flag': np.zeros((len_seqx, len_states), bool),
-                        'unseq_flag': np.zeros((len_seqx, len_states), bool),
-                        'match_flag': np.zeros((len_seqx, len_states), bool)}
-
-                for sj, statej in enumerate(args['known_states']):
-                    seqj = seqs[sj][slice_start:slice_end+1]
-
-                    # only alignment columns used by HMM (polymorphic, no
-                    # gaps in any strain)
-                    total_match_hmm, total_sites_hmm, infoj = \
-                        seq_id_hmm(seqj, seqx, slice_start, ps_ind_align)
-
-                    if statej == species_from \
-                            or species_ind >= len(args['known_states']):
-                        regions_chrm[strain]['num_sites_hmm'][i] = \
-                            total_sites_hmm
-
-                    # only write once, the first index
-                    if sj == 0:
-                        info['hmm_flag'] = infoj['hmm_flag']
-
-                    info['gap_any_flag'] = np.logical_or(
-                        info['gap_any_flag'], infoj['gap_flag'])
-                    info['unseq_any_flag'] = np.logical_or(
-                        info['unseq_any_flag'], infoj['unseq_flag'])
-                    info['gap_flag'][:, sj] = infoj['gap_flag']
-                    info['unseq_flag'][:, sj] = infoj['unseq_flag']
-                    info['match_flag'][:, sj] = infoj['match']
-
-                    regions_chrm[strain][f'match_hmm_{statej}'][i] = \
-                        total_match_hmm
-
-                    # all alignment columns, excluding ones with gaps in
-                    # these two sequences
-                    total_match_nongap, total_sites_nongap = \
-                        seq_functions.seq_id(seqj, seqx)
-
-                    regions_chrm[strain][f'match_nongap_{statej}'][i] =\
-                        total_match_nongap
-                    regions_chrm[strain][f'num_sites_nongap_{statej}'][i] =\
-                        total_sites_nongap
-
-                    # all alignment columns, excluding ones with gaps or
-                    # masked bases or unsequenced in *these two sequences*
-                    total_match_nonmask, total_sites_nonmask, infoj = \
-                        seq_id_unmasked(seqj, seqx, slice_start,
-                                        masked_sites_ind_align[sj],
-                                        masked_sites_ind_align[-1])
-
-                    info['mask_any_flag'] = np.logical_or(
-                        info['mask_any_flag'], infoj['mask_flag'])
-                    info['mask_flag'][:, sj] = infoj['mask_flag']
-
-                    regions_chrm[strain][f'match_nonmask_{statej}'][i] = \
-                        total_match_nonmask
-                    regions_chrm[strain][f'num_sites_nonmask_{statej}'][i] = \
-                        total_sites_nonmask
-
-                region_index[int(r_id[1:])] = region_writer.tell()
-                region_writer.write(f'#{r_id}\n')
-                names = args['known_states'] + [strain]
-                for sj in range(len(names)):
-                    # write sequence to region alignment file, along with
-                    # start and end coordinates
-                    startj = bisect.bisect_left(ind_align[sj], slice_start)
-                    endj = bisect.bisect_left(ind_align[sj], slice_end)
-
-                    region_writer.write(f'> {names[sj]} {startj} {endj}\n')
-                    region_writer.write(
-                        ''.join(seqs[sj][slice_start:slice_end+1]) + '\n')
-
-                # also write string with info about each site
-                info_string = make_info_string(info, 0, species_ind)
-                region_writer.write('> info\n')
-                region_writer.write(info_string + '\n')
-
-                # TODO this can be made faster with numpy
-                # and keep track of each symbol count
-                for sym in info_string_symbols:
-                    regions_chrm[strain]['count_' + sym][i] = \
-                        info_string.count(sym)
-
-            # record current position in case need to re read line
-            line_number = positions.tell()
-            line = positions.readline()
-            sys.stdout.flush()
-
-        labels += ['match_nongap_' + x for x in args['known_states']]
-        labels += ['num_sites_nongap_' + x for x in args['known_states']]
-        labels += ['match_hmm_' + x for x in args['known_states']]
-        labels += ['match_nonmask_' + x for x in args['known_states']]
-        labels += ['num_sites_nonmask_' + x for x in args['known_states']]
-        labels += ['count_' + x for x in info_string_symbols]
-
-        assert labels[0] == 'region_id', 'Unexpected labeled format'
-
-        # write on first execution
-        if quality_writer is None:
-            quality_writer = open(f'{base_dir}/blocks_{species_from}'
-                                  f'_{args["tag"]}_quality.txt', 'w')
-
-            quality_writer.write('\t'.join(labels) + '\n')
-
-        # reorganize output as list of tuples ordered by label
-        output = []
-        strains = list(regions_chrm.keys())
-        for strain in strains:
-            # pop to limit memory usage
-            d = regions_chrm.pop(strain)
-            output += list(zip(*[d[l] for l in labels]))
-
-        # sort by region id (index 0, remove r)
-        for entry in sorted(output, key=lambda e: int(e[0][1:])):
-            quality_writer.write('\t'.join([str(e) for e in entry]) + '\n')
-
-    quality_writer.close()
-    region_writer.close()
-    with open(f'{regions_dir}{species_from}.pkl', 'wb') as index:
-        pickle.dump(region_index, index)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/code/analyze/summarize_strain_states.py b/code/analyze/summarize_strain_states.py
new file mode 100644
index 0000000..24a1b44
--- /dev/null
+++ b/code/analyze/summarize_strain_states.py
@@ -0,0 +1,208 @@
+from analyze.introgression_configuration import Configuration
+import logging as log
+import itertools
+from misc import read_table
+from typing import List
+from contextlib import ExitStack
+import click
+
+
+class Strain_Summarizer():
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
+
+    def validate_arguments(self):
+        '''
+        Check that all required instance variables are set to perform a
+        strain summary run. Returns true if valid, raises value error otherwise
+        '''
+        args = [
+            'known_states',
+            'introgressed_intermediate',
+            'ambiguous_intermediate',
+            'strain_info',
+            'state_counts',
+        ]
+        variables = self.config.__dict__
+        for arg in args:
+            if arg not in variables or variables[arg] is None:
+                err = ('Failed to validate strain summarizer,'
+                       f" required argument '{arg}' was unset")
+                log.exception(err)
+                raise ValueError(err)
+
+        return True
+
+    def run(self):
+        '''
+        Generate summary information for the state of
+        each position in the sequence
+        '''
+        self.validate_arguments()
+
+        summary = Summary_Table()
+
+        states = self.config.known_states[1:]
+        with ExitStack() as stack:
+            progress_bar = None
+            if self.config.log_file:
+                progress_bar = stack.enter_context(
+                    click.progressbar(
+                        length=len(states),
+                        label='State'))
+            for species_from in states:
+
+                log.info(species_from)
+
+                regions1, _ = read_table.read_table_rows(
+                    self.config.introgressed_intermediate.format(
+                        state=species_from), '\t')
+                regions2, _ = read_table.read_table_rows(
+                    self.config.ambiguous_intermediate.format(
+                        state=species_from), '\t')
+
+                for region_id in regions1:
+                    region1 = regions1[region_id]
+
+                    strain = region1['strain']
+                    length = int(region1['end']) - int(region1['start']) + 1
+
+                    summary.set_region(strain, species_from, length)
+                    summary.region_found()
+
+                    if region1['reason'] != '':  # failed filter
+                        continue
+
+                    summary.region_passes_filter1()
+
+                    region2 = regions2[region_id]
+                    summary.record_alt_species(
+                        region2['alternative_states'].split(','))
+
+                if progress_bar:
+                    progress_bar.update(1)
+
+            with open(self.config.strain_info, 'r') as reader:
+                summary.add_strain_info(reader)
+
+            with open(self.config.state_counts, 'w') as writer:
+                summary.write_summary(states, writer)
+
+
+class Summary_Table():
+    def __init__(self):
+        self.table = {}
+
+    def set_region(self, strain, species, length):
+        self.strain = strain
+        self.species = species
+        self.length = length
+
+    def record_element(self,
+                       strain: str,
+                       key: str,
+                       count: int = 1):
+        '''
+        Increment the count of table[strain][key], adding new values as needed
+        '''
+
+        if strain not in self.table:
+            self.table[strain] = {}
+
+        t = self.table[strain]
+        if key not in t:
+            t[key] = 0
+
+        t[key] += count
+
+    def record_region(self,
+                      strain: str,
+                      species: str,
+                      length: int,
+                      suffix: str = "",
+                      update_total: bool = True):
+        '''
+        Record a region of provided length.
+        '''
+        if suffix and suffix[0] != '_':
+            suffix = '_' + suffix
+
+        self.record_element(strain, f'num_regions_{species}{suffix}', 1)
+        self.record_element(strain, f'num_bases_{species}{suffix}', length)
+        if update_total:
+            self.record_element(strain, f'num_bases_total{suffix}', length)
+            self.record_element(strain, f'num_regions_total{suffix}', 1)
+
+    def record_alt_species(self, alt_states: List):
+        for species in alt_states:
+            self.record_alt(species)
+
+        if len(alt_states) == 1:
+            self.record_region(self.strain, self.species,
+                               self.length, '_filtered2')
+        else:
+            self.record_element(self.strain,
+                                ('num_bases_' +
+                                 '_or_'.join(sorted(alt_states)) +
+                                 '_filtered2i'),
+                                self.length)
+
+        self.record_element(self.strain,
+                            f'num_bases_{len(alt_states)}_filtered2i',
+                            self.length)
+
+    def region_found(self):
+        self.record_region(self.strain, self.species, self.length)
+
+    def region_passes_filter1(self):
+        self.record_region(self.strain, self.species,
+                           self.length, '_filtered1')
+
+    def record_alt(self, alt_species):
+        self.record_region(self.strain, alt_species,
+                           self.length, '_filtered2_inclusive',
+                           self.species == alt_species)
+
+    def add_strain_info(self, reader):
+        for line in reader:
+            strain, _, _, geo, env, pop = line[:-1].split('\t')
+            strain = strain.lower()
+            if strain in self.table:
+                d = self.table[strain]
+                d['population'] = pop
+                d['geographic_origin'] = geo
+                d['environmental_origin'] = env
+
+    def write_summary(self, states, writer):
+        fields = self.get_fields(states)
+
+        # write header
+        writer.write('strain\t' + '\t'.join(fields) + '\n')
+
+        for strain in sorted(self.table.keys()):
+            row = self.table[strain]
+            entries = [row[field]
+                       if field in row
+                       else 0
+                       for field in fields]
+
+            entries = [str(s) for s in [strain] + entries]
+
+            writer.write('\t'.join(entries) + '\n')
+
+    def get_fields(self, states):
+        fields = ['population', 'geographic_origin', 'environmental_origin'] +\
+            [f'num_{thing}_{state}{value}'
+             for thing in ('regions', 'bases')
+             for value in ('', '_filtered1',
+                           '_filtered2', '_filtered2_inclusive')
+             for state in states + ['total']
+             ]
+
+        r = sorted(states)
+        for n in range(2, len(r)+1):
+            fields += [f'num_bases_{"_or_".join(combo)}_filtered2i'
+                       for combo in itertools.combinations(r, n)]
+            fields += [f'num_bases_{n}_filtered2i']
+
+        return fields
diff --git a/code/analyze/summarize_strain_states_main.py b/code/analyze/summarize_strain_states_main.py
deleted file mode 100644
index d161570..0000000
--- a/code/analyze/summarize_strain_states_main.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import re
-import sys
-import os
-import copy
-import itertools
-import gene_predictions
-import predict
-from collections import defaultdict
-from filter_helpers import *
-sys.path.insert(0, '..')
-import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-
-args = predict.process_predict_args(sys.argv[1:])
-
-d = defaultdict(lambda: defaultdict(int))
-for species_from in args['known_states'][1:]:
-
-    print species_from
-
-    fn_filtered1i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                    'blocks_' + species_from + \
-                    '_' + args['tag'] + '_filtered1intermediate.txt'
-    fn_filtered2i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
-                    'blocks_' + species_from + \
-                    '_' + args['tag'] + '_filtered2intermediate.txt'
-
-    regions1, fields1 = read_table.read_table_rows(fn_filtered1i, '\t')
-    regions2, fields2 = read_table.read_table_rows(fn_filtered2i, '\t')
-
-    for region_id in regions1:
-
-        strain = regions1[region_id]['strain']
-        length = int(regions1[region_id]['end']) - int(regions1[region_id]['start']) + 1
-        d[strain]['num_regions_' + species_from] += 1
-        d[strain]['num_regions_total'] += 1
-        d[strain]['num_bases_' + species_from] += length
-        d[strain]['num_bases_total'] += length
-        if regions1[region_id]['reason'] == '':
-            d[strain]['num_regions_' + species_from + '_filtered1'] += 1
-            d[strain]['num_regions_total_filtered1'] += 1
-            d[strain]['num_bases_' + species_from + '_filtered1'] += length
-            d[strain]['num_bases_total_filtered1'] += length
-
-            alt_states = regions2[region_id]['alternative_states'].split(',')
-            for species_from_alt in alt_states:
-                d[strain]['num_regions_' + species_from_alt + \
-                          '_filtered2_inclusive'] += 1
-                d[strain]['num_bases_' + species_from_alt + \
-                          '_filtered2_inclusive'] += length
-                if species_from_alt == species_from:
-                    d[strain]['num_regions_total_filtered2_inclusive'] += 1
-                    d[strain]['num_bases_total_filtered2_inclusive'] += length
-                
-            if len(alt_states) == 1:
-                d[strain]['num_regions_' + species_from + \
-                          '_filtered2'] += 1
-                d[strain]['num_regions_total_filtered2'] += 1
-                d[strain]['num_bases_' + species_from + \
-                          '_filtered2'] += length
-                d[strain]['num_bases_total_filtered2'] += length
-
-
-            else:
-                d[strain]['num_bases_' + '_or_'.join(sorted(alt_states)) + '_filtered2i'] += length
-
-            d[strain]['num_bases_' + str(len(alt_states)) + '_filtered2i'] += length
-
-
-strain_info = [line[:-1].split('\t') for line in open('../../100_genomes_info.txt', 'r')]
-strain_origins = dict(zip([x[0].lower() for x in strain_info], \
-                          [(x[5], x[3], x[4]) for x in strain_info]))
-for strain in d.keys():
-    d[strain]['population'] = strain_origins[strain][0]
-    d[strain]['geographic_origin'] = strain_origins[strain][1]
-    d[strain]['environmental_origin'] = strain_origins[strain][2]
-
-fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + 'state_counts_by_strain.txt'
-f = open(fn, 'w')
-fields = []
-
-fields += ['population', 'geographic_origin', 'environmental_origin']
-
-fields += ['num_regions_' + x for x in args['known_states'][1:]]
-fields += ['num_regions_total']
-fields += ['num_regions_' + x + '_filtered1' for x in args['known_states'][1:]]
-fields += ['num_regions_total_filtered1']
-fields += ['num_regions_' + x + '_filtered2' for x in args['known_states'][1:]]
-fields += ['num_regions_total_filtered2']
-fields += ['num_regions_' + x + '_filtered2_inclusive' for x in args['known_states'][1:]]
-fields += ['num_regions_total_filtered2_inclusive']
-
-fields += ['num_bases_' + x for x in args['known_states'][1:]]
-fields += ['num_bases_total']
-fields += ['num_bases_' + x + '_filtered1' for x in args['known_states'][1:]]
-fields += ['num_bases_total_filtered1']
-fields += ['num_bases_' + x + '_filtered2' for x in args['known_states'][1:]]
-fields += ['num_bases_total_filtered2']
-fields += ['num_bases_' + x + '_filtered2_inclusive' for x in args['known_states'][1:]]
-fields += ['num_bases_total_filtered2_inclusive']
-
-r = sorted(gp.alignment_ref_order[1:])
-for n in range(2, len(r)+1):
-    x = itertools.combinations(r, n)
-    for combo in x:
-        fields += ['num_bases_' + '_or_'.join(combo) + '_filtered2i']
-    fields += ['num_bases_' + str(n) + '_filtered2i']
-
-f.write('strain' + '\t' + '\t'.join(fields) + '\n')
-
-for strain in sorted(d.keys()):
-    f.write(strain + '\t')
-    f.write('\t'.join([str(d[strain][x]) for x in fields]))
-    f.write('\n')
-f.close()
diff --git a/code/analyze/to_update/aggregate_genes_by_strains_main.py b/code/analyze/to_update/aggregate_genes_by_strains_main.py
index e7ac91e..f7e008f 100644
--- a/code/analyze/to_update/aggregate_genes_by_strains_main.py
+++ b/code/analyze/to_update/aggregate_genes_by_strains_main.py
@@ -1,28 +1,19 @@
 import sys
-import os
-import gzip
-import predict
 from collections import defaultdict
-from summarize_region_quality import *
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
-import seq_functions
+from misc import read_table
 
 tag = sys.argv[1]
 
 fn = gp.analysis_out_dir_absolute + tag + \
      '/introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt'
-regions_filtered, l = read_table.read_table_rows(fn, "\t")
+regions_filtered, _ = read_table.read_table_rows(fn, "\t")
 
 gene_strains = defaultdict(set)
 strain_genes = defaultdict(lambda: defaultdict(set))
 
 for chrm in gp.chrms:
-    
+
     fn = gp.analysis_out_dir_absolute + tag + \
          '/genes_for_each_region_chr' + chrm + '_' + \
          tag + '.txt'
@@ -43,18 +34,18 @@
 for gene in gene_strains:
     gene_counts[gene] = len(gene_strains[gene])
 
-f_out = open(gp.analysis_out_dir_absolute + tag + \
-             '/genes_for_each_strain_filtered_' + \
+f_out = open(gp.analysis_out_dir_absolute + tag +
+             '/genes_for_each_strain_filtered_' +
              tag + '.txt', 'w')
 f_out.write('strain\tchromosome\tnum_genes\n')
 for chrm in gp.chrms:
     for strain in strain_genes[chrm]:
-        f_out.write(strain + '\t' + chrm + '\t' + \
+        f_out.write(strain + '\t' + chrm + '\t' +
                     str(len(strain_genes[chrm][strain])) + '\n')
 f_out.close()
 
-f_out = open(gp.analysis_out_dir_absolute + tag + \
-             '/genes_strain_hist_' + \
+f_out = open(gp.analysis_out_dir_absolute + tag +
+             '/genes_strain_hist_' +
              tag + '.txt', 'w')
 f_out.write('gene\tnum_strains\n')
 for gene in sorted(gene_counts.keys()):
diff --git a/code/analyze/to_update/annotate_positions.py b/code/analyze/to_update/annotate_positions.py
index 16b6378..0393c58 100644
--- a/code/analyze/to_update/annotate_positions.py
+++ b/code/analyze/to_update/annotate_positions.py
@@ -1,9 +1,8 @@
-import sys
 import re
 import gzip
-sys.path.insert(0, '../misc/')
-import overlap
-import read_fasta
+from misc import overlap
+from misc import read_fasta
+
 
 def get_genes(fn):
 
@@ -15,14 +14,17 @@ def get_genes(fn):
     f.close()
     return genes
 
+
 def get_orfs(fn):
     headers, seqs = read_fasta.read_fasta(fn)
     orfs = {}
     for h in headers:
-        m = re.search(' (?P<name>[a-zA-Z0-9]+)_(?P<strain>[a-zA-Z0-9\.]+):(?P<start>[0-9]+):(?P<end>[0-9]+)', h)
+        m = re.search(r' (?P<name>[a-zA-Z0-9]+)_(?P<strain>[a-zA-Z0-9\.]+)'
+                      ':(?P<start>[0-9]+):(?P<end>[0-9]+)', h)
         orfs[(int(m.group('start')), int(m.group('end')))] = m.group('name')
     return orfs
 
+
 def write_annotated_file(coords, genes, orfs, fn):
     # could definitely do this all way more efficiently
     sep = '\t'
@@ -33,14 +35,13 @@ def write_annotated_file(coords, genes, orfs, fn):
         if int(coords[i]) == coords[i]:
             f.write(str(int(coords[i])) + sep)
             gene = overlap.contained_any_named(coords[i], genes)
-            if gene != None:
+            if gene is not None:
                 f.write(gene)
             f.write(sep)
         else:
             f.write(str(coords[i]) + sep + sep)
         orf = overlap.contained_any_named(i, orfs)
-        if orf != None:
+        if orf is not None:
             f.write(orf)
         f.write('\n')
     f.close()
-
diff --git a/code/analyze/to_update/annotate_positions_main.py b/code/analyze/to_update/annotate_positions_main.py
index c5647b0..f721acb 100644
--- a/code/analyze/to_update/annotate_positions_main.py
+++ b/code/analyze/to_update/annotate_positions_main.py
@@ -4,28 +4,24 @@
 # gene
 # in ORF?
 
-import re
 import sys
 import os
-import copy
 import gzip
-from annotate_positions import *
-sys.path.insert(0, '..')
+from annotate_positions import (get_genes, get_orfs, write_annotated_file)
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
+from align import align_helpers
 
-##======
+# ======
 # get strains
-##======
+# ======
 
 i = int(sys.argv[1])
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
 strain, d = s[i]
 
-##======
+# ======
 # get genes on each chromosome
-##======
+# ======
 
 genes_by_chrm = {}
 for chrm in gp.chrms:
@@ -33,10 +29,10 @@
          '_genes.txt'
     genes_by_chrm[chrm] = get_genes(fn)
 
-##======
+# ======
 # loop through all strains and chromosomes, generating annotated
 # position file for each
-##======
+# ======
 
 coord_dir = gp.analysis_out_dir_absolute + 'coordinates/'
 if not os.path.exists(coord_dir + 'annotated'):
@@ -44,37 +40,15 @@
 
 for chrm in gp.chrms:
 
-    print strain, chrm
+    print(strain, chrm)
 
     fn = strain + '_to_' + gp.master_ref + '_chr' + chrm + '.txt.gz'
 
     fn_orfs = d + 'orfs/' + strain + '_chr' + chrm + \
-              '_orfs' + gp.fasta_suffix
+        '_orfs' + gp.fasta_suffix
     orfs = get_orfs(fn_orfs)
 
     fn_out = coord_dir + 'annotated/' + fn
-    coords = [float(line) for line in gzip.open(coord_dir + fn, 'rb').readlines()]
+    coords = [float(line)
+              for line in gzip.open(coord_dir + fn, 'rb').readlines()]
     write_annotated_file(coords, genes_by_chrm[chrm], orfs, fn_out)
-    
-
-
-
-
-#for strain, d in s:
-
-    #m = re.search('(?P<strain1>[a-zA-Z0-9]+)_to_(?P<strain2>[a-zA-Z0-9]+)_chr(?P<chrm>[IVXM]+)', fn)
-    #if m == None:
-    #    continue
-    #strain1 = m.group('strain1')
-    #strain2 = m.group('strain2')
-    #chrm = m.group('chrm')
-    
-    #if strain1 == gp.master_ref:
-     #   continue
-
-    # don't deal with paradoxus just for now
-    #if strain1 in gp.alignment_ref_order or strain2 != gp.master_ref:
-    #    continue
-
-    #print fn
-
diff --git a/code/analyze/to_update/annotate_regions.py b/code/analyze/to_update/annotate_regions.py
index fd032a0..8758484 100644
--- a/code/analyze/to_update/annotate_regions.py
+++ b/code/analyze/to_update/annotate_regions.py
@@ -1,9 +1,6 @@
 import gzip
 import gene_predictions
-import sys
 import global_params as gp
-sys.path.insert(0, '../misc/')
-
 
 
 def get_block_by_site(all_regions, seq):
@@ -19,8 +16,8 @@ def get_block_by_site(all_regions, seq):
     return introgressed_by_site
 
 
-def write_predictions_annotated(alignment_headers, alignment_seqs, master, \
-                                strain_labels, match_by_site, \
+def write_predictions_annotated(alignment_headers, alignment_seqs, master,
+                                strain_labels, match_by_site,
                                 gene_by_site, block_by_site, masked, fn):
 
     f = gzip.open(fn, 'wb')
@@ -36,10 +33,10 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \
     individual_indices = [0] * num_seqs
 
     # header
-    f.write('ps_ref' + sep + 'ps_strain' + sep + \
-            sep.join(strain_labels) + sep + \
-            'match' + sep + \
-            'gene' + sep + 'block' + sep + \
+    f.write('ps_ref' + sep + 'ps_strain' + sep +
+            sep.join(strain_labels) + sep +
+            'match' + sep +
+            'gene' + sep + 'block' + sep +
             sep.join([lab + '_masked' for lab in strain_labels]) + '\n')
 
     lines = []
@@ -57,7 +54,7 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \
             ind_ref += 1
             ps_ref = str(ind_ref)
         line += ps_ref + sep
-        
+
         # index in strain
         ps_strain = None
         if alignment_seqs[-1][i] == gp.gap_symbol:
@@ -76,7 +73,7 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \
             line += match_by_site[r][i]
         line += sep
 
-        if gene_by_site[i] != None:
+        if gene_by_site[i] is not None:
             line += gene_by_site[i]
         line += sep
 
@@ -87,17 +84,18 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \
             line += sep
             if alignment_seqs[si][i] != gp.gap_symbol:
                 # TODO update n to x
-                if masked[si][individual_indices[si]] == 'n': #gp.masked_symbol:
+                if masked[si][individual_indices[si]] == 'n':  # masked
                     line += gp.masked_symbol
                 individual_indices[si] += 1
-            
+
         line += '\n'
-        
+
         lines.append(line)
     f.writelines(lines)
 
     f.close()
 
+
 # TODO give this a more general name/place
 def read_predictions_annotated(fn):
     sep = '\t'
@@ -109,10 +107,7 @@ def read_predictions_annotated(fn):
         line = line[:-1].split(sep)
         for i in range(len(labels)):
             d[labels[i]].append(line[i])
-        #d[line[0]] = dict(zip(labels[1:], line[1:]))
+        # d[line[0]] = dict(zip(labels[1:], line[1:]))
         line = f.readline()
     f.close()
     return d
-
-
-
diff --git a/code/analyze/to_update/annotate_regions_main.py b/code/analyze/to_update/annotate_regions_main.py
index a06bb2c..3a60544 100644
--- a/code/analyze/to_update/annotate_regions_main.py
+++ b/code/analyze/to_update/annotate_regions_main.py
@@ -1,28 +1,25 @@
-# ps_cer ps_strain cer_ref par_ref strain gene introgressed_region cer_masked par_masked strain_masked 
+# ps_cer ps_strain cer_ref par_ref strain gene introgressed_region
+# cer_masked par_masked strain_masked
 
-import re
 import sys
 import os
-import copy
-import gene_predictions 
-from annotate_regions import *
+import gene_predictions
+from annotate_regions import (write_predictions_annotated,
+                              get_block_by_site)
 import predict
-import pickle
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
+from misc import read_fasta
 
-##======
+# ======
 # read in analysis parameters
-##======
+# ======
 
 refs, strains, args = predict.process_args(sys.argv[1:])
 chrm = sys.argv[1]
 
-##======
+# ======
 # read in introgressed/unknown regions and alignments
-##======
+# ======
 
 gp_dir = '../'
 
@@ -40,9 +37,9 @@
 fn_align_prefix = gp_dir + gp.alignments_dir
 fn_align_prefix += '_'.join([refs[s][0] for s in args['species']]) + '_'
 
-##======
+# ======
 # produce annotated files
-##======
+# ======
 
 # for keeping track of all genes introgressed in each strain, and the
 # fraction introgressed
@@ -68,20 +65,20 @@
 fn_genes = gp.analysis_out_dir_absolute + '/' + \
            master_ref + '_chr' + chrm + '_genes.txt'
 
-print 'reading genes on chromosome', chrm
+print('reading genes on chromosome', chrm)
 # dictionary keyed by name: (start, end)
 genes = gene_predictions.read_genes(fn, fn_genes)
-print 'done reading genes'
+print('done reading genes')
 
 # loop through all strains that we've called introgression in, and
 # associate genes with the regions they overlap
 for strain in regions.keys():
-        
-    print '***', strain, chrm
+
+    print('***', strain, chrm)
     sys.stdout.flush()
 
-    fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' + \
-         'predictions_' + strain + '_chr' + chrm +  '_site_summary.txt.gz'
+    fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' +\
+        'predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz'
     if not os.path.exists(os.path.dirname(fn_out)):
         os.makedirs(os.path.dirname(fn_out))
 
@@ -92,29 +89,30 @@
 
     # read alignment blocks for this strain and chromosome
     fn_align = fn_align_prefix + \
-               strain + '_chr' + chrm + '_mafft' +  gp.alignment_suffix
+        strain + '_chr' + chrm + '_mafft' + gp.alignment_suffix
     alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align)
 
     # read masked (unaligned) sequences
     seq_masked_fns = [header.split()[-1] for header in alignment_headers]
-    seq_masked_fns = [mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix \
+    seq_masked_fns = [mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix
                       for mfn in seq_masked_fns]
     seqs_masked = [read_fasta.read_fasta(mfn)[1][0] for mfn in seq_masked_fns]
 
     labels = ref_labels + [strain]
-    
+
     # mark each site as matching each reference or not
-    ref_match_by_site = gene_predictions.get_ref_match_by_site(alignment_seqs, labels)
+    ref_match_by_site = gene_predictions.get_ref_match_by_site(alignment_seqs,
+                                                               labels)
     # mark each site as in a gene or not
-    genes_by_site = gene_predictions.get_genes_by_site(genes, alignment_seqs[0])
+    genes_by_site = gene_predictions.get_genes_by_site(genes,
+                                                       alignment_seqs[0])
     # mark each site as introgressed or not
     all_regions = [regions[strain][chrm]]
-    if regions_unk.has_key(strain) and regions_unk[strain].has_key(chrm):
+    if strain in regions_unk.has_key and chrm in regions_unk[strain]:
         all_regions.append(regions_unk[strain][chrm])
     block_by_site = get_block_by_site(all_regions, alignment_seqs[0])
 
-    write_predictions_annotated(alignment_headers, alignment_seqs, 0, \
-                                ref_labels + [strain], ref_match_by_site, \
-                                genes_by_site, block_by_site, seqs_masked, fn_out)
-
-    
+    write_predictions_annotated(alignment_headers, alignment_seqs, 0,
+                                ref_labels + [strain], ref_match_by_site,
+                                genes_by_site, block_by_site,
+                                seqs_masked, fn_out)
diff --git a/code/analyze/to_update/check_paralogs_main.py b/code/analyze/to_update/check_paralogs_main.py
index 1bc39bd..bde7994 100644
--- a/code/analyze/to_update/check_paralogs_main.py
+++ b/code/analyze/to_update/check_paralogs_main.py
@@ -1,6 +1,6 @@
 # Loop through all introgressed genes (might be just a small part)
 # that have paralogs
-# Extract introgressed portion of gene 
+# Extract introgressed portion of gene
 # Blast that portion against:
 # - Cerevisiae gene
 # - Paradoxus gene (region aligned to cerevisiae gene)
@@ -13,23 +13,14 @@
 # - Paradoxus paralog -> interesting...
 
 
-import re
-import sys
 import os
 import math
-import Bio.SeqIO
-import copy
 import gzip
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from align import align_helpers
+from misc import read_table
+from misc import read_fasta
+from misc import write_fasta
 
 postprocess = False
 
@@ -48,7 +39,7 @@
 # dict of dicts keyed by region id and column names; includes filtered
 # and unfiltered regions
 region_to_genes = {}
-f = open(gp.analysis_out_dir_absolute + tag + \
+f = open(gp.analysis_out_dir_absolute + tag +
          '/genes_for_each_region_' + tag + '.txt', 'r')
 line = f.readline()
 while line != '':
@@ -60,10 +51,10 @@
 f.close()
 
 # dict of lists keyed by region id
-t_regions_filtered, l = \
-    read_table.read_table_rows(gp.analysis_out_dir_absolute + tag + \
-                               '/introgressed_blocks_filtered_par_' + tag + \
-                               '_summary_plus.txt', \
+t_regions_filtered, _ = \
+    read_table.read_table_rows(gp.analysis_out_dir_absolute + tag +
+                               '/introgressed_blocks_filtered_par_' + tag +
+                               '_summary_plus.txt',
                                '\t', header=True)
 
 
@@ -73,12 +64,12 @@
 for region_id in region_to_genes:
     genes = region_to_genes[region_id]
     for gene in genes:
-        if not gene_to_regions.has_key(gene):
+        if gene not in gene_to_regions:
             gene_to_regions[gene] = []
         gene_to_regions[gene].append(region_id)
     if region_id in t_regions_filtered:
         for gene in genes:
-            if not gene_to_regions_filtered.has_key(gene):
+            if gene not in gene_to_regions_filtered:
                 gene_to_regions_filtered[gene] = []
             gene_to_regions_filtered[gene].append(region_id)
 
@@ -95,7 +86,7 @@
 # read in all gene coordinates
 gene_coords = {}
 for chrm in gp.chrms:
-    f = open(gp.analysis_out_dir_absolute + \
+    f = open(gp.analysis_out_dir_absolute +
              'S288c_chr' + chrm + '_genes.txt', 'r')
     lines = [line.strip().split('\t') for line in f.readlines()]
     f.close()
@@ -108,8 +99,9 @@
 
 genes_to_analyze = gene_to_regions_filtered.keys()
 if postprocess:
-    genes_to_analyze = [line.split('\t')[0] for line in \
-                        open('check_paralogs_out_cer_paralog.tsv', 'r').readlines()]
+    genes_to_analyze = [line.split('\t')[0] for line in
+                        open('check_paralogs_out_cer_paralog.tsv',
+                             'r').readlines()]
     genes_to_analyze = list(set(genes_to_analyze))
 
 ip = 0
@@ -117,13 +109,13 @@
     if gene not in paralogs:
         continue
 
-    print ip
+    print(ip)
     ip += 1
 
     chrm, ref_gene_start, ref_gene_end = gene_coords[gene]
 
     gene_headers, gene_seqs = \
-        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + \
+        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' +
                               gene + '/' + gene + '_from_alignment.fa')
     gene_headers = [x[1:].strip() for x in gene_headers]
     strain_seqs = dict(zip(gene_headers, gene_seqs))
@@ -133,7 +125,7 @@
 
     paralog = paralogs[gene]
     gene_headers, gene_seqs = \
-        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + \
+        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' +
                               paralog + '/' + paralog + '_from_alignment.fa')
     gene_headers = [x[1:].strip() for x in gene_headers]
     strain_paralog_seqs = dict(zip(gene_headers, gene_seqs))
@@ -153,8 +145,8 @@
     f.close()
 
     cmd_string = gp.blast_install_path + 'makeblastdb' + \
-                 ' -in ' + db_fn + \
-                 ' -dbtype nucl'
+        ' -in ' + db_fn + \
+        ' -dbtype nucl'
     os.system(cmd_string)
 
     strain_intd_seqs = {}
@@ -164,25 +156,29 @@
         ref_region_start = int(t_regions_filtered[region]['start'])
         ref_region_end = int(t_regions_filtered[region]['end'])
 
-        ref_to_strain_coords = [float(x[:-1]) for x in \
-                                gzip.open(gp.analysis_out_dir_absolute + \
-                                          'coordinates/S288c_to_' + strain + \
-                                          '_chr' + chrm + '.txt.gz').readlines()]
+        ref_to_strain_coords = [float(x[:-1]) for x in
+                                gzip.open(gp.analysis_out_dir_absolute +
+                                          'coordinates/S288c_to_' + strain +
+                                          '_chr' + chrm
+                                          + '.txt.gz').readlines()]
 
-        gene_start = int(max(0, math.ceil(ref_to_strain_coords[ref_gene_start])))
+        gene_start = int(max(0, math.ceil(
+            ref_to_strain_coords[ref_gene_start])))
         gene_end = int(math.floor(ref_to_strain_coords[ref_gene_end]))
-        
-        region_start = int(max(0, math.ceil(ref_to_strain_coords[ref_region_start])))
+
+        region_start = int(max(0, math.ceil(
+            ref_to_strain_coords[ref_region_start])))
         region_end = int(math.floor(ref_to_strain_coords[ref_region_end]))
 
         start = max(gene_start, region_start)
         end = min(gene_end, region_end)
 
-        chrom_seq = read_fasta.read_fasta(strain_dirs[strain] + strain + '_chr' + \
+        chrom_seq = read_fasta.read_fasta(strain_dirs[strain] +
+                                          strain + '_chr' +
                                           chrm + gp.fasta_suffix)[1][0]
         seq = chrom_seq[start:end+1]
 
-        if not strain_intd_seqs.has_key(strain):
+        if strain not in strain_intd_seqs:
             strain_intd_seqs[strain] = chrom_seq[gene_start:gene_end+1].lower()
         relative_start = start - gene_start
         relative_end = end - gene_start
@@ -196,11 +192,11 @@
         f.close()
 
         cmd_string = gp.blast_install_path + 'blastn' + \
-                     ' -db ' + db_fn + \
-                     ' -query ' + query_fn + \
-                     ' -out ' + out_fn + \
-                     ' -outfmt ' + outfmt
-        print cmd_string
+            ' -db ' + db_fn + \
+            ' -query ' + query_fn + \
+            ' -out ' + out_fn + \
+            ' -outfmt ' + outfmt
+        print(cmd_string)
         os.system(cmd_string)
 
         if os.stat(out_fn).st_size == 0:
@@ -210,9 +206,9 @@
                          ' -out ' + out_fn + \
                          ' -task "blastn-short"' + \
                          ' -outfmt ' + outfmt
-            print cmd_string
+            print(cmd_string)
             os.system(cmd_string)
-            
+
         lines = open(out_fn, 'r').readlines()
         best_key = 'none'
         if len(lines) != 0:
@@ -236,10 +232,10 @@
     # write reference genes and paralogs and all introgressed
     # genes to file and then align
     fn = gp.analysis_out_dir_absolute + tag + '/paralogs/' + \
-         gene + gp.fasta_suffix
-    headers = ['S288c ' + gene, 'CBS432 ' + gene, \
+        gene + gp.fasta_suffix
+    headers = ['S288c ' + gene, 'CBS432 ' + gene,
                'S288c ' + paralog, 'CBS432 ' + paralog]
-    seqs = [cer_seq.lower(), par_seq.lower(), \
+    seqs = [cer_seq.lower(), par_seq.lower(),
             cer_paralog_seq.lower(), par_paralog_seq.lower()]
     for strain in strain_intd_seqs:
         headers.append(strain + ' ' + gene)
@@ -248,10 +244,10 @@
 
     aligned_fn = fn.replace(gp.fasta_suffix, gp.alignment_suffix)
     cmd_string = gp.mafft_install_path + '/mafft ' + \
-                 ' --quiet --reorder --preservecase ' + \
-                 fn + ' > ' + aligned_fn
+        ' --quiet --reorder --preservecase ' + \
+        fn + ' > ' + aligned_fn
     os.system(cmd_string)
-        
+
 f = open('check_paralogs_out.tsv', 'w')
 f.write('category\tnum_total_genes\tnum_unique_genes\n')
 for key in keys:
@@ -265,7 +261,5 @@
     for item in all_rankings[key]:
         fk.write('\t'.join(item) + '\n')
     fk.close()
-    
-f.close()
-
 
+f.close()
diff --git a/code/analyze/to_update/combine_all_strains.py b/code/analyze/to_update/combine_all_strains.py
index 947764b..fa331ca 100644
--- a/code/analyze/to_update/combine_all_strains.py
+++ b/code/analyze/to_update/combine_all_strains.py
@@ -3,19 +3,12 @@
 import os
 import math
 import Bio.SeqIO
-import copy
-import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import seq_functions
-import read_table
-import read_fasta
-import write_fasta
-import mystats
-import overlap
+from misc import seq_functions
+from misc import read_table
+from misc import read_fasta
+from misc import overlap
+
 
 def get_range_seq(start, end, seq_fn):
 
@@ -23,16 +16,17 @@ def get_range_seq(start, end, seq_fn):
     range_seq = chrm_seq[start:end+1]
     return range_seq
 
+
 def get_ref_gene_seq(gene, gene_coords_fn, seq_fn):
 
-    d1, labels = read_table.read_table_rows(gene_coords_fn, '\t', \
+    d1, labels = read_table.read_table_rows(gene_coords_fn, '\t',
                                             header=False, key_ind=0)
     d = {}
     for g in d1:
         if d1[g][0] == '""':
             d[g] = d1[g][1:]
         else:
-            d[d1[g][0]] = d1[g][1:]        
+            d[d1[g][0]] = d1[g][1:]
 
     gene_start = int(d[gene][2]) - 1
     gene_end = int(d[gene][3]) - 1
@@ -45,6 +39,7 @@ def get_ref_gene_seq(gene, gene_coords_fn, seq_fn):
     assert gene_start < gene_end
     return gene_seq, gene_start, gene_end, strand
 
+
 def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1):
     headers, seqs = read_fasta.read_fasta(fn)
     n = len(seqs[0])
@@ -63,29 +58,30 @@ def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1):
             pr.append(str(ri))
             ps.append(str(si))
     if flip_ref:
-        return {'ps_ref':ps, 'ps_strain':pr}
-    return {'ps_ref':pr, 'ps_strain':ps}
+        return {'ps_ref': ps, 'ps_strain': pr}
+    return {'ps_ref': pr, 'ps_strain': ps}
 
 
 # by taking part of sequence aligned with reference coordinates
-def get_range_seqs(strains, chrm, start, end, tag, gp_dir = '../'):
+def get_range_seqs(strains, chrm, start, end, tag, gp_dir='../'):
     # TODO this shouldn't actually be dependent on tag
 
     strain_range_seqs = {}
     for strain, d in strains:
-        print strain
+        print(strain)
         fn = d + strain + '_chr' + chrm + gp.fasta_suffix
         chrm_seq = read_fasta.read_fasta(fn)[1][0]
 
         t = None
         try:
-            t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \
-                                                      tag + '/' + \
-                                                      'site_summaries/predictions_' + \
-                                                      strain + \
-                                                      '_chr' + chrm + \
-                                                      '_site_summary.txt.gz', '\t')
-        except:
+            t, labels = read_table.read_table_columns(
+                gp.analysis_out_dir_absolute +
+                tag + '/' +
+                'site_summaries/predictions_' +
+                strain +
+                '_chr' + chrm +
+                '_site_summary.txt.gz', '\t')
+        except FileNotFoundError:
             # for par reference which doesn't have site summary file
             align_fn = gp_dir + gp.alignments_dir + \
                        '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
@@ -97,13 +93,13 @@ def get_range_seqs(strains, chrm, start, end, tag, gp_dir = '../'):
         start_strain = int(math.ceil(float(ref_ind_to_strain_ind[str(start)])))
         end_strain = int(math.floor(float(ref_ind_to_strain_ind[str(end)])))
 
-        
-        strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain+1], \
-                                      start_strain, end_strain)
+        strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain+1],
+                                     start_strain, end_strain)
     return strain_range_seqs
 
 
-def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, start, end):
+def choose_best_hit_rev(hits, query_fn, ref_chrm_fn,
+                        orf_headers, orf_seqs, start, end):
     # choosing best hit by reciprocal blast -> not reliable tho
     if len(hits) == 1:
         return hits[0][0]
@@ -124,10 +120,10 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star
         f.write(seq + '\n')
         f.close()
         cmd_string = gp.blast_install_path + 'blastn' + \
-                     ' -db ' + ref_chrm_fn + \
-                     ' -query ' + orf_query_fn + \
-                     ' -out ' + out_fn + \
-                     ' -outfmt ' + outfmt
+            ' -db ' + ref_chrm_fn + \
+            ' -query ' + orf_query_fn + \
+            ' -out ' + out_fn + \
+            ' -outfmt ' + outfmt
         os.system(cmd_string)
         f = open(out_fn, 'r')
         nhits = [line[:-1].split('\t') for line in f.readlines()]
@@ -135,7 +131,7 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star
         nstart = int(nhits[0][-2])
         nend = int(nhits[0][-1])
         # this division is hacky and unprincipled
-        o = overlap.overlap(start, end, nstart, nend) / float(hit[1]) 
+        o = overlap.overlap(start, end, nstart, nend) / float(hit[1])
         if o > greatest_overlap:
             greatest_overlap = o
             best_hit = hit[0]
@@ -143,10 +139,11 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star
     os.remove(out_fn)
     return best_hit
 
-def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\
+
+def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,
                     strain_ind_to_ref_ind, gp_dir='../'):
 
-    greatest_overlap = 0 # don't want to take overlaps of 0
+    greatest_overlap = 0  # don't want to take overlaps of 0
     best_hit = None
     x = None
     seq = None
@@ -171,7 +168,7 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\
                 c2 = chunk2.find(':', c1+1)
                 seq = seqs[i]
                 orf_start = int(chunk2[c1+1:c2])
-                orf_end = int(chunk2[c2+1:])                
+                orf_end = int(chunk2[c2+1:])
                 strand = '1'
                 if orf_start > orf_end:
                     temp = orf_end
@@ -179,8 +176,10 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\
                     orf_start = temp
                     strand = '-1'
                 break
-        current_start = int(math.ceil(float(strain_ind_to_ref_ind[str(orf_start)])))
-        current_end = int(math.floor(float(strain_ind_to_ref_ind[str(orf_end)])))
+        current_start = int(math.ceil(
+            float(strain_ind_to_ref_ind[str(orf_start)])))
+        current_end = int(math.floor(
+            float(strain_ind_to_ref_ind[str(orf_end)])))
         o = overlap.overlap(start, end, current_start, current_end)
         if o > greatest_overlap:
             greatest_overlap = o
@@ -190,16 +189,19 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\
             orf_start_max = orf_start
             orf_end_max = orf_end
             strand_max = strand
-            seq_max = seq # don't need to reverse complement (blast does this)
+            seq_max = seq  # don't need to reverse complement (blast does this)
 
-    print greatest_overlap
+    print(greatest_overlap)
     return best_hit, x_max, seq_max, orf_start_max, orf_end_max, strand_max
 
+
 # by blasting ORFs
-def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand, tag,
+def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn,
+                  start, end, strand, tag,
                   strain_ind_to_ref_ind):
-    
-    #outfmt = '"6 qseqid sseqid slen qstart qend length mismatch gapopen gaps sseq"'
+
+    # outfmt = '"6 qseqid sseqid slen qstart qend \
+    #     length mismatch gapopen gaps sseq"'
     outfmt = '"6 sseqid slen evalue bitscore"'
 
     strain_gene_seqs = {}
@@ -208,38 +210,39 @@ def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand, tag,
         if strain != 'yjm1332':
             continue
 
-        print '-', strain
+        print('-', strain)
         sys.stdout.flush()
-        fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' +  gp.fasta_suffix
+        fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix
         cmd_string = gp.blast_install_path + 'blastn' + \
-                     ' -db ' + fn + \
-                     ' -query ' + query_fn + \
-                     ' -out ' + out_fn + \
-                     ' -outfmt ' + outfmt
-        #print cmd_string
+            ' -db ' + fn + \
+            ' -query ' + query_fn + \
+            ' -out ' + out_fn + \
+            ' -outfmt ' + outfmt
+        # print(cmd_string)
         os.system(cmd_string)
-        hits = [line[:-1].split('\t') for line in open(out_fn, 'r').readlines()]
-        num_hits = len(hits)
+        hits = [line[:-1].split('\t')
+                for line in open(out_fn, 'r').readlines()]
         if len(hits) == 0:
             strain_gene_seqs[strain] = ('nohit', '', -1, -1, '')
             continue
-        #best_orf_id = hits[0][0]
+        # best_orf_id = hits[0][0]
         headers, seqs = read_fasta.read_fasta(fn)
         best_orf_id, x, seq, orf_start, orf_end, orf_strand = \
-            choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs, \
+            choose_best_hit(hits, start, end, tag, strain,
+                            chrm, headers, seqs,
                             strain_ind_to_ref_ind[strain])
-        print hits
-        print best_orf_id
-        print orf_strand, strand
+        print(hits)
+        print(best_orf_id)
+        print(orf_strand, strand)
         sys.exit()
 
-        if best_orf_id == None or orf_strand != strand:
+        if best_orf_id is None or orf_strand != strand:
             strain_gene_seqs[strain] = ('nohit', '', -1, -1, '')
             continue
         strain_gene_seqs[strain] = (x, seq, orf_start, orf_end, orf_strand)
     os.remove(out_fn)
     return strain_gene_seqs
-        
+
 
 # can't actually count on annotations
 def get_gene_seqs_gb(fn, gene, chrm):
@@ -249,38 +252,40 @@ def get_gene_seqs_gb(fn, gene, chrm):
     strains = set([])
     for strain_chrm_record in gb_records:
         desc = strain_chrm_record.description
-        m = re.search(' (?P<strain>[a-zA-Z0-9]+) chromosome (?P<chrm>[IVXM]+)', \
+        m = re.search(' (?P<strain>[a-zA-Z0-9]+) chromosome (?P<chrm>[IVXM]+)',
                       desc)
         chrm_current = m.group('chrm')
         strain = m.group('strain').lower()
         strains.add(strain)
-        #if len(strain_gene_seqs) > 82:
-        #    break
-        print strain, chrm_current
+        # if len(strain_gene_seqs) > 82:
+        #     break
+        print(strain, chrm_current)
         if chrm_current != chrm:
             continue
         for feature in strain_chrm_record.features:
-            if feature.type == 'CDS' and feature.qualifiers.has_key('gene') and \
+            if feature.type == 'CDS' and 'gene' in feature.qualifiers and \
                feature.qualifiers['gene'][0] == gene:
                 desc = strain_chrm_record.description
-                m = re.search(\
-                        ' (?P<strain>[a-zA-Z0-9]+) chromosome (?P<chrm>[IVXM]+)', \
+                m = re.search(
+                        ' (?P<strain>[a-zA-Z0-9]+) '
+                        'chromosome (?P<chrm>[IVXM]+)',
                         desc)
                 seq = str(feature.extract(strain_chrm_record.seq).lower())
                 start = str(feature.location.start)
                 end = str(feature.location.end)
                 strand = str(feature.location.strand)
                 locus_tag = feature.qualifiers['locus_tag'][0]
-                strain_gene_seqs[strain] = {'seq':seq, \
-                                            'chrm':chrm, \
-                                            'start':start, \
-                                            'end':end, \
-                                            'strand':strand,\
-                                            'locus_tag':locus_tag}
-            
-                print '- found gene in', strain
+                strain_gene_seqs[strain] = {'seq': seq,
+                                            'chrm': chrm,
+                                            'start': start,
+                                            'end': end,
+                                            'strand': strand,
+                                            'locus_tag': locus_tag}
+
+                print('- found gene in', strain)
     return strain_gene_seqs, list(strains)
 
+
 # because don't have gb file for paradoxus...
 def get_gene_seqs_fsa(fn, gene, chrm):
     f = open(fn, 'r')
@@ -295,12 +300,13 @@ def get_gene_seqs_fsa(fn, gene, chrm):
                 line = f.readline()
             f.close()
 
-            seqfa = open(gp.ref_dir['CBS432'] + 'CBS432_chr' + chrm + '.fa', 'r').read()
+            seqfa = open(gp.ref_dir['CBS432'] +
+                         'CBS432_chr' + chrm + '.fa', 'r').read()
             seqfa = seqfa.replace('\n', '')
             if seq in seqfa:
-                print 'found paradoxus seq'
+                print('found paradoxus seq')
             else:
-                print 'did not find paradoxus seq'
+                print('did not find paradoxus seq')
             fg = open('a.txt', 'w')
             fg.write(seq + '\n')
             fg.write(seqfa + '\n')
@@ -308,4 +314,3 @@ def get_gene_seqs_fsa(fn, gene, chrm):
             return seq.lower()
 
         line = f.readline()
-        
diff --git a/code/analyze/to_update/combine_gene_all_strains_main.py b/code/analyze/to_update/combine_gene_all_strains_main.py
index 2816954..b8c6912 100644
--- a/code/analyze/to_update/combine_gene_all_strains_main.py
+++ b/code/analyze/to_update/combine_gene_all_strains_main.py
@@ -1,4 +1,5 @@
-# TODO - when blasting, take best gene, except if there are multiple hits, prioritize the one that overlaps the region we'd expect based on alignment
+# TODO - when blasting, take best gene, except if there are multiple hits,
+# prioritize the one that overlaps the region we'd expect based on alignment
 
 
 # input a gene or start/end coordinates
@@ -6,34 +7,30 @@
 # - for gene, relies on annotations/orfs
 # - for coordinates, relies on alignments
 
-import re
 import sys
 import os
 import math
-import Bio.SeqIO
 import copy
-from combine_all_strains import *
+from analyze.to_update.combine_all_strains import (get_gene_seqs,
+                                                   get_inds_from_alignment,
+                                                   get_ref_gene_seq)
 import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from align import align_helpers
+from misc import read_table
+from misc import write_fasta
 
 tag = sys.argv[1]
 gene = sys.argv[2]
 chrm = sys.argv[3]
 
-#all_outfiles = []
+# all_outfiles = []
 
-print 'getting gene sequence from reference strain'
+print('getting gene sequence from reference strain')
 ref = 'S288c'
 ref_gene_coords_fn = '../../data/S288c_verified_orfs.tsv'
-ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix
+ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] \
+    + '_chr' + chrm + gp.fasta_suffix
 ref_gene_seq, ref_start, ref_end, ref_strand = \
     get_ref_gene_seq(gene, ref_gene_coords_fn, ref_seq_fn)
 query_fn = gene + '.txt'
@@ -41,19 +38,19 @@
 f.write(ref_gene_seq + '\n')
 f.close()
 
-print 'getting gene sequences from all strains'
+print('getting gene sequences from all strains')
 gp_dir = '../'
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
 ref_ind_to_strain_ind = {}
 strain_ind_to_ref_ind = {}
 for strain, d in s:
-    print '*', strain
+    print('*', strain)
     sys.stdout.flush()
-    t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \
-                                              tag + '/' + \
-                                              'site_summaries/predictions_' + \
-                                              strain + \
-                                              '_chr' + chrm + \
+    t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute +
+                                              tag + '/' +
+                                              'site_summaries/predictions_' +
+                                              strain +
+                                              '_chr' + chrm +
                                               '_site_summary.txt.gz', '\t')
     ref_ind_to_strain_ind[strain] = dict(zip(t['ps_ref'], t['ps_strain']))
     strain_ind_to_ref_ind[strain] = dict(zip(t['ps_strain'], t['ps_ref']))
@@ -63,15 +60,19 @@
            '_mafft' + gp.alignment_suffix
 t = get_inds_from_alignment(align_fn, True)
 other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]]
-ref_ind_to_strain_ind[other_ref_strain] = dict(zip(t['ps_ref'], t['ps_strain']))
-strain_ind_to_ref_ind[other_ref_strain] = dict(zip(t['ps_strain'], t['ps_ref']))
+ref_ind_to_strain_ind[other_ref_strain] = dict(
+    zip(t['ps_ref'], t['ps_strain']))
+strain_ind_to_ref_ind[other_ref_strain] = dict(
+    zip(t['ps_strain'], t['ps_ref']))
 s.append((other_ref_strain, gp.ref_dir[gp.alignment_ref_order[1]]))
-strain_gene_seqs = get_gene_seqs(query_fn, s, chrm, ref_seq_fn, ref_start, ref_end, ref_strand, tag, strain_ind_to_ref_ind)
+strain_gene_seqs = get_gene_seqs(query_fn, s, chrm, ref_seq_fn, ref_start,
+                                 ref_end, ref_strand,
+                                 tag, strain_ind_to_ref_ind)
 os.remove(query_fn)
 
-print 'writing all gene sequences to file'
+print('writing all gene sequences to file')
 keys = sorted(strain_gene_seqs.keys())
-headers = [key + ' ' + strain_gene_seqs[key][0] + ' ' + \
+headers = [key + ' ' + strain_gene_seqs[key][0] + ' ' +
            strain_gene_seqs[key][-1] for key in keys]
 seqs = [strain_gene_seqs[key][1] for key in keys]
 strains = [ref] + keys
@@ -86,18 +87,18 @@
 
 suffixes = ['', '_filtered']
 for suffix in suffixes:
-    print ' '.join(['finding', suffix, 'regions that overlap gene'])
+    print(' '.join(['finding', suffix, 'regions that overlap gene']))
     # read in filtered regions
     fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-                 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-    regions, l = read_table.read_table_rows(fn_regions, '\t')
+        'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
+    regions, _ = read_table.read_table_rows(fn_regions, '\t')
 
     # figure out which strains are introgressed/which regions overlap gene
     fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-                       'genes_for_each_region_chr' + chrm + '_' + tag + '.txt'
+        'genes_for_each_region_chr' + chrm + '_' + tag + '.txt'
     region_to_genes = \
         gene_predictions.read_genes_for_each_region_summary(fn_genes_regions)
-    #strains = [x[0] for x in s]
+    # strains = [x[0] for x in s]
     regions_overlapping = {}
     # TODO does this actually ensure that regions are sorted appropriately
     # in fasta headers below?
@@ -106,12 +107,12 @@
         if regions[region]['chromosome'] == chrm and \
            gene in [x[0] for x in region_to_genes[region]['gene_list']]:
             strain = regions[region]['strain']
-            if not regions_overlapping.has_key(strain):
+            if strain not in regions_overlapping:
                 regions_overlapping[strain] = []
             regions_overlapping[strain].append(region)
 
-    print ' '.join(['writing all gene sequences to file, with', \
-                   suffix, 'introgressed bases capitalized'])
+    print(' '.join(['writing all gene sequences to file, with',
+                   suffix, 'introgressed bases capitalized']))
     headers_current = copy.deepcopy(headers)
     seqs_current = copy.deepcopy(seqs)
     for i in range(len(seqs)):
@@ -123,38 +124,35 @@
         if strain not in regions_overlapping:
             continue
         g = strain_gene_seqs[strain]
-        t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \
-                                                  tag + '/' + \
-                                                  'site_summaries/predictions_' + \
-                                                  strain + \
-                                                  '_chr' + chrm + \
-                                                  '_site_summary.txt.gz', '\t')
+        t, labels = read_table.read_table_columns(
+            gp.analysis_out_dir_absolute + tag + '/' +
+            'site_summaries/predictions_' + strain + '_chr' + chrm +
+            '_site_summary.txt.gz', '\t')
         for region in regions_overlapping[strain]:
             header += ' ' + region
             start_strain = \
-                math.ceil(float(\
-                            ref_ind_to_strain_ind[strain][regions[region]['start']]))
+                math.ceil(float(ref_ind_to_strain_ind[
+                    strain][regions[region]['start']]))
             end_strain = \
-                math.floor(float(\
-                            ref_ind_to_strain_ind[strain][regions[region]['end']]))
+                math.floor(float(ref_ind_to_strain_ind[
+                    strain][regions[region]['end']]))
             start_relative = int(max(start_strain - int(g[2]), 0))
             end_relative = int(end_strain - int(g[2]))
             seq = seq[:start_relative] + \
-                  seq[start_relative:end_relative+1].upper() + \
-                  seq[end_relative+1:]
-        seqs_current[i] = seq
+                seq[start_relative:end_relative+1].upper() + \
+                seq[end_relative+1:]
+            seqs_current[i] = seq
         headers_current[i] = header
 
     gene_seqs_fn = gp.analysis_out_dir_absolute + tag + \
-                   '/genes/' + gene + '/' + gene + \
-                   '_introgressed' + suffix + gp.fasta_suffix
+        '/genes/' + gene + '/' + gene + \
+        '_introgressed' + suffix + gp.fasta_suffix
     write_fasta.write_fasta(headers_current, seqs_current, gene_seqs_fn)
 
-
-    print 'aligning gene sequences'
-    gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix, gp.alignment_suffix)
+    print('aligning gene sequences')
+    gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix,
+                                                gp.alignment_suffix)
     cmd_string = gp.mafft_install_path + '/mafft ' + \
-                 ' --quiet --reorder --preservecase ' + \
-                 gene_seqs_fn + ' > ' + gene_seqs_aligned_fn
+        ' --quiet --reorder --preservecase ' + \
+        gene_seqs_fn + ' > ' + gene_seqs_aligned_fn
     os.system(cmd_string)
-
diff --git a/code/analyze/to_update/combine_region_all_strains_main.py b/code/analyze/to_update/combine_region_all_strains_main.py
index cc79a6b..234671a 100644
--- a/code/analyze/to_update/combine_region_all_strains_main.py
+++ b/code/analyze/to_update/combine_region_all_strains_main.py
@@ -3,44 +3,38 @@
 # - for gene, relies on annotations/orfs
 # - for coordinates, relies on alignments
 
-import re
 import sys
 import os
 import math
-import Bio.SeqIO
 import copy
-from combine_all_strains import *
-import gene_predictions
-sys.path.insert(0, '..')
+from combine_all_strains import (get_range_seq,
+                                 get_range_seqs)
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
-
+from align import align_helpers
+from misc import read_table
+from misc import write_fasta
 
 tag = sys.argv[1]
 start = int(sys.argv[2])
 end = int(sys.argv[3])
 chrm = sys.argv[4]
 
-print 'getting range sequence from reference strain'
+print('getting range sequence from reference strain')
 ref = 'S288c'
-ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix
+ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + \
+    '_chr' + chrm + gp.fasta_suffix
 ref_range_seq = get_range_seq(start, end, ref_seq_fn)
 
-print 'getting range sequences from all strains'
+print('getting range sequences from all strains')
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
-s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], gp.ref_dir[gp.alignment_ref_order[1]]))
+s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]],
+          gp.ref_dir[gp.alignment_ref_order[1]]))
 # keyed by strain: (seq, start, end)
 strain_range_seqs = get_range_seqs(s, chrm, start, end, tag)
 
-print 'writing all range sequences to file'
+print('writing all range sequences to file')
 keys = sorted(strain_range_seqs.keys())
-headers = [key + ' ' + str(strain_range_seqs[key][1]) + ':' + \
+headers = [key + ' ' + str(strain_range_seqs[key][1]) + ':' +
            str(strain_range_seqs[key][2]) for key in keys]
 seqs = [strain_range_seqs[key][0] for key in keys]
 strains = [ref] + keys
@@ -55,11 +49,11 @@
 
 suffixes = ['', '_filtered']
 for suffix in suffixes:
-    print ' '.join(['finding', suffix, 'regions that overlap range'])
+    print(' '.join(['finding', suffix, 'regions that overlap range']))
     # read in filtered regions
     fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-                 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-    regions, l = read_table.read_table_rows(fn_regions, '\t')
+        'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
+    regions, _ = read_table.read_table_rows(fn_regions, '\t')
 
     regions_overlapping = {}
     # TODO does this actually ensure that regions are sorted appropriately
@@ -67,17 +61,17 @@
     region_keys_ordered = sorted(regions.keys(), key=lambda x: int(x[1:]))
     for region in region_keys_ordered:
         if regions[region]['chromosome'] == chrm and \
-           ((int(regions[region]['start']) > start and \
-             int(regions[region]['start']) < end) or \
-            (int(regions[region]['end']) > start and \
+           ((int(regions[region]['start']) > start and
+             int(regions[region]['start']) < end) or
+            (int(regions[region]['end']) > start and
              int(regions[region]['end']) < end)):
             strain = regions[region]['strain']
-            if not regions_overlapping.has_key(strain):
+            if strain not in regions_overlapping:
                 regions_overlapping[strain] = []
             regions_overlapping[strain].append(region)
 
-    print ' '.join(['writing all range sequences to file, with', \
-                   suffix, 'introgressed bases capitalized'])
+    print(' '.join(['writing all range sequences to file, with',
+                   suffix, 'introgressed bases capitalized']))
     headers_current = copy.deepcopy(headers)
     seqs_current = copy.deepcopy(seqs)
     for i in range(len(seqs)):
@@ -89,35 +83,33 @@
         if strain not in regions_overlapping:
             continue
         r = strain_range_seqs[strain]
-        t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \
-                                                  tag + '/' + \
-                                                  'site_summaries/predictions_' + \
-                                                  strain + \
-                                                  '_chr' + chrm + \
-                                                  '_site_summary.txt.gz', '\t')
+        t, labels = read_table.read_table_columns(
+            gp.analysis_out_dir_absolute + tag + '/' +
+            'site_summaries/predictions_' + strain + '_chr' + chrm +
+            '_site_summary.txt.gz', '\t')
         ref_ind_to_strain_ind = dict(zip(t['ps_ref'], t['ps_strain']))
         for region in regions_overlapping[strain]:
             header += ' ' + region
-            start_strain = math.ceil(float(\
-                                ref_ind_to_strain_ind[regions[region]['start']]))
-            end_strain = math.floor(float(\
-                                ref_ind_to_strain_ind[regions[region]['end']]))
+            start_strain = math.ceil(float(
+                ref_ind_to_strain_ind[regions[region]['start']]))
+            end_strain = math.floor(float(
+                ref_ind_to_strain_ind[regions[region]['end']]))
             start_relative = int(max(start_strain - int(r[1]), 0))
             end_relative = int(end_strain - int(r[1]))
-            seq = seq[:start_relative] + \
-                  seq[start_relative:end_relative+1].upper() + \
-                  seq[end_relative+1:]
+            seq = (seq[:start_relative] +
+                   seq[start_relative:end_relative+1].upper() +
+                   seq[end_relative+1:])
         seqs_current[i] = seq
         headers_current[i] = header
 
     range_seqs_fn = gp.analysis_out_dir_absolute + tag + '/ranges/' + label + \
-                    '/' + label + '_introgressed' + suffix + gp.fasta_suffix
+        '/' + label + '_introgressed' + suffix + gp.fasta_suffix
     write_fasta.write_fasta(headers_current, seqs_current, range_seqs_fn)
 
-
-    print 'aligning range sequences'
-    range_seqs_aligned_fn = range_seqs_fn.replace(gp.fasta_suffix, gp.alignment_suffix)
+    print('aligning range sequences')
+    range_seqs_aligned_fn = range_seqs_fn.replace(gp.fasta_suffix,
+                                                  gp.alignment_suffix)
     cmd_string = gp.mafft_install_path + '/mafft ' + \
-                 ' --reorder --preservecase ' + \
-                 range_seqs_fn + ' > ' + range_seqs_aligned_fn
+        ' --reorder --preservecase ' + \
+        range_seqs_fn + ' > ' + range_seqs_aligned_fn
     os.system(cmd_string)
diff --git a/code/analyze/to_update/compare.py b/code/analyze/to_update/compare.py
index 460203e..57d9b95 100644
--- a/code/analyze/to_update/compare.py
+++ b/code/analyze/to_update/compare.py
@@ -1,10 +1,9 @@
-import sys
-import os
-sys.path.insert(0, '..')
 import global_params as gp
 
 
-lines = [x.split(',') for x in open('/tigress/AKEY/akey_vol2/aclark4/nobackup/introgression/data/Table_S5_introgressed_genes.csv', 'r').readlines()]
+lines = [x.split(',') for x in open(
+    '/tigress/AKEY/akey_vol2/aclark4/nobackup/introgression/data/\
+    Table_S5_introgressed_genes.csv', 'r').readlines()]
 genes = []
 genes_verified = []
 for i in range(2, len(lines)):
@@ -14,12 +13,13 @@
 
 gp.analysis_out_dir_absolute
 
-lines = [x.split(' ') for x in open('../../results/introgressed_id_genes.txt', 'r').readlines()]
+lines = [x.split(' ') for x in open('../../results/introgressed_id_genes.txt',
+                                    'r').readlines()]
 my_genes = [x[0] for x in lines]
 
-print len(genes), 'genes from paper'
-print len(genes_verified), 'verified genes from paper'
-print len(my_genes), '(verified) genes I identify'
+print(len(genes), 'genes from paper')
+print(len(genes_verified), 'verified genes from paper')
+print(len(my_genes), '(verified) genes I identify')
 
 pm = []
 pnm = []
@@ -33,27 +33,28 @@
     if g not in genes:
         npm.append(g)
 
-print 'genes found in paper that I found (', len(pm), '):'
+print('genes found in paper that I found (', len(pm), '):')
 for x in pm:
-    print x
-print 'genes found in paper that I didn\'t find (', len(pnm), '):'
+    print(x)
+print('genes found in paper that I didn\'t find (', len(pnm), '):')
 for x in pnm:
-    print x
-print 'genes that I found not in paper(', len(npm), '):'
+    print(x)
+print('genes that I found not in paper(', len(npm), '):')
 for x in npm:
-    print x
+    print(x)
 
-lines = [x.strip().split(' ') for x in open('../../results/introgressed_id_genes_fns.txt', 'r').readlines()]
+lines = [x.strip().split(' ') for x in open(
+    '../../results/introgressed_id_genes_fns.txt', 'r').readlines()]
 gene_to_fns = {}
 for line in lines:
     gene_to_fns[line[0]] = line[1:]
 
 while True:
-    gene = raw_input('=========================================\nwhich gene? ')
+    gene = input('=========================================\nwhich gene? ')
     try:
         gene_to_fns[gene]
-    except:
-        print 'that gene wasn\'t one i found'
+    except KeyError:
+        print('that gene wasn\'t one i found')
         continue
     for fn in gene_to_fns[gene]:
         f = open(fn)
@@ -78,16 +79,17 @@
                 seq += 'p'
             else:
                 seq += '-'
-        print '==========', fn
+        print('==========', fn)
         line_length = 10000
         for i in range(0, len(seq), line_length):
-            print seqc[i:i+line_length]
-            print seqp[i:i+line_length]
-            print seqx[i:i+line_length]
-            print seq[i:i+line_length]
-            print
-    print
-    raw_input('')
+            print(seqc[i:i+line_length])
+            print(seqp[i:i+line_length])
+            print(seqx[i:i+line_length])
+            print(seq[i:i+line_length])
+            print()
+    print()
+    input('')
 
 
-# TODO: get alignments for genes found in paper but not by me; print positions in each genome before alignments
+# TODO: get alignments for genes found in paper but not by me;
+# print positions in each genome before alignments
diff --git a/code/analyze/to_update/compare_3strains_main.py b/code/analyze/to_update/compare_3strains_main.py
index 7e7e560..ab190d5 100644
--- a/code/analyze/to_update/compare_3strains_main.py
+++ b/code/analyze/to_update/compare_3strains_main.py
@@ -1,20 +1,7 @@
-import re
-import sys
-import os
-import copy
 import itertools
 from collections import defaultdict
-import gene_predictions
-import predict
-from filter_helpers import *
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import mystats
-import read_table
-import read_fasta
+from misc import read_table
 
 strains = ['yjm1252', 'yjm1078', 'yjm248']
 
@@ -36,12 +23,13 @@
         for base in range(start, end + 1):
             bases_by_strains[chrm][base].append(strain)
 
-#for base in sorted(bases_by_strains['I'].keys()):
+# for base in sorted(bases_by_strains['I'].keys()):
 #    print base, bases_by_strains['I'][base]
-        
+
 categories = []
-for i in range(1,len(strains) + 1):
-    categories += [tuple(sorted(x)) for x in itertools.combinations(strains, i)]
+for i in range(1, len(strains) + 1):
+    categories += [tuple(sorted(x))
+                   for x in itertools.combinations(strains, i)]
 
 cat_counts = defaultdict(int)
 for chrm in bases_by_strains.keys():
@@ -55,4 +43,3 @@
 for cat in categories:
     f.write(','.join(cat) + '\t' + str(cat_counts[cat]) + '\n')
 f.close()
-
diff --git a/code/analyze/to_update/compare_predictions_main.py b/code/analyze/to_update/compare_predictions_main.py
index c967c92..753c0a1 100644
--- a/code/analyze/to_update/compare_predictions_main.py
+++ b/code/analyze/to_update/compare_predictions_main.py
@@ -1,20 +1,9 @@
-import re
 import sys
-import os
-import copy
-import itertools
-import gene_predictions
 import predict
 from collections import defaultdict
-from filter_helpers import *
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import mystats
-import read_table
-import read_fasta
+from misc import read_table
+
 
 # similar to find_pops function in structure_3_main.py
 def overlap_with_any(start, end, blocks):
@@ -32,23 +21,24 @@ def overlap_with_any(start, end, blocks):
             break
     return count
 
+
 args = predict.process_predict_args(sys.argv[1:])
 
-## comparing to other prediction run; e.g. comparing using just one
-## introgressed reference state to using multiple; this is a little
-## janky because some of the file names and formatting have changed
+# comparing to other prediction run; e.g. comparing using just one
+# introgressed reference state to using multiple; this is a little
+# janky because some of the file names and formatting have changed
 other_region_fn = gp.analysis_out_dir_absolute + 'u3_i.001_tv_l1000_f.01/' + \
-                  'introgressed_blocks_filtered_par_u3_i.001_tv_l1000_f.01_summary_plus.txt'
+    'introgressed_blocks_filtered_par_u3_i.001_tv_l1000_f.01_summary_plus.txt'
 rt_other, fields_other = read_table.read_table_rows(other_region_fn, '\t')
 regions_other = defaultdict(lambda: defaultdict(list))
 for region_id in rt_other:
     chrm = rt_other[region_id]['chromosome']
     strain = rt_other[region_id]['strain']
-    regions_other[chrm][strain].append((int(rt_other[region_id]['start']), \
+    regions_other[chrm][strain].append((int(rt_other[region_id]['start']),
                                         int(rt_other[region_id]['end'])))
 for chrm in gp.chrms:
     for strain in regions_other[chrm].keys():
-        regions_other[chrm][strain].sort(key = lambda x: x[0])
+        regions_other[chrm][strain].sort(key=lambda x: x[0])
 
 regions = defaultdict(lambda: defaultdict(list))
 
@@ -63,12 +53,12 @@ def overlap_with_any(start, end, blocks):
     for region_id in rt:
         chrm = rt[region_id]['chromosome']
         strain = rt[region_id]['strain']
-        regions[chrm][strain].append((int(rt[region_id]['start']), \
+        regions[chrm][strain].append((int(rt[region_id]['start']),
                                       int(rt[region_id]['end']),
                                       rt[region_id]['alternative_states']))
 for chrm in gp.chrms:
     for strain in regions[chrm].keys():
-        regions[chrm][strain].sort(key = lambda x: x[0])
+        regions[chrm][strain].sort(key=lambda x: x[0])
 
 # count bases found in every possible combination of species_from +
 # presence/absence in regions_other
@@ -79,14 +69,14 @@ def overlap_with_any(start, end, blocks):
     # current predictions
     for strain in regions[chrm].keys():
         for region in regions[chrm][strain]:
-            x = overlap_with_any(region[0], region[1], regions_other[chrm][strain])
+            x = overlap_with_any(region[0],
+                                 region[1], regions_other[chrm][strain])
             length = region[1] - region[0] + 1
             alt_states = region[2].split(',')
             d[strain][tuple(['other'] + alt_states)] += x
             d[strain][tuple(alt_states)] += length - x
             assert x <= length
 
-
     # other predictions
     for strain in regions_other[chrm].keys():
         for region in regions_other[chrm][strain]:
@@ -95,13 +85,14 @@ def overlap_with_any(start, end, blocks):
             d[strain][('other', 'any')] += x
             d[strain][('other',)] += length - x
             assert x <= length
-            
 
-fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + 'state_counts_comparison.txt'
+fn = gp.analysis_out_dir_absolute + args['tag'] +\
+    '/' + 'state_counts_comparison.txt'
 f = open(fn, 'w')
 
 f.write('strain\tlabel\tcount\n')
 for strain in d.keys():
     for label in d[strain].keys():
-        f.write(strain + '\t' + ','.join(label) + '\t' + str(d[strain][label]) + '\n')
+        f.write(strain + '\t' + ','.join(label) +
+                '\t' + str(d[strain][label]) + '\n')
 f.close()
diff --git a/code/analyze/to_update/compare_to_strope.py b/code/analyze/to_update/compare_to_strope.py
index 297ebd4..7fed803 100644
--- a/code/analyze/to_update/compare_to_strope.py
+++ b/code/analyze/to_update/compare_to_strope.py
@@ -1,22 +1,10 @@
 # compare set of genes I've called to set called in Strope et al (100
 # genomes paper)
 
-import re
 import sys
-import os
-import math
-import Bio.SeqIO
-import copy
 import gene_predictions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from misc import read_table
 
 tag = sys.argv[1]
 
@@ -35,14 +23,14 @@
         if line[7+i] == 'P':
             strains_int_par.append(strains[i])
     n_int_par = len(strains_int_par)
-    genes_strope[line[2]] = (n_int_par, n_int_other, n_del, strains_int_par, \
+    genes_strope[line[2]] = (n_int_par, n_int_other, n_del, strains_int_par,
                              line[1], line[4])
     sys_standard_strope[line[1]] = line[2]
-    
+
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
              'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt'
 # dict keyed by region: {strain:, start:, end:, etc}
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 region_to_genes = {}
 for chrm in gp.chrms:
     fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \
@@ -53,9 +41,9 @@
     region_to_genes.update(region_to_genes_current)
 genes_by_strain = {}
 for region in regions:
-    if not genes_by_strain.has_key(regions[region]['strain']):
+    if regions[region]['strain'] not in genes_by_strain:
         genes_by_strain[regions[region]['strain']] = set([])
-    [genes_by_strain[regions[region]['strain']].add(gene) \
+    [genes_by_strain[regions[region]['strain']].add(gene)
      for gene in [x[0] for x in region_to_genes[region]['gene_list']]]
 
 genes = {}
@@ -97,7 +85,8 @@
 # TODO fix my gene list then get rid of this
 all_genes = {}
 for chrm in gp.chrms:
-    fn_all_genes = gp.analysis_out_dir_absolute + 'S288c_chr' + chrm + '_genes.txt'
+    fn_all_genes = gp.analysis_out_dir_absolute +\
+        'S288c_chr' + chrm + '_genes.txt'
     f_all_genes = open(fn_all_genes, 'r')
     lines = [line.strip().split('\t') for line in f_all_genes.readlines()]
     f_all_genes.close()
@@ -107,7 +96,6 @@
         strand = 'NA'
         all_genes[line[0]] = ('NA', chrm, start, end, strand)
 
-    
 fn_paralogs = '../../data/S288c_paralogs.tsv'
 f_paralogs = open(fn_paralogs, 'r')
 lines = [line.strip().split('\t') for line in f_paralogs.readlines()]
@@ -117,7 +105,6 @@
     if line[0] != "":
         paralogs[line[0]] = line[3]
 
-
 f_s = open('compare_to_strope/genes_strope_only.txt', 'w')
 f_m = open('compare_to_strope/genes_me_only.txt', 'w')
 f_sm = open('compare_to_strope/genes_both.txt', 'w')
@@ -147,7 +134,8 @@
         f_sp.write(gene + '\n')
     if gene in genes or (gene in sys_standard and sys_standard[gene] in genes):
         continue
-    elif not (gene in all_genes or (gene in sys_standard and sys_standard[gene] in all_genes)):
+    elif not (gene in all_genes or
+              (gene in sys_standard and sys_standard[gene] in all_genes)):
         continue
     elif genes_strope[gene][0] == 0:
         continue
@@ -157,19 +145,19 @@
         f_s.write(gene + '\n')
         c_s += 1
         if gene in paralogs:
-            c_s_p +=1
+            c_s_p += 1
 f_s.close()
 f_m.close()
 f_sm.close()
 f_mp.close()
 f_sp.close()
 
-print 'number strope only:', c_s
-print 'number me only:', c_m
-print 'number strope and me:', c_sm
-print 'number strope only paralogs', c_s_p
-print 'number me only paralogs', c_m_p
-print 'number strope and me paralogs', c_sm_p
-print 'number paralogs', len(paralogs)
+print('number strope only:', c_s)
+print('number me only:', c_m)
+print('number strope and me:', c_sm)
+print('number strope only paralogs', c_s_p)
+print('number me only paralogs', c_m_p)
+print('number strope and me paralogs', c_sm_p)
+print('number paralogs', len(paralogs))
 
-print paralogs.keys()
+print(paralogs.keys())
diff --git a/code/analyze/to_update/count_coding_changes.py b/code/analyze/to_update/count_coding_changes.py
index a4af578..8ce9ec6 100644
--- a/code/analyze/to_update/count_coding_changes.py
+++ b/code/analyze/to_update/count_coding_changes.py
@@ -1,10 +1,7 @@
-import sys
-import os
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import seq_functions
-import read_fasta
+from misc import seq_functions
+from misc import read_fasta
+
 
 def get_aligned_genes(fn, strains):
     headers, seqs = read_fasta.read_fasta(fn)
@@ -44,7 +41,6 @@ def ambiguous(gene, ref_start, ref_end, coords, orfs):
 
 
 def count_coding(seq_master, seq_ref, seq_strain, start, end):
-    
     if not seq_master.startswith('ATG'):
         seq_master = seq_functions.reverse_complement(seq_master)
         assert seq_master.startswith('ATG'), seq_master
@@ -87,16 +83,15 @@ def count_coding(seq_master, seq_ref, seq_strain, start, end):
 
 
 def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end):
-
-    print seq_master
-    print seq_ref
-    print seq_strain
-    print start, end
+    print(seq_master)
+    print(seq_ref)
+    print(seq_strain)
+    print(start, end)
 
     seq_master = seq_master.upper()
     seq_ref = seq_ref.upper()
     seq_strain = seq_strain.upper()
-    
+
     ind_master = 0
     ind_ref = 0
     ind_strain = 0
@@ -177,10 +172,9 @@ def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end):
         if codon_strain != codon_master:
 
             aa_master = seq_functions.codon_table.get(codon_master)
-            aa_ref = seq_functions.codon_table.get(codon_ref)
             aa_strain = seq_functions.codon_table.get(codon_strain)
 
-            if aa_master == None or aa_strain == None:
+            if aa_master is None or aa_strain is None:
                 if gaps_master > gaps_strain:
                     t_insert += gaps_master - gaps_strain
                 else:
@@ -212,10 +206,9 @@ def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end):
                 else:
                     t_non_ref += 1
 
-    print t_syn, t_non, t_syn_ref, t_non_ref
-    print t_insert, t_delete, t_insert_ref, t_delete_ref
-    print frameshift
+    print(t_syn, t_non, t_syn_ref, t_non_ref)
+    print(t_insert, t_delete, t_insert_ref, t_delete_ref)
+    print(frameshift)
     return t_syn, t_non, t_syn_ref, t_non_ref, \
         t_insert/3.0, t_delete/3.0, t_insert_ref/3.0, t_delete_ref/3.0, \
         gene_delete, gene_delete_ref, frameshift_count
-
diff --git a/code/analyze/to_update/count_coding_changes_main.py b/code/analyze/to_update/count_coding_changes_main.py
index d0a4220..96c1705 100644
--- a/code/analyze/to_update/count_coding_changes_main.py
+++ b/code/analyze/to_update/count_coding_changes_main.py
@@ -1,36 +1,34 @@
 import sys
 import os
 import gzip
-from count_coding_changes import *
+from count_coding_changes import get_aligned_genes, count_coding_with_gaps
 import annotate_positions
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import overlap
-import read_table
-import read_fasta
+from misc import overlap
+from misc import read_table
+from misc import read_fasta
 
-##======
+# ======
 # command line arguments
-##======
+# ======
 
 tag = sys.argv[1]
 
-##======
+# ======
 # read in introgressed regions
-##======
+# ======
 
-# key region ids by chromosome and then strain 
+# key region ids by chromosome and then strain
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
              'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt'
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 region_ids_by_chrm_strain = {}
 for r in regions.keys():
     strain = regions[r]['strain']
     chrm = regions[r]['chromosome']
-    if not region_ids_by_chrm_strain.has_key(chrm):
+    if chrm not in region_ids_by_chrm_strain:
         region_ids_by_chrm_strain[chrm] = {}
-    if not region_ids_by_chrm_strain[chrm].has_key(strain):
+    if strain not in region_ids_by_chrm_strain[chrm]:
         region_ids_by_chrm_strain[chrm][strain] = []
     region_ids_by_chrm_strain[chrm][strain].append(r)
 
@@ -48,82 +46,84 @@
 f.close()
 
 
-##======
+# ======
 # count sites within all regions that are coding/noncoding, plus some
 # more details about coding changes
-##======
+# ======
 
 other_ref = gp.alignment_ref_order[1]
 
 region_totals = {}
 gene_totals = {}
 strain_totals = {}
-totals = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \
-          'insert':0, 'delete':0, 'insert_ref':0, 'delete_ref':0, \
-          'gene_delete':0, 'gene_delete_ref':0, \
-          'ref_gene_only':0, 'strain_orf_only':0, \
-          'coding':0, 'noncoding':0, 'frameshift':0}
+totals = {'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0,
+          'insert': 0, 'delete': 0, 'insert_ref': 0, 'delete_ref': 0,
+          'gene_delete': 0, 'gene_delete_ref': 0,
+          'ref_gene_only': 0, 'strain_orf_only': 0,
+          'coding': 0, 'noncoding': 0, 'frameshift': 0}
 
 for chrm in gp.chrms:
 
-    print chrm
+    print(chrm)
 
     # read in cer reference genes
     fn = gp.analysis_out_dir_absolute + gp.master_ref + '_chr' + chrm + \
-         '_genes.txt'
-    genes, l = read_table.read_table_rows(fn, '\t', header=False, key_ind=0)
+        '_genes.txt'
+    genes, _ = read_table.read_table_rows(fn, '\t', header=False, key_ind=0)
     for gene in genes:
         genes[gene] = (int(genes[gene][0]), int(genes[gene][1]))
 
     # read in cer ref -> par ref position file
     fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \
-         '_to_' + other_ref + '_chr' + chrm + '.txt.gz'
-    master_to_other_ref_pos = [float(line[:-1]) \
+        '_to_' + other_ref + '_chr' + chrm + '.txt.gz'
+    master_to_other_ref_pos = [float(line[:-1])
                                for line in gzip.open(fn, 'rb').readlines()]
 
     # read in cer ref chromosome sequence
     fn = gp.ref_dir[gp.master_ref] + gp.ref_fn_prefix[gp.master_ref] + \
-         '_chr' + chrm + gp.fasta_suffix
+        '_chr' + chrm + gp.fasta_suffix
     master_seq = read_fasta.read_fasta(fn)[1][0]
 
     # read in par ref chromosome sequence
     fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \
-         '_chr' + chrm + gp.fasta_suffix
+        '_chr' + chrm + gp.fasta_suffix
     other_ref_seq = read_fasta.read_fasta(fn)[1][0]
 
     # read in par ref ORFs
     fn = gp.ref_dir[other_ref] + 'orfs/' + other_ref + \
-         '_chr' + chrm + '_orfs' + gp.fasta_suffix
+        '_chr' + chrm + '_orfs' + gp.fasta_suffix
     ref_orfs = annotate_positions.get_orfs(fn)
 
     for strain in region_ids_by_chrm_strain[chrm].keys():
-        print '-', strain
-
-        if not strain_totals.has_key(strain):
-            strain_totals[strain] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \
-                                     'ref_gene_only':0, 'strain_orf_only':0, \
-                                     'coding':0, 'noncoding':0}
-        
-        # read in cer ref -> strain position file 
+        print('-', strain)
+
+        if strain not in strain_totals:
+            strain_totals[strain] = {
+                'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0,
+                'ref_gene_only': 0, 'strain_orf_only': 0,
+                'coding': 0, 'noncoding': 0}
+
+        # read in cer ref -> strain position file
         fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \
-             '_to_' + strain + '_chr' + chrm + '.txt.gz'
-        master_to_strain_pos = [float(line[:-1]) \
+            '_to_' + strain + '_chr' + chrm + '.txt.gz'
+        master_to_strain_pos = [float(line[:-1])
                                 for line in gzip.open(fn, 'rb').readlines()]
 
         # read in strain chromosome sequence
         fn = gp.non_ref_dirs[gp.master_ref][0] + strain + \
-             '_chr' + chrm + gp.fasta_suffix
+            '_chr' + chrm + gp.fasta_suffix
         strain_seq = read_fasta.read_fasta(fn)[1][0]
 
         # read in strain ORFs
         fn = gp.non_ref_dirs[gp.master_ref][0] + 'orfs/' + strain + \
-             '_chr' + chrm + '_orfs' + gp.fasta_suffix
+            '_chr' + chrm + '_orfs' + gp.fasta_suffix
         orfs = annotate_positions.get_orfs(fn)
 
         for region in region_ids_by_chrm_strain[chrm][strain]:
-            region_totals[region] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \
-                                     'ref_gene_only':0, 'strain_orf_only':0, \
-                                     'coding':0, 'noncoding':0}
+            region_totals[region] = {
+                'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0,
+                'ref_gene_only': 0, 'strain_orf_only': 0,
+                'coding': 0, 'noncoding': 0}
 
             # is each site in region in a master ref gene and/or
             # strain ORF?
@@ -132,10 +132,11 @@
             t_gene_not_orf = 0
             t_not_gene_orf = 0
             t_not_gene_not_orf = 0
-            for site in range(int(regions[region]['start']), \
+            for site in range(int(regions[region]['start']),
                               int(regions[region]['end'])):
                 in_gene = overlap.contained_any(site, genes.values())
-                in_orf = overlap.contained_any(master_to_strain_pos[site], orfs.keys())
+                in_orf = overlap.contained_any(
+                    master_to_strain_pos[site], orfs.keys())
                 if in_gene:
                     if in_orf:
                         t_gene_orf += 1
@@ -170,58 +171,63 @@
 
                 # read multiple alignment for the gene, in which we've
                 # previously selected the best orfs to match the gene
-                fn = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + \
-                     gene + '_introgressed_filtered.maf'
+                fn = gp.analysis_out_dir_absolute + tag + '/genes/' \
+                    + gene + '/' + gene + '_introgressed_filtered.maf'
                 if not os.path.isfile(fn):
-                    print 'do not have alignment for', gene
+                    print('do not have alignment for', gene)
                     continue
-                aligned_genes = get_aligned_genes(fn, \
-                                                  [gp.master_ref, other_ref, strain])
+                aligned_genes = get_aligned_genes(
+                    fn, [gp.master_ref, other_ref, strain])
 
-                print gene, strain
+                print(gene, strain)
                 # for now, ignore cerevisiae reference genes that
                 # don't map perfectly to an ORF in the strain and
                 # paradoxus reference
-                #if ambiguous(gene, gene_start, gene_end, master_to_strain_pos, orfs):
+                # if ambiguous(gene, gene_start, gene_end,
+                # master_to_strain_pos, orfs):
                 #    continue
-                #if ambiguous(gene, gene_start, gene_end, \
+                # if ambiguous(gene, gene_start, gene_end, \
                 #             master_to_other_ref_pos, ref_orfs):
                 #    continue
-                
+
                 # extract gene sequence from references and strain
                 g_master = master_seq[gene_start:gene_end+1]
-                g_ref = other_ref_seq[int(master_to_other_ref_pos[gene_start]):\
+                g_ref = other_ref_seq[int(master_to_other_ref_pos[gene_start]):
                                       int(master_to_other_ref_pos[gene_end])+1]
-                g_strain = strain_seq[int(master_to_strain_pos[gene_start]):\
+                g_strain = strain_seq[int(master_to_strain_pos[gene_start]):
                                       int(master_to_strain_pos[gene_end])+1]
 
                 # get overlap between gene and introgressed region
-                o_start, o_end = overlap.overlap_region(genes[gene][0], \
-                                                        genes[gene][1], \
-                                                        int(regions[region]['start']), \
-                                                        int(regions[region]['end']))
+                o_start, o_end = overlap.overlap_region(
+                    genes[gene][0],
+                    genes[gene][1],
+                    int(regions[region]['start']),
+                    int(regions[region]['end']))
 
                 # count synonymous and non synonymous changes due to
                 # paradoxus (deal with gene direction correctly)
                 # t_syn, t_non = count_coding(g_master, g_ref, g_strain, \
-                #                             o_start-gene_start, o_end-gene_start)
+                #                             o_start-gene_start,
+                #                             o_end-gene_start)
 
                 # alternative method that deals with imperfect matches
                 t_syn, t_non, t_syn_ref, t_non_ref, \
                     t_insert, t_delete, t_insert_ref, t_delete_ref, \
                     gene_delete, gene_delete_ref, frameshift = \
-                    count_coding_with_gaps(aligned_genes[gp.master_ref], \
-                                           aligned_genes[other_ref], \
-                                           aligned_genes[strain], \
-                                           o_start-gene_start, o_end-gene_start)
+                    count_coding_with_gaps(aligned_genes[gp.master_ref],
+                                           aligned_genes[other_ref],
+                                           aligned_genes[strain],
+                                           o_start-gene_start,
+                                           o_end-gene_start)
 
                 # add to totals for region, gene, strain, and overall
-                if not gene_totals.has_key(gene):
-                    gene_totals[gene] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \
-                                         'insert':0, 'delete':0, \
-                                         'insert_ref':0, 'delete_ref':0, \
-                                         'gene_delete':0, 'gene_delete_ref':0, \
-                                         'frameshift':0}
+                if gene not in gene_totals:
+                    gene_totals[gene] = {
+                        'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0,
+                        'insert': 0, 'delete': 0,
+                        'insert_ref': 0, 'delete_ref': 0,
+                        'gene_delete': 0, 'gene_delete_ref': 0,
+                        'frameshift': 0}
                 gene_totals[gene]['syn'] += t_syn
                 gene_totals[gene]['non'] += t_non
                 gene_totals[gene]['syn_ref'] += t_syn_ref
@@ -256,9 +262,9 @@
                 totals['gene_delete_ref'] += gene_delete_ref
                 totals['frameshift'] += frameshift
 
-##======
+# ======
 # write output file
-##======
+# ======
 
 fn = gp.analysis_out_dir_absolute + tag + '/' + 'coding_changes_summary_' + \
      tag + '.txt'
@@ -271,30 +277,26 @@
 
 for strain in strain_totals:
     for key in strain_totals[strain].keys():
-        f.write(strain + sep + 'strain' + sep + \
+        f.write(strain + sep + 'strain' + sep +
                 str(strain_totals[strain][key]) + sep + key + '\n')
 
 for gene in gene_totals:
     for key in gene_totals[gene].keys():
-        f.write(gene + sep + 'gene' + sep + \
+        f.write(gene + sep + 'gene' + sep +
                 str(gene_totals[gene][key]) + sep + key + '\n')
 
 for region in region_totals:
     for key in region_totals[region].keys():
-        f.write(region + sep + 'region' + sep + \
+        f.write(region + sep + 'region' + sep +
                 str(region_totals[region][key]) + sep + key + '\n')
 
-f.close()    
-
-
-
+f.close()
 
 # new plan
 # for each region
 
 # for each site in region
 # is it in ref gene and/or strain orf? (keep track of four totals)
-# 
 
 # for each gene
 # get corresponding orfs in par and strain
@@ -305,4 +307,3 @@
 # - categories:
 #   multiples of 3
 #   not multiples of 3 -> stop counting/ignore gene?
-
diff --git a/code/analyze/to_update/count_introgressed_main.py b/code/analyze/to_update/count_introgressed_main.py
index 2663c12..5cbc0d3 100644
--- a/code/analyze/to_update/count_introgressed_main.py
+++ b/code/analyze/to_update/count_introgressed_main.py
@@ -1,24 +1,12 @@
 # counts total amount of sites introgressed on each chromosome
 
-import re
-import sys
-import os
-import math
-import gzip
-import itertools
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import overlap
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from misc import read_table
 
 
-chrm_sizes = [230218, 813184, 316620, 1531933, 576874, 270161, 1090940, 562643, 439888, 745751, 666816, 1078177, 924431, 784333, 1091291, 948066]
+chrm_sizes = [230218, 813184, 316620, 1531933, 576874,
+              270161, 1090940, 562643, 439888, 745751,
+              666816, 1078177, 924431, 784333, 1091291, 948066]
 
 tag = 'u3_i.001_tv_l1000_f.01'
 
@@ -30,13 +18,13 @@
 for region in d:
     chrm = d[region]['chromosome']
     strain = d[region]['strain']
-    regions_by_chrm[chrm].append((strain, \
-                                  int(d[region]['start']), \
+    regions_by_chrm[chrm].append((strain,
+                                  int(d[region]['start']),
                                   int(d[region]['end'])))
 
 hist = {}
 for chrm in gp.chrms:
-    print chrm
+    print(chrm)
     chrm_size = chrm_sizes[gp.chrms.index(chrm)]
     x = [0 for i in range(chrm_size)]
     for ri in range(len(regions_by_chrm[chrm])):
@@ -56,7 +44,7 @@
 total = 0
 for chrm in gp.chrms:
     f.write(chrm + '\t')
-    chrm_size = chrm_sizes[gp.chrms.index(chrm)]    
+    chrm_size = chrm_sizes[gp.chrms.index(chrm)]
     at_least_one = chrm_size - hist[chrm][0]
     total += at_least_one
     f.write(str(at_least_one) + '\t')
diff --git a/code/analyze/to_update/frequency_of_introgression_main.py b/code/analyze/to_update/frequency_of_introgression_main.py
index 8004fdb..89b2da0 100644
--- a/code/analyze/to_update/frequency_of_introgression_main.py
+++ b/code/analyze/to_update/frequency_of_introgression_main.py
@@ -1,25 +1,11 @@
-import re
-import sys
-import os
-import copy
-import itertools
 from collections import defaultdict
-import gene_predictions
-import predict
-from filter_helpers import *
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import mystats
-import read_table
-import read_fasta
+from misc import read_table
 
 tag = 'u3_i.001_tv_l1000_f.01'
 species_from = 'par'
 
-#strains3 = ['yjm1252', 'yjm1078', 'yjm248']
+# strains3 = ['yjm1252', 'yjm1078', 'yjm248']
 
 fn = gp.analysis_out_dir_absolute + tag + '/' + \
      'introgressed_blocks_filtered_' + species_from + \
@@ -30,7 +16,7 @@
 strains = set([])
 for region_id in regions:
     strain = regions[region_id]['strain']
-    #if strain not in strains3:
+    # if strain not in strains3:
     strains.add(strain)
     chrm = regions[region_id]['chromosome']
     start = int(regions[region_id]['start'])
@@ -50,4 +36,3 @@
 for i in range(len(strains)):
     f.write(str(i) + '\t' + str(counts[i]) + '\n')
 f.close()
-
diff --git a/code/analyze/to_update/gene_overlap_main.py b/code/analyze/to_update/gene_overlap_main.py
index 9c274e4..3e431e0 100644
--- a/code/analyze/to_update/gene_overlap_main.py
+++ b/code/analyze/to_update/gene_overlap_main.py
@@ -10,10 +10,10 @@
 # bases within coding sequence are upper case. In addition, there is a
 # corresponding file S288c_CBS432_strain_chrX_start-end.genes.txt
 # listing the genes that overlap this region, and the indices of
-# the bases they overlap, in this format: 
-# gene_name\t0-149\t25236-25385 
+# the bases they overlap, in this format:
+# gene_name\t0-149\t25236-25385
 # gene_name\t200-600\t....
-# 
+#
 # also generate a file in results/tag/gene_alignments/ for each
 # introgressed gene, which contains one threeway alignment for each
 # strain in which the gene was called introgressed...followed by all
@@ -27,93 +27,86 @@
 # versions (gene_introgressed.fasta), and also to all of the versions
 # (gene_all.fasta).
 
-# TODO: 
-## _annotated file should be .txt not .maf
-## also modify so that 80 characters per line
-## and extra row showing summary of which references match
+# TODO:
+#  _annotated file should be .txt not .maf
+#  also modify so that 80 characters per line
+#  and extra row showing summary of which references match
 
 
-import re
 import sys
-import os
-import copy
-from gene_predictions import *
+from gene_predictions import read_gene_file
 import predict
-import pickle
 from collections import defaultdict
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../misc/')
-import read_fasta
-import overlap
+from misc import overlap
 
-##======
+# ======
 # read in analysis parameters
-##======
+# ======
 
 args = predict.process_predict_args(sys.argv[1:])
 
 gp_dir = '../'
 open_mode = 'w'
 
-##======
+# ======
 # read in reference gene coordinates
-##======
+# ======
 
 genes = {}
 for chrm in gp.chrms:
     fn_genes = gp.analysis_out_dir_absolute + '/' + \
                gp.master_ref + '_chr' + chrm + '_genes.txt'
-    # 
+
     genes[chrm] = read_gene_file(fn_genes)
 
-##======
+# ======
 # do all the stuff
-##======
+# ======
 
 for species_from in args['states']:
 
-    ##======
+    # ======
     # read in introgressed regions for current state
-    ##======
+    # ======
 
     # strain chromosome predicted_species start end number_non_gap
     blocks_fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \
                 'introgressed_blocks_' + species_from + '_' + args['tag'] + \
                 '_labeled.txt'
 
-    # introgressed regions keyed by strain and then chromosome: 
+    # introgressed regions keyed by strain and then chromosome:
     # (region_id, start, end, number_non_gap)
     regions = predict.read_blocks(blocks_fn, labeled=True)
 
-    ##======
+    # ======
     # extract alignments and genes for introgressed regions
-    ##======
+    # ======
 
-    fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                       'genes_for_each_region_' + species_from + '_' + \
-                       args['tag'] + '.txt'
+    fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + \
+        '/' + 'genes_for_each_region_' + species_from + '_' + \
+        args['tag'] + '.txt'
     f_genes_regions = open(fn_genes_regions, open_mode)
     f_genes_regions.write('region_id\tnumber_genes\tgenes\tfracs\n')
     d_regions_to_genes = defaultdict(lambda: defaultdict(float))
 
-    #fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-    #                     'regions_for_each_strain_' species_from + '_' + \
-    #                     args['tag'] + '.txt'
-    #f_regions_strains = open(fn_regions_strains, open_mode)
-    #f_regions_strains.write('strain\tregions\n')
-    #d_strains_to_regions = defaultdict(list)
+    # fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag']\
+    #                      + '/' + 'regions_for_each_strain_' species_from\
+    #                      + '_' + args['tag'] + '.txt'
+    # f_regions_strains = open(fn_regions_strains, open_mode)
+    # f_regions_strains.write('strain\tregions\n')
+    # d_strains_to_regions = defaultdict(list)
 
-    fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                       'genes_for_each_strain_' + species_from + '_' + \
-                       args['tag'] + '.txt'
+    fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + \
+        '/' + 'genes_for_each_strain_' + species_from + '_' + \
+        args['tag'] + '.txt'
     f_genes_strains = open(fn_genes_strains, open_mode)
     f_genes_strains.write('strain\tnumber_genes\tgenes\tfracs\n')
     d_strains_to_genes = defaultdict(lambda: defaultdict(float))
 
-    fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                       'strains_for_each_gene_' + species_from + '_' + \
-                       args['tag'] + '.txt'
+    fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + \
+        '/' + 'strains_for_each_gene_' + species_from + '_' + \
+        args['tag'] + '.txt'
     f_strains_genes = open(fn_strains_genes, open_mode)
     f_strains_genes.write('gene\tnum_strains\tstrains\tfracs\n')
     d_genes_to_strains = defaultdict(lambda: defaultdict(float))
@@ -123,42 +116,42 @@
             for entry in regions[strain][chrm]:
                 region_id, start, end, number_non_gap = entry
                 for gene in genes[chrm]:
-                    o = overlap.overlap(start, end, \
-                                        genes[chrm][gene][0], genes[chrm][gene][1])
+                    o = overlap.overlap(start, end,
+                                        genes[chrm][gene][0],
+                                        genes[chrm][gene][1])
                     if o > 0:
-                        gene_length = float(genes[chrm][gene][1] - \
+                        gene_length = float(genes[chrm][gene][1] -
                                             genes[chrm][gene][0] + 1)
                         frac_o = o / gene_length
                         d_regions_to_genes[region_id][gene] += frac_o
                         d_strains_to_genes[strain][gene] += frac_o
                         d_genes_to_strains[gene][strain] += frac_o
 
-
     for region in sorted(d_regions_to_genes.keys(), key=lambda x: int(x[1:])):
         g = sorted(d_regions_to_genes[region].keys())
         f_genes_regions.write(region + '\t' + str(len(g)) + '\t')
         f_genes_regions.write(','.join(g) + '\t')
-        f_genes_regions.write(','.join([str(d_regions_to_genes[region][x]) \
+        f_genes_regions.write(','.join([str(d_regions_to_genes[region][x])
                                         for x in g]) + '\n')
 
     for strain in sorted(d_strains_to_genes.keys()):
         g = sorted(d_strains_to_genes[strain].keys())
         f_genes_strains.write(strain + '\t' + str(len(g)) + '\t')
         f_genes_strains.write(','.join(g) + '\t')
-        f_genes_strains.write(','.join([str(d_strains_to_genes[strain][x]) \
+        f_genes_strains.write(','.join([str(d_strains_to_genes[strain][x])
                                         for x in g]) + '\n')
 
     for gene in sorted(d_genes_to_strains.keys()):
         s = sorted(d_genes_to_strains[gene].keys())
         f_strains_genes.write(gene + '\t' + str(len(s)) + '\t')
         f_strains_genes.write(','.join(s) + '\t')
-        f_strains_genes.write(','.join([str(d_genes_to_strains[gene][x]) \
+        f_strains_genes.write(','.join([str(d_genes_to_strains[gene][x])
                                         for x in s]) + '\n')
 
     f_genes_regions.close()
     f_genes_strains.close()
     f_strains_genes.close()
-"""    
+"""
 # produce region summmary file with all the same info, but also with
 # region ids (r1-rn), and with genes overlapping each region
 
@@ -178,7 +171,8 @@
 fn_align_prefix += '_'.join([refs[s][0] for s in args['species']]) + '_'
 
 # for annotated region files (output)
-fn_region_prefix = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/regions/'
+fn_region_prefix = gp.analysis_out_dir_absolute + \
+    '/' + args['tag'] + '/regions/'
 if not os.path.isdir(fn_region_prefix):
     os.makedirs(fn_region_prefix)
 
@@ -193,19 +187,23 @@
 write_region_summary_header(refs_ordered, f_region_summary)
 
 fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                  'genes_for_each_region_chr' + chrm + '_' + args['tag'] + '.txt'
+                  'genes_for_each_region_chr' + chrm + '_' \
+                  + args['tag'] + '.txt'
 f_genes_regions = open(fn_genes_regions, open_mode)
 
 fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                     'regions_for_each_strain_chr' + chrm + '_' + args['tag'] + '.txt'
+                     'regions_for_each_strain_chr' + chrm + '_' + \
+                     args['tag'] + '.txt'
 f_regions_strains = open(fn_regions_strains, open_mode)
 
 fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                  'genes_for_each_strain_chr' + chrm + '_' + args['tag'] + '.txt'
+                  'genes_for_each_strain_chr' + chrm + '_' + \
+                  args['tag'] + '.txt'
 f_genes_strains = open(fn_genes_strains, open_mode)
 
 fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \
-                  'strains_for_each_gene_chr' + chrm + '_' + args['tag'] + '.txt'
+                  'strains_for_each_gene_chr' + chrm + '_' + \
+                  args['tag'] + '.txt'
 f_strains_genes = open(fn_strains_genes, open_mode)
 
 # for keeping track of all genes introgressed in each strain, and the
@@ -240,7 +238,7 @@
 # loop through all strains that we've called introgression in, and
 # associate genes with the regions they overlap
 for strain in regions.keys():
-        
+
     print '***', strain, chrm
     sys.stdout.flush()
     # skip this strain x chromosome if there are no introgressed
@@ -254,7 +252,7 @@
     alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align)
 
     labels = ref_labels + [strain]
-    
+
     # mark each site as matching each reference or not
     ref_match_by_site = get_ref_match_by_site(alignment_seqs, labels)
     # mark each site as in a gene or not
@@ -281,7 +279,6 @@
         # regions are indexed by (unaligned) master ref sequence
         write_region_alignment(alignment_headers, alignment_seqs, fn_region, \
                                entry[0], entry[1], 0)
-        
 
         # write region to file in annotated/readable format
         fn_region_annotated = fn_region_current_prefix + '_annotated' + \
@@ -290,34 +287,35 @@
             write_region_alignment_annotated(labels, alignment_seqs, \
                                              fn_region_annotated, \
                                              entry[0], entry[1], 0, \
-                                             genes, ref_match_by_site, 
+                                             genes, ref_match_by_site,
                                              genes_by_site, \
                                              introgressed_by_site, 100)
 
         #====
         # region summary file with extra info
         #====
-        
+
         # strain chromosome predicted_species start end number_non_gap
         # number_match_ref1 number_match_ref2 number_match_only_ref1
         # number_match_ref2_not_ref1 number_mismatch_all_ref
 
         write_region_summary_line(entry, strain, chrm, species_from, \
                                   alignment_seqs, labels, \
-                                  relative_start, relative_end, f_region_summary)
+                                  relative_start, relative_end,
+                                  f_region_summary)
 
         #====
         # genes for each region summary file
         #====
 
         # region_id num_genes gene frac_intd gene frac_intd
-            
+
         frac_intd = write_genes_for_each_region_summary_line(entry[3], \
                                                              genes_by_site, \
                                                              genes, \
                                                              relative_start, \
                                                              relative_end, \
-                                                             alignment_seqs[0], \
+                                                             alignment_seqs[0],
                                                              f_genes_regions)
         for gene in frac_intd:
             # keep track of all genes for each strain...
@@ -331,14 +329,13 @@
                 gene_strains_dic[gene][strain] = 0
             gene_strains_dic[gene][strain] += frac_intd[gene]
 
-    
 #====
 # strains for each gene summary file
 #====
 
-# (could do this for one chromsoome at a time if we wanted)    
+# (could do this for one chromsoome at a time if we wanted)
 # gene num_strains strain frac_intd strain frac_intd
-    
+
 write_strains_for_each_gene_lines(gene_strains_dic, f_strains_genes)
 
 #====
@@ -363,7 +360,5 @@
 f_regions_strains.close()
 f_genes_strains.close()
 f_strains_genes.close()
-        
 
-            
 """
diff --git a/code/analyze/to_update/gene_predictions.py b/code/analyze/to_update/gene_predictions.py
index 0507a44..0d118e3 100644
--- a/code/analyze/to_update/gene_predictions.py
+++ b/code/analyze/to_update/gene_predictions.py
@@ -1,14 +1,9 @@
 import re
-import sys
 import os
-import copy
 import gzip
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../sim/')
-import sim_analyze_hmm_bw as sim
-sys.path.insert(0, '../misc/')
-import write_fasta
+from misc import write_fasta
+
 
 def index_ignoring_gaps(s, i, s_start):
     '''returns the index of the ith (starting at 0) non-gap character in
@@ -31,6 +26,7 @@ def index_ignoring_gaps(s, i, s_start):
         x += 1
     return x
 
+
 def get_ref_match_by_site(seqs, labels):
 
     # for master: matches _only_ that ref
@@ -51,7 +47,7 @@ def get_ref_match_by_site(seqs, labels):
 
         if seqs[0][i] == seqs[-1][i]:
             ref_match_by_site[0][i] = labels[0][0]
-            
+
         for r in range(1, nrefs):
             if seqs[r][i] == seqs[-1][i]:
                 # matches this ref and master ref -> both blank
@@ -69,10 +65,9 @@ def get_ref_match_by_site(seqs, labels):
                 else:
                     ref_match_by_site[r][i] = '.'
                     ref_match_by_site[0][i] = '.'
-                
 
     return [''.join(s) for s in ref_match_by_site]
-    
+
 
 def get_ref_match_by_site_2(seqs, labels):
 
@@ -99,6 +94,7 @@ def get_ref_match_by_site_2(seqs, labels):
 
     return [''.join(s) for s in ref_match_by_site]
 
+
 def get_genes_by_site(genes, seq):
 
     genes_by_site = [None for site in seq]
@@ -110,6 +106,7 @@ def get_genes_by_site(genes, seq):
             genes_by_site[i] = gene_name
     return genes_by_site
 
+
 def get_introgressed_by_site(regions, seq):
 
     introgressed_by_site = [' ' for site in seq]
@@ -119,19 +116,20 @@ def get_introgressed_by_site(regions, seq):
         for i in range(start_ind, end_ind+1):
             introgressed_by_site[i] = 'i'
     return ''.join(introgressed_by_site)
-    
+
 
 def write_region_alignment(headers, seqs, fn, start, end, master_ind):
-    
+
     relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0))
     relative_end = index_ignoring_gaps(seqs[master_ind], end, 0)
-    
+
     region_seqs = [seq[relative_start:relative_end+1] for seq in seqs]
 
     write_fasta.write_fasta(headers, region_seqs, fn, gz=True)
 
+
 def get_genes_in_region(start, end, genes):
-    
+
     region_genes = []
     for gene_name in genes:
         gene_start, gene_end = genes[gene_name]
@@ -142,24 +140,27 @@ def get_genes_in_region(start, end, genes):
     region_genes.sort(key=lambda x: x[1])
     return region_genes
 
-def write_region_alignment_annotated(labels, seqs, fn, start, end, \
-                                     master_ind, genes, ref_match_by_site, \
-                                     genes_by_site, \
+
+def write_region_alignment_annotated(labels, seqs, fn, start, end,
+                                     master_ind, genes, ref_match_by_site,
+                                     genes_by_site,
                                      introgressed_by_site, context):
 
     relative_start_with_context = \
         max(0, index_ignoring_gaps(seqs[master_ind], start-context, 0))
     relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0))
     relative_end = index_ignoring_gaps(seqs[master_ind], end, 0)
-    relative_end_with_context = index_ignoring_gaps(seqs[master_ind], end+context, 0)
-    
-    region_seqs = [seq[relative_start_with_context:relative_end_with_context+1] \
-                   for seq in seqs]
+    relative_end_with_context = index_ignoring_gaps(seqs[master_ind],
+                                                    end+context, 0)
+
+    region_seqs = [
+        seq[relative_start_with_context:relative_end_with_context+1]
+        for seq in seqs]
 
     # for reference matching lines
     ref_match_strings = []
     for r in ref_match_by_site:
-        ref_match_strings.append(\
+        ref_match_strings.append(
             r[relative_start_with_context:relative_end_with_context+1])
 
     # for gene line
@@ -168,10 +169,12 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \
     region_genes_set = list(set(region_genes))
     try:
         region_genes_set.remove(None)
-    except:
+    except ValueError:
         pass
     region_genes_set.sort(key=lambda x: genes[x][1])
-    gene_string = ''.join([' ' if entry == None else '=' for entry in region_genes])
+    gene_string = ''.join([' '
+                           if entry is None else '='
+                           for entry in region_genes])
 
     # for introgression line
     introgressed_string = \
@@ -188,7 +191,7 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \
     # assume master ref comes first
     f.write('matches only ' + labels[0] + '\n')
     # and assume ref seqs come before predict seq
-    for label in labels[1:-1]: 
+    for label in labels[1:-1]:
         f.write('matches ' + label + ' and mismatches ' + labels[0] + '\n')
     f.write('genes: ' + ' '.join(region_genes_set) + '\n')
     f.write('introgressed\n\n')
@@ -212,6 +215,7 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \
 
     return relative_start, relative_end
 
+
 def read_gene_file(fn):
     f = open(fn, 'r')
     genes = {}
@@ -223,6 +227,7 @@ def read_gene_file(fn):
     f.close()
     return genes
 
+
 def write_gene_file(genes, fn):
     f = open(fn, 'w')
     for gene in genes:
@@ -230,15 +235,17 @@ def write_gene_file(genes, fn):
         f.write(gene + '\t' + str(start) + '\t' + str(end) + '\n')
     f.close()
 
+
 def write_region_summary_header(refs, f):
-    f.write('region_id\tstrain\tchromosome\tpredicted_species\tstart\tend\t' + \
+    f.write('region_id\tstrain\tchromosome\tpredicted_species\tstart\tend\t' +
             'number_non_gap\t')
     f.write('\t'.join(['number_match_' + ref for ref in refs]) + '\t')
     f.write('\t'.join(['number_match_only_' + ref for ref in refs]) + '\t')
     f.write('number_mismatch_all_refs\n')
 
-def write_region_summary_line(region, strain, chrm, predicted_species, seqs, labels, 
-                              start, end, f):
+
+def write_region_summary_line(region, strain, chrm, predicted_species,
+                              seqs, labels, start, end, f):
 
     # region_id [strain chromosome predicted_species start end number_non_gap]
     # number_match_ref1 number_match_ref2 number_match_only_ref1
@@ -246,8 +253,8 @@ def write_region_summary_line(region, strain, chrm, predicted_species, seqs, lab
 
     sep = '\t'
 
-    f.write(region[3] + sep + strain + sep + chrm + sep + predicted_species + \
-            sep + str(region[0]) + sep + str(region[1]) + sep + \
+    f.write(region[3] + sep + strain + sep + chrm + sep + predicted_species +
+            sep + str(region[0]) + sep + str(region[1]) + sep +
             str(region[2]) + sep)
 
     ids = [0] * (len(seqs) - 1)
@@ -280,12 +287,13 @@ def write_region_summary_line(region, strain, chrm, predicted_species, seqs, lab
                 continue
             for r in range(1, len(seqs) - 1):
                 unique_ids[r] += match_refs[r]
-                
+
     f.write(sep.join([str(x) for x in ids]) + sep)
     f.write(sep.join([str(x) for x in unique_ids]) + sep)
     f.write(str(mismatch_all) + '\n')
     f.flush()
 
+
 def read_region_summary(fn):
     # region_id [strain chromosome predicted_species start end number_non_gap]
     # number_match_ref1 number_match_ref2 number_match_only_ref1
@@ -294,39 +302,41 @@ def read_region_summary(fn):
     f = open(fn, 'r')
     line = f.readline()
     d = {}
-    fields = ['strain', 'chromosome', 'predicted_species', 'start', 'end', \
-              'number_non_gap', 'number_match_ref1', 'number_match_ref2', \
-              'number_match_only_ref1', 'number_match_ref2_not_ref1', \
+    fields = ['strain', 'chromosome', 'predicted_species', 'start', 'end',
+              'number_non_gap', 'number_match_ref1', 'number_match_ref2',
+              'number_match_only_ref1', 'number_match_ref2_not_ref1',
               'number_mismatch_all_ref']
     while line != '':
         line = line[:-1].split('\t')
-        #TODO actually fix the multiple header lines scattered throughout
+        # TODO actually fix the multiple header lines scattered throughout
         if line[0] != 'region_id':
             d[line[0]] = dict(zip(fields, line[1:]))
         line = f.readline()
     f.close()
     return d
 
-def write_genes_for_each_region_summary_line(region_id, genes_by_site, gene_summary, \
+
+def write_genes_for_each_region_summary_line(region_id, genes_by_site,
+                                             gene_summary,
                                              start, end, seq, f):
-    
+
     # region_id num_genes gene frac_intd gene frac_intd
     genes = genes_by_site[start:end+1]
     genes_set = list(set(genes))
     try:
         genes_set.remove(None)
-    except:
+    except ValueError:
         pass
     seq_region = seq[start:end+1]
     gene_site_counts = dict(zip(genes_set, [0]*len(genes_set)))
     for i in range(len(seq_region)):
-        if seq_region[i] != gp.gap_symbol and genes[i] != None:
+        if seq_region[i] != gp.gap_symbol and genes[i] is not None:
             gene_site_counts[genes[i]] += 1
     frac_intd = {}
     for gene in genes_set:
         gene_length = gene_summary[gene][1] - gene_summary[gene][0] + 1
         frac_intd[gene] = float(gene_site_counts[gene]) / gene_length
-    
+
     sep = '\t'
     f.write(region_id + sep)
     f.write(str(len(genes_set)))
@@ -337,6 +347,7 @@ def write_genes_for_each_region_summary_line(region_id, genes_by_site, gene_summ
 
     return frac_intd
 
+
 def read_genes_for_each_region_summary(fn):
     # region_id num_genes gene frac_intd gene frac_intd
 
@@ -348,18 +359,19 @@ def read_genes_for_each_region_summary(fn):
         gene_list = []
         for i in range(2, len(line), 2):
             gene_list.append((line[i], line[i+1]))
-        d[line[0]] = {'num_genes':line[1], 'gene_list':gene_list}
+        d[line[0]] = {'num_genes': line[1], 'gene_list': gene_list}
         line = f.readline()
     f.close()
     return d
 
+
 def write_regions_for_each_strain(regions, f):
 
     # strain num_regions region length region length
     sep = '\t'
     for strain in regions:
         f.write(strain + sep)
-        num_regions = sum([len(regions[strain][chrm]) \
+        num_regions = sum([len(regions[strain][chrm])
                            for chrm in regions[strain].keys()])
         f.write(str(num_regions))
         for chrm in regions[strain].keys():
@@ -369,6 +381,7 @@ def write_regions_for_each_strain(regions, f):
         f.write('\n')
     f.flush()
 
+
 def read_regions_for_each_strain(fn):
     # strain num_regions region length region length
 
@@ -380,11 +393,12 @@ def read_regions_for_each_strain(fn):
         region_list = []
         for i in range(2, len(line), 2):
             region_list.append((line[i], line[i+1]))
-        d[line[0]] = {'num_regions':line[1], 'region_list':region_list}
+        d[line[0]] = {'num_regions': line[1], 'region_list': region_list}
         line = f.readline()
     f.close()
     return d
-    
+
+
 def write_genes_for_each_strain(strain_genes_dic, f):
 
     # strain num_genes gene frac_intd gene frac_intd
@@ -396,6 +410,7 @@ def write_genes_for_each_strain(strain_genes_dic, f):
         f.write('\n')
     f.flush()
 
+
 def read_genes_for_each_strain(fn):
     # strain num_genes gene frac_intd gene frac_intd
 
@@ -407,11 +422,12 @@ def read_genes_for_each_strain(fn):
         gene_list = []
         for i in range(2, len(line), 2):
             gene_list.append((line[i], line[i+1]))
-        d[line[0]] = {'num_genes':line[1], 'gene_list':gene_list}
+        d[line[0]] = {'num_genes': line[1], 'gene_list': gene_list}
         line = f.readline()
     f.close()
     return d
 
+
 def write_strains_for_each_gene_lines(gene_strains_dic, f):
 
     # (this is actually the same as above function, but it's confusing
@@ -426,6 +442,7 @@ def write_strains_for_each_gene_lines(gene_strains_dic, f):
         f.write('\n')
     f.flush()
 
+
 def read_strains_for_each_gene(fn):
     # gene num_strains strain frac_intd strain frac_intd
 
@@ -437,11 +454,12 @@ def read_strains_for_each_gene(fn):
         strain_list = []
         for i in range(2, len(line), 2):
             strain_list.append((line[i], line[i+1]))
-        d[line[0]] = {'num_strains':line[1], 'strain_list':strain_list}
+        d[line[0]] = {'num_strains': line[1], 'strain_list': strain_list}
         line = f.readline()
     f.close()
     return d
 
+
 def read_genes(fn, fn_genes):
 
     if os.path.isfile(fn_genes):
@@ -464,11 +482,12 @@ def read_genes(fn, fn_genes):
             break
 
         # starting with new gene
-        #assert line.strip().startswith('gene'), line
+        # assert line.strip().startswith('gene'), line
         skip_this_gene = False
 
         # regex for finding coordinates
-        m = re.search(r'[><]?(?P<start>[0-9]+)[.><,0-9]*\.\.[><]?(?P<end>[0-9]+)', line)
+        m = re.search(r'[><]?(?P<start>[0-9]+)'
+                      r'[.><,0-9]*\.\.[><]?(?P<end>[0-9]+)', line)
         # subtract one to index from zero TODO is this correct? end is
         # inclusive
 
@@ -495,20 +514,25 @@ def read_genes(fn, fn_genes):
             if gene_name != '':
                 genes[gene_name] = (start, end)
             else:
-                print 'gene name not found: ' + line
+                print('gene name not found: ' + line)
     f.close()
     write_gene_file(genes, fn_genes)
 
     return genes
 
+
 """
 def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \
                             introgressed_genes, gene_info, tag, threshold=0):
-    
+
     f_all = open(fn_all, 'w')
-    f_all.write('gene\tchromosome\tstart\tend\tnumber_strains\taverage_introgressed_fraction\taverage_number_non_gap\taverage_ref_from_count\n')
+    f_all.write('gene\tchromosome\tstart\tend\tnumber_strains'
+                '\taverage_introgressed_fraction\taverage_number_non_gap'
+                '\taverage_ref_from_count\n')
 
-    f_gene_heading = 'region_id\tstrain\tstart\tend\tintrogressed_fraction\tnumber_non_gap\tref_from_count\n'
+    f_gene_heading = ('region_id\tstrain\tstart\tend\t'
+                      'introgressed_fraction\tnumber_non_gap'
+                      '\tref_from_count\n')
 
     strain_genes = {}
 
@@ -518,7 +542,8 @@ def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \
         sum_introgressed_fraction = {}
         sum_number_non_gap = {}
         sum_ref_from_count = {}
-        fn_gene = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '.txt'
+        fn_gene = (gp.analysis_out_dir_absolute + tag +
+                   '/genes/' + gene + '.txt')
         if not os.path.exists(os.path.dirname(fn_gene)):
             os.makedirs(os.path.dirname(fn_gene))
         f_gene = open(fn_gene, 'w')
@@ -530,7 +555,8 @@ def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \
                     sum_introgressed_fraction[strain] = 0
                     sum_number_non_gap[strain] = 0
                     sum_ref_from_count[strain] = 0
-                sum_introgressed_fraction[strain] += entry['introgressed_fraction']
+                sum_introgressed_fraction[strain] += entry[
+                        'introgressed_fraction']
                 sum_number_non_gap[strain] += entry['number_non_gap']
                 sum_ref_from_count[strain] += entry['ref_from_count']
                 if strain not in strain_genes:
diff --git a/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py b/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py
index 6bda7a6..c53b29a 100644
--- a/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py
+++ b/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py
@@ -1,33 +1,25 @@
-## calculate nucleotide diversity for all sites and for all sites
-## excluding introgression; also calculate the same but only in coding
-## regions
+# calculate nucleotide diversity for all sites and for all sites
+# excluding introgression; also calculate the same but only in coding
+# regions
 
-import re
 import sys
-import os
 import copy
-import math
 import gzip
-import itertools
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import overlap
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from align import align_helpers
+from misc import read_table
+from misc import read_fasta
+
 
 def try_int(s, default=-1):
     try:
         i = int(s)
         return i
-    except:
+    except ValueError:
         return default
 
-def count_diffs(s, t, skip_char = 'N'):
+
+def count_diffs(s, t, skip_char='N'):
     assert len(s) == len(t)
     num = 0
     den = 0
@@ -39,10 +31,11 @@ def count_diffs(s, t, skip_char = 'N'):
         den += 1
     return num, den
 
-## generate a sequence that has the current strain's base for each
-## site in the reference sequence, and skip_char for any site where
-## the base is a gap/unknown (this is all based on the alignment)
-def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
+
+# generate a sequence that has the current strain's base for each
+# site in the reference sequence, and skip_char for any site where
+# the base is a gap/unknown (this is all based on the alignment)
+def referize(strain_seq, ref_ind_to_strain_ind, skip_char='N'):
     s = [skip_char for r in ref_ind_to_strain_ind]
     for i in range(len(ref_ind_to_strain_ind)):
         si = ref_ind_to_strain_ind[i]
@@ -52,6 +45,7 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'):
             s[i] = strain_seq[si]
     return s
 
+
 def mark_excluded(seq, regions, fill='N'):
     seqi = copy.deepcopy(seq)
     for start, end in regions:
@@ -59,6 +53,7 @@ def mark_excluded(seq, regions, fill='N'):
             seqi[i] = fill
     return seqi
 
+
 def mark_included(seq, regions, fill='N'):
     s = [fill for r in seq]
     for start, end in regions:
@@ -66,15 +61,17 @@ def mark_included(seq, regions, fill='N'):
             s[i] = seq[i]
     return s
 
+
 tag = 'u3_i.001_tv_l1000_f.01'
 
-########
-## read in introgressed regions, as well as strains and reference genes
-########
+# ######
+# read in introgressed regions, as well as strains and reference genes
+# ######
 
-## dictionary of introgressed regions keyed by chromosome and then
-## strain
-regions_by_chrm_and_strain = dict(zip(gp.chrms, [{} for i in range(len(gp.chrms))]))
+# dictionary of introgressed regions keyed by chromosome and then
+# strain
+regions_by_chrm_and_strain = dict(zip(gp.chrms,
+                                      [{} for i in range(len(gp.chrms))]))
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
              'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt'
 d, labels = read_table.read_table_rows(fn_regions, '\t')
@@ -82,20 +79,20 @@ def mark_included(seq, regions, fill='N'):
 for region in d:
     chrm = d[region]['chromosome']
     strain = d[region]['strain']
-    if not regions_by_chrm_and_strain[chrm].has_key(strain):
+    if strain not in regions_by_chrm_and_strain[chrm]:
         regions_by_chrm_and_strain[chrm][strain] = []
-    regions_by_chrm_and_strain[chrm][strain].append((int(d[region]['start']), \
+    regions_by_chrm_and_strain[chrm][strain].append((int(d[region]['start']),
                                                      int(d[region]['end'])))
-## read in all strains
+# read in all strains
 strain_dirs = align_helpers.get_strains(gp.non_ref_dirs[gp.master_ref])
 num_strains = len(strain_dirs)
 
-## read in genes in reference sequence into dictionary keyed by
-## chromosome
+# read in genes in reference sequence into dictionary keyed by
+# chromosome
 ref_genes = {}
 for chrm in gp.chrms:
     ref_genes[chrm] = []
-    f = open(gp.analysis_out_dir_absolute + gp.master_ref + \
+    f = open(gp.analysis_out_dir_absolute + gp.master_ref +
              '_chr' + chrm + '_genes.txt', 'r')
     line = f.readline()
     while line != '':
@@ -104,9 +101,9 @@ def mark_included(seq, regions, fill='N'):
         line = f.readline()
     f.close()
 
-########
-## calculate nucleotide diversity
-########
+# ######
+# calculate nucleotide diversity
+# ######
 
 # all sites
 total_frac = 0
@@ -127,7 +124,7 @@ def mark_included(seq, regions, fill='N'):
 # total number of strain pairs
 num_comparisons = 0
 
-## loop through all strains
+# loop through all strains
 for i in range(num_strains):
     strain_i, d_i = strain_dirs[i]
     strain_i_seqs = {}
@@ -135,49 +132,53 @@ def mark_included(seq, regions, fill='N'):
     strain_i_seqs_coding = {}
     strain_i_seqs_coding_nonint = {}
 
-    ## for each 
+    # for each
     for chrm in gp.chrms:
-        ## coordinate conversion between reference and current strain
+        # coordinate conversion between reference and current strain
         coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
                    gp.master_ref + '_to_' + strain_i + \
                    '_chr' + chrm + '.txt.gz'
         f_coord = gzip.open(coord_fn, 'rb')
-        ref_ind_to_strain_i_ind = [try_int(line[:-1]) for line in f_coord.readlines()]
+        ref_ind_to_strain_i_ind = [try_int(line[:-1])
+                                   for line in f_coord.readlines()]
 
-        ## current strain fasta file for current chromosome
+        # current strain fasta file for current chromosome
         strain_fn = d_i + strain_i + '_chr' + chrm + gp.fasta_suffix
-        print strain_i, chrm
-        
-        ## get chromosome sequence for this strain relative to
-        ## reference strain (the base for this strain at each site in
-        ## the reference, based on original alignment);
-        ## gaps/unsequenced sites/etc marked as 'N'
-        strain_i_seqs[chrm] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(),\
-                                       ref_ind_to_strain_i_ind)
-
-        ## get version of sequence where everything that doesn't fall
-        ## within gene is replaced by 'N'
-        strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm],\
+        print(strain_i, chrm)
+
+        # get chromosome sequence for this strain relative to
+        # reference strain (the base for this strain at each site in
+        # the reference, based on original alignment);
+        # gaps/unsequenced sites/etc marked as 'N'
+        strain_i_seqs[chrm] = referize(
+            read_fasta.read_fasta(strain_fn)[1][0].lower(),
+            ref_ind_to_strain_i_ind)
+
+        # get version of sequence where everything that doesn't fall
+        # within gene is replaced by 'N'
+        strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm],
                                                    ref_genes[chrm])
 
-        ## also get version of above sequences where introgressed sites are
-        ## replaced by 'N'
+        # also get version of above sequences where introgressed sites are
+        # replaced by 'N'
         strain_i_seqs_nonint[chrm] = copy.deepcopy(strain_i_seqs[chrm])
-        strain_i_seqs_coding_nonint[chrm] = copy.deepcopy(strain_i_seqs_coding[chrm])
-        if regions_by_chrm_and_strain[chrm].has_key(strain_i):
-            strain_i_seqs_nonint[chrm] = mark_excluded(strain_i_seqs[chrm],\
-                                            regions_by_chrm_and_strain[chrm][strain_i])
+        strain_i_seqs_coding_nonint[chrm] = copy.deepcopy(
+            strain_i_seqs_coding[chrm])
+        if strain_i in regions_by_chrm_and_strain[chrm]:
+            strain_i_seqs_nonint[chrm] = mark_excluded(
+                strain_i_seqs[chrm],
+                regions_by_chrm_and_strain[chrm][strain_i])
             strain_i_seqs_coding_nonint[chrm] = \
-                mark_excluded(strain_i_seqs_coding[chrm],\
+                mark_excluded(strain_i_seqs_coding[chrm],
                               regions_by_chrm_and_strain[chrm][strain_i])
 
-    ## loop through all strains to get second strain for current pair
+    # loop through all strains to get second strain for current pair
     for j in range(i+1, num_strains):
         strain_j, d_j = strain_dirs[j]
 
-        print strain_i, strain_j
-        ## keep track of total number of strain pairs we're looking
-        ## at, so we can divide total by that later
+        print(strain_i, strain_j)
+        # keep track of total number of strain pairs we're looking
+        # at, so we can divide total by that later
         num_comparisons += 1
 
         num = 0
@@ -190,46 +191,49 @@ def mark_included(seq, regions, fill='N'):
         den_coding_nonint = 0
         for chrm in gp.chrms:
 
-            ## do the same reading in of sequence for this strain,
-            ## relative to reference, and also excluding introgressed
-            ## sites
+            # do the same reading in of sequence for this strain,
+            # relative to reference, and also excluding introgressed
+            # sites
             coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
                        gp.master_ref + '_to_' + strain_j + \
                        '_chr' + chrm + '.txt.gz'
             f_coord = gzip.open(coord_fn, 'rb')
-            ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()]
-            
+            ref_ind_to_strain_ind = [try_int(line[:-1])
+                                     for line in f_coord.readlines()]
+
             strain_fn = d_j + strain_j + '_chr' + chrm + gp.fasta_suffix
-            strain_j_seq = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(),\
-                                    ref_ind_to_strain_ind)
+            strain_j_seq = referize(
+                read_fasta.read_fasta(strain_fn)[1][0].lower(),
+                ref_ind_to_strain_ind)
             strain_j_seq_coding = mark_included(strain_j_seq, ref_genes[chrm])
 
             strain_j_seq_nonint = copy.deepcopy(strain_j_seq)
             strain_j_seq_coding_nonint = copy.deepcopy(strain_j_seq_coding)
-            if regions_by_chrm_and_strain[chrm].has_key(strain_j):
-                strain_j_seq_nonint = mark_excluded(strain_j_seq,\
-                                        regions_by_chrm_and_strain[chrm][strain_j])
-                strain_j_seq_coding_nonint = mark_excluded(strain_j_seq_coding,\
-                                        regions_by_chrm_and_strain[chrm][strain_j])
-
-            ## count sites that differ between the two strains
-            ## (ignoring any sites where one of the strains has 'N')
-            ## and add to appropriate running total
-
-            ## all sites
+            if strain_j in regions_by_chrm_and_strain[chrm]:
+                strain_j_seq_nonint = mark_excluded(
+                    strain_j_seq, regions_by_chrm_and_strain[chrm][strain_j])
+                strain_j_seq_coding_nonint = mark_excluded(
+                    strain_j_seq_coding,
+                    regions_by_chrm_and_strain[chrm][strain_j])
+
+            # count sites that differ between the two strains
+            # (ignoring any sites where one of the strains has 'N')
+            # and add to appropriate running total
+
+            # all sites
             num_chrm, den_chrm = count_diffs(strain_i_seqs[chrm], strain_j_seq)
             num += num_chrm
             den += den_chrm
             total_fracs[chrm] += float(num_chrm)/den_chrm
 
             # nonintrogressed
-            num_chrm_nonint, den_chrm_nonint = count_diffs(strain_i_seqs_nonint[chrm],\
-                                                           strain_j_seq_nonint)
+            num_chrm_nonint, den_chrm_nonint = count_diffs(
+                strain_i_seqs_nonint[chrm], strain_j_seq_nonint)
             num_nonint += num_chrm_nonint
             den_nonint += den_chrm_nonint
             total_fracs_nonint[chrm] += float(num_chrm_nonint)/den_chrm_nonint
 
-            ## all coding sites
+            # all coding sites
             num_chrm_coding, den_chrm_coding = \
                 count_diffs(strain_i_seqs_coding[chrm], strain_j_seq_coding)
             num_coding += num_chrm_coding
@@ -238,20 +242,20 @@ def mark_included(seq, regions, fill='N'):
 
             # coding, nonintrogressed
             num_chrm_coding_nonint, den_chrm_coding_nonint = \
-                count_diffs(strain_i_seqs_coding_nonint[chrm],\
+                count_diffs(strain_i_seqs_coding_nonint[chrm],
                             strain_j_seq_coding_nonint)
             num_coding_nonint += num_chrm_coding_nonint
             den_coding_nonint += den_chrm_coding_nonint
             total_fracs_coding_nonint[chrm] += \
                 float(num_chrm_coding_nonint)/den_chrm_coding_nonint
 
-            print num_comparisons, chrm, \
-                total_fracs[chrm], \
-                total_fracs_nonint[chrm], \
-                1 - total_fracs_nonint[chrm]/total_fracs[chrm], \
-                total_fracs_coding[chrm], \
-                total_fracs_coding_nonint[chrm], \
-                1 - total_fracs_coding_nonint[chrm]/total_fracs_coding[chrm]
+            print(num_comparisons, chrm,
+                  total_fracs[chrm],
+                  total_fracs_nonint[chrm],
+                  1 - total_fracs_nonint[chrm]/total_fracs[chrm],
+                  total_fracs_coding[chrm],
+                  total_fracs_coding_nonint[chrm],
+                  1 - total_fracs_coding_nonint[chrm]/total_fracs_coding[chrm])
 
         # and keep track across all chromosomes
         total_frac += float(num)/den
@@ -259,9 +263,10 @@ def mark_included(seq, regions, fill='N'):
         total_frac_coding += float(num_coding)/den_coding
         total_frac_coding_nonint += float(num_coding_nonint)/den_coding_nonint
 
-        print num_comparisons, total_frac, total_frac_nonint, \
-            1 - total_frac_nonint/total_frac, total_frac_coding, \
-            total_frac_coding_nonint, 1 - total_frac_coding_nonint/total_frac_coding
+        print(num_comparisons, total_frac, total_frac_nonint,
+              1 - total_frac_nonint/total_frac, total_frac_coding,
+              total_frac_coding_nonint,
+              1 - total_frac_coding_nonint/total_frac_coding)
         sys.stdout.flush()
 
 # nucleotide diversity is the running total of fractions of sites that
@@ -272,23 +277,23 @@ def mark_included(seq, regions, fill='N'):
 nuc_div_coding = total_frac_coding/num_comparisons
 nuc_div_coding_nonint = total_frac_coding_nonint/num_comparisons
 
-print nuc_div
-print nuc_div_nonint
-print nuc_div_coding
-print nuc_div_coding_nonint
+print(nuc_div)
+print(nuc_div_nonint)
+print(nuc_div_coding)
+print(nuc_div_coding_nonint)
 
-########
-## write overall results and results for individual chromosome to file
-########
+# ######
+# write overall results and results for individual chromosome to file
+# ######
 
-f = open(gp.analysis_out_dir_absolute + tag + '/polymorphism/' + \
+f = open(gp.analysis_out_dir_absolute + tag + '/polymorphism/' +
          'nucleotide_diversity_c.txt', 'w')
 f.write('chromosome\tpi\tpi_nonint\tpi_coding\tpi_coding_nonint\n')
-f.write('all\t' + str(nuc_div) + '\t' + str(nuc_div_nonint) + \
+f.write('all\t' + str(nuc_div) + '\t' + str(nuc_div_nonint) +
         '\t' + str(nuc_div_coding) + '\t' + str(nuc_div_coding_nonint) + '\n')
 for chrm in gp.chrms:
-    f.write(chrm + '\t' + str(total_fracs[chrm]/num_comparisons) + '\t' + \
-            str(total_fracs_nonint[chrm]/num_comparisons) + '\t' + \
-            str(total_fracs_coding[chrm]/num_comparisons) + '\t' + \
+    f.write(chrm + '\t' + str(total_fracs[chrm]/num_comparisons) + '\t' +
+            str(total_fracs_nonint[chrm]/num_comparisons) + '\t' +
+            str(total_fracs_coding[chrm]/num_comparisons) + '\t' +
             str(total_fracs_coding_nonint[chrm]/num_comparisons) + '\n')
 f.close()
diff --git a/code/annotate/fix.py b/code/annotate/fix.py
index 4c7ca15..76cf352 100644
--- a/code/annotate/fix.py
+++ b/code/annotate/fix.py
@@ -1,12 +1,6 @@
-import sys
 import os
-#from orf import *
-sys.path.insert(0, '../align')
-import align_helpers
-sys.path.insert(0, '..')
-import global_params as gp
 
-#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
+# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
 d = '../../data/CBS432/orfs/'
 fns = os.listdir(d)
 for fn in fns:
diff --git a/code/annotate/makeblastdb.py b/code/annotate/makeblastdb.py
index 606969a..4e7e967 100644
--- a/code/annotate/makeblastdb.py
+++ b/code/annotate/makeblastdb.py
@@ -1,18 +1,13 @@
-import sys
 import os
-#from orf import *
-sys.path.insert(0, '../align')
-import align_helpers
-sys.path.insert(0, '..')
 import global_params as gp
 
-#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
+# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
 d = '../../data/CBS432/orfs/'
-#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
+# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/'
 fns = os.listdir(d)
 for fn in fns:
     cmd_string = gp.blast_install_path + 'makeblastdb' + \
                  ' -dbtype nucl' + \
                  ' -in ' + d + fn
-    print cmd_string
+    print(cmd_string)
     os.system(cmd_string)
diff --git a/code/annotate/orfs_main.py b/code/annotate/orfs_main.py
index 9c0a906..6ec3a05 100644
--- a/code/annotate/orfs_main.py
+++ b/code/annotate/orfs_main.py
@@ -3,49 +3,48 @@
 
 import sys
 import os
-#from orf import *
-sys.path.insert(0, '../align')
-import align_helpers
-sys.path.insert(0, '..')
+from align import align_helpers
 import global_params as gp
 
-ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + \
-           gp.fasta_suffix \
+ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' +
+           gp.fasta_suffix
            for r in gp.alignment_ref_order]
 
 # get all non-reference strains of cerevisiae and paradoxus
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
 # and get paradoxus reference as well
-s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], gp.ref_dir[gp.alignment_ref_order[1]]))
+s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]],
+          gp.ref_dir[gp.alignment_ref_order[1]]))
 
 strain_fn = '*_chr?' + gp.fasta_suffix
 
 f = open('orfs.sh', 'w')
-for i in range(78,94):
+for i in range(78, 94):
 
     strain, d = s[i]
-    print strain
+    print(strain)
     current_strain_fn = strain_fn.replace('*', strain)
     for chrm in gp.chrms:
-        print chrm
+        print(chrm)
         sys.stdout.flush()
-        
+
         current_strain_chrm_fn = current_strain_fn.replace('?', chrm)
-        orf_fn = strain + '_chr' + chrm + \
-                 '_orfs' + gp.fasta_suffix
+        orf_fn = strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix
         orf_d = d + '/orfs/'
         if not os.path.isdir(orf_d):
             os.makedirs(orf_d)
 
-        cmd_string = gp.orffinder_install_path + '/ORFfinder' + \
-                     ' -in ' + d + current_strain_chrm_fn + \
-                     ' -s 0' + \
-                     ' -out ' + orf_d + orf_fn + \
-                     ' -outfmt 1 -n true; \n'
-        #print cmd_string
+        cmd_string = (gp.orffinder_install_path + '/ORFfinder' +
+                      ' -in ' + d + current_strain_chrm_fn +
+                      ' -s 0' +
+                      ' -out ' + orf_d + orf_fn +
+                      ' -outfmt 1 -n true; \n')
+        # print(cmd_string)
         os.system(cmd_string)
         f.write(cmd_string)
 f.close()
 
 
-# "../../../../software/ORFfinder -in /tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/yjm248_chrI.fa -out a.txt -outfmt 1 -n true"
+# "../../../../software/ORFfinder \
+#    -in /tigress/AKEY/akey_vol2/aclark4/nobackup/\
+#        100_genomes/genomes_gb/yjm248_chrI.fa -out a.txt -outfmt 1 -n true"
diff --git a/code/beer_strains/fastq_to_fasta.py b/code/beer_strains/fastq_to_fasta.py
index 838af93..d7959ac 100644
--- a/code/beer_strains/fastq_to_fasta.py
+++ b/code/beer_strains/fastq_to_fasta.py
@@ -1,12 +1,15 @@
 # take fastq files containing reads and quality information, along
-# with reference genome, and convert to fasta file ... or vcf file and then fasta??
+# with reference genome, and convert to fasta file ...
+# or vcf file and then fasta??
 
 import os
 import sys
 
 fastq_dir = '/net/dunham/vol2/Giang/DunhamBeer/DunhamBeer'
 
-quality_chars = list('!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')
+quality_chars = list('!"#$%&\'()*+,-./0123456789:;<=>?@\
+                     ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`\
+                     abcdefghijklmnopqrstuvwxyz{|}~')
 char_to_score = dict(zip(quality_chars, range(1, len(quality_chars))))
 
 fastq_dir = '/net/dunham/vol2/Giang/DunhamBeer/DunhamBeer/'
@@ -16,34 +19,41 @@
     if '.1.fastq' in l and 'stats' not in l and l[0] != 'N':
         fns.append(l[:-8])
 
-ref_fasta = '/net/akey/vol2/aclark4/nobackup/100_genomes/genomes/S288c_SGD-R64.fa'
+ref_fasta = '/net/akey/vol2/aclark4/nobackup/\
+    100_genomes/genomes/S288c_SGD-R64.fa'
 
 #####
 # align reads with bwa
 #####
 
 samdir = '/net/akey/vol2/aclark4/nobackup/introgression/data/beer/dunham/sam/'
-os.system('module load bwa/latest') # this doesn't actually work because it makes a new shell instance every time - TODO fix this
-cmd = 'bwa index ' + ref_fasta 
-#print cmd
-#os.system(cmd)
+# this doesn't actually work because it makes a new shell instance every time
+# TODO fix this
+os.system('module load bwa/latest')
+cmd = 'bwa index ' + ref_fasta
+# print cmd
+# os.system(cmd)
 for fn in fns:
-    cmd = 'bwa mem ' + ref_fasta + ' ' + fastq_dir + fn + '.1.fastq ' + fastq_dir + fn + '.2.fastq' + ' > ' + samdir + fn + '.sam'
-    print cmd
+    cmd = 'bwa mem ' + ref_fasta + ' ' + fastq_dir + fn + \
+        '.1.fastq ' + fastq_dir + fn + '.2.fastq' + ' > ' + \
+        samdir + fn + '.sam'
+    print(cmd)
     os.system(cmd)
 
 sys.exit()
 
-#####
+# ###
 # run base recalibrator
-#####
+# ###
 
-outdir = '/net/akey/vol2/aclark4/nobackup/introgression/data/beer/dunham/fasta/'
+outdir = '/net/akey/vol2/aclark4/nobackup/introgression/\
+    data/beer/dunham/fasta/'
 for fn in fns:
     # -knownSites database of previously known polymorphisms
-    os.system('java -jar ~/software/GenomeAnalysisTK.jar -T BaseRecalibrator -R ' + ref_fasta + ' -I ' + fastq_dir + fn + ' -o ' + outdir + fn[:-1] + 'a')
-    
-#####
-# run 
-#####
-    
+    os.system('java -jar ~/software/GenomeAnalysisTK.jar '
+              '-T BaseRecalibrator -R ' + ref_fasta + ' -I ' +
+              fastq_dir + fn + ' -o ' + outdir + fn[:-1] + 'a')
+
+# ###
+# run
+# ###
diff --git a/code/beer_strains/vcf_to_fasta.py b/code/beer_strains/vcf_to_fasta.py
index 01296d1..689ffe4 100644
--- a/code/beer_strains/vcf_to_fasta.py
+++ b/code/beer_strains/vcf_to_fasta.py
@@ -1,5 +1,6 @@
 import sys
 
+
 def read_vcf(fn):
     f = open(fn, 'r')
 
@@ -14,14 +15,15 @@ def read_vcf(fn):
     f.close()
     return v
 
+
 def vcf_to_fasta(v, fn_ref, fn_out):
     f_ref = open(fn_ref, 'r')
-    f_out = open(fn_out, 'w')
 
     line = f_ref.readline()
     while line != '':
-        
+
         line = f_ref.readline()
-    
+
+
 v = read_vcf(sys.argv[1])
 vcf_to_fasta(v, sys.argv[2])
diff --git a/code/config.yaml b/code/config.yaml
new file mode 100644
index 0000000..5a0fc47
--- /dev/null
+++ b/code/config.yaml
@@ -0,0 +1,138 @@
+---
+# biological parameters
+mu: 1.84e-10
+
+
+# should we leave the alignments already completed in the alignments
+# directory alone?
+resume_alignment: False
+
+HMM_symbols:
+    match: '+'
+    mismatch: '-'
+    unknown: '?'
+    unsequenced: 'n'
+    gap: '-'
+    unaligned: '?'
+    masked: 'x'
+
+output_root: /tigress/tcomi/aclark4_temp/results
+input_root: /tigress/AKEY/akey_vol2/aclark4/nobackup
+
+paths:
+    # optional log file
+    # log_file: introgression.log
+    fasta_suffix: .fa  # suffix for _all_ fasta files
+    # suffix for _all_ alignment files
+    # this needs to match the suffix output by mugsy
+    alignment_suffix: .maf
+
+    masks: /tigress/tcomi/aclark4_temp/par4/masked/
+    alignments: /tigress/tcomi/aclark4_temp/par4/
+
+    test_strains:
+        - "__INPUT_ROOT__/100_genomes/genomes_gb/\
+           {strain}_chr{chrom}.fa"
+
+    simulations:
+        sim_base: __OUTPUT_ROOT__/sim
+        prefix: sim_out_
+        suffix: .txt
+
+    analysis:
+        analysis_base: __OUTPUT_ROOT__/analysis
+        regions: __ANALYSIS_BASE__/regions/{state}.fa.gz
+        region_index: __ANALYSIS_BASE__/regions/{state}.pkl
+        genes: __ANALYSIS_BASE__/genes/
+        blocks: __ANALYSIS_BASE__/blocks_{state}.txt
+        labeled_blocks: __ANALYSIS_BASE__/blocks_{state}_labeled.txt
+        quality_blocks: __ANALYSIS_BASE__/blocks_{state}_quality.txt
+        hmm_initial: __ANALYSIS_BASE__/hmm_initial.txt
+        hmm_trained: __ANALYSIS_BASE__/hmm_trained.txt
+        probabilities: __ANALYSIS_BASE__/probabilities.txt.gz
+        alignment: __ALIGNMENTS__/{prefix}_{strain}_chr{chrom}_mafft.maf
+        positions: __ANALYSIS_BASE__/positions.txt.gz
+        masked_intervals: __MASKS__/{strain}_chr{chrom}_intervals.txt
+        introgressed: __ANALYSIS_BASE__/blocks_{state}_filter1.txt
+        introgressed_intermediate: "__ANALYSIS_BASE__/\
+                                    blocks_{state}_filter1inter.txt"
+        ambiguous: __ANALYSIS_BASE__/blocks_{state}_filter2.txt
+        ambiguous_intermediate: "__ANALYSIS_BASE__/\
+                                 blocks_{state}_filter2inter.txt"
+        filter_sweep: __ANALYSIS_BASE__/filter2_thresholds.txt
+        # strain_info is a tsv file with strain, _, _, geographic origin,
+        # environmental origin, population
+        strain_info: "/home/tcomi/projects/aclark4_introgression/\
+                      100_genomes_info.txt"
+        state_counts: __ANALYSIS_BASE__/state_counts.txt
+
+    # software install locations
+    software:
+        root_install: /tigress/anneec/software
+        mugsy: __ROOT_INSTALL__/mugsy/
+        tcoffee: "__ROOT_INSTALL__/\
+            T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/"
+        mafft: __ROOT_INSTALL__/mafft/bin/
+        ms: __ROOT_INSTALL__/msdir/
+        # including dustmasker
+        blast: "__ROOT_INSTALL__/ncbi-blast-2.7.1+-src/\
+            c++/ReleaseMT/bin/"
+        orffinder: __ROOT_INSTALL__/
+        ldselect: __ROOT_INSTALL__/ldSelect/
+        structure: __ROOT_INSTALL__/structure/
+
+chromosomes: ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
+              'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']
+
+# can optionally list all strains to consider
+# if blank will glob with TEST_STRAINS paths
+# strains:
+
+# can provide a prefix for the alignment files
+# if blank will be the reference and known state names joined with '_'
+
+analysis_params:
+    convergence_threshold: 0.001
+    # threshold can be 'viterbi' or a float to threshold HMM probabilities
+    threshold: viterbi
+    input_root: /tigress/AKEY/akey_vol2/aclark4/nobackup
+    filter_threshold: 0.98
+
+    # master known state, prepeded to list of known states
+    reference:
+        name: S288c
+        base_dir: __INPUT_ROOT__/100_genomes/genomes/S288c_SGD-R64/
+        gene_bank_dir: __INPUT_ROOT__/S288c/
+        interval_name: S288c_SGD-R64  # if different than name
+
+    known_states:
+        - name: CBS432
+          base_dir: /tigress/anneec/projects/introgression/data/CBS432/
+          gene_bank_dir: __INPUT_ROOT__/CBS432/
+          expected_length: 10000
+          expected_fraction: 0.025
+
+        - name: N_45
+          base_dir: __INPUT_ROOT__/para_sgrp/strains/N_45/
+          gene_bank_dir: null
+          expected_length: 10000
+          expected_fraction: 0.025
+
+        - name: DBVPG6304
+          base_dir: __INPUT_ROOT__/para_sgrp/strains/DBVPG6304/
+          gene_bank_dir: null
+          expected_length: 10000
+          expected_fraction: 0.025
+
+        - name: UWOPS91_917_1
+          base_dir: __INPUT_ROOT__/para_sgrp/strains/UWOPS91_917_1/
+          gene_bank_dir: null
+          expected_length: 10000
+          expected_fraction: 0.025
+
+    unknown_states:
+        - name: unknown
+          expected_length: 1000
+          expected_fraction: 0.01
+
+    gene_bank_all: __INPUT_ROOT__/100_genomes/sequence.gb
diff --git a/code/environment.yml b/code/environment.yml
index 7dca4f3..e3ba0e6 100644
--- a/code/environment.yml
+++ b/code/environment.yml
@@ -1,20 +1,24 @@
----
-name: introgression3
+name: introgression
 channels:
+  - conda-forge
   - defaults
 dependencies:
   - atomicwrites=1.3.0=py_0
   - attrs=18.2.0=py37h28b3542_0
   - blas=1.0=mkl
-  - ca-certificates=2019.1.23=0
-  - certifi=2018.11.29=py37_0
+  - ca-certificates=2019.5.15=0
+  - certifi=2019.3.9=py37_0
   - coverage=4.5.2=py37h7b6447c_0
+  - entrypoints=0.3=py37_1000
+  - flake8=3.7.7=py37_0
   - intel-openmp=2019.1=144
   - libedit=3.1.20181209=hc058e9b_0
   - libffi=3.2.1=hd88cf55_4
   - libgcc-ng=8.2.0=hdf63c60_1
   - libgfortran-ng=7.3.0=hdf63c60_0
   - libstdcxx-ng=8.2.0=hdf63c60_1
+  - llvmlite=0.28.0=py37hd408876_0
+  - mccabe=0.6.1=py_1
   - mkl=2019.1=144
   - mkl_fft=1.0.10=py37ha843d7b_0
   - mkl_random=1.0.2=py37hd81dba3_0
@@ -22,14 +26,18 @@ dependencies:
   - ncurses=6.1=he6710b0_1
   - numpy=1.15.4=py37h7e9f1db_0
   - numpy-base=1.15.4=py37hde5b4d6_0
-  - openssl=1.1.1a=h7b6447c_0
+  - openssl=1.1.1c=h7b6447c_1
   - pip=19.0.1=py37_0
   - pluggy=0.8.1=py37_0
   - py=1.7.0=py37_0
+  - pycodestyle=2.5.0=py_0
+  - pyflakes=2.1.1=py_0
   - pytest=4.2.1=py37_0
   - pytest-cov=2.6.1=py37_0
+  - pytest-flake8=1.0.4=py37_0
   - pytest-mock=1.10.0=py37_0
   - python=3.7.2=h0371630_0
+  - pyyaml=5.1=py37h7b6447c_0
   - readline=7.0=h7b6447c_5
   - setuptools=40.8.0=py37_0
   - six=1.12.0=py37_0
@@ -37,4 +45,7 @@ dependencies:
   - tk=8.6.8=hbc83047_0
   - wheel=0.32.3=py37_0
   - xz=5.2.4=h14c3975_4
+  - yaml=0.1.7=had09818_2
   - zlib=1.2.11=h7b6447c_3
+  - pip:
+    - click==7.0
diff --git a/code/global_params.py b/code/global_params.py
index f01ec9d..a21ef1e 100644
--- a/code/global_params.py
+++ b/code/global_params.py
@@ -1,33 +1,34 @@
-#====
+# ====
 # biological parameters
-#====
+# ====
 
 mu = 1.84 * 10 ** -10
 
-#====
+# ====
 # file extensions
-#====
+# ====
 
 # suffix for _all_ fasta files
 fasta_suffix = '.fa'
 
-# suffix for _all_ alignment files; this needs to match the suffix output by mugsy
+# suffix for _all_ alignment files
+# this needs to match the suffix output by mugsy
 alignment_suffix = '.maf'
 
-#====
+# ====
 # sequence locations/names
-#====
+# ====
 
-## now specified in setup_args file
+# now specified in setup_args file
 
-#====
+# ====
 # alignment files
-#====
+# ====
 
-## alignments directory now specified in setup_args file
+# alignments directory now specified in setup_args file
 
 mask_dir = '../alignments/masked/'
-#mask_dir = '/tigress/tcomi/aclark4_temp/par4/masked/'
+# mask_dir = '/tigress/tcomi/aclark4_temp/par4/masked/'
 
 # should we leave the alignments already completed in the alignments
 # directory alone?
@@ -36,9 +37,9 @@
 # master_ref now automatically assumed to be first
 # reference specified in setup_args file
 
-#====
+# ====
 # HMM
-#====
+# ====
 
 match_symbol = '+'
 mismatch_symbol = '-'
@@ -49,13 +50,12 @@
 unaligned_symbol = '?'
 masked_symbol = 'x'
 
-#====
+# ====
 # simulations
-#====
+# ====
 
 # output directory for simulpations
 sim_out_dir_absolute = '/tigress/tcomi/aclark4_temp/results/sim'
-#sim_out_dir_absolute = '/tigress/AKEY/akey_vol2/aclark4/projects/introgression/results/sim/'
 
 # prefix for simulation output
 sim_out_prefix = 'sim_out_'
@@ -63,9 +63,9 @@
 # suffix for simulation output
 sim_out_suffix = '.txt'
 
-#====
+# ====
 # analysis
-#====
+# ====
 
 analysis_out_dir_absolute = \
     '/tigress/AKEY/akey_vol2/aclark4/projects/introgression/results/analysis/'
@@ -74,20 +74,22 @@
 
 genes_out_dir_absolute = analysis_out_dir_absolute + '/genes/'
 
-#====
+# ====
 # software install locations
-#====
+# ====
 
 mugsy_install_path = '/tigress/anneec/software/mugsy/'
 
-tcoffee_install_path = '/tigress/anneec/software/T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/'
+tcoffee_install_path = '/tigress/anneec/software/\
+    T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/'
 
 mafft_install_path = '/tigress/anneec/software/mafft/bin/'
 
 ms_install_path = '/tigress/anneec/software/msdir/'
 
 # including dustmasker
-blast_install_path = '/tigress/anneec/software/ncbi-blast-2.7.1+-src/c++/ReleaseMT/bin/'
+blast_install_path = '/tigress/anneec/software/\
+    ncbi-blast-2.7.1+-src/c++/ReleaseMT/bin/'
 
 orffinder_install_path = '/tigress/anneec/software/'
 
@@ -95,11 +97,12 @@
 
 structure_install_path = '/tigress/anneec/software/structure/'
 
-#====
+# ====
 # other
-#====
+# ====
 
-chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']
-#chrms = ['I']
+chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
+         'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']
+# chrms = ['I']
 
 chrms_ara = dict(zip(chrms, range(1, len(chrms)+1)))
diff --git a/code/hmm/hmm_bw.py b/code/hmm/hmm_bw.py
index 26c8e37..e787916 100644
--- a/code/hmm/hmm_bw.py
+++ b/code/hmm/hmm_bw.py
@@ -1,5 +1,6 @@
 import numpy as np
 from typing import List, Dict, Tuple
+import logging as log
 
 
 class HMM:
@@ -22,7 +23,7 @@ def set_hidden_states(self, states: List[str]) -> None:
     def set_observed_states(self, states: List[str]) -> None:
         '''
         Sets the observed states of the HMM to the supplied list of strings
-        If not supplied will set to list of keys provided by emissions 
+        If not supplied will set to list of keys provided by emissions
         '''
         self.observed_states = states
 
@@ -86,46 +87,43 @@ def set_initial_p(self, initial_p: List[float]) -> None:
         '''
 
         self.initial_p = np.array(initial_p)
-        assert np.isclose(np.sum(initial_p), 1), f"{initial_p} {sum(initial_p)}"
+        assert np.isclose(np.sum(initial_p), 1), \
+            f"{initial_p} {sum(initial_p)}"
 
     def print_results(self, iterations: int, LL: float) -> None:
         '''
         Write current state of HMM to stdout
         '''
-        print(
-            f'''Iterations: {iterations}
+        message = f'Iterations: {iterations}\n\nLog Likelihood:\n{LL:.30e}'
 
-Log Likelihood:
-{LL:.30e}
-
-Initial State Probabilities:'''
-        )
+        message += '\n\nInitial State Probabilities:\n'
         for i in range(len(self.hidden_states)):
-            print(f"{self.hidden_states[i]}={self.initial_p[i]:.30e}")
-        print()
-        print("Transition Probabilities:")
+            message += f'{self.hidden_states[i]}={self.initial_p[i]:.30e}\n'
+
+        message += '\nTransition Probabilities:\n'
         for i in range(len(self.hidden_states)):
             for j in range(len(self.hidden_states)):
-                print(f"{self.hidden_states[i]},{self.hidden_states[j]}\
-                    ={self.transitions[i][j]:.30e}")
-        print()
-        print("Emission Probabilities:")
+                message += f"{self.hidden_states[i]},{self.hidden_states[j]}\
+                    ={self.transitions[i][j]:.30e}\n"
+
+        message += '\nEmission Probabilities:\n'
         for i in range(len(self.hidden_states)):
             for k in sorted(self.observed_states):
-                print(f"{self.hidden_states[i]},{k}=\
-                      {self.emissions[i, self.symbol_to_ind[k]]:.30e}")
-        print()
+                message += f"{self.hidden_states[i]},{k}=\
+                    {self.emissions[i, self.symbol_to_ind[k]]:.30e}\n"
+        message += '\n'
+        log.debug(message)
 
     def train(self,
-           improvement_frac: float = .01,
-           max_iterations: int = None) -> None:
+              improvement_frac: float = 0.01,
+              max_iterations: int = None) -> None:
         '''
         Train the hmm until either the max iterations is reached or
         the log likelihood fails to improve beyond the improvement factor
         '''
 
         # calculate current log likelihood
-        print("calculating alpha")
+        log.debug('calculating alpha')
         alpha = self.forward()
 
         LL = self.log_likelihood(alpha)
@@ -141,32 +139,32 @@ def train(self,
                and iterations < max_iterations)\
                 or LL - prev_LL > threshold:
 
-            print(f"Iteration {iterations}")
+            log.info(f'Iteration {iterations}')
 
-            print("calculating beta")
+            log.debug('calculating beta')
             beta = self.backward()
-            print("calculating gamma")
+            log.debug('calculating gamma')
             gamma = self.state_probs(alpha, beta)
-            print("calculating xi")
+            log.debug('calculating xi')
             xi = self.bw(alpha, beta)
 
-            print("updating parameters")
+            log.debug('updating parameters')
 
             self.initial_p = self.initial_probabilities(gamma)
             self.transitions = self.transition_probabilities(xi, gamma)
             self.emissions = self.emission_probabilities(gamma)
 
             assert np.isclose(np.sum(self.initial_p), 1), \
-                f"{beta}\n{np.sum(self.initial_p)} {self.initial_p}"
+                f'{beta}\n{np.sum(self.initial_p)} {self.initial_p}'
             for t in self.transitions:
                 assert np.isclose(np.sum(t), 1), \
-                    f"{xi} {gamma} {np.sum(t)} {t}"
+                    f'{xi} {gamma} {np.sum(t)} {t}'
             for e in self.emissions:
-                assert np.isclose(np.sum(e), 1), f"{np.sum(e.values())} {e}"
+                assert np.isclose(np.sum(e), 1), f'{np.sum(e.values())} {e}'
 
             iterations += 1
 
-            print("calculating alpha")
+            log.debug("calculating alpha")
             alpha = self.forward()
 
             prev_LL = LL
@@ -176,11 +174,11 @@ def train(self,
             self.print_results(iterations, LL)
 
             if LL < prev_LL and not np.isclose(LL, prev_LL):
-                # NOTE does not stop execution
-                print('PROBLEM: log-likelihood stopped increasing; \
-                      stopping training now')
+                log.error('PROBLEM: log-likelihood stopped increasing; '
+                          'stopping training now')
+                return
 
-        print(f"finished in {iterations} iterations")
+        log.info(f'finished in {iterations} iterations')
 
     def log_likelihood(self, alpha: np.array) -> float:
         '''
@@ -291,14 +289,16 @@ def forward(self) -> np.array:
         # Markov process was at state j at time t
         # returns array of size observations, observations[0], hidden_states
         # determine emission probabilities for each measured value
-        emis = np.transpose(np.log(self.emissions[:, self.observations]))
+        with np.errstate(divide='ignore'):
+            emis = np.transpose(np.log(self.emissions[:, self.observations]))
         trans = np.log(self.transitions)
         alpha = np.empty((len(self.observations),
                           len(self.observations[0]),
                           len(self.hidden_states)), float)
 
         # initialize to initial probabilitiy * observed emission
-        alpha[:, 0, :] = np.log(self.initial_p[None, :]) + emis[0, :, :]
+        with np.errstate(divide='ignore'):
+            alpha[:, 0, :] = np.log(self.initial_p[None, :]) + emis[0, :, :]
         # recursively fill array
         for i in range(1, len(self.observations[0])):
             alpha[:, i, :] = np.logaddexp.reduce(alpha[:, i-1, :][:, :, None] +
@@ -312,8 +312,9 @@ def backward(self) -> np.array:
         '''
         # probability that the sequence from t+1 to end was observed
         # and Markov process was at state j at time t
-        emis = np.transpose(np.log(self.emissions[:, self.observations]))
-        trans = np.log(self.transitions)
+        with np.errstate(divide='ignore'):
+            emis = np.transpose(np.log(self.emissions[:, self.observations]))
+            trans = np.log(self.transitions)
         beta = np.zeros((len(self.observations),
                          len(self.observations[0]),
                          len(self.hidden_states)), float)
@@ -327,7 +328,7 @@ def backward(self) -> np.array:
 
     def calculate_max_states(self) -> Tuple[np.array, np.array]:
         '''
-        Find the maximum likelihood hidden states and the corresponding 
+        Find the maximum likelihood hidden states and the corresponding
         log probability for each state.
         Returned tuple is (probability, states)
         '''
@@ -337,12 +338,14 @@ def calculate_max_states(self) -> Tuple[np.array, np.array]:
                            len(self.hidden_states)), int)
 
         # build array of emissions based on observations
-        emissions = np.log(np.transpose(self.emissions)[self.observations])
+        with np.errstate(divide='ignore'):
+            emissions = np.log(np.transpose(self.emissions)[self.observations])
+
+            trans_emis = np.log(self.transitions[None, :, :]) +\
+                emissions[:, None, :]
 
-        trans_emis = np.log(self.transitions[None, :, :]) +\
-            emissions[:, None, :]
+            probabilities[0, :] = np.log(self.initial_p) + emissions[0]
 
-        probabilities[0, :] = np.log(self.initial_p) + emissions[0]
         states[0, :] = -1
 
         for i in range(1, len(emissions)):
diff --git a/code/misc/binary_search.py b/code/misc/binary_search.py
index 73c4cee..79594ca 100644
--- a/code/misc/binary_search.py
+++ b/code/misc/binary_search.py
@@ -1,7 +1,9 @@
 import bisect
+from typing import List
 
-def present(a, x):
-    'Locate the leftmost value exactly equal to x'
+
+def present(a: List[int], x: int) -> bool:
+    'Locate the leftmost value exactly equal to x in a'
     i = bisect.bisect_left(a, x)
     if i != len(a) and a[i] == x:
         return True
diff --git a/code/misc/config_utils.py b/code/misc/config_utils.py
new file mode 100644
index 0000000..2efeb5d
--- /dev/null
+++ b/code/misc/config_utils.py
@@ -0,0 +1,226 @@
+import re
+from copy import copy
+from typing import Dict, List
+import logging as log
+
+
+'''
+config_utils.py
+
+Helper functions for working with yaml config files
+'''
+
+
+def clean_config(config: Dict,
+                 valid_replacements: Dict[str, str] = None) -> Dict:
+    '''
+    Performs subsitution of variables in string recursively replacing
+    strings of the form __.+__ with the matching key.  Nested variables
+    with the same name replace parent values.
+    config is the possibly nested dict with values to replace
+    valid_replacements are the valid entries for performing replacements
+    '''
+    result = {}
+    if valid_replacements is None:
+        valid_replacements = dict()
+    len_values = len(config)
+    while config:
+        # want to look at valid replacements first,
+        # to possibly replace their values
+        keys = config.keys()
+        keys = list([k for k in keys if k in valid_replacements] +
+                    [k for k in keys if k not in valid_replacements])
+
+        for key in keys:
+            value = config[key]
+            if isinstance(value, str):
+                value = replace_entry(value, valid_replacements)
+                if value is None:
+                    continue  # don't remove
+                result[key] = value
+                valid_replacements[key] = value
+
+            elif isinstance(value, dict):
+                result[key] = clean_config(value,
+                                           copy(valid_replacements))
+
+            elif isinstance(value, list):
+                result[key] = clean_list(value,
+                                         valid_replacements)
+
+            else:
+                result[key] = value
+                valid_replacements[key] = str(value)
+
+            config.pop(key)
+
+        if len_values == len(config):
+            raise Exception('Failed to dereference all keys, remaining '
+                            f'values are:\n {print_dict(config)}')
+
+        len_values = len(config)
+
+    return result
+
+
+def clean_list(config: List,
+               valid_replacements: Dict[str, str] = None) -> List:
+    '''
+    Performs substitution on list of config objects
+    '''
+    result = []
+    for value in config:
+        if isinstance(value, str):
+            output = replace_entry(value, valid_replacements)
+            if output is None:
+                raise Exception(f'Failed to dereference list entry: "{value}"')
+            result.append(output)
+
+        elif isinstance(value, list):
+            result.append(clean_list(value, valid_replacements))
+
+        elif isinstance(value, dict):
+            result.append(clean_config(value, copy(valid_replacements)))
+
+        else:
+            result.append(value)
+
+    return result
+
+
+def replace_entry(value: str, valid_replacements: Dict[str, str]) -> str:
+    '''
+    Replace instances of __.+__ with the key in valid_replacements
+    If valid replacements is none or the key is not found, return None
+    Else return the (possibly) substituted string with all instances of /+
+    replaced with / (common in path replacements)
+    '''
+    replacements = re.findall('__(.+?)__', value)
+    for replacement in set(replacements):
+        replace = replacement.lower()
+        if valid_replacements is None or replace not in valid_replacements:
+            return None
+        value = re.sub(f'__{replacement}__',
+                       valid_replacements[replace],
+                       value)
+    return re.sub('/+', '/', value)
+
+
+def print_dict(d: Dict, lvl: int = 0) -> str:
+    '''
+    Return pretty representation of the dictionary d.
+    lvl is the starting amount to indent the line
+    '''
+    result = ''
+    for k, v in d.items():
+        if isinstance(v, dict):
+            result += '  ' * lvl + f'{k} -\n'
+            result += print_dict(v, lvl+1)
+        elif isinstance(v, list):
+            result += '  ' * lvl + f'{k} -\n'
+            result += print_list(v, lvl+1)
+        else:
+            result += '  ' * lvl + f'{k} - {v}\n'
+    return result
+
+
+def print_list(l: List, lvl: int = 0) -> str:
+    '''
+    Return pretty representation of the list l.
+    lvl is the startin amount to indent the line
+    '''
+    result = ''
+    for i, v in enumerate(l):
+        if isinstance(v, dict):
+            result += '  ' * lvl + f'{i}:\n' + print_dict(v, lvl+1)
+        elif isinstance(v, list):
+            result += '  ' * lvl + f'{i}:\n' + print_list(v, lvl+1)
+        else:
+            result += '  ' * lvl + f'{i}:\t{v},\n'
+    return result
+
+
+def merge_dicts(parent: Dict, new: Dict) -> Dict:
+    '''
+    Merge the new dict into parent.  Existing items are overwritten,
+    dicts are merged recursively, lists are combined as sets.
+    '''
+
+    for k, v in new.items():
+        if k in parent:
+            if isinstance(v, dict):
+                parent[k] = merge_dicts(parent[k], v)
+
+            else:
+                parent[k] = v
+        else:
+            parent[k] = v
+
+    return parent
+
+
+def merge_lists(parent: List, new: List) -> List:
+    '''
+    Merge new list into parent.  If new item isn't in list, add it.
+    Overwriting and nesting is not supported as it seems ill-defined.
+    '''
+    for i, v in enumerate(new):
+        if v not in parent:
+            parent.append(v)
+
+    return parent
+
+
+def get_nested(config: Dict, keys: str):
+    '''
+    Return the value of the nested keys, or none if the key is invalid
+    keys is a period separated list of keys as a string
+    '''
+    if config is None:
+        return None
+    keys = keys.split('.')
+    value = config
+    try:
+        for k in keys:
+            value = value[k]
+    except KeyError:
+        return None
+    return value
+
+
+def check_wildcards(path: str, wildcards: str) -> bool:
+    '''
+    Check if the supplied path contains all required wildcards
+    wildcards are provided as a comma separated list string
+    returns true if all wildcards are present in path, e.g. {wildcard} in path
+    else raises a ValueError with the unfound wildcard
+    '''
+    for wildcard in wildcards.split(','):
+        if f'{{{wildcard}}}' not in path:
+            err = f'{{{wildcard}}} not found in {path}'
+            log.exception(err)
+            raise ValueError(err)
+
+    return True
+
+
+def validate(config: Dict,
+             path: str,
+             exception: str,
+             value: str = None):
+    '''
+    validate the supplied value, raising exception if no value is found
+    config: the config dictionary to lookup
+    path: the path in nested config dict
+    exception: string to display if no value is found
+    value: starting value. values of None or '' will cause lookup into config
+    '''
+
+    if value is None or value == '':
+        value = get_nested(config, path)
+
+    if value is None:
+        log.exception(exception)
+        raise ValueError(exception)
+
+    return value
diff --git a/code/misc/mystats.py b/code/misc/mystats.py
index c18d7a9..f71e2fb 100644
--- a/code/misc/mystats.py
+++ b/code/misc/mystats.py
@@ -1,43 +1,48 @@
 import math
 import numpy.random
 
-def mean(l):
-    l = filter(lambda x: x != 'NA' and not math.isnan(x), l)
-    if len(l) == 0:
-        #TODO float('nan') ?
+
+def mean(values):
+    values = filter(lambda x: x != 'NA' and not math.isnan(x), values)
+    if len(values) == 0:
+        # TODO float('nan') ?
         return 'NA'
-    return float(sum(l)) / len(l)
+    return float(sum(values)) / len(values)
+
 
-def std_dev(l):
-    l = filter(lambda x: x != 'NA' and not math.isnan(x), l)
-    if len(l) == 0:
+def std_dev(values):
+    values = filter(lambda x: x != 'NA' and not math.isnan(x), values)
+    if len(values) == 0:
         return 'NA'
-    if len(l) == 1:
+    if len(values) == 1:
         return 0
-    m = mean(l)
-    return math.sqrt(sum([(x - m)**2 for x in l]) / (len(l) - 1))
+    m = mean(values)
+    return math.sqrt(sum([(x - m)**2 for x in values]) / (len(values) - 1))
+
 
-def std_err(l):
-    l = filter(lambda x: x != 'NA' and not math.isnan(x), l)
-    if len(l) == 0:
+def std_err(values):
+    values = filter(lambda x: x != 'NA' and not math.isnan(x), values)
+    if len(values) == 0:
         return 'NA'
-    return std_dev(l) / math.sqrt(len(l))
+    return std_dev(values) / math.sqrt(len(values))
 
-def bootstrap(l, n = 100, alpha = .05):
-    l = filter(lambda x: x != 'NA' and not math.isnan(x), l)
-    x = len(l)
+
+def bootstrap(values, n=100, alpha=.05):
+    values = filter(lambda x: x != 'NA' and not math.isnan(x), values)
+    x = len(values)
     if x == 0:
         return 'NA', 'NA'
     a = []
     for i in range(n):
-        a.append(mean(numpy.random.choice(l, size = x, replace = True)))
+        a.append(mean(numpy.random.choice(values, size=x, replace=True)))
     a.sort()
-    #print len(a), a.count(0)
-    #print mean(a)
+    # print len(a), a.count(0)
+    # print mean(a)
     return a[int(alpha * n * .5)], a[int((1 - alpha * .5) * n)]
 
-def median(l):
-    m = sorted(l)
+
+def median(values):
+    m = sorted(values)
     x = len(m)
     if x % 2 == 0:
         return mean([m[x/2], m[x/2-1]])
diff --git a/code/misc/overlap.py b/code/misc/overlap.py
index d19ed2a..06001fb 100644
--- a/code/misc/overlap.py
+++ b/code/misc/overlap.py
@@ -17,10 +17,11 @@ def overlap(start1, end1, start2, end2):
                 return end2 - start1 + 1
         else:
             return 0
-        
-    #if start1 < start2:
-    #    return max(end1 - start2 + 1, 0) - max(end1 - end2, 0)
-    #return max(end2 - start1 + 1, 0) - max(end2 - end1, 0)
+
+    # if start1 < start2:
+    #     return max(end1 - start2 + 1, 0) - max(end1 - end2, 0)
+    # return max(end2 - start1 + 1, 0) - max(end2 - end1, 0)
+
 
 def overlap_any(start1, end1, coords):
     for start2, end2 in coords:
@@ -28,24 +29,28 @@ def overlap_any(start1, end1, coords):
             return True
     return False
 
+
 def contained(i, start, end):
     return i >= start and i <= end
 
+
 def contained_any(i, coords):
     for start2, end2 in coords:
         if contained(i, start2, end2):
             return True
     return False
 
+
 def contained_any_named(i, coords):
     for start2, end2 in coords.keys():
         if contained(i, start2, end2):
             return coords[(start2, end2)]
     return None
 
+
 def overlap_region(start1, end1, start2, end2):
     o_start = max(start1, start2)
     o_end = min(end1, end2)
     if o_start > o_end:
-        return -1, -1 # disjoint ranges
+        return -1, -1  # disjoint ranges
     return o_start, o_end
diff --git a/code/misc/read_fasta.py b/code/misc/read_fasta.py
index 4554396..a422f30 100644
--- a/code/misc/read_fasta.py
+++ b/code/misc/read_fasta.py
@@ -1,8 +1,14 @@
 import gzip
 import numpy as np
+from typing import Tuple, List
 
 
-def read_fasta(fn, gz=False):
+def read_fasta(fn: str, gz: bool = False) -> Tuple[
+        List[str], np.array]:
+    '''
+    Read the provided fasta file, returning the
+    headers (lines starting with >) and sequences
+    '''
 
     headers = []
     seqs = []
diff --git a/code/misc/read_maf.py b/code/misc/read_maf.py
index 69bcac4..7950f08 100644
--- a/code/misc/read_maf.py
+++ b/code/misc/read_maf.py
@@ -1,6 +1,7 @@
 import re
 
-def read_mugsy(fn, required_mult = 1):
+
+def read_mugsy(fn, required_mult=1):
     f = open(fn, 'r')
     line = f.readline()
     while line[0] == '#':
@@ -9,9 +10,9 @@ def read_mugsy(fn, required_mult = 1):
     while line != '':
         assert line[0] == 'a', line
         block = {}
-        m = re.search('a score=(?P<score>[0-9]+) ' +\
-                          'label=(?P<label>[a-zA-Z0-9]+) ' + \
-                          'mult=(?P<mult>[0-9]+)', line)
+        m = re.search('a score=(?P<score>[0-9]+) ' +
+                      'label=(?P<label>[a-zA-Z0-9]+) ' +
+                      'mult=(?P<mult>[0-9]+)', line)
         block['mult'] = int(m.group('mult'))
         if block['mult'] >= required_mult:
             block['score'] = int(m.group('score'))
@@ -19,7 +20,7 @@ def read_mugsy(fn, required_mult = 1):
             for i in range(block['mult']):
                 line = f.readline()
                 assert line[0] == 's'
-                line = line.strip().split() # splits on space and tab
+                line = line.strip().split()  # splits on space and tab
                 name = line[1][:line[1].find('_')]
                 block['strains'][name] = {}
                 block['strains'][name]['start'] = int(line[2])
@@ -27,7 +28,7 @@ def read_mugsy(fn, required_mult = 1):
                 block['strains'][name]['strand'] = line[4]
                 block['strains'][name]['aligned_length'] = int(line[5])
                 block['strains'][name]['sequence'] = line[6]
-                block['strains'][name]['index'] = i # for sorting
+                block['strains'][name]['index'] = i  # for sorting
             blocks[m.group('label')] = block
         line = f.readline()
         while line != '' and line[0] != 'a':
@@ -35,6 +36,7 @@ def read_mugsy(fn, required_mult = 1):
     f.close()
     return blocks
 
+
 def read_mugsy_block(label, fn):
     f = open(fn, 'r')
     line = f.readline()
@@ -43,9 +45,9 @@ def read_mugsy_block(label, fn):
     while line != '':
         assert line[0] == 'a', line
         block = {}
-        m = re.search('a score=(?P<score>[0-9]+) ' +\
-                          'label=(?P<label>[a-zA-Z0-9]+) ' + \
-                          'mult=(?P<mult>[0-9]+)', line)
+        m = re.search('a score=(?P<score>[0-9]+) ' +
+                      'label=(?P<label>[a-zA-Z0-9]+) ' +
+                      'mult=(?P<mult>[0-9]+)', line)
         if m.group('label') == label:
             block['mult'] = int(m.group('mult'))
             block['score'] = int(m.group('score'))
@@ -53,7 +55,7 @@ def read_mugsy_block(label, fn):
             for i in range(block['mult']):
                 line = f.readline()
                 assert line[0] == 's'
-                line = line.strip().split() # splits on space and tab
+                line = line.strip().split()  # splits on space and tab
                 name = line[1][:line[1].find('_')]
                 block['strains'][name] = {}
                 block['strains'][name]['start'] = int(line[2])
@@ -61,7 +63,7 @@ def read_mugsy_block(label, fn):
                 block['strains'][name]['strand'] = line[4]
                 block['strains'][name]['aligned_length'] = int(line[5])
                 block['strains'][name]['sequence'] = line[6]
-                block['strains'][name]['index'] = i # for sorting
+                block['strains'][name]['index'] = i  # for sorting
             f.close()
             return block
         line = f.readline()
@@ -69,6 +71,7 @@ def read_mugsy_block(label, fn):
             line = f.readline()
     f.close()
 
+
 def make_ungapped(blocks, strain):
 
     for block in blocks:
@@ -76,14 +79,13 @@ def make_ungapped(blocks, strain):
             gaps = []
             n = 0
             ungapped = ''
-            for pos in xrange(len(block['strains'][strain]['sequence'])):
+            for pos in range(len(block['strains'][strain]['sequence'])):
                 if block['strains'][strain]['sequence'][pos] == '-':
                     gaps.append(n)
                 else:
                     ungapped.append(block['strains'][strain]['sequence'][pos])
                     n += 1
             block['strains'][strain]['sequence'] = ungapped
-            block['gaps'] = gaps # needs to be the same for all sequences
+            block['gaps'] = gaps  # needs to be the same for all sequences
 
-#def gapped(ungapped, gaps):
-    
+# def gapped(ungapped, gaps):
diff --git a/code/misc/read_origin.py b/code/misc/read_origin.py
index d7b55f4..f500717 100644
--- a/code/misc/read_origin.py
+++ b/code/misc/read_origin.py
@@ -7,8 +7,9 @@
 line = f.readline()
 while line != '':
 
-    m = re.search('Saccharomyces cerevisiae (?P<strain>[a-zA-Z0-9]+) chromosome (?P<chrm>[IVXM]+)', line)
-    if m != None:
+    m = re.search('Saccharomyces cerevisiae (?P<strain>[a-zA-Z0-9]+) '
+                  'chromosome (?P<chrm>[IVXM]+)', line)
+    if m is not None:
         current_strain = m.group('strain')
         current_chrm = m.group('chrm')
         if current_strain.lower() == strain.lower() and current_chrm == chrm:
@@ -27,4 +28,3 @@
             fo.close()
             break
     line = f.readline()
-
diff --git a/code/misc/read_table.py b/code/misc/read_table.py
index 7c2f5bd..e5ba31b 100644
--- a/code/misc/read_table.py
+++ b/code/misc/read_table.py
@@ -1,9 +1,24 @@
 import gzip
-import io
+from typing import List, Dict, Tuple
 
 
-def read_table_rows(fn, sep, header=True, key_ind=0):
-    # returns dictionary of rows keyed by first item in row
+def read_table_rows(fn: str,
+                    sep: str,
+                    header: bool = True,
+                    key_ind: int = 0) -> Tuple[
+                        Dict[str, Dict[str, List[str]]],
+                        List[str]]:
+    '''
+    Read the text file of tabular data by rows
+    fn: filename to read
+    sep: the column delimiter
+    header: flag to indicate a header is present
+     If a header is provided, labels are returned from the first row.
+     Return value becomes a dictionary of dictionaries, keyed first
+     by the key_ind, then the column label
+    key_ind: the column index to use as keys in output
+    returns dictionary of rows keyed by key_ind and labels
+    '''
 
     reader = None
     if fn.endswith('.gz'):
@@ -31,7 +46,10 @@ def read_table_rows(fn, sep, header=True, key_ind=0):
     return table, labels
 
 
-def read_table_columns(fn, sep, group_by=None, **filter_output):
+def read_table_columns(fn: str,
+                       sep: str,
+                       group_by: str = None,
+                       **filter_output) -> Tuple[Dict, List[str]]:
     '''
     Reads sep delimited file to generate dictionary of columns, keyed by labels
     Optionally, a column to group by can be specified, changing the return
diff --git a/code/misc/region_reader.py b/code/misc/region_reader.py
new file mode 100644
index 0000000..829bd13
--- /dev/null
+++ b/code/misc/region_reader.py
@@ -0,0 +1,152 @@
+import pickle
+import gzip
+import os
+import sys
+import numpy as np
+from typing import List, Tuple
+
+
+class Region_Reader():
+    def __init__(self,
+                 region_file: str,
+                 as_fa: bool = False,
+                 suppress_header: bool = True,
+                 num_lines: int = 14):
+        '''
+        Checks for valid filename and existance of corresponding pickle
+        as_fa: if true will return headers and sequences as read_fasta does
+        suppress_header: if true will not print the #region_id line
+        num_lines: number of lines to print once seek to index. Does not
+        include region header line.
+        '''
+        if not os.path.exists(region_file):
+            raise ValueError(f'{region_file} not found')
+
+        if region_file[-6:] != '.fa.gz':
+            raise ValueError(f'{region_file} expected to be .fa.gz')
+
+        pickle = region_file[:-6] + '.pkl'
+        if not os.path.exists(pickle):
+            raise ValueError(f'{pickle} not found with region file')
+
+        self.region_file = region_file
+        self.pickle = pickle
+        self.as_fa = as_fa
+        self.suppress_header = suppress_header
+        self.num_lines = num_lines
+
+    def __enter__(self):
+        self.region_reader = gzip.open(self.region_file, 'rt')
+        self.index = pickle.load(open(self.pickle, 'rb'))
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.region_reader.close()
+        return traceback is None
+
+    def __repr__(self):
+        return (
+            f'region_file = {self.region_file}\n'
+            f'pickle = {self.pickle}\n'
+            f'as_fa = {self.as_fa}\n'
+            f'suppress_header = {self.suppress_header}\n'
+            f'num_lines = {self.num_lines}\n'
+        )
+
+    def read_region(self, region_name: str):
+        '''
+        read the supplied region name, either printing to stdout or returning
+        (headers, seqs) tuple depending on as_fa value
+        '''
+        region = self.convert_region(region_name)
+        location = self.decode_region(region)
+        return self.read_location(location)
+
+    def read_location(self, location: int):
+        '''
+        helper method used in extract_region for directly handling locations
+        '''
+        self.region_reader.seek(location)
+
+        if self.suppress_header is True:
+            self.region_reader.readline()
+        else:
+            print(self.region_reader.readline(), end='')
+
+        if self.as_fa:
+            return self.encode_fa(location)
+        else:
+            self.print_region(location)
+
+    def convert_region(self, region_name: str) -> int:
+        '''
+        Checks that region is a digit that starts with r
+        If so, returns the integer value of the region for decoding
+        '''
+        r = region_name
+        if r[0] == 'r':
+            r = r[1:]
+        if not r.isdigit():
+            raise ValueError(f'{region_name} could not be parsed')
+        return int(r)
+
+    def decode_region(self, region_number: int) -> int:
+        '''
+        Convert region to disk location.
+        Raises key error if region doesn't exist
+        '''
+        try:
+            result = self.index[region_number]
+        except KeyError as e:
+            raise KeyError(f'r{e} not found in index')
+
+        return result
+
+    def yield_fa(self, keys=None) -> Tuple[str, List[str], List[List[str]]]:
+        '''
+        repeatedly yield tuples of region, headers, sequences from fa file
+        assumes file position starts at header for region
+        suppress_header is taken as true (will not print)
+        keys are a list of regions which are valid.  If provided will only
+        yield regions found in keys
+        '''
+        while True:
+            region = self.region_reader.readline()[1:-1]
+            try:
+                header, seq = self.encode_fa(region)
+                if keys is None or region in keys:
+                    yield (region, header, np.asarray([list(s) for s in seq]))
+                else:
+                    continue
+            except ValueError:
+                break
+
+    def encode_fa(self, location: int) -> Tuple[List[str], List[List[str]]]:
+        '''
+        Reads the region file entry and returns headers, seqs
+        Assumes even numbered lines are headers, odd are sequences
+        '''
+        headers = []
+        seqs = []
+        for i in range(self.num_lines):
+            line = self.region_reader.readline()
+            if line == '':
+                raise ValueError(f'{location} outside of file')
+            if i % 2 == 0:  # header
+                headers.append(line[:-1])
+            else:
+                seqs.append(line[:-1])
+
+        return headers, np.asarray(seqs)
+
+    def print_region(self, location: int) -> None:
+        '''
+        reads the region file entry, printing to stdout
+        '''
+        for i in range(self.num_lines):
+            line = self.region_reader.readline()
+            if line == '':
+                print(f'{location} outside of file', file=sys.stderr)
+                break
+            else:
+                print(line, end='')
diff --git a/code/misc/seq_functions.py b/code/misc/seq_functions.py
index 60f9a25..fe4b65b 100644
--- a/code/misc/seq_functions.py
+++ b/code/misc/seq_functions.py
@@ -1,4 +1,5 @@
 import numpy as np
+from typing import List, Tuple
 
 
 r = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G',
@@ -56,7 +57,12 @@ def index_ignoring_gaps(s, i, s_start, gap_symbol):
     return x
 
 
-def seq_id(ref_seq, seq):
+def seq_id(ref_seq: List[str], seq: List[str]) -> Tuple[int, int]:
+    '''
+    Given two sequences, determine the total number of valid matching sites
+    and number of valid sites. A site is valid if it is a upper or lower case
+    ATCG
+    '''
     length = min(ref_seq.size, seq.size)
     valid_seq = list(r.keys())
     valid = np.logical_and(
diff --git a/code/misc/seq_id.py b/code/misc/seq_id.py
index 1ff1104..23ef787 100644
--- a/code/misc/seq_id.py
+++ b/code/misc/seq_id.py
@@ -9,28 +9,32 @@
 import os
 import re
 
-def seq_id(a, b, l = -1, use_gaps = False):
+
+def seq_id(a, b, length=-1, use_gaps=False):
     assert len(a) == len(b)
     ndiff = 0
     ntotal = 0
-    for i in xrange(len(a)):
+    for i in range(len(a)):
         # use sequence a as denominator
         if a[i] != '-':
             if b[i] != '-' or use_gaps:
                 ntotal += 1
                 if a[i] != b[i]:
                     ndiff += 1
-    if l == -1:
+    if length == -1:
         if ntotal == 0:
             return -1, 0
         return 1 - float(ndiff) / ntotal, ntotal
-    return 1 - float(ndiff) / l
+    return 1 - float(ndiff) / length
+
 
 def mean(a):
     return float(sum(a))/len(a)
 
-def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
-    assert only_threeway, 'reading non-threeway parts of the alignment not yet implemented'
+
+def maf_id(fn, only_threeway, ref1='S288c', ref2='CBS432'):
+    assert only_threeway, 'reading non-threeway parts of \
+        the alignment not yet implemented'
     f = open(fn, 'r')
     line = f.readline()
     id1 = []
@@ -58,12 +62,11 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
                     len2.append(r2[1])
         line = f.readline()
     f.close()
-    num1 = sum([id1[i] * len1[i] for i in xrange(len(id1))])
-    num2 = sum([id2[i] * len2[i] for i in xrange(len(id2))])
+    num1 = sum([id1[i] * len1[i] for i in range(len(id1))])
+    num2 = sum([id2[i] * len2[i] for i in range(len(id2))])
     den1 = float(sum(len1))
     den2 = float(sum(len2))
     return num1 / den1, num2 / den2, den1, den2
-    
 
 
 if sys.argv[1] == '100':
@@ -72,7 +75,8 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
     # sequence identity, i.e. assume genomes are mostly aligned so
     # this is valid
 
-    chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
+    chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
+             'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
 
     # get all strain names
     alignment_dir = '../../alignments/genbank/'
@@ -88,33 +92,36 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
     len_cer = []
     len_par = []
     for strain in strains:
-        print strain
+        print(strain)
         id_strain_cer = []
         id_strain_par = []
         len_strain_cer = []
         len_strain_par = []
         for chrm in chrms:
-            id_chrm_cer, id_chrm_par, len_chrm_cer, len_chrm_par = maf_id(alignment_dir + prefix + strain + '_chr' + chrm + '.maf', True)
+            id_chrm_cer, id_chrm_par, len_chrm_cer, len_chrm_par = maf_id(
+                alignment_dir + prefix + strain + '_chr' + chrm + '.maf', True)
             id_strain_cer.append(id_chrm_cer)
             id_strain_par.append(id_chrm_par)
             len_strain_cer.append(len_chrm_cer)
             len_strain_par.append(len_chrm_par)
-        num_strain_cer = sum([id_strain_cer[i] * len_strain_cer[i] for i in xrange(len(chrms))])
-        num_strain_par = sum([id_strain_par[i] * len_strain_par[i] for i in xrange(len(chrms))])
+        num_strain_cer = sum([id_strain_cer[i] * len_strain_cer[i]
+                              for i in range(len(chrms))])
+        num_strain_par = sum([id_strain_par[i] * len_strain_par[i]
+                              for i in range(len(chrms))])
         den_strain_cer = float(sum(len_strain_cer))
         den_strain_par = float(sum(len_strain_par))
         id_cer.append(num_strain_cer/den_strain_cer)
         id_par.append(num_strain_par/den_strain_par)
         len_cer.append(den_strain_cer)
         len_par.append(den_strain_par)
-    num_cer = sum([id_cer[i] * len_cer[i] for i in xrange(len(id_cer))])
-    num_par = sum([id_par[i] * len_par[i] for i in xrange(len(id_par))])
+    num_cer = sum([id_cer[i] * len_cer[i] for i in range(len(id_cer))])
+    num_par = sum([id_par[i] * len_par[i] for i in range(len(id_par))])
     den_cer = float(sum(len_cer))
     den_par = float(sum(len_par))
-    print 'average id between cer and cer ref:', num_cer/den_cer
-    print '(all strains:', id_cer, len_cer, ')'
-    print 'average id between cer and par ref:', num_par/den_par
-    print '(all strains:', id_par, len_par, ')'    
+    print('average id between cer and cer ref:', num_cer/den_cer)
+    print('(all strains:', id_cer, len_cer, ')')
+    print('average id between cer and par ref:', num_par/den_par)
+    print('(all strains:', id_par, len_par, ')')
 
 elif sys.argv[1] == 'sim':
     # for simulations, specifically with one par and one cer population
@@ -127,14 +134,15 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
         num_cer = int(p[4])
         par_ref_ind = 0
         cer_ref_ind = num_par
-        seq_len = int(p[7])            
-        d = {'par_ref':{'par':[], 'cer':[]}, 'cer_ref':{'par':[], 'cer':[]}}
+        seq_len = int(p[7])
+        d = {'par_ref': {'par': [], 'cer': []},
+             'cer_ref': {'par': [], 'cer': []}}
 
         f = open(out_dir + out_prefix + p[0] + '.txt', 'r')
         line = f.readline()
         while line != '':
-            m = re.match('segsites: (?P<nseg>[0-9]+)', line) 
-            if m != None:
+            m = re.match('segsites: (?P<nseg>[0-9]+)', line)
+            if m is not None:
                 # positions:
                 f.readline()
                 # begin sequences
@@ -143,15 +151,20 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
                 while line != '' and line != '\n':
                     seqs.append(line[:-1])
                     line = f.readline()
-                d_temp = {'par_ref':{'par':[], 'cer':[]}, 'cer_ref':{'par':[], 'cer':[]}}
+                d_temp = {'par_ref': {'par': [], 'cer': []},
+                          'cer_ref': {'par': [], 'cer': []}}
                 for i in range(0, num_par):
                     if i != par_ref_ind:
-                        d_temp['par_ref']['par'].append(seq_id(seqs[par_ref_ind], seqs[i], l=seq_len))
-                    d_temp['cer_ref']['par'].append(seq_id(seqs[cer_ref_ind], seqs[i], l=seq_len))
+                        d_temp['par_ref']['par'].append(
+                            seq_id(seqs[par_ref_ind], seqs[i], length=seq_len))
+                    d_temp['cer_ref']['par'].append(
+                        seq_id(seqs[cer_ref_ind], seqs[i], length=seq_len))
                 for i in range(num_par, num_par + num_cer):
                     if i != cer_ref_ind:
-                        d_temp['cer_ref']['cer'].append(seq_id(seqs[cer_ref_ind], seqs[i], l=seq_len))
-                    d_temp['par_ref']['cer'].append(seq_id(seqs[par_ref_ind], seqs[i], l=seq_len))
+                        d_temp['cer_ref']['cer'].append(
+                            seq_id(seqs[cer_ref_ind], seqs[i], length=seq_len))
+                    d_temp['par_ref']['cer'].append(
+                        seq_id(seqs[par_ref_ind], seqs[i], length=seq_len))
                 d['par_ref']['par'].append(mean(d_temp['par_ref']['par']))
                 d['par_ref']['cer'].append(mean(d_temp['par_ref']['cer']))
                 d['cer_ref']['par'].append(mean(d_temp['cer_ref']['par']))
@@ -159,10 +172,10 @@ def maf_id(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
             line = f.readline()
         f.close()
 
-        print '-----'
-        print out_prefix + p[0]
-        print p
-        print 'average id within par:', mean(d['par_ref']['par'])
-        print 'average id within cer:', mean(d['cer_ref']['cer'])
-        print 'average id between cer ref and par:', mean(d['cer_ref']['par'])
-        print 'average id between par ref and cer:', mean(d['par_ref']['cer'])
+        print('-----')
+        print(out_prefix + p[0])
+        print(p)
+        print('average id within par:', mean(d['par_ref']['par']))
+        print('average id within cer:', mean(d['cer_ref']['cer']))
+        print('average id between cer ref and par:', mean(d['cer_ref']['par']))
+        print('average id between par ref and cer:', mean(d['par_ref']['cer']))
diff --git a/code/misc/to_bed.py b/code/misc/to_bed.py
index e6ac10a..d14fb07 100644
--- a/code/misc/to_bed.py
+++ b/code/misc/to_bed.py
@@ -1,6 +1,7 @@
 import re
 
-chrms_roman = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
+chrms_roman = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
+               'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
 
 
 f = open('../../results/introgressed_id.txt', 'r')
@@ -12,11 +13,11 @@
 while line != '':
     if strain in line:
         line = line.strip().split(',')
-        
-        m = re.search('chr(?P<chrm>[A-Z]+)\.', line[0])
+
+        m = re.search(r'chr(?P<chrm>[A-Z]+)\.', line[0])
         chrm = m.group('chrm')
         chrm = 'chr' + str(chrms_roman.index(chrm))
-        
+
         d = line[2].find('-')
         start = line[2][:d]
         end = line[2][d+1:]
@@ -24,8 +25,8 @@
         region_count += 1
         score = '0'
         strand = line[1][line[1].find(' strand') - 1]
-        f_out.write(chrm + '\t' + start + '\t' + end + '\t' + name + '\t' + score + '\t' + strand + '\n')
+        f_out.write(chrm + '\t' + start + '\t' + end + '\t' +
+                    name + '\t' + score + '\t' + strand + '\n')
     line = f.readline()
 f.close()
 f_out.close()
-        
diff --git a/code/misc/write_fasta.py b/code/misc/write_fasta.py
index da62a16..a5d7211 100644
--- a/code/misc/write_fasta.py
+++ b/code/misc/write_fasta.py
@@ -1,7 +1,8 @@
 import gzip
 
+
 def write_fasta(headers, seqs, fn, gz=False):
-    
+
     f = None
     if gz:
         f = gzip.open(fn + '.gz', 'wb')
diff --git a/code/phylo-hmm/gen_phylo_hmm_input_file.py b/code/phylo-hmm/gen_phylo_hmm_input_file.py
index 0ee9265..83de318 100644
--- a/code/phylo-hmm/gen_phylo_hmm_input_file.py
+++ b/code/phylo-hmm/gen_phylo_hmm_input_file.py
@@ -1,9 +1,4 @@
 import os
-import sys
-sys.path.insert(0, '../sim')
-import sim_analyze_hmm_bw
-sys.path.insert(0, '..')
-import global_params as gp
 
 
 def gen_input_file(sequence_fn, fn, tag, rep):
@@ -12,175 +7,201 @@ def gen_input_file(sequence_fn, fn, tag, rep):
     0) Build a new model.
     1) Load a pre-existing model.
     2) Exit.
-    Choose an option: 
+    Choose an option:
     0
     Input the basic file info path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     basic-info.txt
     Input the number of trees or states for this HMM:
     6
-    
+
     Input the parental trees file path name:
-    (note: see README for file format) 
+    (note: see README for file format)
 
     parental.trees
-    
+
     Input the gene genealogies file path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     gene.trees
-    
-    Input outgroup taxon name, or empty string for no outgroup taxon: 
-    
+
+    Input outgroup taxon name, or empty string for no outgroup taxon:
+
     OUTGROUP
-    Empty working directory: 
+    Empty working directory:
     ./d
-    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>: 
+    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>:
     0.964011 0.192217 0.270592 0.0806232 0.174283
-    Input non-zero substitution model base frequencies in format <A> <G> <C> <T>: 
+    Input non-zero substitution model base frequencies in format
+    <A> <G> <C> <T>:
     0.1953 0.3077 0.3008 0.1962
-    
-    Across-row switching frequency gamma: 
+
+    Across-row switching frequency gamma:
     .1
-    Hidden state switching frequency ratio term input file: 
+    Hidden state switching frequency ratio term input file:
     switching-frequency-ratio-terms
 
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization heuristic
+    that incorporates Brent's method
     4) Exit
     Choose an option:
     0
 
-    Path to your output file: 
+    Path to your output file:
     initial.viterbi.sequence
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
     Choose an option:
     1
-    Keep or discard parsimony-uninformative sites? true for keep, false for discard: 
+    Keep or discard parsimony-uninformative sites?
+    true for keep, false for discard:
     false
-    Input observation sequence file path name : 
+    Input observation sequence file path name :
     sequence_nogaps.fasta
     Input alignment length: 86706
     Alignment length after optional filter step: 8194
     Building Psy Array
-    100% Done!                             
+    100% Done!
 
     Total: 29 MiB; Free: 22 MiB; --> Used: 7 MiB.
     Max: 454 MiB.
-    Begin computing Viterbi's algorithm : FORWARD PART 
-    100% DONE!                                
+    Begin computing Viterbi's algorithm : FORWARD PART
+    100% DONE!
     Enlarging... capacity = 10000
-    Begin computing Viterbi's algorithm : BACKWARD PART 
-    100% DONE!                                   
+    Begin computing Viterbi's algorithm : BACKWARD PART
+    100% DONE!
     Input HMM Viterbi log likelihood: |-21.866065387121736|
-    Computing input HMM log likelihood for input sequences... 
+    Computing input HMM log likelihood for input sequences...
     Computing input HMM log likelihood for input sequences DONE.
     Input HMM log likelihood: |-21.310316817591303|
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization
+    heuristic that incorporates Brent's method
     4) Exit
-    Choose an option: 
-    
+    Choose an option:
+
 
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
-    Choose an option: 
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    Enable optimization flag vector <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
+    Choose an option:
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ...
+    <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
+
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ...
+    <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
+    Enable optimization flag vector <enable parental tree optimization flag>
+    <enable gene genealogy optimization flag>
+    <enable switching frequency optimization flag>
+    <enable substitution model optimization flag>
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization
+    heuristic that incorporates Brent's method
     4) Exit
-    Choose an option: 
+    Choose an option:
     """
 
-    initial_mode = '0' # build new model
+    initial_mode = '0'  # build new model
     basic_info_fn = 'basic-info.txt'
-    num_states = '6' # num parental trees * num genealogies
+    num_states = '6'  # num parental trees * num genealogies
     parental_trees_fn = 'parental.trees'
     gene_trees_fn = 'gene.trees'
     outgroup_name = 'OUTGROUP'
-    working_dir = '../../results/sim/phylo-hmm/working/' + tag # don't need one for every rep because we do one at a time
+    # don't need one for every rep because we do one at a time
+    working_dir = '../../results/sim/phylo-hmm/working/' + tag
     os.mkdir(working_dir)
-    substitution_rates = '1 1 1 1 1' # <AG> <AC> <AT> <GC> <GT>
-    base_frequencies = '0.25 0.25 0.25 0.25' # <A> <G> <C> <T>
+    substitution_rates = '1 1 1 1 1'  # <AG> <AC> <AT> <GC> <GT>
+    base_frequencies = '0.25 0.25 0.25 0.25'  # <A> <G> <C> <T>
     parental_tree_switching_freq = '.1'
     gene_tree_switching_fn = 'switching-frequency-ratio-terms'
-    operate_mode = '0' # run viterbi
+    operate_mode = '0'  # run viterbi
     # operate_mode = '1' # learn with Baum-Welch
-    output_file_path = '../../results/sim/phylo-hmm/initial.viterbi.sequence.' + tag + '.' + rep
-    observation_sequence_option = '1' # read new sequence
-    keep_uninformative_sites = 'true' # referred to as "optional filter step" later
-    #sequence_fn = 'sequence.fasta'
-    operate_mode_2 = '3' # learn with "a multivariate optimization heuristic that incorporates Brent's method"
-    observation_sequence_option_2 = '0' # reuse previously read sequence
+    output_file_path = '../../results/sim/phylo-hmm/initial.viterbi.sequence.'\
+        + tag + '.' + rep
+    observation_sequence_option = '1'  # read new sequence
+    # referred to as "optional filter step" later
+    keep_uninformative_sites = 'true'
+    # sequence_fn = 'sequence.fasta'
+    # learn with "a multivariate optimization heuristic
+    # that incorporates Brent's method"
+    operate_mode_2 = '3'
+    observation_sequence_option_2 = '0'  # reuse previously read sequence
     length_params_fn = 'length-parameters'
-    length_params_inequality_constraints_fn = 'length-parameter-inequality-constraints'
+    length_params_inequality_constraints_fn =\
+        'length-parameter-inequality-constraints'
     length_params_constraint_sets_fn = 'length-parameter-constraint-sets'
-    restore_fn = '' # blank for no file
-    output_posterior_decoding_fn = '../../results/sim/phylo-hmm/optimized.posterior.decoding.probabilities.' + tag + '.' + rep
-    output_viterbi_optimized_fn = '../../results/sim/phylo-hmm/optimized.viterbi.sequence.'  + tag + '.' + rep
-    output_model_likelihoods_fn = '../../results/sim/phylo-hmm/optimized.model.likelihoods.'  + tag + '.' + rep
-    output_optimized_params_fn = '../../results/sim/phylo-hmm/optimized.model.parameters.' + tag + '.' + rep
+    restore_fn = ''  # blank for no file
+    output_posterior_decoding_fn = (
+        '../../results/sim/phylo-hmm/'
+        'optimized.posterior.decoding.probabilities.' + tag + '.' + rep)
+    output_viterbi_optimized_fn = (
+        '../../results/sim/phylo-hmm/optimized.viterbi.sequence.'
+        + tag + '.' + rep)
+    output_model_likelihoods_fn = (
+        '../../results/sim/phylo-hmm/optimized.model.likelihoods.'
+        + tag + '.' + rep)
+    output_optimized_params_fn = (
+        '../../results/sim/phylo-hmm/optimized.model.parameters.'
+        + tag + '.' + rep)
     initial_search_settings = 'CURRENT DEFAULT'
-    enable_optimization = 'true true true true' # <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
-    operate_mode_3='4' # exit
+    # <enable parental tree optimization flag>
+    # <enable gene genealogy optimization flag>
+    # <enable switching frequency optimization flag>
+    # <enable substitution model optimization flag>
+    enable_optimization = 'true true true true'
+    operate_mode_3 = '4'  # exit
 
     f = open(fn, 'w')
-    f.write(initial_mode + '\n' + \
-                basic_info_fn + '\n' + \
-                num_states + '\n' + \
-                parental_trees_fn + '\n' + \
-                gene_trees_fn + '\n' + \
-                outgroup_name + '\n' + \
-                working_dir + '\n' + \
-                substitution_rates + '\n' + \
-                base_frequencies + '\n' + \
-                parental_tree_switching_freq + '\n' + \
-                gene_tree_switching_fn + '\n' + \
-                operate_mode + '\n' + \
-                output_file_path + '\n' + \
-                observation_sequence_option + '\n' + \
-                keep_uninformative_sites + '\n' + \
-                sequence_fn + '\n' + \
-                operate_mode_2 + '\n' + \
-                observation_sequence_option_2 + '\n' + \
-                length_params_fn + '\n' + \
-                length_params_inequality_constraints_fn + '\n' + \
-                length_params_constraint_sets_fn + '\n' + \
-                restore_fn + '\n' + \
-                output_posterior_decoding_fn + '\n' + \
-                output_viterbi_optimized_fn + '\n' + \
-                output_model_likelihoods_fn + '\n' + \
-                output_optimized_params_fn + '\n' + \
-                initial_search_settings + '\n' + \
-                enable_optimization + '\n' + \
-                operate_mode_3)
-
-
+    f.write(initial_mode + '\n' +
+            basic_info_fn + '\n' +
+            num_states + '\n' +
+            parental_trees_fn + '\n' +
+            gene_trees_fn + '\n' +
+            outgroup_name + '\n' +
+            working_dir + '\n' +
+            substitution_rates + '\n' +
+            base_frequencies + '\n' +
+            parental_tree_switching_freq + '\n' +
+            gene_tree_switching_fn + '\n' +
+            operate_mode + '\n' +
+            output_file_path + '\n' +
+            observation_sequence_option + '\n' +
+            keep_uninformative_sites + '\n' +
+            sequence_fn + '\n' +
+            operate_mode_2 + '\n' +
+            observation_sequence_option_2 + '\n' +
+            length_params_fn + '\n' +
+            length_params_inequality_constraints_fn + '\n' +
+            length_params_constraint_sets_fn + '\n' +
+            restore_fn + '\n' +
+            output_posterior_decoding_fn + '\n' +
+            output_viterbi_optimized_fn + '\n' +
+            output_model_likelihoods_fn + '\n' +
+            output_optimized_params_fn + '\n' +
+            initial_search_settings + '\n' +
+            enable_optimization + '\n' +
+            operate_mode_3)
diff --git a/code/phylo-hmm/gen_sim_seqs.py b/code/phylo-hmm/gen_sim_seqs.py
index 587b577..b7b255d 100644
--- a/code/phylo-hmm/gen_sim_seqs.py
+++ b/code/phylo-hmm/gen_sim_seqs.py
@@ -6,9 +6,7 @@
 
 import os
 import sys
-sys.path.insert(0, '../sim')
-import sim_analyze_hmm_bw
-sys.path.insert(0, '..')
+from sim import sim_analyze_hmm_bw
 import global_params as gp
 
 
@@ -17,7 +15,8 @@
 #####
 
 tag, topology, species_to, species_from1, species_from2, \
-    num_samples_species_to, num_samples_species_from1, num_samples_species_from2, \
+    num_samples_species_to, num_samples_species_from1, \
+    num_samples_species_from2, \
     N0_species_to, N0_species_from1, N0_species_from2, \
     migration_from1, migration_from2, \
     expected_length_introgressed, \
@@ -26,11 +25,12 @@
     rho, outcross_rate, theta, num_sites, num_reps = \
     sim_analyze_hmm_bw.process_args(sys.argv)
 
-num_samples = num_samples_species_to + num_samples_species_from1 + num_samples_species_from2
+num_samples = num_samples_species_to + \
+    num_samples_species_from1 + num_samples_species_from2
 
 # TODO don't hardcode this
 labels = ['1', '2', '3', '4']
-names = ['C1', 'C2', 'P','OUTGROUP']
+names = ['C1', 'C2', 'P', 'OUTGROUP']
 label_to_name = dict(zip(labels, names))
 
 #####
@@ -38,7 +38,8 @@
 #####
 
 gp_dir = '../'
-ms_out_fn = gp_dir + gp.sim_out_dir + '/ms/' + gp.sim_out_prefix + tag + gp.sim_out_suffix
+ms_out_fn = gp_dir + gp.sim_out_dir + '/ms/' + \
+    gp.sim_out_prefix + tag + gp.sim_out_suffix
 f = open(ms_out_fn, 'r')
 
 # format is //\n[num sites]tree\n[num sites]tree\n etc
@@ -55,8 +56,10 @@
             tree = line[close_bracket+1:-1]
             # TODO make this less dumb
             for label in labels:
-                tree = tree.replace('(' + label + ':', '(' + label_to_name[label] + ':')
-                tree = tree.replace(',' + label + ':', ',' + label_to_name[label] + ':')
+                tree = tree.replace('(' + label + ':',
+                                    '(' + label_to_name[label] + ':')
+                tree = tree.replace(',' + label + ':',
+                                    ',' + label_to_name[label] + ':')
             trees_rep.append((n, tree))
             line = f.readline()
         trees.append(trees_rep)
@@ -72,7 +75,7 @@
 for rep in range(num_reps):
     individual_trees_fn = gp_dir + gp.sim_out_dir + '/seq-gen/temp/' + \
         gp.sim_out_prefix + \
-        'individual_trees_' +  tag + '_rep' + str(rep) + gp.sim_out_suffix
+        'individual_trees_' + tag + '_rep' + str(rep) + gp.sim_out_suffix
     tree_fns.append(individual_trees_fn)
     f = open(individual_trees_fn, 'w')
     for n, tree in trees[rep]:
@@ -87,8 +90,9 @@
 seq_fns = []
 for rep in range(len(tree_fns)):
     fn = tree_fns[rep]
-    seq_gen_out_fn = gp_dir + gp.sim_out_dir + '/seq-gen/temp/' + gp.sim_out_prefix + \
-        'seq_gen_' +  tag + '_rep' + str(rep) + gp.sim_out_suffix
+    seq_gen_out_fn = gp_dir + gp.sim_out_dir + \
+        '/seq-gen/temp/' + gp.sim_out_prefix + \
+        'seq_gen_' + tag + '_rep' + str(rep) + gp.sim_out_suffix
     seq_fns.append(seq_gen_out_fn)
     # JC69 model equivalent to HKY model but with base frequencies set
     # equal and transition/transversion rates set equal (which are the
@@ -98,7 +102,7 @@
     seq_gen_command = \
         '~/software/Seq-Gen.v1.3.3/source/seq-gen -mHKY -l1' + \
         ' < ' + fn + ' > ' + seq_gen_out_fn
-    print seq_gen_command
+    print(seq_gen_command)
     os.system(seq_gen_command)
 
 #####
@@ -114,7 +118,7 @@
     f = open(fn, 'r')
     fasta_fn = seq_gen_out_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + \
         gp.sim_out_prefix + \
-        'seq_gen_' +  tag + '_rep' + str(rep) + '.fasta'
+        'seq_gen_' + tag + '_rep' + str(rep) + '.fasta'
     fasta_f = open(fasta_fn, 'w')
     seqs = dict(zip(sample_labels, [''] * num_samples))
     for i in range(num_sites):
diff --git a/code/phylo-hmm/remove_gaps.py b/code/phylo-hmm/remove_gaps.py
index 33dabbf..a5417e1 100644
--- a/code/phylo-hmm/remove_gaps.py
+++ b/code/phylo-hmm/remove_gaps.py
@@ -11,7 +11,7 @@
 while line != '':
     headers.append(line)
     seqs.append(f.readline())
-    print len(seqs[-1])
+    print(len(seqs[-1]))
     line = f.readline()
 
 seqs_new = [''] * len(seqs)
@@ -31,4 +31,3 @@
 
 f.close()
 fout.close()
-        
diff --git a/code/phylo-hmm/run_gen_sim_seqs.py b/code/phylo-hmm/run_gen_sim_seqs.sh
similarity index 82%
rename from code/phylo-hmm/run_gen_sim_seqs.py
rename to code/phylo-hmm/run_gen_sim_seqs.sh
index 3dc809b..bd32eec 100644
--- a/code/phylo-hmm/run_gen_sim_seqs.py
+++ b/code/phylo-hmm/run_gen_sim_seqs.sh
@@ -1,14 +1,14 @@
 # 
 # Use the bash shell to interpret this job script 
-#$ -S /bin/bash -t 1-27 -tc 27 -l m_mem_free=2G
+# $ -S /bin/bash -t 1-27 -tc 27 -l m_mem_free=2G
 # 
 # Send an e-mail to the address 
 # specified in .sge_request when this job ends. 
-##$ -m e 
+# $ -m e 
 # 
 # Only submit this job to nodes 
 # that have at least 8GB of RAM free.
-##$ -l mem_requested=8G, h_vmem=8G
+# $ -l mem_requested=8G, h_vmem=8G
 
 # Put the hostname, current directory, and start date 
 # into variables, then write them to the SGE standard output file.
@@ -22,13 +22,13 @@
 
 # SGE jobs do not run in your login environment, so you'll need to 
 # load your environment, or atleast the modules your script needs to run
-#source /net/gs/vol1/home/aclark4/.bashrc
+# source /net/gs/vol1/home/aclark4/.bashrc
 # also load any modules as needed
 source /etc/profile.d/modules.sh
 module load modules modules-init modules-gs
 module load python/2.7.3
-#module load numpy/latest
-#module load scipy/latest
+# module load numpy/latest
+# module load scipy/latest
 
 # Script or command(s) to run via SGE
 cd /net/gs/vol1/home/aclark4/projects/introgression/code/phylo-hmm
@@ -37,6 +37,4 @@
 
 python gen_sim_seqs.py $ARGS
 
-#python sim_analyze_hmm_bw_main.py $ARGS
-
-
+# python sim_analyze_hmm_bw_main.py $ARGS
diff --git a/code/phylo-hmm/sim_analyze_phylo.py b/code/phylo-hmm/sim_analyze_phylo.py
index d95d100..daad5f8 100644
--- a/code/phylo-hmm/sim_analyze_phylo.py
+++ b/code/phylo-hmm/sim_analyze_phylo.py
@@ -1,18 +1,12 @@
 import os
 import sys
-import copy
-import itertools
 import random
-sys.path.append('../sim')
-from concordance_functions import *
-import sim_analyze_hmm_bw
-sys.path.append('../hmm')
-from hmm_bw import *
-sys.path.append('..')
+from sim import sim_analyze_hmm_bw
 import global_params as gp
 
+
 def read_predicted(fn, trees_to_states):
-    
+
     # TODO make this deal with filtered sites? here or somewhere else?
 
     f = open(fn, 'r')
@@ -33,6 +27,7 @@ def read_predicted(fn, trees_to_states):
     f.close()
     return predicted
 
+
 def convert_binary_to_nucleotides(seqs):
     n = ['A', 'T', 'G', 'C']
     seqs_n = [[] for s in range(len(seqs))]
@@ -47,6 +42,7 @@ def convert_binary_to_nucleotides(seqs):
                 seqs_n[s].append(l1)
     return seqs_n
 
+
 def write_fasta(seqs, names, fn):
     f = open(fn, 'w')
     for i in range(len(names)):
@@ -54,7 +50,9 @@ def write_fasta(seqs, names, fn):
         f.write(''.join(seqs[i]) + '\n')
     f.close()
 
-def get_actual(num_samples_species_to, trees, topology, species_to, index_to_species, recomb_sites):
+
+def get_actual(num_samples_species_to, trees, topology,
+               species_to, index_to_species, recomb_sites):
 
     # sequence of states, one for each site and strain
     actual_state_seq = [[] for i in range(num_samples_species_to)]
@@ -74,12 +72,13 @@ def get_actual(num_samples_species_to, trees, topology, species_to, index_to_spe
         # two other species, based on coalescent tree; could clean
         # this up a little
 
-        # only supports two species 
+        # only supports two species
 
         # introgressed is a list of species (one entry for
         # each individual in to species)
         introgressed, num_lineages_at_join_current = \
-            sim_analyze_hmm_bw.find_introgressed_3(t, species_to, topology, index_to_species)
+            sim_analyze_hmm_bw.find_introgressed_3(t, species_to,
+                                                   topology, index_to_species)
 
         # number of sites in the current block of sequence
         num_sites_t = recomb_sites[ti]
@@ -92,106 +91,124 @@ def get_actual(num_samples_species_to, trees, topology, species_to, index_to_spe
             if introgressed[i] != species_to:
                 num_introgressed[i] += num_sites_t
             actual_state_seq[i] += [introgressed[i]] * num_sites_t
-            
+
     return actual_state_seq, num_introgressed
 
 
-def process_phylo_output(sim, ref_inds, output_dic, f_out, trees_to_states, tag, rep, \
-                             num_samples_species_to, \
-                             topology, species_to, index_to_species, \
-                             states, f_tracts_predicted, filtered_sites_fn):
+def process_phylo_output(sim, ref_inds, output_dic, f_out,
+                         trees_to_states, tag, rep,
+                         num_samples_species_to,
+                         topology, species_to, index_to_species,
+                         states, f_tracts_predicted, filtered_sites_fn):
 
     # gah so much copy pasta here
 
     trees, recomb_sites, segsites, positions, seqs = sim
 
     # read predicted state sequence
-    viterbi_fn = '../../results/sim/phylo-hmm/optimized.viterbi.sequence.'  + \
+    viterbi_fn = '../../results/sim/phylo-hmm/optimized.viterbi.sequence.' + \
         tag + '.' + str(rep)
     predicted = read_predicted(viterbi_fn, trees_to_states)
 
     # actual state sequence
-    actual_state_seq_unfiltered, num_introgressed = get_actual(num_samples_species_to, \
-                                                                   trees, topology, \
-                                                                   species_to, \
-                                                                   index_to_species, recomb_sites)
+    actual_state_seq_unfiltered, num_introgressed = get_actual(
+        num_samples_species_to, trees,
+        topology, species_to, index_to_species, recomb_sites)
 
     try:
-        filtered_sites = [int(x) for x in \
-                              open(filtered_sites_fn, 'r').readline().strip().split(' ')]
-    except:
-        print 'looks like none of the sites passed filtering'
+        filtered_sites = [int(x) for x in
+                          open(filtered_sites_fn,
+                               'r').readline().strip().split(' ')]
+    except FileNotFoundError:
+        print('looks like none of the sites passed filtering')
         sys.exit()
     actual_state_seq = [[] for s in range(num_samples_species_to)]
     for i in filtered_sites:
         for s in range(num_samples_species_to):
             actual_state_seq[s].append(actual_state_seq_unfiltered[s][i])
-    os.system('mv ' + filtered_sites_fn + ' ../' + gp.sim_out_dir + '/phylo-hmm/' + \
-                  'filtered_sites_' + tag + '_rep' + str(rep) + '.txt')
+    os.system('mv ' + filtered_sites_fn + ' ../' +
+              gp.sim_out_dir + '/phylo-hmm/' +
+              'filtered_sites_' + tag + '_rep' + str(rep) + '.txt')
 
-    # TODO fix this stupidness; C1 is the reference, C2 what we're trying to predict
+    # TODO fix this stupidness; C1 is the reference,
+    # C2 what we're trying to predict
     predicted = [predicted]
     actual_state_seq = [actual_state_seq[1]]
 
     # see how well predictions did
-    num_correct, num_introgressed_correct, actual_lens, predicted_lens, \
-        num_predicted_tracts_actual, num_actual_tracts_predicted, \
-        num_introgressed_tracts, num_not_introgressed_tracts, \
-        num_predicted_introgressed_tracts, num_predicted_not_introgressed_tracts = \
-        sim_analyze_hmm_bw.evaluate_predicted(predicted, actual_state_seq, species_to)
-
-    groups = sim_analyze_hmm_bw.group_actual_predicted_bases(actual_state_seq, predicted, states)
+    (num_correct, num_introgressed_correct, actual_lens, predicted_lens,
+     num_predicted_tracts_actual, num_actual_tracts_predicted,
+     num_introgressed_tracts, num_not_introgressed_tracts,
+     num_predicted_introgressed_tracts,
+     num_predicted_not_introgressed_tracts) = \
+        sim_analyze_hmm_bw.evaluate_predicted(
+            predicted, actual_state_seq, species_to)
+
+    groups = sim_analyze_hmm_bw.group_actual_predicted_bases(
+        actual_state_seq, predicted, states)
     for group in groups:
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'num_bases_actual_' + group[0] + \
-                                                         '_predicted_' + group[1], \
-                                                         groups[group])
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'num_bases_actual_' + group[0] +
+            '_predicted_' + group[1], groups[group])
 
-    ### tracts
+    # tracts
     inds_to_predict = range(num_samples_species_to)
     inds_to_predict.remove(ref_inds[0])
     blocks_predicted, blocks_actual = \
-        sim_analyze_hmm_bw.evaluate_predicted_blocks(predicted, actual_state_seq, \
-                                                         species_to, states, inds_to_predict)
+        sim_analyze_hmm_bw.evaluate_predicted_blocks(
+            predicted, actual_state_seq, species_to, states, inds_to_predict)
+
     # remove blocks in reference individual
     blocks_actual = filter(lambda b: b[3] != ref_inds[0], blocks_actual)
-    d_actual_predicted, d_predicted_actual, d_actual_counts, d_predicted_counts = \
-        sim_analyze_hmm_bw.group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, \
-                                                             inds_to_predict)
+    (d_actual_predicted, d_predicted_actual,
+     d_actual_counts, d_predicted_counts) = \
+        sim_analyze_hmm_bw.group_actual_predicted_blocks(
+            blocks_actual, blocks_predicted, states,
+            inds_to_predict)
     for group in d_actual_predicted:
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'num_tracts_actual_' + group[0] + \
-                                                         '_predicted_' + group[1], \
-                                                         d_actual_predicted[group])
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'num_tracts_actual_' + group[0] +
+            '_predicted_' + group[1], d_actual_predicted[group])
     for group in d_predicted_actual:
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'num_tracts_predicted_' + group[0] + \
-                                                         '_actual_' + group[1], \
-                                                         d_predicted_actual[group])
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'num_tracts_predicted_' + group[0] +
+            '_actual_' + group[1], d_predicted_actual[group])
     for state in states:
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'num_tracts_actual_' + state, \
-                                                         d_actual_counts[state])
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'num_tracts_predicted_' + state, \
-                                                         d_predicted_counts[state])
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'num_tracts_actual_' + state,
+            d_actual_counts[state])
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'num_tracts_predicted_' + state,
+            d_predicted_counts[state])
 
     # tract lengths, actual and predicted; list, not average
     for state in states:
-        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state, blocks_actual)]
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'tract_lengths_actual_' + state, tract_lengths)
-
-        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state, blocks_predicted)]
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'tract_lengths_predicted_' + state, tract_lengths)
-
+        tract_lengths = [b[2]
+                         for b in filter(
+                             lambda x: x[0] == state, blocks_actual)]
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'tract_lengths_actual_' + state, tract_lengths)
+
+        tract_lengths = [b[2]
+                         for b in filter(
+                             lambda x: x[0] == state, blocks_predicted)]
+        output_dic = sim_analyze_hmm_bw.update_value(
+            output_dic, 'tract_lengths_predicted_' + state, tract_lengths)
 
     """
     # HMM parameters
     init, emis, trans = read_phylo_hmm_params()
 
     for i in range(len(states)):
-        output_dic = sim_analyze_hmm_bw.update_value(output_dic, 'init_' + states[i], init[i])
-            
+        output_dic = sim_analyze_hmm_bw.update_value(output_dic,
+        'init_' + states[i], init[i])
+
     for i in range(len(states)):
         for j in range(len(states)):
-            output_dic = sim_analyze_hmm_bw.update_value(output_dic, \
-                                                             'trans_' + states[i] + '_' + states[j], \
-                                                             trans[i][j])
+            output_dic = sim_analyze_hmm_bw.update_value(
+                output_dic, \
+                'trans_' + states[i] + '_' + states[j], \
+                trans[i][j])
 
     # so here emis_cer_par is going to be the probability that cer
     # state emits symbol that matches par, so *+; note that these
@@ -208,7 +225,7 @@ def process_phylo_output(sim, ref_inds, output_dic, f_out, trees_to_states, tag,
                         total += emis[i][s]
                 elif s[j] == gp.match_symbol:
                     total += emis[i][s]
-            output_dic = update_value(output_dic, 'emis_'  + states[i] + '_' + \
+            output_dic = update_value(output_dic, 'emis_'  + states[i] + '_' +
                                           to_states[j], total)
                                           """
     #####
@@ -220,12 +237,14 @@ def process_phylo_output(sim, ref_inds, output_dic, f_out, trees_to_states, tag,
     sys.stdout.flush()
 
     for b in blocks_predicted:
-        f_tracts_predicted.write(str(rep) + ' ' +  ' '.join([str(x) for x in b]) + '\n')
+        f_tracts_predicted.write(str(rep) + ' ' +
+                                 ' '.join([str(x) for x in b]) + '\n')
 
     f_tracts_predicted.flush()
 
-    #return init, emis, trans
-    return None, None, None # TODO
+    # return init, emis, trans
+    return None, None, None  # TODO
+
 
 def gen_input_file(sequence_fn, fn, tag, rep):
     """
@@ -233,176 +252,204 @@ def gen_input_file(sequence_fn, fn, tag, rep):
     0) Build a new model.
     1) Load a pre-existing model.
     2) Exit.
-    Choose an option: 
+    Choose an option:
     0
     Input the basic file info path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     basic-info.txt
     Input the number of trees or states for this HMM:
     6
-    
+
     Input the parental trees file path name:
-    (note: see README for file format) 
+    (note: see README for file format)
 
     parental.trees
-    
+
     Input the gene genealogies file path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     gene.trees
-    
-    Input outgroup taxon name, or empty string for no outgroup taxon: 
-    
+
+    Input outgroup taxon name, or empty string for no outgroup taxon:
+
     OUTGROUP
-    Empty working directory: 
+    Empty working directory:
     ./d
-    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>: 
+    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>:
     0.964011 0.192217 0.270592 0.0806232 0.174283
-    Input non-zero substitution model base frequencies in format <A> <G> <C> <T>: 
+    Input non-zero substitution model base frequencies in format
+        <A> <G> <C> <T>:
     0.1953 0.3077 0.3008 0.1962
-    
-    Across-row switching frequency gamma: 
+
+    Across-row switching frequency gamma:
     .1
-    Hidden state switching frequency ratio term input file: 
+    Hidden state switching frequency ratio term input file:
     switching-frequency-ratio-terms
 
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization
+        heuristic that incorporates Brent's method
     4) Exit
     Choose an option:
     0
 
-    Path to your output file: 
+    Path to your output file:
     initial.viterbi.sequence
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
     Choose an option:
     1
-    Keep or discard parsimony-uninformative sites? true for keep, false for discard: 
+    Keep or discard parsimony-uninformative sites?
+        true for keep, false for discard:
     false
-    Input observation sequence file path name : 
+    Input observation sequence file path name :
     sequence_nogaps.fasta
     Input alignment length: 86706
     Alignment length after optional filter step: 8194
     Building Psy Array
-    100% Done!                             
+    100% Done!
 
     Total: 29 MiB; Free: 22 MiB; --> Used: 7 MiB.
     Max: 454 MiB.
-    Begin computing Viterbi's algorithm : FORWARD PART 
-    100% DONE!                                
+    Begin computing Viterbi's algorithm : FORWARD PART
+    100% DONE!
     Enlarging... capacity = 10000
-    Begin computing Viterbi's algorithm : BACKWARD PART 
-    100% DONE!                                   
+    Begin computing Viterbi's algorithm : BACKWARD PART
+    100% DONE!
     Input HMM Viterbi log likelihood: |-21.866065387121736|
-    Computing input HMM log likelihood for input sequences... 
+    Computing input HMM log likelihood for input sequences...
     Computing input HMM log likelihood for input sequences DONE.
     Input HMM log likelihood: |-21.310316817591303|
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization
+        heuristic that incorporates Brent's method
     4) Exit
-    Choose an option: 
-    
+    Choose an option:
+
 
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
-    Choose an option: 
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    Enable optimization flag vector <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
+    Choose an option:
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ...
+        <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
+
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ...
+        <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
+    Enable optimization flag vector <enable parental tree optimization flag>
+        <enable gene genealogy optimization flag>
+        <enable switching frequency optimization flag>
+        <enable substitution model optimization flag>
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization
+        heuristic that incorporates Brent's method
     4) Exit
-    Choose an option: 
+    Choose an option:
     """
 
-    initial_mode = '0' # build new model
+    initial_mode = '0'  # build new model
     basic_info_fn = 'basic-info.txt'
-    num_states = '6' # num parental trees * num genealogies
+    num_states = '6'  # num parental trees * num genealogies
     parental_trees_fn = 'parental.trees'
     gene_trees_fn = 'gene.trees'
     outgroup_name = 'OUTGROUP'
-    working_dir = '../../results/sim/phylo-hmm/working/' + tag # don't need one for every rep because we do one at a time
+    # don't need one for every rep because we do one at a time
+    working_dir = '../../results/sim/phylo-hmm/working/' + tag
     if not os.path.exists(working_dir):
         os.mkdir(working_dir)
-    substitution_rates = '1 1 1 1 1' # <AG> <AC> <AT> <GC> <GT>
-    base_frequencies = '0.25 0.25 0.25 0.25' # <A> <G> <C> <T>
+    substitution_rates = '1 1 1 1 1'  # <AG> <AC> <AT> <GC> <GT>
+    base_frequencies = '0.25 0.25 0.25 0.25'  # <A> <G> <C> <T>
     parental_tree_switching_freq = '.1'
     gene_tree_switching_fn = 'switching-frequency-ratio-terms'
-    operate_mode = '0' # run viterbi
+    operate_mode = '0'  # run viterbi
     # operate_mode = '1' # learn with Baum-Welch
-    output_file_path = '../../results/sim/phylo-hmm/initial.viterbi.sequence.' + tag + '.' + str(rep)
-    observation_sequence_option = '1' # read new sequence
-    keep_uninformative_sites = 'false' # referred to as "optional filter step" later
-    #sequence_fn = 'sequence.fasta'
-    operate_mode_2 = '3' # learn with "a multivariate optimization heuristic that incorporates Brent's method"
-    observation_sequence_option_2 = '0' # reuse previously read sequence
+    output_file_path = '../../results/sim/phylo-hmm/initial.viterbi.sequence.'\
+        + tag + '.' + str(rep)
+    observation_sequence_option = '1'  # read new sequence
+    # referred to as "optional filter step" later
+    keep_uninformative_sites = 'false'
+    # sequence_fn = 'sequence.fasta'
+    # learn with "a multivariate optimization
+    # heuristic that incorporates Brent's method"
+    operate_mode_2 = '3'
+    observation_sequence_option_2 = '0'  # reuse previously read sequence
     length_params_fn = 'length-parameters'
-    length_params_inequality_constraints_fn = 'length-parameter-inequality-constraints'
+    length_params_inequality_constraints_fn = \
+        'length-parameter-inequality-constraints'
     length_params_constraint_sets_fn = 'length-parameter-constraint-sets'
-    restore_fn = '' # blank for no file
-    output_posterior_decoding_fn = '../../results/sim/phylo-hmm/optimized.posterior.decoding.probabilities.' + tag + '.' + str(rep)
-    output_viterbi_optimized_fn = '../../results/sim/phylo-hmm/optimized.viterbi.sequence.'  + tag + '.' + str(rep)
-    output_model_likelihoods_fn = '../../results/sim/phylo-hmm/optimized.model.likelihoods.'  + tag + '.' + str(rep)
-    output_optimized_params_fn = '../../results/sim/phylo-hmm/optimized.model.parameters.' + tag + '.' + str(rep)
+    restore_fn = ''  # blank for no file
+    output_posterior_decoding_fn = (
+        '../../results/sim/phylo-hmm/'
+        'optimized.posterior.decoding.probabilities.' + tag + '.' + str(rep))
+    output_viterbi_optimized_fn = (
+        '../../results/sim/phylo-hmm/'
+        'optimized.viterbi.sequence.' + tag + '.' + str(rep))
+    output_model_likelihoods_fn = (
+        '../../results/sim/phylo-hmm/'
+        'optimized.model.likelihoods.' + tag + '.' + str(rep))
+    output_optimized_params_fn = (
+        '../../results/sim/phylo-hmm/'
+        'optimized.model.parameters.' + tag + '.' + str(rep))
     initial_search_settings = 'CURRENT DEFAULT'
-    enable_optimization = 'true true true true' # <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
-    operate_mode_3='4' # exit
+    # <enable parental tree optimization flag>
+    # <enable gene genealogy optimization flag>
+    # <enable switching frequency optimization flag>
+    # <enable substitution model optimization flag>
+    enable_optimization = 'true true true true'
+    operate_mode_3 = '4'  # exit
 
     f = open(fn, 'w')
-    f.write(initial_mode + '\n' + \
-                basic_info_fn + '\n' + \
-                num_states + '\n' + \
-                parental_trees_fn + '\n' + \
-                gene_trees_fn + '\n' + \
-                outgroup_name + '\n' + \
-                working_dir + '\n' + \
-                substitution_rates + '\n' + \
-                base_frequencies + '\n' + \
-                parental_tree_switching_freq + '\n' + \
-                gene_tree_switching_fn + '\n' + \
-                operate_mode + '\n' + \
-                output_file_path + '\n' + \
-                observation_sequence_option + '\n' + \
-                keep_uninformative_sites + '\n' + \
-                sequence_fn + '\n' + \
-                operate_mode_2 + '\n' + \
-                observation_sequence_option_2 + '\n' + \
-                length_params_fn + '\n' + \
-                length_params_inequality_constraints_fn + '\n' + \
-                length_params_constraint_sets_fn + '\n' + \
-                restore_fn + '\n' + \
-                output_posterior_decoding_fn + '\n' + \
-                output_viterbi_optimized_fn + '\n' + \
-                output_model_likelihoods_fn + '\n' + \
-                output_optimized_params_fn + '\n' + \
-                initial_search_settings + '\n' + \
-                enable_optimization + '\n' + \
-                operate_mode_3)
+    f.write(initial_mode + '\n' +
+            basic_info_fn + '\n' +
+            num_states + '\n' +
+            parental_trees_fn + '\n' +
+            gene_trees_fn + '\n' +
+            outgroup_name + '\n' +
+            working_dir + '\n' +
+            substitution_rates + '\n' +
+            base_frequencies + '\n' +
+            parental_tree_switching_freq + '\n' +
+            gene_tree_switching_fn + '\n' +
+            operate_mode + '\n' +
+            output_file_path + '\n' +
+            observation_sequence_option + '\n' +
+            keep_uninformative_sites + '\n' +
+            sequence_fn + '\n' +
+            operate_mode_2 + '\n' +
+            observation_sequence_option_2 + '\n' +
+            length_params_fn + '\n' +
+            length_params_inequality_constraints_fn + '\n' +
+            length_params_constraint_sets_fn + '\n' +
+            restore_fn + '\n' +
+            output_posterior_decoding_fn + '\n' +
+            output_viterbi_optimized_fn + '\n' +
+            output_model_likelihoods_fn + '\n' +
+            output_optimized_params_fn + '\n' +
+            initial_search_settings + '\n' +
+            enable_optimization + '\n' +
+            operate_mode_3)
 
     return working_dir
diff --git a/code/phylo-hmm/sim_analyze_phylo_main.py b/code/phylo-hmm/sim_analyze_phylo_main.py
index 6b0c833..1986d16 100644
--- a/code/phylo-hmm/sim_analyze_phylo_main.py
+++ b/code/phylo-hmm/sim_analyze_phylo_main.py
@@ -1,26 +1,30 @@
 # TODO eliminate some of the copy pasta here from the code for
 # analyzing my method
 
-from sim_analyze_phylo import * 
-sys.path.insert(0, '../sim')
-from sim_analyze_hmm_bw import *
-from concordance_functions import *
-sys.path.insert(0, '..')
+from sim.sim_analyze_phylo import (convert_binary_to_nucleotides,
+                                   write_fasta, gen_input_file,
+                                   process_phylo_output)
+from sim.sim_analyze_hmm_bw import (make_output_dic, write_output_line,
+                                    read_fasta)
+from sim.sim_process import read_one_sim, fill_seqs
+from sim.process_args import process_args
 import global_params as gp
+import sys
+import os
 
 seq_gen = False
 
-tag, topology, species_to, species_from1, species_from2, \
-    num_samples_species_to, num_samples_species_from1, num_samples_species_from2, \
-    N0_species_to, N0_species_from1, N0_species_from2, \
-    migration_from1, migration_from2, \
-    expected_length_introgressed, \
-    expected_num_introgressed_tracts, \
-    has_ref_from1, has_ref_from2, \
-    rho, outcross_rate, theta, num_sites, num_reps = \
-    process_args(sys.argv)
+(tag, topology, species_to, species_from1, species_from2,
+ num_samples_species_to, num_samples_species_from1, num_samples_species_from2,
+ N0_species_to, N0_species_from1, N0_species_from2,
+ migration_from1, migration_from2,
+ expected_length_introgressed,
+ expected_num_introgressed_tracts,
+ has_ref_from1, has_ref_from2,
+ rho, outcross_rate, theta, num_sites, num_reps) = process_args(sys.argv)
 
-num_samples = num_samples_species_to + num_samples_species_from1 + num_samples_species_from2
+num_samples = (num_samples_species_to + num_samples_species_from1 +
+               num_samples_species_from2)
 
 # species_to always comes first
 index_to_species = [species_to] * num_samples_species_to + \
@@ -49,7 +53,7 @@
     ref_inds.append(ref_ind_species_from1)
 else:
     unknown_species = species_from1
-if species_from2 != None:
+if species_from2 is not None:
     states.append(species_from2)
     if has_ref_from2:
         ref_inds.append(ref_ind_species_from2)
@@ -62,9 +66,9 @@
 # the species in states correspond to the indices of the references
 # (and the sequence codings later); ACTUALLY just force the unknown
 # species to come last
-if species_from2 != None:
+if species_from2 is not None:
     assert has_ref_from1
-#if species_from2 != None and not has_ref_from1:
+# if species_from2 != None and not has_ref_from1:
 #    states = states[0] + states[2] + states[1]
 
 #####
@@ -81,12 +85,16 @@
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +  gp.sim_out_prefix + tag + '.txt', 'r')
+ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +
+            gp.sim_out_prefix + tag + '.txt', 'r')
 
 # for writing results of analysis
-results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + gp.sim_out_prefix + tag + '_summary_phylo.txt'
-hmm_filename = gp_dir + gp.sim_out_dir + '/analyze/' + 'hmm_parameters_' + tag + '_phylo.txt'
-avg_results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + gp.sim_out_prefix + 'avg_' + tag + 'summary_phylo.txt'
+results_filename = (gp_dir + gp.sim_out_dir + '/analyze/' +
+                    gp.sim_out_prefix + tag + '_summary_phylo.txt')
+hmm_filename = (gp_dir + gp.sim_out_dir + '/analyze/' +
+                'hmm_parameters_' + tag + '_phylo.txt')
+avg_results_filename = (gp_dir + gp.sim_out_dir + '/analyze/' +
+                        gp.sim_out_prefix + 'avg_' + tag + 'summary_phylo.txt')
 
 # write results headers
 # - for training on single sim
@@ -100,11 +108,13 @@
 
 # results files for tracts predicted to be and actually introgressed
 # - training on single sim
-f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                              gp.sim_out_prefix + tag + '_introgressed_tracts_predicted_phylo.txt', 'w')
+f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                          gp.sim_out_prefix + tag +
+                          '_introgressed_tracts_predicted_phylo.txt', 'w')
 # - averaged params
-avg_f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                                  gp.sim_out_prefix + 'avg_' + tag + '_introgressed_tracts_predicted_phylo.txt', 'w')
+avg_f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                              gp.sim_out_prefix + 'avg_' + tag +
+                              '_introgressed_tracts_predicted_phylo.txt', 'w')
 
 #####
 # for actual parameters, analyze simulation results and get hmm
@@ -120,17 +130,19 @@
 # and predict introgressed tracts by training on those sequences
 # using phylo-hmm
 for i in range(num_reps):
-    print i
+    print(i)
 
     # trees, recomb sites, segsites, positions, seqs
     sim = read_one_sim(ms_f, num_sites, num_samples)
-    assert sim != None, str(num_reps) + ' reps is not correct'
+    assert sim is not None, str(num_reps) + ' reps is not correct'
 
     seq_fn = ''
     if seq_gen:
         # simulated sequences already generated with ms and then seq-gen
         # DON'T USE THIS IN CURRENT FORM
-        seq_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + gp.sim_out_prefix + 'seq_gen_' + tag + '_rep' + str(i) + '.fasta'
+        seq_fn = (gp_dir + gp.sim_out_dir + '/seq-gen/' +
+                  gp.sim_out_prefix + 'seq_gen_' + tag +
+                  '_rep' + str(i) + '.fasta')
         sim[4] = read_fasta(seq_fn)
     else:
         # fill in the nonpolymorphic sites
@@ -149,19 +161,20 @@
     working_dir = gen_input_file(seq_fn, input_fn, tag, i)
 
     # run phylo-hmm
-    os.system('java -jar ~/software/phylo_hmm/phmm-0.1/dist/lib/phmm.jar < ' + input_fn)
+    os.system('java -jar ~/software/phylo_hmm/phmm-0.1/dist/lib/phmm.jar < '
+              + input_fn)
 
     # write results in different format
-    trees_to_states = {'p1':'cer', 'p2':'par'} # generalize this? worth it? nah
-    init_new, emis_new, trans_new = process_phylo_output(sim, ref_inds, output_dic, \
-                                                             f_out, \
-                                                             trees_to_states, tag, i, \
-                                                             num_samples_species_to, \
-                                                             topology, species_to, \
-                                                             index_to_species, states, \
-                                                             f_tracts_predicted, \
-                                                             working_dir + \
-                                                             '/filtered_sites.txt')
+    # generalize this? worth it? nah
+    trees_to_states = {'p1': 'cer', 'p2': 'par'}
+    init_new, emis_new, trans_new = process_phylo_output(
+        sim, ref_inds, output_dic, f_out,
+        trees_to_states, tag, i,
+        num_samples_species_to,
+        topology, species_to,
+        index_to_species, states,
+        f_tracts_predicted,
+        working_dir + '/filtered_sites.txt')
 
     init_all.append(init_new)
     emis_all.append(emis_new)
@@ -205,8 +218,6 @@
 
     for i in range(num_reps):
         print i
-    
+
         analyze_one_rep(params, f)
 """
-
-
diff --git a/code/phylogeny/format_for_phylip_main.py b/code/phylogeny/format_for_phylip_main.py
index cb6d89e..f1a1db8 100644
--- a/code/phylogeny/format_for_phylip_main.py
+++ b/code/phylogeny/format_for_phylip_main.py
@@ -1,12 +1,12 @@
 import sys
-import os
-sys.path.insert(0, '../misc/')
-import read_fasta
+from misc import read_fasta
+
 
 def pad(s, n):
     s = s.strip()
     return s[:n] + (n - len(s)) * ' '
 
+
 headers, seqs = read_fasta.read_fasta(sys.argv[1])
 
 fp = open(sys.argv[2], 'w')
diff --git a/code/phylogeny/format_shared_introgression_for_plotting.py b/code/phylogeny/format_shared_introgression_for_plotting.py
index 69d6011..07bd7c9 100644
--- a/code/phylogeny/format_shared_introgression_for_plotting.py
+++ b/code/phylogeny/format_shared_introgression_for_plotting.py
@@ -1,20 +1,12 @@
-import sys
-import os
-import gzip
-import math
-sys.path.insert(0, '..')
-import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
+from misc import read_table
 
-shared_regions, l = \
-    read_table.read_table_rows('shared_introgression_nonsingleton_polymorphism.txt', \
-                               '\t')
+shared_regions, _ = \
+    read_table.read_table_rows(
+        'shared_introgression_nonsingleton_polymorphism.txt',
+        '\t')
 f = open('shared_introgression_nonsingleton_polymorphism_3strains.txt', 'w')
-f.write('region_number\tchromosome\tstart\tend\tin_3strains\tin_only_3strains\tpi\tfrac_poly\tnum_poly\tnum_total\n')
+f.write('region_number\tchromosome\tstart\tend\tin_3strains'
+        '\tin_only_3strains\tpi\tfrac_poly\tnum_poly\tnum_total\n')
 s3 = set(['yjm1078', 'yjm1242', 'yjm248'])
 for region_number in shared_regions.keys():
     strains = set(shared_regions[region_number]['strain_list'].split(','))
@@ -22,10 +14,10 @@
     f.write(shared_regions[region_number]['chromosome'] + '\t')
     f.write(shared_regions[region_number]['start'] + '\t')
     f.write(shared_regions[region_number]['end'] + '\t')
-   
-    if strains.intersection(s3) == set(): # none of three strains present
+
+    if strains.intersection(s3) == set():  # none of three strains present
         f.write('FALSE\tFALSE\t')
-    elif strains - s3 == set(): # no other strain present
+    elif strains - s3 == set():  # no other strain present
         f.write('TRUE\tTRUE\t')
     else:
         f.write('TRUE\tFALSE\t')
diff --git a/code/phylogeny/get_gene_set.py b/code/phylogeny/get_gene_set.py
index cb4e7d4..eeed2a5 100644
--- a/code/phylogeny/get_gene_set.py
+++ b/code/phylogeny/get_gene_set.py
@@ -1,8 +1,5 @@
-import sys
-import os
-sys.path.insert(0, '../misc/')
-import read_fasta
-import write_fasta
+from misc import read_fasta
+
 
 def concatenate_fastas(fns, fn_out, remove_gaps):
     strains = read_fasta.read_fasta(fns[0])[0]
@@ -16,4 +13,3 @@ def concatenate_fastas(fns, fn_out, remove_gaps):
         f.write(strain + '\n')
         f.write(concat_seqs[strain] + '\n')
     f.close()
-            
diff --git a/code/phylogeny/get_gene_set_main.py b/code/phylogeny/get_gene_set_main.py
index 39c8fea..7ed8330 100644
--- a/code/phylogeny/get_gene_set_main.py
+++ b/code/phylogeny/get_gene_set_main.py
@@ -6,20 +6,12 @@
 
 import sys
 import os
-from get_gene_set import *
-import math
-sys.path.insert(0, '..')
+from get_gene_set import concatenate_fastas
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../analyze/')
-import gene_predictions
-import combine_all_strains
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from align import align_helpers
+from analyze import combine_all_strains
+from misc import read_table
+from misc import read_fasta
 
 tag = 'u3_i.001_tv_l1000_f.01'
 suffix = '_filtered'
@@ -27,7 +19,7 @@
 threshold = .1
 
 # look up all cerevisiae reference genes
-print 'getting reference gene coordinates'
+print('getting reference gene coordinates')
 all_ref_genes = {}
 for chrm in gp.chrms:
     genes_fn = gp.analysis_out_dir_absolute + gp.master_ref + \
@@ -37,19 +29,19 @@
         gene, start, end = line.split('\t')
         all_ref_genes[chrm][gene] = (int(start), int(end))
 
-print 'getting reference gene sequences'
+print('getting reference gene sequences')
 # files for storing all gene sequences, unaligned and unaligned
 genes_dir = gp.analysis_out_dir_absolute + tag + '/genes/'
 for chrm in gp.chrms:
     for gene in all_ref_genes[chrm]:
-        seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] + \
-                                    gp.ref_fn_prefix[gp.master_ref] + \
+        seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] +
+                                    gp.ref_fn_prefix[gp.master_ref] +
                                     '_chr' + chrm + gp.fasta_suffix)[1][0]
         start, end = all_ref_genes[chrm][gene]
         seq = seq[start:end + 1]
         if not os.path.isdir(genes_dir + gene):
             os.makedirs(genes_dir + gene)
-        f = open(genes_dir + gene + '/' + gene + \
+        f = open(genes_dir + gene + '/' + gene +
                  '_from_alignment' + gp.fasta_suffix, 'w')
         f.write('> ' + gp.master_ref + '\n')
         f.write(seq + '\n')
@@ -59,11 +51,12 @@
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
 
 
-print 'getting introgressed genes'
+print('getting introgressed genes')
 # read in filtered regions
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-             'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+    'introgressed_blocks' + suffix + '_par_' + \
+    tag + '_summary_plus.txt'
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 
 # read in genes for each region
 fn_genes_for_each_region = gp.analysis_out_dir_absolute + tag + '/' + \
@@ -79,7 +72,7 @@
 
 
 # loop through all genes by chromosome
-print 'getting gene sequences for all other strains and aligning them'
+print('getting gene sequences for all other strains and aligning them')
 genes = {}
 genes_nonint = []
 genes_int = []
@@ -88,27 +81,27 @@
 flagged = set()
 
 for chrm in gp.chrms:
-    print chrm
+    print(chrm)
+
+    # get gene sequences for this strain and all genes
 
-    # get gene sequences for this strain and all genes 
-    
     # for each strain, get the corresponding part of chromosome
     # sequence (don't worry about ORF); assess alignment quality, flag
     # if it doesn't meet a threshold
-    
+
     # get indexing from reference genome to strain genomes based on
     # alignments
 
     # for par reference which doesn't have site summary file
     align_fn = gp_dir + gp.alignments_dir + \
-               '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
-               '_mafft' + gp.alignment_suffix
+        '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
+        '_mafft' + gp.alignment_suffix
     t = combine_all_strains.get_inds_from_alignment(align_fn, False)
     other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]]
-    ref_ind_to_strain_ind = dict(zip([int(x) for x in t['ps_ref']], \
+    ref_ind_to_strain_ind = dict(zip([int(x) for x in t['ps_ref']],
                                      [float(x) for x in t['ps_strain']]))
-    chrom_seq = read_fasta.read_fasta(gp.ref_dir[other_ref_strain] + \
-                                      gp.ref_fn_prefix[other_ref_strain] + \
+    chrom_seq = read_fasta.read_fasta(gp.ref_dir[other_ref_strain] +
+                                      gp.ref_fn_prefix[other_ref_strain] +
                                       '_chr' + chrm + gp.fasta_suffix)[1][0]
     for gene in all_ref_genes[chrm].keys():
         ref_start, ref_end = all_ref_genes[chrm][gene]
@@ -116,25 +109,26 @@
         start = int(ref_ind_to_strain_ind[ref_start])
         end = int(ref_ind_to_strain_ind[ref_end])
         seq = chrom_seq[start:end + 1]
-        f = open(genes_dir + gene + '/' + gene + \
+        f = open(genes_dir + gene + '/' + gene +
                  '_from_alignment' + gp.fasta_suffix, 'a')
         f.write('> ' + other_ref_strain + '\n')
         f.write(seq + '\n')
         f.close()
 
     for strain, d in s:
-        print '*', strain
+        print('*', strain)
         sys.stdout.flush()
 
         align_fn = gp_dir + gp.alignments_dir + \
-                   '_'.join(gp.alignment_ref_order) + '_' + \
-                   strain + '_chr' + chrm + \
-                   '_mafft' + gp.alignment_suffix
+            '_'.join(gp.alignment_ref_order) + '_' + \
+            strain + '_chr' + chrm + \
+            '_mafft' + gp.alignment_suffix
         t = combine_all_strains.get_inds_from_alignment(align_fn, False, 0, 2)
-        ref_ind_to_strain_ind = dict(zip([int(x) for x in t['ps_ref']], \
+        ref_ind_to_strain_ind = dict(zip([int(x) for x in t['ps_ref']],
                                          [float(x) for x in t['ps_strain']]))
-        chrom_seq = read_fasta.read_fasta(d + '/' + strain + \
-                                    '_chr' + chrm + gp.fasta_suffix)[1][0]
+        chrom_seq = read_fasta.read_fasta(d + '/' + strain +
+                                          '_chr' + chrm +
+                                          gp.fasta_suffix)[1][0]
 
         for gene in all_ref_genes[chrm].keys():
             ref_start, ref_end = all_ref_genes[chrm][gene]
@@ -145,7 +139,7 @@
             if len(seq) > (1 + threshold) * ref_length or \
                len(seq) < (1 - threshold) * ref_length:
                 flagged.add(gene)
-            f = open(genes_dir + gene + '/' + gene + \
+            f = open(genes_dir + gene + '/' + gene +
                      '_from_alignment' + gp.fasta_suffix, 'a')
             f.write('> ' + strain + '\n')
             f.write(seq + '\n')
@@ -156,15 +150,15 @@
     # check whether gene overlaps introgressed region in any strains, keep
     # separate lists
     for gene in all_ref_genes[chrm].keys():
-        
+
         gene_seqs_fn = genes_dir + gene + '/' + gene + \
                        '_from_alignment' + gp.fasta_suffix
-        gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix, \
+        gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix,
                                                     gp.alignment_suffix)
         # make alignments for _all_ the genes
         cmd_string = gp.mafft_install_path + '/mafft ' + \
-                     ' --quiet --reorder --preservecase ' + \
-                     gene_seqs_fn + ' > ' + gene_seqs_aligned_fn
+            ' --quiet --reorder --preservecase ' + \
+            gene_seqs_fn + ' > ' + gene_seqs_aligned_fn
         os.system(cmd_string)
 
         if gene not in flagged:
@@ -174,30 +168,41 @@
             else:
                 genes_nonint.append(gene)
 
-        
+
 # then just for unflagged, int, and nonint, append alignments so
 # there's one master file with big entry for each strain
-concatenate_fastas([genes_dir + gene + '/' + gene + '_from_alignment' + gp.alignment_suffix for gene in genes_unflagged], gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_all' + gp.fasta_suffix, True)
-f = open(gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_all.txt', 'w')
+concatenate_fastas([genes_dir + gene + '/' + gene +
+                    '_from_alignment' + gp.alignment_suffix
+                    for gene in genes_unflagged],
+                   gp.analysis_out_dir_absolute + tag +
+                   '/genes_for_phylogeny_all' + gp.fasta_suffix, True)
+f = open(gp.analysis_out_dir_absolute + tag +
+         '/genes_for_phylogeny_all.txt', 'w')
 for gene in genes_unflagged:
     f.write(gene + '\n')
 f.close()
 
-concatenate_fastas([genes_dir + gene + '/' + gene + '_from_alignment' + gp.alignment_suffix for gene in genes_int], gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_int' + gp.fasta_suffix, True)
-f = open(gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_int.txt', 'w')
+concatenate_fastas([genes_dir + gene + '/' +
+                    gene + '_from_alignment' + gp.alignment_suffix
+                    for gene in genes_int],
+                   gp.analysis_out_dir_absolute + tag +
+                   '/genes_for_phylogeny_int' + gp.fasta_suffix, True)
+f = open(gp.analysis_out_dir_absolute + tag +
+         '/genes_for_phylogeny_int.txt', 'w')
 for gene in genes_int:
     f.write(gene + '\n')
 f.close()
 
-concatenate_fastas([genes_dir + gene + '/' + gene + '_from_alignment' + gp.alignment_suffix for gene in genes_nonint], gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_nonint' + gp.fasta_suffix, True)
-f = open(gp.analysis_out_dir_absolute + tag + '/genes_for_phylogeny_nonint.txt', 'w')
+concatenate_fastas([genes_dir + gene + '/' +
+                    gene + '_from_alignment' + gp.alignment_suffix
+                    for gene in genes_nonint],
+                   gp.analysis_out_dir_absolute + tag +
+                   '/genes_for_phylogeny_nonint' + gp.fasta_suffix, True)
+f = open(gp.analysis_out_dir_absolute + tag +
+         '/genes_for_phylogeny_nonint.txt', 'w')
 for gene in genes_nonint:
     f.write(gene + '\n')
 f.close()
 
 
 # then build phylogeny!
-
-
-
-
diff --git a/code/phylogeny/get_sequences_main.py b/code/phylogeny/get_sequences_main.py
index a2e9fe5..1ba8875 100644
--- a/code/phylogeny/get_sequences_main.py
+++ b/code/phylogeny/get_sequences_main.py
@@ -2,23 +2,14 @@
 # with position for every site in cerevisiae reference
 # creates same file but with only introgressed sites
 # creates same file but with only nonintrogressed sites
-# creates same above three files but with alignment columns with any gaps removed
+# creates same above three files but with
+# alignment columns with any gaps removed
 
-import sys
-import os
-from get_gene_set import *
-import math
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../analyze/')
-import gene_predictions
-sys.path.insert(0, '../misc/')
-import read_table
-import read_fasta
-import write_fasta
-import mystats
+from align import align_helpers
+from misc import read_table
+from misc import read_fasta
+
 
 def get_inds_from_alignment(fn, rind, sind):
     headers, seqs = read_fasta.read_fasta(fn)
@@ -39,6 +30,7 @@ def get_inds_from_alignment(fn, rind, sind):
                 ps.append(str(si))
     return ps
 
+
 tag = 'u3_i.001_tv_l1000_f.01'
 suffix = '_filtered'
 gp_dir = '../'
@@ -60,21 +52,21 @@ def get_inds_from_alignment(fn, rind, sind):
 fn_nonint_ungapped = gp.analysis_out_dir_absolute + tag + \
                      '/seqs_for_phylogeny_nonint_ungapped' + gp.fasta_suffix
 
-#======
+# ======
 # write all sites, including gaps
-#======
+# ======
 
-print 'writing file with all sites'
+print('writing file with all sites')
 
 f_all = open(fn_all, 'w')
 
 # master reference (cerevisiae)
-print '*', gp.master_ref
+print('*', gp.master_ref)
 f_all.write('>' + gp.master_ref + '\n')
 chrm_offset = 0
 for chrm in gp.chrms:
-    seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] + \
-                                gp.ref_fn_prefix[gp.master_ref] + \
+    seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] +
+                                gp.ref_fn_prefix[gp.master_ref] +
                                 '_chr' + chrm + gp.fasta_suffix)[1][0]
     f_all.write(seq)
     chrm_offsets[chrm] = chrm_offset
@@ -83,21 +75,21 @@ def get_inds_from_alignment(fn, rind, sind):
 
 # other reference (paradoxus)
 other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]]
-print '*', other_ref_strain
+print('*', other_ref_strain)
 f_all.write('>' + other_ref_strain + '\n')
 for chrm in gp.chrms:
     align_fn = gp_dir + gp.alignments_dir + \
                '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
                '_mafft' + gp.alignment_suffix
     ps = get_inds_from_alignment(align_fn, 0, 1)
-    chrom_seq = read_fasta.read_fasta(gp.ref_dir[other_ref_strain] + \
-                                      gp.ref_fn_prefix[other_ref_strain] + \
+    chrom_seq = read_fasta.read_fasta(gp.ref_dir[other_ref_strain] +
+                                      gp.ref_fn_prefix[other_ref_strain] +
                                       '_chr' + chrm + gp.fasta_suffix)[1][0]
     for i in range(len(ps)):
         try:
             x = int(ps[i])
             f_all.write(chrom_seq[x])
-        except:
+        except ValueError:
             f_all.write(gp.gap_symbol)
             gap_sites.add(chrm_offsets[chrm] + i)
 
@@ -107,7 +99,7 @@ def get_inds_from_alignment(fn, rind, sind):
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
 
 for strain, d in s:
-    print '*', strain
+    print('*', strain)
     f_all.write('>' + strain + '\n')
     for chrm in gp.chrms:
         align_fn = gp_dir + gp.alignments_dir + \
@@ -115,30 +107,32 @@ def get_inds_from_alignment(fn, rind, sind):
                    strain + '_chr' + chrm + \
                    '_mafft' + gp.alignment_suffix
         ps = get_inds_from_alignment(align_fn, 0, 2)
-        chrom_seq = read_fasta.read_fasta(d + '/' + strain + \
-                                          '_chr' + chrm + gp.fasta_suffix)[1][0]
+        chrom_seq = read_fasta.read_fasta(d + '/' + strain +
+                                          '_chr' + chrm +
+                                          gp.fasta_suffix)[1][0]
         for i in range(len(ps)):
             try:
                 x = int(ps[i])
                 f_all.write(chrom_seq[x])
-            except:
+            except ValueError:
                 f_all.write(gp.gap_symbol)
                 gap_sites.add(chrm_offsets[chrm] + i)
-                
+
     f_all.write('\n')
 
 f_all.close()
 
-#======
+# ======
 # write all the other files with/without gaps and introgressed sites
-#======
+# ======
 
-print 'writing all other files'
+print('writing all other files')
 
 # get introgressed sites
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-             'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+    'introgressed_blocks' + suffix + '_par_' + \
+    tag + '_summary_plus.txt'
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 for rid in regions:
     region = regions[rid]
     for i in range(int(region['start']), int(region['end'])+1):
@@ -153,7 +147,7 @@ def get_inds_from_alignment(fn, rind, sind):
 f_nonint_ungapped = open(fn_nonint_ungapped, 'w')
 for i in range(len(s) + 2):
     header = f.readline()[:-1]
-    print '*', header[1:]
+    print('*', header[1:])
     seq = f.readline()[:-1]
 
     f_all_ungapped.write(header + '\n')
@@ -176,7 +170,7 @@ def get_inds_from_alignment(fn, rind, sind):
             f_nonint.write(a)
             if not gap:
                 f_nonint_ungapped.write(a)
-        
+
     f_all_ungapped.write('\n')
     f_int.write('\n')
     f_int_ungapped.write('\n')
@@ -189,7 +183,3 @@ def get_inds_from_alignment(fn, rind, sind):
 f_int_ungapped.close()
 f_nonint.close()
 f_nonint_ungapped.close()
-
-
-
-
diff --git a/code/phylogeny/make_dollop_input_main.py b/code/phylogeny/make_dollop_input_main.py
index 395b05b..468ddd1 100644
--- a/code/phylogeny/make_dollop_input_main.py
+++ b/code/phylogeny/make_dollop_input_main.py
@@ -1,12 +1,7 @@
-import sys
-import os
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
+from align import align_helpers
+from misc import read_table
+
 
 def pad(s, n=10):
     s = s.strip()
@@ -21,8 +16,9 @@ def pad(s, n=10):
 
 # read in filtered regions
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-             'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+    'introgressed_blocks' + suffix + '_par_' + \
+    tag + '_summary_plus.txt'
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 
 # read in genes for each region
 fn_genes_for_each_region = gp.analysis_out_dir_absolute + tag + '/' + \
@@ -35,7 +31,7 @@ def pad(s, n=10):
         strain = regions[region]['strain']
         g = line[2::2]
         for gene in g:
-            if not introgressed_genes_strains.has_key(gene):
+            if gene not in introgressed_genes_strains:
                 introgressed_genes_strains[gene] = set([])
             introgressed_genes_strains[gene].add(strain)
 
@@ -47,7 +43,8 @@ def pad(s, n=10):
 
 f_dollop = open('infile_dollop', 'w')
 
-f_dollop.write(str(len(s)+2) + ' ' + str(len(introgressed_genes_strains)) + '\n')
+f_dollop.write(str(len(s)+2) + ' ' +
+               str(len(introgressed_genes_strains)) + '\n')
 f_dollop.write(pad(gp.master_ref) + '0' * len(genes) + '\n')
 f_dollop.write(pad(gp.alignment_ref_order[1]) + '0' * len(genes) + '\n')
 
@@ -63,11 +60,12 @@ def pad(s, n=10):
 f_dollop.close()
 
 f_matrix = open('strain_gene_introgressed_matrix.tsv', 'w')
-f_matrix.write('strain' + '\t' + '\t'.join(genes) +'\n')
+f_matrix.write('strain' + '\t' + '\t'.join(genes) + '\n')
 f_matrix.write(gp.master_ref + '\t' + '\t'.join(['0' for g in genes]) + '\n')
-f_matrix.write(gp.alignment_ref_order[1] + '\t' + '\t'.join(['0' for g in genes]) + '\n')
+f_matrix.write(gp.alignment_ref_order[1] + '\t' +
+               '\t'.join(['0' for g in genes]) + '\n')
 for strain, d in s:
-    f_matrix.write( strain)
+    f_matrix.write(strain)
     for gene in genes:
         if strain in introgressed_genes_strains[gene]:
             f_matrix.write('\t1')
diff --git a/code/phylogeny/make_shared_introgression_matrix_main.py b/code/phylogeny/make_shared_introgression_matrix_main.py
index 263b83c..171716d 100644
--- a/code/phylogeny/make_shared_introgression_matrix_main.py
+++ b/code/phylogeny/make_shared_introgression_matrix_main.py
@@ -1,12 +1,7 @@
-import sys
-import os
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
+from align import align_helpers
+from misc import read_table
+
 
 def pad(s, n=10):
     s = s.strip()
@@ -21,8 +16,8 @@ def pad(s, n=10):
 
 # read in filtered regions
 fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
-             'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
-regions, l = read_table.read_table_rows(fn_regions, '\t')
+    'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
+regions, _ = read_table.read_table_rows(fn_regions, '\t')
 
 
 site_to_strains_intd = dict(zip(gp.chrms, [{} for c in gp.chrms]))
@@ -41,7 +36,8 @@ def pad(s, n=10):
 # looping through all regions, keep track of which strains are
 # introgressed at each site
 for region in regions:
-    for site in range(int(regions[region]['start']), int(regions[region]['end'])+1):
+    for site in range(int(regions[region]['start']),
+                      int(regions[region]['end'])+1):
         chrm = regions[region]['chromosome']
         strain = regions[region]['strain']
         intd[chrm][site].add(strain)
@@ -51,7 +47,7 @@ def pad(s, n=10):
 shared_regions = {}
 for chrm in gp.chrms:
     shared_regions[chrm] = []
-    #if intd[chrm][0] != set():
+    # if intd[chrm][0] != set():
     site_to_strains_intd[chrm][0] = intd[chrm][0]
     shared_regions[chrm].append([0, 0, intd[chrm][0]])
     for i in range(1, len(intd[chrm])):
@@ -61,7 +57,7 @@ def pad(s, n=10):
         else:
             shared_regions[chrm][-1][1] = i
 
-print shared_regions['I']
+print(shared_regions['I'])
 
 strain_matrix = {}
 for chrm in gp.chrms:
@@ -71,13 +67,13 @@ def pad(s, n=10):
             for s2 in range(s1, len(strains)):
                 strain1 = strains[s1]
                 strain2 = strains[s2]
-                if not strain_matrix.has_key(strain1):
+                if strain1 not in strain_matrix:
                     strain_matrix[strain1] = {}
-                if not strain_matrix.has_key(strain2):
+                if strain2 not in strain_matrix:
                     strain_matrix[strain2] = {}
-                if not strain_matrix[strain1].has_key(strain2):
+                if strain2 not in strain_matrix[strain1]:
                     strain_matrix[strain1][strain2] = 0
-                if not strain_matrix[strain2].has_key(strain1):
+                if strain1 not in strain_matrix[strain2]:
                     strain_matrix[strain2][strain1] = 0
                 strain_matrix[strain1][strain2] += 1
                 if strain1 != strain2:
@@ -160,7 +156,8 @@ def pad(s, n=10):
 for chrm in gp.chrms:
 
     f = open('shared_introgression_chr' + chrm + '_nonsingleton_list.txt', 'w')
-    f.write('region_number\tchromosome\tstart\tend\tnum_strains\tstrain_list\n')
+    f.write('region_number\tchromosome\tstart\tend'
+            '\tnum_strains\tstrain_list\n')
     count_ns = 1
 
     for r in shared_regions[chrm]:
@@ -175,7 +172,8 @@ def pad(s, n=10):
 
     f.close()
 
-    f = open('strain_shared_introgression_chr' + chrm + '_nonsingleton.txt', 'w')
+    f = open('strain_shared_introgression_chr' + chrm +
+             '_nonsingleton.txt', 'w')
     f.write(str(len(strain_matrix.keys())) + ' ' + str(count_ns - 1) + '\n')
     for strain in sorted(strain_matrix.keys()):
         f.write(pad(strain))
@@ -189,4 +187,3 @@ def pad(s, n=10):
     f.write(pad('S288c') + '0' * (count_ns - 1) + '\n')
     f.write(pad('CBS432') + '0' * (count_ns - 1) + '\n')
     f.close()
-
diff --git a/code/phylogeny/shared_introgression_nonsingleton_polymorphism_main.py b/code/phylogeny/shared_introgression_nonsingleton_polymorphism_main.py
deleted file mode 100644
index bee021d..0000000
--- a/code/phylogeny/shared_introgression_nonsingleton_polymorphism_main.py
+++ /dev/null
@@ -1,11 +0,0 @@
-region_number	chromosome	start	end	frac_poly	num_poly	num_total	num_strains	strain_list
-199	I	218821	218913	0.0	0	91	4	yjm1208,yjm1418,yjm1419,yjm1615
-198	I	218757	218820	0.046875	3	64	11	yjm1083,yjm1208,yjm1273,yjm1386,yjm1418,yjm1419,yjm1434,yjm1447,yjm1573,yjm1615,yjm451
-195	I	206509	206551	0.139534883721	6	43	2	yjm1388,yjm1400
-194	I	206359	206392	0.0	0	34	2	yjm1388,yjm1389
-197	I	218646	218756	0.00900900900901	1	111	3	yjm1208,yjm1419,yjm1615
-196	I	218637	218645	0.0	0	9	2	yjm1208,yjm1615
-191	I	205442	205549	0.0	0	108	2	yjm554,yjm682
-190	I	205197	205219	0.0869565217391	2	23	2	yjm1419,yjm1573
-193	I	206044	206083	0.15	6	40	2	yjm1389,yjm1433
-192	I	205585	205615	0.161290322581	5	31	2	yjm1573,yjm682
diff --git a/code/phylogeny/shared_introgression_polymorphism_main.py b/code/phylogeny/shared_introgression_polymorphism_main.py
index 09a957a..5f79fb4 100644
--- a/code/phylogeny/shared_introgression_polymorphism_main.py
+++ b/code/phylogeny/shared_introgression_polymorphism_main.py
@@ -1,20 +1,17 @@
-import sys
-import os
 import gzip
 import math
-sys.path.insert(0, '..')
 import global_params as gp
-sys.path.insert(0, '../align/')
-import align_helpers
-sys.path.insert(0, '../misc/')
-import read_fasta
-import read_table
+from align import align_helpers
+from misc import read_fasta
+from misc import read_table
+
 
 def calculate_polymorphism(seqs):
     poly_sites = []
     keys = seqs.keys()
     n = len(seqs[seqs.keys()[0]])
-    ignore = set([gp.unsequenced_symbol.upper(), gp.unsequenced_symbol.lower()])
+    ignore = set([gp.unsequenced_symbol.upper(),
+                  gp.unsequenced_symbol.lower()])
     t = 0
     for i in range(n):
         a = [seqs[key][i] for key in keys]
@@ -27,6 +24,7 @@ def calculate_polymorphism(seqs):
             t += 1
     return len(poly_sites), t
 
+
 def diffs_per_site(s1, s2):
     ignore = set([gp.unsequenced_symbol.lower(), gp.gap_symbol])
     num = 0
@@ -41,6 +39,7 @@ def diffs_per_site(s1, s2):
         return 'NA'
     return float(num)/den
 
+
 def calculate_nuc_div(seqs):
     # average number of nucleotide differences per site between a pair
     # of sequences
@@ -58,9 +57,11 @@ def calculate_nuc_div(seqs):
         return 'NA'
     return float(num)/den
 
+
 # read in shared regions
-shared_regions , l = \
-    read_table.read_table_rows('shared_introgression_nonsingleton_list.txt', '\t')
+shared_regions, _ = \
+    read_table.read_table_rows('shared_introgression_nonsingleton_list.txt',
+                               '\t')
 
 # read in strain dirs information
 s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values()))
@@ -71,35 +72,39 @@ def calculate_nuc_div(seqs):
 # - for each introgressed strain, calculate:
 #   - number of unique variants among introgressed strains (or all strains?)
 f = open('shared_introgression_nonsingleton_polymorphism.txt', 'w')
-f.write('region_number\tchromosome\tstart\tend\tpi\tfrac_poly\tnum_poly\tnum_total\tnum_strains\tstrain_list\n')
+f.write('region_number\tchromosome\tstart\tend\tpi\t'
+        'frac_poly\tnum_poly\tnum_total\tnum_strains\tstrain_list\n')
 for chrm in gp.chrms:
 
     chrom_seqs = {}
     for region_number in shared_regions.keys():
         if shared_regions[region_number]['chromosome'] != chrm:
             continue
-        
-        print '*', region_number
+
+        print('*', region_number)
 
         strains = shared_regions[region_number]['strain_list'].split(',')
         ref_start = int(shared_regions[region_number]['start'])
         ref_end = int(shared_regions[region_number]['end'])
-    
+
         region_seqs = {}
         for strain in strains:
-            print ' ', strain
-            ref_to_strain_coords = [float(x[:-1]) for x in \
-                                    gzip.open(gp.analysis_out_dir_absolute + \
-                                              'coordinates/S288c_to_' + strain + \
-                                              '_chr' + chrm + '.txt.gz').readlines()]
-            strain_start = int(max(0, math.ceil(ref_to_strain_coords[ref_start])))
+            print(' ', strain)
+            ref_to_strain_coords = [
+                float(x[:-1]) for x in
+                gzip.open(gp.analysis_out_dir_absolute +
+                          'coordinates/S288c_to_' + strain +
+                          '_chr' + chrm + '.txt.gz').readlines()
+            ]
+            strain_start = int(max(0,
+                                   math.ceil(ref_to_strain_coords[ref_start])))
             strain_end = int(math.floor(ref_to_strain_coords[ref_end]))
-            
-            if not chrom_seqs.has_key(strain):
-                chrom_seqs[strain] = read_fasta.read_fasta(strain_dirs[strain] + \
-                                                           strain + '_chr' + \
-                                                           chrm + gp.fasta_suffix)[1][0]
-            #seq = chrom_seqs[strain][strain_start:strain_end+1]
+
+            if strain not in chrom_seqs:
+                chrom_seqs[strain] = read_fasta.read_fasta(
+                    strain_dirs[strain] + strain + '_chr' +
+                    chrm + gp.fasta_suffix)[1][0]
+            # seq = chrom_seqs[strain][strain_start:strain_end+1]
             seq = [gp.gap_symbol for i in range(ref_start, ref_end + 1)]
             for i in range(ref_start, ref_end + 1):
                 c = ref_to_strain_coords[i]
@@ -113,7 +118,7 @@ def calculate_nuc_div(seqs):
             fp = float(p)/t
 
         nuc_div = calculate_nuc_div(region_seqs)
-        
+
         f.write(region_number + '\t')
         f.write(chrm + '\t')
         f.write(shared_regions[region_number]['start'] + '\t')
diff --git a/code/setup.py b/code/setup.py
new file mode 100644
index 0000000..d975663
--- /dev/null
+++ b/code/setup.py
@@ -0,0 +1,16 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='introgression',
+    version='0.1',
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=[
+        'Click',
+    ],
+    entry_points='''
+        [console_scripts]
+        introgression=analyze.main:cli
+
+''',
+)
diff --git a/code/sim/aggregate.py b/code/sim/aggregate.py
index 2d75921..62b654e 100644
--- a/code/sim/aggregate.py
+++ b/code/sim/aggregate.py
@@ -1,25 +1,23 @@
-import sys
-sys.path.append('..')
-import global_params as gp
-sys.path.append('../misc')
-from mystats import *
+from misc.mystats import (mean, std_err, bootstrap)
+
 
 def average_item(item):
     if item == '':
         return mean([])
     if item[0] == '[':
-        l = []
-        if item != '[]':            
-            l = [float(x) for x in item[1:-1].split(',')]
-        return mean(l)
+        items = []
+        if item != '[]':
+            items = [float(x) for x in item[1:-1].split(',')]
+        return mean(items)
     return float(item)
 
+
 def read_summary_file(fn):
     f = open(fn, 'r')
     line = f.readline()
     labels = line[:-1].split('\t')
     ncols = len(labels)
-    vals = [[] for l in labels]
+    vals = [[] for _ in labels]
     line = f.readline()
     while line != '':
         items = line[:-1].split('\t')
@@ -33,13 +31,14 @@ def read_summary_file(fn):
     d_bootstrap = dict(zip(labels, [bootstrap(x) for x in vals]))
     return d_mean, d_std_err, d_bootstrap
 
+
 def write_header(f, keys):
-    
     f.write('line_type\ttag')
     for key in keys:
         f.write('\t' + key)
     f.write('\n')
 
+
 def write_line_set(d_mean, d_std_err, d_bootstrap, keys, tag, f):
 
     # line type: mean
@@ -66,10 +65,11 @@ def write_line_set(d_mean, d_std_err, d_bootstrap, keys, tag, f):
         f.write('\t' + str(d_bootstrap[key][1]))
     f.write('\n')
 
+
 def aggregate_summary_files(fns, fn_all, tags):
     f_all = open(fn_all, 'w')
     header = True
-    keys = None # for keeping order consistent
+    keys = None  # for keeping order consistent
     for i in range(len(fns)):
         d_mean, d_std_err, d_bootstrap = read_summary_file(fns[i])
         if header:
@@ -78,16 +78,3 @@ def aggregate_summary_files(fns, fn_all, tags):
             header = False
         write_line_set(d_mean, d_std_err, d_bootstrap, keys, tags[i], f_all)
     f_all.close()
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/code/sim/aggregate_main.py b/code/sim/aggregate_main.py
index 28124fb..aba1541 100644
--- a/code/sim/aggregate_main.py
+++ b/code/sim/aggregate_main.py
@@ -1,20 +1,19 @@
 import sys
-from aggregate import *
-import process_args
-sys.path.append('..')
+from aggregate import aggregate_summary_files
 import global_params as gp
 
 # read in all summary files and generate one summary file with
 # averages, std devs for all stats
 
 arg_lines = open(sys.argv[1], 'r').readlines()
-tags = [l.split(' ')[0] for l in arg_lines]
+tags = [line.split(' ')[0] for line in arg_lines]
 # summary, introgressed_actual, introgressed_predicted, etc
-output_type  = sys.argv[2] 
+output_type = sys.argv[2]
 sim_id = sys.argv[3]
 
 gp_dir = '../'
-fns = [gp_dir + gp.sim_out_dir + gp.sim_out_prefix + tag + '_' + output_type + '.txt' \
+fns = [gp_dir + gp.sim_out_dir + gp.sim_out_prefix + tag
+       + '_' + output_type + '.txt'
        for tag in tags]
 
 fn_all = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + 'all_' + output_type + \
diff --git a/code/sim/compare_introgressed.py b/code/sim/compare_introgressed.py
index 2b17ff1..283ba92 100644
--- a/code/sim/compare_introgressed.py
+++ b/code/sim/compare_introgressed.py
@@ -4,12 +4,13 @@ def find_in_blocks(i, blocks):
         for block in blocks[state]:
             if i >= block[0] and i <= block[1]:
                 return state
-    print 'not found', i, blocks
+    print('not found', i, blocks)
+
 
 def count_bases_one(d1, d2, args, suffix1, suffix2):
     # for one individual
     d = {}
-    d_suffix = {suffix1:{}, suffix2:{}}
+    d_suffix = {suffix1: {}, suffix2: {}}
     for state1 in args['species']:
         d_suffix[suffix1][state1] = 0
         d_suffix[suffix2][state1] = 0
@@ -25,6 +26,7 @@ def count_bases_one(d1, d2, args, suffix1, suffix2):
 
     return d, d_suffix
 
+
 def count_bases(d1, d2, args, suffix1, suffix2, pos):
     # separate counts for all individuals
     d = {}
@@ -46,20 +48,20 @@ def count_bases(d1, d2, args, suffix1, suffix2, pos):
     # loop through all individuals
     num_inds = 0
     for ind in d1.keys():
-        if d2.has_key(ind):
-            d_pairs, d_suffix = count_bases_one(d1[ind], d2[ind], args, \
+        if ind in d2:
+            d_pairs, d_suffix = count_bases_one(d1[ind], d2[ind], args,
                                                 suffix1, suffix2)
             d[ind] = d_pairs
             for pair in d[ind]:
                 d_avg[pair] += d[ind][pair]
 
-                if pair == (pos, pos): # par, par
+                if pair == (pos, pos):  # par, par
                     d_avg['true positive'] += d[ind][pair]
-                elif pair[0] != pos and pair[1] == pos: # cer, par
+                elif pair[0] != pos and pair[1] == pos:  # cer, par
                     d_avg['false positive'] += d[ind][pair]
-                elif pair[0] == pos and pair[1] != pos: # par, cer
+                elif pair[0] == pos and pair[1] != pos:  # par, cer
                     d_avg['false negative'] += d[ind][pair]
-                else: # cer, cer
+                else:  # cer, cer
                     d_avg['true negative'] += d[ind][pair]
 
             for state in d_suffix[suffix1]:
@@ -81,24 +83,27 @@ def count_bases(d1, d2, args, suffix1, suffix2, pos):
 
     try:
         d_avg['tpr'] = d_avg['true positive'] / float(d_avg['positive'])
-    except: 
+    except ValueError:
         d_avg['tpr'] = float('nan')
-    try: 
+    try:
         d_avg['fpr'] = d_avg['false positive'] / float(d_avg['negative'])
-    except:
+    except ValueError:
         d_avg['fpr'] = float('nan')
-    try: 
-        d_avg['ppv'] = d_avg['true positive'] / float(d_avg['predicted positive'])
-    except:
+    try:
+        d_avg['ppv'] = d_avg['true positive'] /\
+            float(d_avg['predicted positive'])
+    except ValueError:
         d_avg['ppv'] = float('nan')
-    try: 
-        d_avg['fdr'] = d_avg['false positive'] / float(d_avg['predicted positive'])
-    except:
+    try:
+        d_avg['fdr'] = d_avg['false positive'] /\
+            float(d_avg['predicted positive'])
+    except ValueError:
         d_avg['fdr'] = float('nan')
 
     # TODO fix d?
     return d, d_avg
 
+
 def write_compare_header(f, states, suffix1, suffix2, sep='\t'):
 
     header_string = ''
@@ -106,14 +111,15 @@ def write_compare_header(f, states, suffix1, suffix2, sep='\t'):
     header_string += 'tn' + sep + 'n' + sep + 'fn' + sep + 'pn' + sep
     header_string += 'tpr' + sep + 'fpr' + sep + 'ppv' + sep + 'fdr' + sep
 
-    #for state in states:
-    #    header_string += 'bases_' + suffix1 + '_' + state + sep
-    #    header_string += 'bases_' + suffix2 + '_' + state + sep
+    # for state in states:
+    #     header_string += 'bases_' + suffix1 + '_' + state + sep
+    #     header_string += 'bases_' + suffix2 + '_' + state + sep
     for state1 in states:
         for state2 in states:
             header_string += 'bases_' + state1 + '_' + state2 + sep
     f.write(header_string[:-len(sep)] + '\n')
 
+
 def write_compare_line(avg_base_counts, f, states, suffix1, suffix2, sep='\t'):
 
     line_string = ''
@@ -132,9 +138,9 @@ def write_compare_line(avg_base_counts, f, states, suffix1, suffix2, sep='\t'):
     line_string += str(avg_base_counts['ppv']) + sep
     line_string += str(avg_base_counts['fdr']) + sep
 
-    #for state in states:
-    #    line_string += str(avg_base_counts[(state, suffix1)]) + sep
-    #    line_string += str(avg_base_counts[(state, suffix2)]) + sep
+    # for state in states:
+    #     line_string += str(avg_base_counts[(state, suffix1)]) + sep
+    #     line_string += str(avg_base_counts[(state, suffix2)]) + sep
     for state1 in states:
         for state2 in states:
             line_string += str(avg_base_counts[(state1, state2)]) + sep
@@ -145,24 +151,30 @@ def write_compare_line(avg_base_counts, f, states, suffix1, suffix2, sep='\t'):
 ###############
 
 
-def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_to_predict):
+def group_actual_predicted_blocks(blocks_actual,
+                                  blocks_predicted,
+                                  states,
+                                  inds_to_predict):
     # each block entry is a list with four items: the species it was
     # predicted to be from, (dictionary of) the number of bases within
     # it that are actually from each species, the total block length,
     # and the index of the strain
 
     # returns dictionaries of lists (one entry per strain)
-    
+
     num_samples = len(inds_to_predict)
 
     # first loop through actual blocks
     d_actual_predicted = {}
     d_actual_counts = {}
     for state_actual in states:
-        # this is legit because species_to samples always come first 
-        d_actual_counts[state_actual] = dict(zip(inds_to_predict, [0] * num_samples))
+        # this is legit because species_to samples always come first
+        d_actual_counts[state_actual] = dict(zip(inds_to_predict,
+                                                 [0] * num_samples))
         for state_predicted in states:
-            d_actual_predicted[(state_actual,state_predicted)] = dict(zip(inds_to_predict, [0] * num_samples))
+            d_actual_predicted[(state_actual,
+                                state_predicted)] = dict(
+                                    zip(inds_to_predict, [0] * num_samples))
 
     for b in range(len(blocks_actual)):
         state_actual = blocks_actual[b][0]
@@ -172,15 +184,18 @@ def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_
             add = 0
             if blocks_actual[b][1][state_predicted] > 0:
                 add = 1
-            d_actual_predicted[(state_actual, state_predicted)][sample_ind] += add
+            d_actual_predicted[(state_actual,
+                                state_predicted)][sample_ind] += add
 
     # then loop through predicted blocks
     d_predicted_actual = {}
     d_predicted_counts = {}
     for state_predicted in states:
-        d_predicted_counts[state_predicted] = dict(zip(inds_to_predict, [0] * num_samples))
+        d_predicted_counts[state_predicted] = dict(
+            zip(inds_to_predict, [0] * num_samples))
         for state_actual in states:
-            d_predicted_actual[(state_predicted,state_actual)] = dict(zip(inds_to_predict, [0] * num_samples))
+            d_predicted_actual[(state_predicted, state_actual)] = dict(
+                zip(inds_to_predict, [0] * num_samples))
 
     for b in range(len(blocks_predicted)):
         state_predicted = blocks_predicted[b][0]
@@ -190,33 +205,48 @@ def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_
             add = 0
             if blocks_predicted[b][1][state_actual] > 0:
                 add = 1
-            d_predicted_actual[(state_predicted, state_actual)][sample_ind] += add
+            d_predicted_actual[(state_predicted,
+                                state_actual)][sample_ind] += add
 
     '''
     # convert dictionaries to lists
     for state in states:
-        d_actual_counts[state] = [d_actual_counts[state][x] for x in sorted(d_actual_counts[state].keys())]
-        d_predicted_counts[state] = [d_predicted_counts[state][x] for x in sorted(d_predicted_counts[state].keys())]
+        d_actual_counts[state] = [d_actual_counts[state][x]
+            for x in sorted(d_actual_counts[state].keys())]
+        d_predicted_counts[state] = [d_predicted_counts[state][x]
+            for x in sorted(d_predicted_counts[state].keys())]
     for key in d_actual_predicted.keys():
-        d_actual_predicted[key] = [d_actual_predicted[key][x] for x in sorted(d_actual_predicted[key].keys())]
-        d_predicted_actual[key] = [d_predicted_actual[key][x] for x in sorted(d_predicted_actual[key].keys())]
+        d_actual_predicted[key] = [d_actual_predicted[key][x]
+            for x in sorted(d_actual_predicted[key].keys())]
+        d_predicted_actual[key] = [d_predicted_actual[key][x]
+            for x in sorted(d_predicted_actual[key].keys())]
     '''
 
     # convert to dictionaries containing lists for each individual
 
     for state_actual in states:
-        d_actual_counts[state_actual] = [d_actual_counts[state_actual][i] for i in inds_to_predict]
+        d_actual_counts[state_actual] = [d_actual_counts[state_actual][i]
+                                         for i in inds_to_predict]
         # looks dumb but correct
-        d_predicted_counts[state_actual] = [d_predicted_counts[state_actual][i] for i in inds_to_predict]
+        d_predicted_counts[state_actual] = [d_predicted_counts[state_actual][i]
+                                            for i in inds_to_predict]
         for state_predicted in states:
             d_actual_predicted[(state_actual, state_predicted)] = \
-                [d_actual_predicted[(state_actual, state_predicted)][i] for i in inds_to_predict]
+                [d_actual_predicted[(state_actual, state_predicted)][i]
+                 for i in inds_to_predict]
             d_predicted_actual[(state_predicted, state_actual)] = \
-                [d_predicted_actual[(state_predicted, state_actual)][i] for i in inds_to_predict]
+                [d_predicted_actual[(state_predicted, state_actual)][i]
+                 for i in inds_to_predict]
+
+    return (d_actual_predicted, d_predicted_actual,
+            d_actual_counts, d_predicted_counts)
 
-    return d_actual_predicted, d_predicted_actual, d_actual_counts, d_predicted_counts
 
-def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_to_predict):
+def evaluate_predicted_blocks(predicted,
+                              actual,
+                              species_to,
+                              all_species,
+                              inds_to_predict):
 
     # each block entry is a list with four items: the species it was
     # predicted to be from, (dictionary of) the number of bases within
@@ -229,7 +259,7 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
 
     for i in range(len(inds_to_predict)):
         strain_id = inds_to_predict[i]
-        
+
         assert len(predicted[i]) == len(actual[i]), \
             str(len(predicted[i])) + ' ' + str(len(actual[i]))
 
@@ -253,12 +283,13 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
             else:
                 count_species = {}
                 for sp in all_species:
-                    count_species[sp] = predicted_block_actual_sequence.count(sp)
+                    count_species[sp] = \
+                        predicted_block_actual_sequence.count(sp)
                 current_block = \
-                    [current_species_predicted,\
-                         count_species,\
-                         len(predicted_block_actual_sequence),\
-                         strain_id]
+                    [current_species_predicted,
+                     count_species,
+                     len(predicted_block_actual_sequence),
+                     strain_id]
                 blocks_predicted.append(current_block)
                 current_species_predicted = seq_predicted[j]
                 predicted_block_actual_sequence = [seq_actual[j]]
@@ -271,28 +302,32 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
             else:
                 count_species = {}
                 for sp in all_species:
-                    count_species[sp] = actual_block_predicted_sequence.count(sp)
+                    count_species[sp] = \
+                        actual_block_predicted_sequence.count(sp)
                 current_block = \
-                    [current_species_actual,\
-                         count_species,\
-                         len(actual_block_predicted_sequence),\
-                         strain_id]
+                    [current_species_actual,
+                     count_species,
+                     len(actual_block_predicted_sequence),
+                     strain_id]
                 blocks_actual.append(current_block)
                 current_species_actual = seq_actual[j]
                 actual_block_predicted_sequence = [seq_predicted[j]]
 
     return blocks_predicted, blocks_actual
 
+
 def evaluate_predicted(predicted, actual, species_to):
 
     # predicted is a list; for each index that wasn't the species we
     # wanted to predict introgression, the entry is None; for other
     # indices, the entry is a list of predicted species at each
     # position (actual is the same deal)
-    assert len(predicted) == len(actual), str(len(predicted)) + ' ' + str(len(actual))
+    assert len(predicted) == len(actual), \
+        str(len(predicted)) + ' ' + str(len(actual))
 
     # make predictions for every cer sequence
-    # and also keep track of lengths of all actual and predicted introgressed tracts
+    # and also keep track of lengths of
+    # all actual and predicted introgressed tracts
     actual_lens = []
     predicted_lens = []
 
@@ -300,8 +335,10 @@ def evaluate_predicted(predicted, actual, species_to):
     num_correct = []
     num_introgressed_correct = []
 
-    num_predicted_tracts_actual = [] # number of predicted introgressed tracts that overlap an actual one
-    num_actual_tracts_predicted = [] # number of actual introgressed tracts that overlap a predicted one
+    # number of predicted introgressed tracts that overlap an actual one
+    num_predicted_tracts_actual = []
+    # number of actual introgressed tracts that overlap a predicted one
+    num_actual_tracts_predicted = []
 
     num_introgressed_tracts = []
     num_not_introgressed_tracts = []
@@ -315,20 +352,22 @@ def evaluate_predicted(predicted, actual, species_to):
         assert len(predicted[i]) == len(actual[i]), \
             str(len(predicted[i])) + ' ' + str(len(actual[i]))
 
-        in_ai = False # in actual introgressed region
-        in_pi = False # in predicted introgressed region
-        c = 0 # number of sites correct
-        ci = 0 # number of introgressed sites correct
-        ai_count = 0 # length of current actual introgressed tract
-        pi_count = 0 # length of current predicted introgressed tract
-        nai = 0 # number of actual introgressed tracts
-        npi = 0 # number of predicted introgressed tracts
-        hit_actual = 0 # whether we've hit an actual introgressed base
-                       # in the current predicted introgressed tract
-                       # (0 no, 1 yes)
-        hit_predicted = 0 # whether we've hit a predicted introgressed
-                          # base in the current actual introgressed
-                          # tract (0 no, 1 yes)
+        in_ai = False  # in actual introgressed region
+        in_pi = False  # in predicted introgressed region
+        c = 0  # number of sites correct
+        ci = 0  # number of introgressed sites correct
+        ai_count = 0  # length of current actual introgressed tract
+        pi_count = 0  # length of current predicted introgressed tract
+        nai = 0  # number of actual introgressed tracts
+        npi = 0  # number of predicted introgressed tracts
+        # whether we've hit an actual introgressed base
+        # in the current predicted introgressed tract
+        # (0 no, 1 yes)
+        hit_actual = 0
+        # whether we've hit a predicted introgressed
+        # base in the current actual introgressed
+        # tract (0 no, 1 yes)
+        hit_predicted = 0
         ap = 0
         pa = 0
 
@@ -342,8 +381,8 @@ def evaluate_predicted(predicted, actual, species_to):
                 # if introgressed, add one to correct introgressed sites count
                 if actual[i][b] != species_to:
                     ci += 1
-                    # if actual and predicted both introgressed, then we've found
-                    # each for the current tracts
+                    # if actual and predicted both introgressed,
+                    # then we've found each for the current tracts
                     hit_actual = 1
                     hit_predicted = 1
 
@@ -357,7 +396,7 @@ def evaluate_predicted(predicted, actual, species_to):
                     in_ai = True
                     ai_count = 1
             # if the current position is not introgressed, end
-            # previous actual introgressed region 
+            # previous actual introgressed region
             else:
                 if in_ai:
                     actual_lens.append(ai_count)
@@ -375,7 +414,7 @@ def evaluate_predicted(predicted, actual, species_to):
                     in_pi = True
                     pi_count = 1
             # if the current position is not predicted introgressed, end
-            # previous predicted introgressed region 
+            # previous predicted introgressed region
             else:
                 if in_pi:
                     predicted_lens.append(pi_count)
@@ -423,11 +462,14 @@ def evaluate_predicted(predicted, actual, species_to):
 
     assert(sum(num_introgressed_tracts) == len(actual_lens))
     assert(sum(num_predicted_introgressed_tracts) == len(predicted_lens))
-    
-    return num_correct, num_introgressed_correct, actual_lens, predicted_lens, \
+
+    return num_correct, num_introgressed_correct, \
+        actual_lens, predicted_lens, \
         num_predicted_tracts_actual, num_actual_tracts_predicted, \
         num_introgressed_tracts, num_not_introgressed_tracts, \
-        num_predicted_introgressed_tracts, num_predicted_not_introgressed_tracts
+        num_predicted_introgressed_tracts, \
+        num_predicted_not_introgressed_tracts
+
 
 def count_blocks(blocks, negative_label, kind):
     # negative label is species to predict - i.e. if it matches
@@ -447,6 +489,7 @@ def count_blocks(blocks, negative_label, kind):
                 x += 1
     return x
 
+
 def get_positives(blocks, negative):
     x = 0
     for block in blocks:
diff --git a/code/sim/compare_introgressed_main.py b/code/sim/compare_introgressed_main.py
index 984a955..9a3966d 100644
--- a/code/sim/compare_introgressed_main.py
+++ b/code/sim/compare_introgressed_main.py
@@ -1,8 +1,9 @@
 import sys
 import process_args
 import sim_process
-from compare_introgressed import *
-sys.path.append('..')
+from compare_introgressed import (write_compare_header,
+                                  count_bases,
+                                  write_compare_line)
 import global_params as gp
 
 # takes in two sets of introgression calls on same set of data and
@@ -10,8 +11,8 @@
 # from two different prediction methods)
 
 sim_tag = sys.argv[1]
-suffix1 = sys.argv[2] # actual, or first prediction method
-suffix2 = sys.argv[3] # prediction method
+suffix1 = sys.argv[2]  # actual, or first prediction method
+suffix2 = sys.argv[3]  # prediction method
 args = process_args.process_args_by_tag(sys.argv[4], sim_tag)
 
 gp_dir = '../'
@@ -32,18 +33,23 @@
 write_compare_header(f_out, args['species'], suffix1, suffix2)
 while line1 != '' and line2 != '':
 
-    d1, rep1, line1 = sim_process.read_introgression_blocks(f1, line1, args['species'])
-    d2, rep2, line2 = sim_process.read_introgression_blocks(f2, line2, args['species'])
+    d1, rep1, line1 = sim_process.read_introgression_blocks(f1, line1,
+                                                            args['species'])
+    d2, rep2, line2 = sim_process.read_introgression_blocks(f2, line2,
+                                                            args['species'])
     assert rep1 == rep2, str(rep1) + ' ' + str(rep2)
-    print 'rep', rep1
-    print d1
-    print d2
+    print('rep', rep1)
+    print(d1)
+    print(d2)
 
-    base_counts, avg_base_counts = count_bases(d1, d2, args, suffix1, suffix2, 'par')
+    base_counts, avg_base_counts = count_bases(d1, d2, args,
+                                               suffix1, suffix2, 'par')
 
-    write_compare_line(avg_base_counts, f_out, args['species'], suffix1, suffix2)
+    write_compare_line(avg_base_counts, f_out, args['species'],
+                       suffix1, suffix2)
 
 if line1 != '' or line2 != '':
-    print 'one of these files is incomplete and for some reason I\'m not bothering to tell you which!'
+    print('one of these files is incomplete and for some reason '
+          'I\'m not bothering to tell you which!')
 
 f_out.close()
diff --git a/code/sim/concordance_functions.py b/code/sim/concordance_functions.py
index dc423c8..79c93f3 100644
--- a/code/sim/concordance_functions.py
+++ b/code/sim/concordance_functions.py
@@ -1,3 +1,5 @@
+
+
 # converts branch lengths to represent total time rather than just
 # that length
 def make_times_additive(t):
@@ -29,41 +31,44 @@ def split(t, cutoff_time):
     current_lineages = [t]
     final_lineages = []
     while len(current_lineages) > 0:
-        l = current_lineages.pop()
-        if len(l) == 2 or l[2] < cutoff_time:
-            final_lineages.append(l)
+        lineage = current_lineages.pop()
+        if len(lineage) == 2 or lineage[2] < cutoff_time:
+            final_lineages.append(lineage)
         else:
-            left = l[0]
-            right = l[1]
+            left = lineage[0]
+            right = lineage[1]
             prev_time = -1
             if len(left) == 2:
                 prev_time = left[1]
             else:
                 prev_time = left[2]
             if prev_time < cutoff_time:
-                final_lineages.append(l)
+                final_lineages.append(lineage)
             else:
                 current_lineages.append(left)
                 current_lineages.append(right)
     return final_lineages
 
+
 # return list of lineages that exist at given time
 def get_labels(t):
-    if type(t) != type([]):
+    if not isinstance(t, list):
         return [t]
     if len(t) == 2:
         return [t[0]]
     return get_labels(t[0]) + get_labels(t[1])
 
+
 # checks whether t consists _only_ of labels in A (but does not
 # necessarily include all of them)
 def is_partial_clade(t, species, index_to_species):
     labels = get_labels(t)
-    for l in labels:
-        if index_to_species[l] != species:
-            return False, index_to_species[l]
+    for label in labels:
+        if index_to_species[label] != species:
+            return False, index_to_species[label]
     return True, species
 
+
 # checks whether t consists _only_ of labels in A or _only_ of
 # labels not in A
 def is_one_species(t, A):
@@ -71,24 +76,26 @@ def is_one_species(t, A):
     is_A = False
     if labels[0] in A:
         is_A = True
-    for l in labels:
-        if (l in A) != is_A:
+    for label in labels:
+        if (label in A) != is_A:
             return False
     return True
 
+
 # checks whether t contains all labels in A _and_ only labels in A
 def is_whole_species(t, A):
     labels = get_labels(t)
     if len(labels) != len(A):
         return False
-    for l in labels:
-        if l not in A:
+    for label in labels:
+        if label not in A:
             return False
     return True
 
+
 # all subtrees(?)
 def get_internal_nodes(t):
-    assert(type(t) == type([]))
+    assert(isinstance(t, list))
     assert(len(t) == 3)
 
     nodes = [t]
@@ -98,15 +105,17 @@ def get_internal_nodes(t):
         nodes += get_internal_nodes(t[1])
     return nodes
 
+
 def get_species(t, label_to_species):
     labels = get_labels(t)
     species = []
-    for l in labels:
-        s = label_to_species[l]
+    for label in labels:
+        s = label_to_species[label]
         if s not in species:
             species.append(s)
     return species
 
+
 def collapse_tree(t, label_to_species):
 
     # collapse gene tree by proceeding backwards in time until a
@@ -122,6 +131,7 @@ def collapse_tree(t, label_to_species):
     # get all coalescent events
     internal_nodes = get_internal_nodes(t)
     assert(len(internal_nodes) == len(label_to_species) - 1)
+
     # sort by time that they occurred (most recent at beginning of list)
     # note that we have to go down one level to get the time of the node
     def key_function(x):
@@ -136,20 +146,21 @@ def key_function(x):
         else:
             right = x[1][2]
         return max(left, right)
-    internal_nodes.sort(key = key_function)
+    internal_nodes.sort(key=key_function)
 
     for it in internal_nodes:
         s = get_species(it, label_to_species)
         if len(s) == 2:
-            for l in label_to_species:
-                if label_to_species[l] in s:
-                    label_to_species[l] = s
+            for label in label_to_species:
+                if label_to_species[label] in s:
+                    label_to_species[label] = s
         else:
             assert(len(s) == 1)
     return label_to_species.values()[0]
 
+
 def sort_recursively(a):
-    if type(a) != type([]):
+    if not isinstance(a, list):
         return a
     left = sort_recursively(a[0])
     right = sort_recursively(a[1])
@@ -158,9 +169,11 @@ def sort_recursively(a):
     else:
         return [right, left]
 
+
 def equivalent_topologies(a, b):
     return sort_recursively(a) == sort_recursively(b)
 
+
 # returns true if species is monophyletic (regardless of how many
 # other species there are)
 def is_concordant(t, index_to_species, species):
@@ -176,12 +189,16 @@ def is_concordant(t, index_to_species, species):
             return True
     return False
 
-def is_monophyletically_concordant(t, A, B, C, split, label_to_species = None, species_topology = None):
+
+def is_monophyletically_concordant(t, A, B, C, split,
+                                   label_to_species=None,
+                                   species_topology=None):
     if split:
         if not is_split_concordant(t, B, C):
             return False
-    else: 
-        if not is_topologically_concordant(t, label_to_species, species_topology):
+    else:
+        if not is_topologically_concordant(t, label_to_species,
+                                           species_topology):
             return False
     nodes = get_internal_nodes(t)
     clade_A = False
@@ -196,6 +213,7 @@ def is_monophyletically_concordant(t, A, B, C, split, label_to_species = None, s
             clade_C = True
     return clade_A and clade_B and clade_C
 
+
 def is_topologically_concordant(t, label_to_species, species_topology):
 
     gene_topology = collapse_tree(t, label_to_species)
@@ -203,8 +221,8 @@ def is_topologically_concordant(t, label_to_species, species_topology):
         return True
     return False
 
+
 # can do this simpler thing for the special case of 3 species
-def is_split_concordant(t, A, B = []):
+def is_split_concordant(t, A, B=[]):
 
     return (is_one_species(t[0], A + B) and is_one_species(t[1], A + B))
-
diff --git a/code/sim/concordance_functions_old.py b/code/sim/concordance_functions_old.py
index 966f3be..1a3856a 100644
--- a/code/sim/concordance_functions_old.py
+++ b/code/sim/concordance_functions_old.py
@@ -1,42 +1,49 @@
+def make_times_additive(t):
+    raise ValueError('Not implemented')
+
+
 # finds the lineages that exist at a given time
 def split(t, cutoff_time):
     t = make_times_additive(t)
     current_lineages = [t]
     final_lineages = []
     while len(current_lineages) > 0:
-        l = current_lineages.pop()
-        if len(l) == 2 or l[2] < cutoff_time:
-            final_lineages.append(l)
+        lineage = current_lineages.pop()
+        if len(lineage) == 2 or lineage[2] < cutoff_time:
+            final_lineages.append(lineage)
         else:
-            left = l[0]
-            right = l[1]
+            left = lineage[0]
+            right = lineage[1]
             prev_time = -1
             if len(left) == 2:
                 prev_time = left[1]
             else:
                 prev_time = left[2]
             if prev_time < cutoff_time:
-                final_lineages.append(l)
+                final_lineages.append(lineage)
             else:
                 current_lineages.append(left)
                 current_lineages.append(right)
     return final_lineages
 
+
 # return list of lineages that exist at given time
 def get_labels(t):
     if len(t) == 2:
         return [t[0]]
     return get_labels(t[0]) + get_labels(t[1])
 
+
 # checks whether t consists _only_ of labels in A (but does not
 # necessarily include all of them)
 def is_partial_clade(t, species, index_to_species):
     labels = get_labels(t)
-    for l in labels:
-        if index_to_species[l] != species:
+    for label in labels:
+        if index_to_species[label] != species:
             return False
     return True
 
+
 # checks whether t consists _only_ of labels in A or _only_ of
 # labels not in A
 def is_one_species(t, A):
@@ -44,54 +51,57 @@ def is_one_species(t, A):
     is_A = False
     if labels[0] in A:
         is_A = True
-    for l in labels:
-        if (l in A) != is_A:
+    for label in labels:
+        if (label in A) != is_A:
             return False
     return True
 
+
 # checks whether t contains all labels in A _and_ only labels in A
 def is_whole_species(t, A):
     labels = get_labels(t)
     if len(labels) != len(A):
         return False
-    for l in labels:
-        if l not in A:
+    for label in labels:
+        if label not in A:
             return False
     return True
 
+
 # actual introgressed with 2 species total
-def find_introgressed_2(t, cutoff_time, to_species, from_species, index_to_species):
+def find_introgressed_2(t, cutoff_time, to_species,
+                        from_species, index_to_species):
     lineages = split(t, cutoff_time)
     introgressed = index_to_species
-    for l in lineages:
+    for lineage in lineages:
         # if there's any occurence of the from_species in this clade,
         # then mark all individuals as coming from from_species (since
         # for now we're only allowing migration in one direction)
-        if not is_partial_clade(l, to_species, index_to_species):
-            for label in get_labels(l):
+        if not is_partial_clade(lineage, to_species, index_to_species):
+            for label in get_labels(lineage):
                 introgressed[label] = from_species
     return introgressed, len(lineages)
 
+
 # actual introgressed with 3 species total
-def find_introgressed_3(t, cutoff_time_1, cutoff_time_2, to_species, \
-                            from_species_1, from_species_2, label_to_species):
-    lineages = split(t, cutoff_time)
+def find_introgressed_3(t, cutoff_time_1, cutoff_time_2, to_species,
+                        from_species_1, from_species_2, label_to_species):
+    lineages = split(t, cutoff_time_1)  # was cutoff_time
     non_introgressed = []
     introgressed = []
-    for l in lineages:
-        if is_partial_clade(l, label_to_species):
-            non_introgressed += get_labels(l)
+    for lineage in lineages:
+        if is_partial_clade(lineage, label_to_species):
+            non_introgressed += get_labels(lineage)
         else:
-            labels = get_labels(l)
+            labels = get_labels(lineage)
             for label in labels:
-                if label_to_species[label] == from_species:
+                if label_to_species[label] == from_species_1:  # from_species
                     introgressed.append(label)
     return introgressed, len(lineages)
 
 
-
 def get_internal_nodes(t):
-    assert(type(t) == type([]))
+    assert isinstance(t, list)
     assert(len(t) == 3)
 
     nodes = [t]
@@ -101,15 +111,17 @@ def get_internal_nodes(t):
         nodes += get_internal_nodes(t[1])
     return nodes
 
+
 def get_species(t, label_to_species):
     labels = get_labels(t)
     species = []
-    for l in labels:
-        s = label_to_species[l]
+    for label in labels:
+        s = label_to_species[label]
         if s not in species:
             species.append(s)
     return species
 
+
 def collapse_tree(t, label_to_species):
 
     # collapse gene tree by proceeding backwards in time until a
@@ -125,6 +137,7 @@ def collapse_tree(t, label_to_species):
     # get all coalescent events
     internal_nodes = get_internal_nodes(t)
     assert(len(internal_nodes) == len(label_to_species) - 1)
+
     # sort by time that they occurred (most recent at beginning of list)
     # note that we have to go down one level to get the time of the node
     def key_function(x):
@@ -139,20 +152,21 @@ def key_function(x):
         else:
             right = x[1][2]
         return max(left, right)
-    internal_nodes.sort(key = key_function)
+    internal_nodes.sort(key=key_function)
 
     for it in internal_nodes:
         s = get_species(it, label_to_species)
         if len(s) == 2:
-            for l in label_to_species:
-                if label_to_species[l] in s:
-                    label_to_species[l] = s
+            for label in label_to_species:
+                if label_to_species[label] in s:
+                    label_to_species[label] = s
         else:
             assert(len(s) == 1)
     return label_to_species.values()[0]
 
+
 def sort_recursively(a):
-    if type(a) != type([]):
+    if not isinstance(a, list):
         return a
     left = sort_recursively(a[0])
     right = sort_recursively(a[1])
@@ -161,20 +175,27 @@ def sort_recursively(a):
     else:
         return [right, left]
 
+
 def equivalent_topologies(a, b):
     return sort_recursively(a) == sort_recursively(b)
 
+
 def is_concordant(t, A, B):
 
     return (is_whole_species(t[0], A) and is_whole_species(t[1], B)) or \
         (is_whole_species(t[1], A) and is_whole_species(t[0], B))
 
-def is_monophyletically_concordant(t, A, B, C, split, label_to_species = None, species_topology = None):
+
+def is_monophyletically_concordant(t, A, B, C, split,
+                                   label_to_species=None,
+                                   species_topology=None):
     if split:
         if not is_split_concordant(t, B, C):
             return False
-    else: 
-        if not is_topologically_concordant(t, label_to_species, species_topology):
+    else:
+        if not is_topologically_concordant(t,
+                                           label_to_species,
+                                           species_topology):
             return False
     nodes = get_internal_nodes(t)
     clade_A = False
@@ -189,6 +210,7 @@ def is_monophyletically_concordant(t, A, B, C, split, label_to_species = None, s
             clade_C = True
     return clade_A and clade_B and clade_C
 
+
 def is_topologically_concordant(t, label_to_species, species_topology):
 
     gene_topology = collapse_tree(t, label_to_species)
@@ -196,8 +218,8 @@ def is_topologically_concordant(t, label_to_species, species_topology):
         return True
     return False
 
+
 # can do this simpler thing for the special case of 3 species
-def is_split_concordant(t, A, B = []):
+def is_split_concordant(t, A, B=[]):
 
     return (is_one_species(t[0], A + B) and is_one_species(t[1], A + B))
-
diff --git a/code/sim/fix_summary.py b/code/sim/fix_summary.py
index cd5a32b..4f779f8 100644
--- a/code/sim/fix_summary.py
+++ b/code/sim/fix_summary.py
@@ -14,7 +14,7 @@
         line = line.strip().split('\t')
         for i in range(len(line)):
             fnew.write(line[i])
-            if (i+1)%ncols == 0:
+            if (i+1) % ncols == 0:
                 fnew.write('\n')
             else:
                 fnew.write('\t')
diff --git a/code/sim/ils_rosenberg.py b/code/sim/ils_rosenberg.py
index f6e954c..25c78be 100644
--- a/code/sim/ils_rosenberg.py
+++ b/code/sim/ils_rosenberg.py
@@ -1,8 +1,8 @@
-import sys
 import math
 import cPickle
 import mpmath
 
+
 def F(k, a, b, c, d):
     '''returns the probability that at least one interspecific coalescence
     occurs as the a lineages from species A, b lineages from species B,
@@ -12,66 +12,70 @@ def F(k, a, b, c, d):
 
     assert(k >= 1)
 
-    if d.has_key((k, a, b, c)):
+    if (k, a, b, c) in d:
         return d[(k, a, b, c)]
 
     if a < 0 or b < 0 or c < 0 or a + b + c <= k:
         return 0
 
     sum_choose_2 = choose_2(a + b + c)
-    result =  a * b / sum_choose_2 + \
+    result = a * b / sum_choose_2 + \
         F(k, a - 1, b, c, d) * choose_2(a) / sum_choose_2 + \
         F(k, a, b - 1, c, d) * choose_2(b) / sum_choose_2 + \
-        F(k, a, b, c - 1, d) * choose_2(c) / sum_choose_2        
+        F(k, a, b, c - 1, d) * choose_2(c) / sum_choose_2
     d[(k, a, b, c)] = result
     return result
-        
+
+
 def choose_2(x):
     return x * (x - 1) / 2
 
+
 def falling_factorial(a, k):
     product = 1
     for i in range(0, k):
         product *= a - i
     return product
 
+
 def rising_factorial(a, k):
     product = 1
     for i in range(0, k):
         product *= a + i
     return product
 
+
 def g(i, j, T, d):
     '''returns probability that i lineages derive from j lineages that
     existed T coalescent time units in the _past_'''
 
-
-    if d.has_key((i, j, T)):
+    if (i, j, T) in d:
         return d[(i, j, T)]
 
     # note that result will only be nonzero for i >= j
     total = 0
     for k in range(j, i + 1):
         k = mpmath.mpf(k)
-        total +=  mpmath.mpf(math.e) ** (-k * (k - 1) * T / 2.) * \
-                  (2 * k - 1) * (-1) ** (k - j) * \
-                  rising_factorial(j, k - 1) * \
-                  falling_factorial(i, k) / \
-                  (mpmath.mpf(math.factorial(j)) * \
-                       math.factorial(k - j) * \
-                       rising_factorial(i, k))
+        total += mpmath.mpf(math.e) ** (-k * (k - 1) * T / 2.) * \
+            (2 * k - 1) * (-1) ** (k - j) * \
+            rising_factorial(j, k - 1) * \
+            falling_factorial(i, k) / \
+            (mpmath.mpf(math.factorial(j)) *
+             math.factorial(k - j) *
+             rising_factorial(i, k))
 
     d[(i, j, T)] = total
 
     return total
 
+
 def W(m, n, x, k, T, d, dg):
     ''' returns the probability that X1 = x, X2 = k - x | X1 + X2 = k,
     where X1 and X2 are the number of lineages ancestral to species A
     and B respectively at time T3 + T, and m and n are the numbers of
     ancestral lineages for the two species at time T3'''
 
-    if d.has_key((m, n, x, k, T)):
+    if (m, n, x, k, T) in d:
         return d[(m, n, x, k, T)]
 
     denom = 0
@@ -84,11 +88,13 @@ def W(m, n, x, k, T, d, dg):
 
     return result
 
+
 def delta(k, i):
     if k == i:
         return 1
     return 0
-        
+
+
 # THIS WAS SUPPOSED TO BE TOPOLOGICAL BUT ENDED UP PRODUCING VALUES
 # THAT MATCH TAKAHATA... FML
 def takahata_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
@@ -101,23 +107,26 @@ def takahata_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
             n = mpmath.mpf(n)
             for k in range(1, m + n + 1):
                 k = mpmath.mpf(k)
-                print m, n, k
+                print(m, n, k)
                 xsum = 0
                 for x in range(1, k):
                     x = mpmath.mpf(x)
                     lsum = 0
-                    for l in range(1, q): # MISSING +1
-                        l = mpmath.mpf(l)
-                        lsum += g(q, l, T3 + T2, g_dic) * F(1, x, k-x, l, F_dic)
+                    for i in range(1, q):  # MISSING +1
+                        i = mpmath.mpf(i)
+                        lsum += g(q, i, T3 + T2, g_dic) *\
+                            F(1, x, k-x, i, F_dic)
                     xsum += W(m, n, x, k, T2, W_dic, g_dic) * lsum
 
-                total += g(r, m, T3, g_dic) * g(s, n, T3, g_dic) * g(m + n, k, T2, g_dic) * \
-                         (F(k, m, n, 0, F_dic) + \
-                          (1 - F(k, m, n, 0, F_dic)) * \
-                          xsum)
+                total += g(r, m, T3, g_dic) * \
+                    g(s, n, T3, g_dic) * g(m + n, k, T2, g_dic) * \
+                    (F(k, m, n, 0, F_dic) +
+                     (1 - F(k, m, n, 0, F_dic)) *
+                     xsum)
 
     return total
 
+
 def topological_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
     '''returns probability of topological concordance'''
 
@@ -128,23 +137,26 @@ def topological_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
             n = mpmath.mpf(n)
             for k in range(1, m + n + 1):
                 k = mpmath.mpf(k)
-                print m, n, k
+                print(m, n, k)
                 xsum = 0
                 for x in range(1, k):
                     x = mpmath.mpf(x)
                     lsum = 0
-                    for l in range(1, q + 1):
-                        l = mpmath.mpf(l)
-                        lsum += g(q, l, T3 + T2, g_dic) * F(1, x, k-x, l, F_dic)
+                    for i in range(1, q + 1):
+                        i = mpmath.mpf(i)
+                        lsum += g(q, i, T3 + T2, g_dic) *\
+                            F(1, x, k-x, i, F_dic)
                     xsum += W(m, n, x, k, T2, W_dic, g_dic) * lsum
 
-                total += g(r, m, T3, g_dic) * g(s, n, T3, g_dic) * g(m + n, k, T2, g_dic) * \
-                         (F(k, m, n, 0, F_dic) + \
-                          (1 - F(k, m, n, 0, F_dic)) * \
-                          xsum)
+                total += g(r, m, T3, g_dic) * \
+                    g(s, n, T3, g_dic) * g(m + n, k, T2, g_dic) * \
+                    (F(k, m, n, 0, F_dic) +
+                     (1 - F(k, m, n, 0, F_dic)) *
+                     xsum)
 
     return total
 
+
 def monophyletic_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
     '''returns probability of monophyletic concordance'''
 
@@ -155,22 +167,27 @@ def monophyletic_concordance(r, s, q, T3, T2, F_dic, g_dic, W_dic):
             n = mpmath.mpf(n)
             for k in range(1, m + n + 1):
                 k = mpmath.mpf(k)
-                print m, n, k
-                for l in range(1, q + 1):
-                    l = mpmath.mpf(l)
+                print(m, n, k)
+                for i in range(1, q + 1):
+                    i = mpmath.mpf(i)
                     xsum = 0
                     for x in range(1, k):
                         x = mpmath.mpf(x)
-                        xsum += W(m, n, x, k, T2, W_dic, g_dic) * (1 - F(3, x, k - x, l, F_dic) - \
-                            F(3, x, l, k - x, F_dic) - F(3, k - x, l, x, F_dic)) * 1 / 3.
+                        xsum += W(m, n, x, k, T2, W_dic, g_dic) *\
+                            (1 - F(3, x, k - x, i, F_dic) -
+                             F(3, x, i, k - x, F_dic) -
+                             F(3, k - x, i, x, F_dic)) * 1 / 3.
+
+                    total += g(r, m, T3, g_dic) * g(s, n, T3, g_dic) *\
+                        g(m + n, k, T2, g_dic) * g(q, i, T3 + T2, g_dic) * \
+                        (delta(k, 1) * (1 - F(2, m, n, 0, F_dic)) *
+                         2 / (i * (i + 1)) + (1 - delta(k, 1)) *
+                         (1 - F(k, m, n, 0, F_dic)) * xsum)
 
-                    total += g(r, m, T3, g_dic) * g(s, n, T3, g_dic) * g(m + n, k, T2, g_dic) * g(q, l, T3 + T2, g_dic) * \
-                        (delta(k, 1) * (1 - F(2, m, n, 0, F_dic)) * 2 / (l * (l + 1)) + \
-                             (1 - delta(k, 1)) * (1 - F(k, m, n, 0, F_dic)) * xsum)
-                        
     return total
 
-def monophyletic_concordance_2(r, s, T3, F_dic = {}, g_dic = {}):
+
+def monophyletic_concordance_2(r, s, T3, F_dic={}, g_dic={}):
     '''returns probability of monophyletic concordance for two species'''
 
     total = 0
@@ -185,8 +202,9 @@ def monophyletic_concordance_2(r, s, T3, F_dic = {}, g_dic = {}):
                 g(s, n, T3, g_dic) * \
                 (1 - F(2, m, n, 0, F_dic))
     return total
-    
-def concordance(r, s, q, T3, T2):    
+
+
+def concordance(r, s, q, T3, T2):
 
     # load dictionaries of calculated values from files if they exist
     F_dic = {}
@@ -194,7 +212,7 @@ def concordance(r, s, q, T3, T2):
         f = open('F_vals.txt', 'r')
         F_dic = cPickle.load(f)
         f.close()
-    except:
+    except FileNotFoundError:
         pass
 
     g_dic = {}
@@ -202,7 +220,7 @@ def concordance(r, s, q, T3, T2):
         f = open('g_vals.txt', 'r')
         g_dic = cPickle.load(f)
         f.close()
-    except:
+    except FileNotFoundError:
         pass
 
     W_dic = {}
@@ -210,7 +228,7 @@ def concordance(r, s, q, T3, T2):
         f = open('W_vals.txt', 'r')
         W_dic = cPickle.load(f)
         f.close()
-    except:
+    except FileNotFoundError:
         pass
 
     # calculate different types of concordance
@@ -230,15 +248,19 @@ def concordance(r, s, q, T3, T2):
     cPickle.dump(W_dic, f)
     f.close()
 
-    return {'topological concordance':tc, 'monophyletic concordance':mc}
+    return {'topological concordance': tc, 'monophyletic concordance': mc}
+
 
-#results =  concordance(int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), float(sys.argv[4]), float(sys.argv[5]))
-#print results['topological concordance']
-#print results['monophyletic concordance']
+# results =  concordance(int(sys.argv[1]), int(sys.argv[2]),
+#       int(sys.argv[3]), float(sys.argv[4]), float(sys.argv[5]))
+# print results['topological concordance']
+# print results['monophyletic concordance']
 
 mpmath.mp.dps = 100
-#for i in range(11, 16):
-#    mpmath.nprint(monophyletic_concordance_2(10, 100, 375000000/8000000./i), 50)
-#mpmath.nprint(monophyletic_concordance_2(10, 100, 375000000/8000000.), 50)
-#for i in range(2, 11):
-#    mpmath.nprint(monophyletic_concordance_2(10, 100, 375000000/8000000.*i), 50)
+# for i in range(11, 16):
+#     mpmath.nprint(monophyletic_concordance_2(10, 100,
+#       375000000/8000000./i), 50)
+# mpmath.nprint(monophyletic_concordance_2(10, 100, 375000000/8000000.), 50)
+# for i in range(2, 11):
+#     mpmath.nprint(monophyletic_concordance_2(10, 100,
+#       375000000/8000000.*i), 50)
diff --git a/code/sim/ils_rosenberg_main.py b/code/sim/ils_rosenberg_main.py
index fb59e9e..b7b44d7 100644
--- a/code/sim/ils_rosenberg_main.py
+++ b/code/sim/ils_rosenberg_main.py
@@ -9,7 +9,7 @@
 T = 46.875
 
 num_points = 30
-#x = float(num_points) / 2
+# x = float(num_points) / 2
 max_multiplier = 20
 T_values = []
 x = T - T/float(max_multiplier)
@@ -19,43 +19,47 @@
 for i in range(num_points):
     T_values.append(x * float(i) / num_points + T)
 
-#for i in range(num_points/2, 0, -1):
-#    T_values.append(T/(float(max_multiplier * i) / x))
-#T_values.append(T)
-#for i in range(1, num_points/2 + 1):
-#    T_values.append(T*float(max_multiplier * i) / x)
-print T_values
+# for i in range(num_points/2, 0, -1):
+#     T_values.append(T/(float(max_multiplier * i) / x))
+# T_values.append(T)
+# for i in range(1, num_points/2 + 1):
+#     T_values.append(T*float(max_multiplier * i) / x)
+print(T_values)
 
 for Ti in T_values:
-    #print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, Ti))
-    print Ti, "%0.100f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 94, Ti))
+    # print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, Ti))
+    print(Ti, "%0.100f" %
+          (1 - ils_rosenberg.monophyletic_concordance_2(1, 94, Ti)))
 
 sys.exit()
 
-# test 
-print "%0.50f" % ils_rosenberg.monophyletic_concordance_2(2, 2, 5)
-
-
+# test
+print("%0.50f" % ils_rosenberg.monophyletic_concordance_2(2, 2, 5))
 
 # old
-T = 375000000 * 4 
-print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T))
-print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/2))
-print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/10))
-print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/100))
+T = 375000000 * 4
+print("%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T)))
+print("%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/2)))
+print("%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/10)))
+print("%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, T/100)))
 
 sys.exit()
 
 
 for n in [8000000./2, 8000000., 8000000.*2]:
     for t in [375000000./2, 375000000., 375000000.*2]:
-        print t, n
-        print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, t*4))
-        print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, t/n))
-        print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(1, 94, t/n))
-        print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(2, 94, t/n))
-        print "%0.50f" % (1 - ils_rosenberg.monophyletic_concordance_2(10, 94, t/n))
-        print '***'
+        print(t, n)
+        print("%0.50f" %
+              (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, t*4)))
+        print("%0.50f" %
+              (1 - ils_rosenberg.monophyletic_concordance_2(1, 2, t/n)))
+        print("%0.50f" %
+              (1 - ils_rosenberg.monophyletic_concordance_2(1, 94, t/n)))
+        print("%0.50f" %
+              (1 - ils_rosenberg.monophyletic_concordance_2(2, 94, t/n)))
+        print("%0.50f" %
+              (1 - ils_rosenberg.monophyletic_concordance_2(10, 94, t/n)))
+        print('***')
 
 """
 T = 375000000 / 4N
diff --git a/code/sim/plot_rep_1_setup.py b/code/sim/plot_rep_1_setup.py
index a09c1af..cbf7169 100644
--- a/code/sim/plot_rep_1_setup.py
+++ b/code/sim/plot_rep_1_setup.py
@@ -1,12 +1,7 @@
-import sys
-import os
-import sim_process
-import sim_predict
-sys.path.append('..')
-import global_params as gp
+
 
 def write_coding_table(seqs_coded, files, rep, header):
-    
+
     for i in files.keys():
         f = files[i]
         if header:
@@ -14,15 +9,17 @@ def write_coding_table(seqs_coded, files, rep, header):
         for j in range(len(seqs_coded[i])):
             f.write(str(j) + '\t' + seqs_coded[i][j] + '\t' + str(rep) + '\n')
 
-def write_one_block_set(blocks_dic, ind, f, rep, suffix = ''):
+
+def write_one_block_set(blocks_dic, ind, f, rep, suffix=''):
 
     for block_type in blocks_dic[ind].keys():
         for species in blocks_dic[ind][block_type].keys():
             for block in blocks_dic[ind][block_type][species]:
-                f.write(block_type + suffix + '\t' + species + '\t' + \
-                        str(block[0]) + '\t' + str(block[1]) + '\t' + \
+                f.write(block_type + suffix + '\t' + species + '\t' +
+                        str(block[0]) + '\t' + str(block[1]) + '\t' +
                         str(rep) + '\n')
 
+
 def write_blocks_table(blocks_dic, files, rep, ref_ind, header):
 
     for ind in blocks_dic.keys():
@@ -33,4 +30,3 @@ def write_blocks_table(blocks_dic, files, rep, ref_ind, header):
         write_one_block_set(blocks_dic, ind, f, rep)
         # and also the reference individual
         write_one_block_set(blocks_dic, ref_ind, f, rep, suffix='_ref')
-        
diff --git a/code/sim/plot_rep_1_setup_main.py b/code/sim/plot_rep_1_setup_main.py
index 84d8c0c..942590f 100644
--- a/code/sim/plot_rep_1_setup_main.py
+++ b/code/sim/plot_rep_1_setup_main.py
@@ -10,10 +10,10 @@
 # start and end coordinates (contained in below)
 # at each site, code for matching some combination of reference sequences
 # actual introgressed start and end coordinates, arbitrary number
-# predicted (method #1) introgressed start and end coordinates, arbitrary number
-# predicted (method #2) introgressed start and end coordinates, arbitrary number
+# predicted (method 1) introgressed start and end coordinates, arbitrary number
+# predicted (method 2) introgressed start and end coordinates, arbitrary number
 
-# table 1 (coding): 
+# table 1 (coding):
 # site code rep
 # 1    ++-  1
 # 2    +++  1
@@ -31,48 +31,46 @@
 # predicted method 2           480    500   1
 
 import sys
-import os
-from plot_rep_setup import *
+from plot_rep_setup import (write_coding_table, write_blocks_table)
 import process_args
 import sim_process
 import sim_predict
-sys.path.append('..')
 import global_params as gp
 
-##======
+# ======
 # read in simulation parameters
-##======
+# ======
 
 args, last_read = process_args.process_args(sys.argv)
 # TODO fix
 block_types = sys.argv[last_read+1+len(args['states']):]
 
-##======
+# ======
 # loop through all simulations predict introgression
-##======
+# ======
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' + gp.sim_out_prefix + \
-               args['tag'] + '.txt', 'r')
+ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' + gp.sim_out_prefix +
+            args['tag'] + '.txt', 'r')
 # all introgressed block files to read
 introgression_fn_prefix = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
                           args['tag'] + '_introgressed_'
-introgression_files = [open(introgression_fn_prefix + t + '.txt', 'r') \
+introgression_files = [open(introgression_fn_prefix + t + '.txt', 'r')
                        for t in block_types]
 introgression_file_lines = [f.readline() for f in introgression_files]
 
 # indices of individuals in species with introgression (keep the
 # reference in even though it's not interesting
 inds = args['species_to_indices'][args['species_to']]
-#inds.remove(args['ref_inds'][0])
+# inds.remove(args['ref_inds'][0])
 
 # table 1 (coding of which references each strain matches at each
 # site, 1 file per strain)
 coding_fn_prefix = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
                args['tag'] + '_site_codings_strain_'
-coding_files = dict(zip(inds, \
-                        [open(coding_fn_prefix + str(i) + '.txt', 'w') \
+coding_files = dict(zip(inds,
+                        [open(coding_fn_prefix + str(i) + '.txt', 'w')
                          for i in inds]))
 
 
@@ -80,43 +78,46 @@
 # strain)
 blocks_fn_prefix = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
                args['tag'] + '_introgressed_blocks_strain_'
-blocks_files = dict(zip(inds, \
-                        [open(blocks_fn_prefix + str(i) + '.txt', 'w') \
+blocks_files = dict(zip(inds,
+                        [open(blocks_fn_prefix + str(i) + '.txt', 'w')
                          for i in inds]))
 
 # loop through reps and then individuals
 for i in range(args['num_reps']):
 
-    sim = sim_process.read_one_sim(ms_f, args['num_sites'], args['num_samples'])
+    sim = sim_process.read_one_sim(ms_f, args['num_sites'],
+                                   args['num_samples'])
 
     seqs_coded = sim_predict.set_up_seqs(sim, args)
     write_coding_table(seqs_coded, coding_files, i, i == 0)
 
     # read in blocks from all methods
     # keyed by individual, then block type, then species
-    blocks_dic = {} 
+    blocks_dic = {}
 
     for j in range(len(block_types)):
         # d is keyed by individual, then species
-        d, rep, line = sim_process.read_introgression_blocks(introgression_files[j], \
-                                                        introgression_file_lines[j], \
-                                                        args['states'])
+        d, rep, line = sim_process.read_introgression_blocks(
+            introgression_files[j],
+            introgression_file_lines[j],
+            args['states'])
         assert i == rep, str(i) + ' ' + str(rep)
         introgression_file_lines[j] = line
 
         # this is just converting the dictionary to have the block
         # type layer
         for ind in d.keys():
-            if not blocks_dic.has_key(ind):
+            if ind not in blocks_dic:
                 blocks_dic[ind] = {}
-            if not blocks_dic[ind].has_key(block_types[j]):
+            if block_types[j] not in blocks_dic[ind]:
                 blocks_dic[ind][block_types[j]] = {}
             for species in d[ind]:
-                if not blocks_dic[ind][block_types[j]].has_key(species):
+                if species not in blocks_dic[ind][block_types[j]]:
                     blocks_dic[ind][block_types[j]][species] = []
                 blocks_dic[ind][block_types[j]][species] += d[ind][species]
 
-    write_blocks_table(blocks_dic, blocks_files, i, str(args['ref_inds'][0]), i == 0)
+    write_blocks_table(blocks_dic, blocks_files,
+                       i, str(args['ref_inds'][0]), i == 0)
 
 for f in introgression_files + coding_files.values() + blocks_files.values():
     f.close()
diff --git a/code/sim/plot_rep_setup.py b/code/sim/plot_rep_setup.py
index f3eadfa..10fcdd6 100644
--- a/code/sim/plot_rep_setup.py
+++ b/code/sim/plot_rep_setup.py
@@ -1,11 +1,7 @@
 import sys
-import os
-import sim_process
-import sim_predict
-sys.path.append('..')
-import global_params as gp
 
-def write_combined_file(f, rep, codings, introgressed, \
+
+def write_combined_file(f, rep, codings, introgressed,
                         actual_block_type, introgressed_ref, probs, header):
     #  rep site coding predicted actual actual_ref prob_cer prob_par
     #    0    0     ++       cer    cer        cer      .99       .1
@@ -35,19 +31,22 @@ def write_combined_file(f, rep, codings, introgressed, \
             for state in probs[prob_type].keys():
                 try:
                     f.write('\t' + str(probs[prob_type][state][i]))
-                except:
-                    print prob_type, state, i, len(probs[prob_type][state])
+                except KeyError:
+                    print(prob_type, state, i, len(probs[prob_type][state]))
                     sys.exit()
         f.write('\n')
 
-def write_combined_files(files, inds, rep, codings, introgressed, probs, \
+
+def write_combined_files(files, inds, rep, codings, introgressed, probs,
                          actual_block_type, ref_ind, header):
     for i in inds:
-        write_combined_file(files[i], rep, codings[i], introgressed[i], \
-                            actual_block_type, introgressed[ref_ind], probs[i], header)
+        write_combined_file(files[i], rep, codings[i], introgressed[i],
+                            actual_block_type, introgressed[ref_ind],
+                            probs[i], header)
+
 
 def write_coding_table(seqs_coded, files, rep, header):
-    
+
     for i in files.keys():
         f = files[i]
         if header:
@@ -55,15 +54,17 @@ def write_coding_table(seqs_coded, files, rep, header):
         for j in range(len(seqs_coded[i])):
             f.write(str(j) + '\t' + seqs_coded[i][j] + '\t' + str(rep) + '\n')
 
-def write_one_block_set(blocks_dic, ind, f, rep, suffix = ''):
+
+def write_one_block_set(blocks_dic, ind, f, rep, suffix=''):
 
     for block_type in blocks_dic[ind].keys():
         for species in blocks_dic[ind][block_type].keys():
             for block in blocks_dic[ind][block_type][species]:
-                f.write(block_type + suffix + '\t' + species + '\t' + \
-                        str(block[0]) + '\t' + str(block[1]) + '\t' + \
+                f.write(block_type + suffix + '\t' + species + '\t' +
+                        str(block[0]) + '\t' + str(block[1]) + '\t' +
                         str(rep) + '\n')
 
+
 def write_blocks_table(blocks_dic, files, rep, ref_ind, header):
 
     for ind in blocks_dic.keys():
@@ -74,4 +75,3 @@ def write_blocks_table(blocks_dic, files, rep, ref_ind, header):
         write_one_block_set(blocks_dic, ind, f, rep)
         # and also the reference individual
         write_one_block_set(blocks_dic, ref_ind, f, rep, suffix='_ref')
-        
diff --git a/code/sim/plot_rep_setup_main.py b/code/sim/plot_rep_setup_main.py
index 2180461..0d83ed0 100644
--- a/code/sim/plot_rep_setup_main.py
+++ b/code/sim/plot_rep_setup_main.py
@@ -14,53 +14,55 @@
 
 import sys
 import os
-from plot_rep_setup import *
+from plot_rep_setup import write_combined_files
 import process_args
 import sim_process
 import sim_predict
-sys.path.append('..')
 import global_params as gp
 
-##======
+# ======
 # read in simulation parameters
-##======
+# ======
 
 sim_tag = sys.argv[2]
 sim_args = process_args.process_args_by_tag(sys.argv[1], sim_tag)
 predict_args, last_read = sim_predict.process_args(sys.argv, sim_args, i=2)
 
-predict_prob_block_types = ['predicted_' + predict_args['predict_tag'], \
-                            'predicted_phylohmm_' + predict_args['predict_tag']]
-predict_path_block_types = []#['predicted_viterbi_' + predict_args['predict_tag']]
+predict_prob_block_types = ['predicted_' + predict_args['predict_tag'],
+                            'predicted_phylohmm_' +
+                            predict_args['predict_tag']]
+# ['predicted_viterbi_' + predict_args['predict_tag']]
+predict_path_block_types = []
 actual_block_type = 'actual'
-block_types = predict_prob_block_types + predict_path_block_types + [actual_block_type]
+block_types = predict_prob_block_types +\
+    predict_path_block_types + [actual_block_type]
 
-##======
+# ======
 # produce combined file
-##======
+# ======
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix + \
+ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix +
             sim_args['tag'] + '.txt', 'r')
 
 # all introgressed block files to read
 introgression_fn_prefix = gp.sim_out_dir_absolute + gp.sim_out_prefix + \
                           sim_args['tag'] + '_introgressed_'
-introgression_files = dict(zip(block_types, \
-                               [open(introgression_fn_prefix + t + '.txt', 'r') \
+introgression_files = dict(zip(block_types,
+                               [open(introgression_fn_prefix + t + '.txt', 'r')
                                 for t in block_types]))
-introgression_file_lines = dict(zip(block_types, \
-                                    [introgression_files[k].readline() \
+introgression_file_lines = dict(zip(block_types,
+                                    [introgression_files[k].readline()
                                      for k in introgression_files]))
 
 # all prob files to read
 prob_fn_prefix = gp.sim_out_dir_absolute + gp.sim_out_prefix + \
                  sim_args['tag'] + '_introgressed_probs_'
-prob_files = dict(zip(predict_prob_block_types, \
-                      [open(prob_fn_prefix + t + '.txt', 'r') \
+prob_files = dict(zip(predict_prob_block_types,
+                      [open(prob_fn_prefix + t + '.txt', 'r')
                        for t in predict_prob_block_types]))
-prob_file_lines = dict(zip(predict_prob_block_types, \
+prob_file_lines = dict(zip(predict_prob_block_types,
                            [prob_files[k].readline() for k in prob_files]))
 
 
@@ -81,30 +83,31 @@
 # loop through reps and then individuals
 for i in range(sim_args['num_reps']):
 
-    print 'rep', i
+    print('rep', i)
 
-    combined_files = dict(zip(inds, \
-                              [open(combined_fn_prefix + str(ind) + '_rep' + str(i) + \
-                                    '.txt', 'w') \
+    combined_files = dict(zip(inds,
+                              [open(combined_fn_prefix + str(ind) +
+                                    '_rep' + str(i) + '.txt', 'w')
                                for ind in inds]))
 
-    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'], sim_args['num_samples'])
+    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'],
+                                   sim_args['num_samples'])
 
     # list of num_samples sequences
     seqs_coded = sim_predict.set_up_seqs(sim, sim_args, predict_args)
 
     # read in blocks from all methods
     # keyed by individual, then block type, list of species at all sites
-    blocks_dic = {} 
+    blocks_dic = {}
 
     # keyed by individual and then block_type and then species, list of probs
     probs = {}
 
     for t in block_types:
         # d is keyed by individual, then species
-        d, rep, line = sim_process.read_introgression_blocks(\
-                            introgression_files[t], \
-                            introgression_file_lines[t], \
+        d, rep, line = sim_process.read_introgression_blocks(
+                            introgression_files[t],
+                            introgression_file_lines[t],
                             predict_args['states'])
         assert i == rep, str(i) + ' ' + str(rep)
         introgression_file_lines[t] = line
@@ -112,7 +115,7 @@
         # this is just converting the dictionary to have the block
         # type layer
         for ind in inds:
-            if not blocks_dic.has_key(ind):
+            if ind not in blocks_dic:
                 blocks_dic[ind] = {}
             blocks_dic[ind][t] = d[ind]
 
@@ -122,19 +125,21 @@
             blocks_dic[ref_ind][t] = d[ref_ind]
         # probs only exist for predictions with probabilities
         elif t in predict_prob_block_types:
-            probs_ind, rep, line = sim_process.read_state_probs(prob_files[t], \
-                                                                prob_file_lines[t])
+            probs_ind, rep, line = sim_process.read_state_probs(
+                prob_files[t], prob_file_lines[t])
             assert i == rep, str(i) + ' ' + str(rep)
             prob_file_lines[t] = line
             # this is just converting the dictionary to have the block
             # type layer
             for ind in inds:
-                if not probs.has_key(ind):
+                if ind not in probs:
                     probs[ind] = {}
                 probs[ind][t] = probs_ind[ind]
 
-    write_combined_files(combined_files, inds, i, seqs_coded, blocks_dic, probs, \
+    write_combined_files(combined_files, inds, i,
+                         seqs_coded, blocks_dic, probs,
                          actual_block_type, ref_ind, True)
 
-for f in introgression_files.values() + prob_files.values() + combined_files.values():
+for f in introgression_files.values() + \
+        prob_files.values() + combined_files.values():
     f.close()
diff --git a/code/sim/prediction_functions.py b/code/sim/prediction_functions.py
index fc5d17c..60847f4 100644
--- a/code/sim/prediction_functions.py
+++ b/code/sim/prediction_functions.py
@@ -1,12 +1,17 @@
-    
+import sys
 
 
 def predict_introgressed_hmm_old(seqs_filled, third):
 
+    class HMM():
+        def __init__():
+            raise NotImplementedError()
+    hmm_symbol = None
+
     # create a hidden markov model and determine which reference genome we
     # are most likely to be in at each variant site
     hmm = HMM()
-        
+
     hmm.set_obs(seqs_filled)
 
     # only cer and par
@@ -16,32 +21,32 @@ def predict_introgressed_hmm_old(seqs_filled, third):
 
         hmm_trans_cer_par = .0005
 
-        hmm.set_trans([\
-                [1-hmm_trans_cer_par,hmm_trans_cer_par],\
-                [hmm_trans_cer_par,1-hmm_trans_cer_par],\
+        hmm.set_trans([
+                [1-hmm_trans_cer_par, hmm_trans_cer_par],
+                [hmm_trans_cer_par, 1-hmm_trans_cer_par],
                     ])
 
-        emis_cer = {\
-            hmm_symbol['cer']:.5,\
-                hmm_symbol['cerpar']:.4998,\
-                hmm_symbol['par']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+        emis_cer = {
+            hmm_symbol['cer']: .5,
+            hmm_symbol['cerpar']: .4998,
+            hmm_symbol['par']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_cer.values()) == 1, sum(emis_cer.values())
 
-        emis_par = {\
-            hmm_symbol['par']:.5,\
-                hmm_symbol['cerpar']:.4998,\
-                hmm_symbol['cer']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+        emis_par = {
+            hmm_symbol['par']: .5,
+            hmm_symbol['cerpar']: .4998,
+            hmm_symbol['cer']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_par.values()) == 1, sum(emis_par.values())
-    
+
         hmm.set_emis([emis_cer, emis_par])
-    
+
     elif third == 'bay':
 
-        hmm.set_init([1/3.,1/3.,1/3.])
+        hmm.set_init([1/3., 1/3., 1/3.])
         # anything (of the correct length) would work here, because really
         # we just use indices; difference between bayanus and unknown is
         # in whether it matches bayanus sequence or just doesn't match
@@ -51,100 +56,112 @@ def predict_introgressed_hmm_old(seqs_filled, third):
         hmm_trans_cer_par = .0005
         hmm_trans_cer_bay = .0005
         hmm_trans_par_bay = .0005
-    
+
         # hmm.set_trans({'cer':{'cer':1-hmm_trans, 'par':hmm_trans},
         # 'par':{'cer':hmm_trans, 'par':1-hmm_trans}})
-        hmm.set_trans([\
-                [1-(hmm_trans_cer_par-hmm_trans_cer_bay),hmm_trans_cer_par,hmm_trans_cer_bay],\
-                    [hmm_trans_cer_par,1-(hmm_trans_cer_par-hmm_trans_par_bay),hmm_trans_par_bay],\
-                    [hmm_trans_cer_bay,hmm_trans_par_bay,1-(hmm_trans_cer_bay-hmm_trans_par_bay)]\
-                    ])
-        
+        hmm.set_trans([
+            [1-(hmm_trans_cer_par-hmm_trans_cer_bay),
+             hmm_trans_cer_par,
+             hmm_trans_cer_bay],
+            [hmm_trans_cer_par,
+             1-(hmm_trans_cer_par-hmm_trans_par_bay),
+             hmm_trans_par_bay],
+            [hmm_trans_cer_bay,
+             hmm_trans_par_bay,
+             1-(hmm_trans_cer_bay-hmm_trans_par_bay)]
+        ])
+
         # 0    1       2       3          4    5       6    7
         # cer, cerpar, cerbay, cerparbay, par, parbay, bay, none
-        emis_cer = {\
-            hmm_symbol['cer']:.5,\
-                hmm_symbol['cerpar']:.25,\
-                hmm_symbol['cerbay']:.15,\
-                hmm_symbol['cerparbay']:.0995,\
-                hmm_symbol['par']:.0001,\
-                hmm_symbol['parbay']:.0001,\
-                hmm_symbol['bay']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+        emis_cer = {
+            hmm_symbol['cer']: .5,
+            hmm_symbol['cerpar']: .25,
+            hmm_symbol['cerbay']: .15,
+            hmm_symbol['cerparbay']: .0995,
+            hmm_symbol['par']: .0001,
+            hmm_symbol['parbay']: .0001,
+            hmm_symbol['bay']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_cer.values()) == 1, sum(emis_cer.values())
-        
-        emis_par = {\
-            hmm_symbol['par']:.5,\
-                hmm_symbol['cerpar']:.25,\
-                hmm_symbol['parbay']:.15,\
-                hmm_symbol['cerparbay']:.0995,\
-                hmm_symbol['cer']:.0001,\
-                hmm_symbol['cerbay']:.0001,\
-                hmm_symbol['bay']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+
+        emis_par = {
+            hmm_symbol['par']: .5,
+            hmm_symbol['cerpar']: .25,
+            hmm_symbol['parbay']: .15,
+            hmm_symbol['cerparbay']: .0995,
+            hmm_symbol['cer']: .0001,
+            hmm_symbol['cerbay']: .0001,
+            hmm_symbol['bay']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_par.values()) == 1, sum(emis_par.values())
 
-        emis_bay = {\
-            hmm_symbol['bay']:.5,\
-                hmm_symbol['parbay']:.25,\
-                hmm_symbol['cerbay']:.15,\
-                hmm_symbol['cerparbay']:.0995,\
-                hmm_symbol['par']:.0001,\
-                hmm_symbol['cerpar']:.0001,\
-                hmm_symbol['cer']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+        emis_bay = {
+            hmm_symbol['bay']: .5,
+            hmm_symbol['parbay']: .25,
+            hmm_symbol['cerbay']: .15,
+            hmm_symbol['cerparbay']: .0995,
+            hmm_symbol['par']: .0001,
+            hmm_symbol['cerpar']: .0001,
+            hmm_symbol['cer']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_bay.values()) == 1, sum(emis_bay.values())
 
         hmm.set_emis([emis_cer, emis_par, emis_bay])
 
     elif third == 'unknown':
 
-        hmm.set_init([1/3.,1/3.,1/3.])
+        hmm.set_init([1/3., 1/3., 1/3.])
         hmm.set_states(['cer', 'par', 'unk'])
 
         hmm_trans_cer_par = .0005
         hmm_trans_cer_unk = .0005
         hmm_trans_par_unk = .0005
 
-        hmm.set_trans([\
-                [1-(hmm_trans_cer_par-hmm_trans_cer_unk),hmm_trans_cer_par,hmm_trans_cer_unk],\
-                    [hmm_trans_cer_par,1-(hmm_trans_cer_par-hmm_trans_par_unk),hmm_trans_par_unk],\
-                    [hmm_trans_cer_unk,hmm_trans_par_unk,1-(hmm_trans_cer_unk-hmm_trans_par_unk)]\
-                    ])
-
-        emis_cer = {\
-            hmm_symbol['cer']:.5,\
-                hmm_symbol['cerpar']:.4998,\
-                hmm_symbol['par']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+        hmm.set_trans([
+            [1-(hmm_trans_cer_par-hmm_trans_cer_unk),
+             hmm_trans_cer_par,
+             hmm_trans_cer_unk],
+            [hmm_trans_cer_par,
+             1-(hmm_trans_cer_par-hmm_trans_par_unk),
+             hmm_trans_par_unk],
+            [hmm_trans_cer_unk,
+             hmm_trans_par_unk,
+             1-(hmm_trans_cer_unk-hmm_trans_par_unk)]
+        ])
+
+        emis_cer = {
+            hmm_symbol['cer']: .5,
+            hmm_symbol['cerpar']: .4998,
+            hmm_symbol['par']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_cer.values()) == 1, sum(emis_cer.values())
-        
-        emis_par = {\
-            hmm_symbol['par']:.5,\
-                hmm_symbol['cerpar']:.4998,\
-                hmm_symbol['cer']:.0001,\
-                hmm_symbol['none']:.0001\
-                }
+
+        emis_par = {
+            hmm_symbol['par']: .5,
+            hmm_symbol['cerpar']: .4998,
+            hmm_symbol['cer']: .0001,
+            hmm_symbol['none']: .0001
+        }
         assert sum(emis_par.values()) == 1, sum(emis_par.values())
 
-        emis_unk = {\
-            hmm_symbol['par']:.0001,\
-                hmm_symbol['cerpar']:.0001,\
-                hmm_symbol['cer']:.0001,\
-                hmm_symbol['none']:.9997\
-                }
+        emis_unk = {
+            hmm_symbol['par']: .0001,
+            hmm_symbol['cerpar']: .0001,
+            hmm_symbol['cer']: .0001,
+            hmm_symbol['none']: .9997
+        }
         assert sum(emis_unk.values()) == 1, sum(emis_unk.values())
 
         hmm.set_emis([emis_cer, emis_par, emis_unk])
 
     else:
-        print 'option for third incorrect'
+        print('option for third incorrect')
         sys.exit()
-                
+
     # Baum-Welch parameter estimation
     hmm.go()
 
diff --git a/code/sim/process.py b/code/sim/process.py
index 28bc27d..79e95ee 100644
--- a/code/sim/process.py
+++ b/code/sim/process.py
@@ -1,13 +1,12 @@
 import sys
-from sim_analyze_hmm_bw import *
-from concordance_functions import *
-sys.path.insert(0, '..')
 import global_params as gp
 
+
 def process_list(l):
     r = [x.strip() for x in l[1:-1].split(',')]
     return [process_entry(x) for x in r]
 
+
 def process_entry(l):
     if len(l) == 0:
         return None
@@ -15,20 +14,24 @@ def process_entry(l):
         return process_list(l)
     try:
         return float(l)
-    except:
+    except ValueError:
         return l
 
+
 def process_line(l, d, labels):
-    l = l.strip().split('\t')
-    for i in range(len(l)):
-        r = process_entry(l[i])
+    token = l.strip().split('\t')
+    for i in range(len(token)):
+        r = process_entry(token[i])
         d[labels[i]].append(r)
 
+
 tag = sys.argv[1]
 
 gp_dir = '../'
-results_filename = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + tag + '_summary.txt'
-processed_filename = gp_dir + gp.sim_out_dir + gp.sim_out_prefix + tag + '_summary_processed.txt'
+results_filename = gp_dir + gp.sim_out_dir + \
+    gp.sim_out_prefix + tag + '_summary.txt'
+processed_filename = gp_dir + gp.sim_out_dir + \
+    gp.sim_out_prefix + tag + '_summary_processed.txt'
 
 #####
 # read results file into a table
diff --git a/code/sim/process_args.py b/code/sim/process_args.py
index eb08b01..2d1a14f 100644
--- a/code/sim/process_args.py
+++ b/code/sim/process_args.py
@@ -1,8 +1,7 @@
-import sys
 import concordance_functions
-sys.path.append('..')
 import global_params as gp
 
+
 def parse_topology_helper(t, factor=1):
     if '(' not in t:
         return t
@@ -12,7 +11,7 @@ def parse_topology_helper(t, factor=1):
     # if left subtree is just a label, easier to split into left and
     # right
     if t[1] != '(':
-        comma_ind = t.find(',') 
+        comma_ind = t.find(',')
         left = t[1:comma_ind]
         right = t[comma_ind+1:t.rfind(')')]
     # otherwise, have to figure out where left subtree ends by
@@ -31,9 +30,10 @@ def parse_topology_helper(t, factor=1):
         right = t[i+1:t.rfind(')')]
 
     time = float(t[t.rfind(':')+1:]) * factor
-    return [parse_topology_helper(left, factor), \
-                parse_topology_helper(right, factor), \
-                time]
+    return [parse_topology_helper(left, factor),
+            parse_topology_helper(right, factor),
+            time]
+
 
 # parses a topology that also includes divergence times
 # e.g. '(cer,par):375000000,bay):1125000000'
@@ -41,9 +41,10 @@ def parse_topology_helper(t, factor=1):
 def parse_topology(t, factor=1):
     return parse_topology_helper(t, factor)
 
+
 def process_args(arg_list, i=1, print_args=True):
 
-    print arg_list
+    print(arg_list)
 
     # store all arguments in dictionary
     d = {}
@@ -61,9 +62,10 @@ def process_args(arg_list, i=1, print_args=True):
     i += 1
 
     # species names
-    d['species'] = concordance_functions.get_labels(parse_topology(d['topology']))
-    print d['topology']
-    print d['species']
+    d['species'] = concordance_functions.get_labels(
+        parse_topology(d['topology']))
+    print(d['topology'])
+    print(d['species'])
     assert len(d['species']) == 2 or len(d['species']) == 3, d['species']
 
     # ...for the species with introgression
@@ -108,7 +110,8 @@ def process_args(arg_list, i=1, print_args=True):
     assert d['N0_species_to'] == d['N0_species_from1'] and \
         d['N0_species_to'] == d['N0_species_from2']
 
-    d['topology'] = parse_topology(d['topology'], 1/float(4 * d['N0_species_to']))
+    d['topology'] = parse_topology(d['topology'],
+                                   1/float(4 * d['N0_species_to']))
 
     # 13,500 sites to get about 10% with one recombination event, .3% with
     # more than one (based on poisson(.1), 1 recombination per chromosome
@@ -119,12 +122,13 @@ def process_args(arg_list, i=1, print_args=True):
     # parameter is recombination rate between adjacent bp per
     # generation should probably be 1/750000 + 6.1 * 10^-6 = 7.425 *
     # 10^-6 (where 750000 is average chr size) recombination rate
-    d['rho'] = 4 * d['N0_species_to'] * float(arg_list[i]) * (d['num_sites'] - 1)
+    d['rho'] = 4 * d['N0_species_to'] * \
+        float(arg_list[i]) * (d['num_sites'] - 1)
     i += 1
 
     d['outcross_rate'] = float(arg_list[i])
     i += 1
-    
+
     d['rho'] *= d['outcross_rate']
 
     d['theta'] = gp.mu * 4 * d['num_sites'] * d['N0_species_to']
@@ -153,12 +157,13 @@ def process_args(arg_list, i=1, print_args=True):
 
     if print_args:
         for key in d.keys():
-            print key, d[key]
+            print(key, d[key])
 
     return d, i
-    
+
+
 def process_args_by_tag(fn, tag):
-    
+
     f = open(fn, 'r')
     line = f.readline()
     args = None
diff --git a/code/sim/roc.py b/code/sim/roc.py
index f88b42d..7db10b1 100644
--- a/code/sim/roc.py
+++ b/code/sim/roc.py
@@ -1,6 +1,7 @@
 import compare_introgressed
 import sim_predict
 
+
 def reformat_probs(probs):
     # convert probabilities from {1:{cer:.9,.9,..., par:.1,.1,...}}
     # format to {1:[{cer:.9, par:.1},{cer:.9, par:.1},...]} format
@@ -15,13 +16,16 @@ def reformat_probs(probs):
         p[ind] = p_ind
     return p
 
+
 def get_stats(actual, predicted, sim_args):
 
-    d, d_avg = compare_introgressed.count_bases(actual, predicted, \
-                                                sim_args, 'actual', 'predicted', \
+    d, d_avg = compare_introgressed.count_bases(actual, predicted,
+                                                sim_args, 'actual',
+                                                'predicted',
                                                 sim_args['species_from1'])
     return d_avg
 
+
 def threshold_probs(probs, threshold, default_state, ps, seq_start, seq_end):
 
     # when there's just one non-default state, choose that one if
@@ -48,22 +52,24 @@ def threshold_probs(probs, threshold, default_state, ps, seq_start, seq_end):
                     max_positive_state = state
                     max_positive_prob = probs[i][state]
             predicted_thresholded.append(max_positive_state)
-    predicted_thresholded = sim_predict.fill_prediction(predicted_thresholded, \
-                                                        ps, seq_start, seq_end, \
-                                                        [default_state] + \
+    predicted_thresholded = sim_predict.fill_prediction(predicted_thresholded,
+                                                        ps, seq_start, seq_end,
+                                                        [default_state] +
                                                         positive_states)
     return predicted_thresholded
-    
+
+
 def write_roc_header(f, stats, sep):
 
     f.write('threshold')
     for key in sorted(stats.keys()):
-        if type(key) == type(()):
+        if isinstance(key, tuple):
             key = '_'.join(key)
         f.write(sep + str(key))
         f.flush()
     f.write('\n')
 
+
 def write_roc_line(f, threshold, stats, header):
 
     sep = '\t'
diff --git a/code/sim/roc_main.py b/code/sim/roc_main.py
index 2064ff1..1f144d5 100644
--- a/code/sim/roc_main.py
+++ b/code/sim/roc_main.py
@@ -2,21 +2,18 @@
 # and phylonet-hmm
 
 import sys
-import os
 import process_args
 import sim_process
 import sim_predict
 import sim_predict_phylohmm
 import roc
 import gzip
-sys.path.append('..')
 import global_params as gp
-sys.path.append('../misc')
-import mystats
+from misc import mystats
 
-##======
+# ======
 # read in simulation and prediction parameters
-##======
+# ======
 
 method = sys.argv[1]
 sim_tag = sys.argv[3]
@@ -25,44 +22,45 @@
 if method == 'predicted':
     predict_args, last_read = sim_predict.process_args(sys.argv, sim_args, i=3)
 elif method == 'predicted_phylohmm':
-    predict_args, last_read = sim_predict_phylohmm.process_args(sys.argv, sim_args, i=3)
+    predict_args, last_read = sim_predict_phylohmm.process_args(
+        sys.argv, sim_args, i=3)
 else:
-    print 'invalid method specified'
+    print('invalid method specified')
     sys.exit()
 
-##======
-# files 
-##======
+# ======
+# files
+# ======
 
 gp_dir = '../'
-f_probs = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-               sim_tag + '_introgressed_probs_' + method + '_' + \
-               predict_args['predict_tag'] + '.txt','r')
-actual_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + sim_tag + \
+f_probs = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+               sim_tag + '_introgressed_probs_' + method + '_' +
+               predict_args['predict_tag'] + '.txt', 'r')
+actual_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + sim_tag +
                 '_introgressed_actual.txt', 'r')
-f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + sim_tag + \
+f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + sim_tag +
          '_roc_' + method + '_' + predict_args['predict_tag'] + '.txt', 'w')
 
-##======
+# ======
 # read in probabilties and positions
-##======
+# ======
 
-print 'reading in posterior probabilities'
-print 'rep 0/' + str(sim_args['num_reps']),
+print('reading in posterior probabilities')
+print('rep 0/' + str(sim_args['num_reps']),)
 
 # list with one entry per rep
 probs = []
 
 line = f_probs.readline()
 for rep in range(sim_args['num_reps']):
-    print '\r' + 'rep ' + str(rep) + '/' + str(sim_args['num_reps']),
+    print('\r' + 'rep ' + str(rep) + '/' + str(sim_args['num_reps']),)
     sys.stdout.flush()
     # {1:{cer:.9,.9,..., par:.1,.1,...}}
     d, rep, line = sim_process.read_state_probs(f_probs, line)
     # {1:[{cer:.9, par:.1},{cer:.9, par:.1},...]}
     d = roc.reformat_probs(d)
     probs.append(d)
-print
+print()
 f_probs.close()
 
 
@@ -76,42 +74,44 @@
     ps.append(x)
 ps_f.close()
 
-##======
+# ======
 # read in actual introgressed blocks
-##======
+# ======
 
-print 'reading in actual introgressed blocks'
+print('reading in actual introgressed blocks')
 
 actual = []
 
 line = actual_f.readline()
 for rep in range(sim_args['num_reps']):
-    d, rep, line = sim_process.read_introgression_blocks(actual_f, line, \
+    d, rep, line = sim_process.read_introgression_blocks(actual_f, line,
                                                          sim_args['species'])
     actual.append(d)
 
-##======
+# ======
 # calculate statistics for different thresholds and write to file
-##======
+# ======
 
-print 'generating ROC file'
+print('generating ROC file')
 
-#x = 20
-#thresholds = [float(i) / x for i in range(0, x + 2)]
-thresholds = [0, .00001, .00005, .0001, .0005, .001, .005, .01, .05, .1, .5, .6, .7, .8, .9, .99, 1, 1.1]
+# x = 20
+# thresholds = [float(i) / x for i in range(0, x + 2)]
+thresholds = [0, .00001, .00005, .0001, .0005, .001, .005,
+              .01, .05, .1, .5, .6, .7, .8, .9, .99, 1, 1.1]
 header = True
 for threshold in thresholds:
-    print 'threshold:', threshold
+    print('threshold:', threshold)
     all_stats = []
     for rep in range(sim_args['num_reps']):
         paths_t = {}
         for ind in probs[rep]:
-            paths_t[ind] = roc.threshold_probs(probs[rep][ind], \
-                                               threshold, \
-                                               sim_args['species_to'], \
-                                               ps[rep], 0, \
+            paths_t[ind] = roc.threshold_probs(probs[rep][ind],
+                                               threshold,
+                                               sim_args['species_to'],
+                                               ps[rep], 0,
                                                sim_args['num_sites']-1)
-        predicted = sim_process.convert_to_blocks(paths_t, predict_args['states'])
+        predicted = sim_process.convert_to_blocks(paths_t,
+                                                  predict_args['states'])
         stats = roc.get_stats(actual[rep], predicted, sim_args)
         all_stats.append(stats)
     avg_stats = {}
@@ -123,6 +123,3 @@
     roc.write_roc_line(f, threshold, avg_stats, header)
     header = False
 f.close()
-
-
-
diff --git a/code/sim/sim_actual.py b/code/sim/sim_actual.py
index 25453d7..c9f1398 100644
--- a/code/sim/sim_actual.py
+++ b/code/sim/sim_actual.py
@@ -1,24 +1,23 @@
-import sys
-import os
-import copy
 import concordance_functions
 
-def seq_id(a, b, l = -1, use_gaps = False):
+
+def seq_id(a, b, length=-1, use_gaps=False):
     assert len(a) == len(b)
     ndiff = 0
     ntotal = 0
-    for i in xrange(len(a)):
+    for i in range(len(a)):
         # use sequence a as denominator
         if a[i] != '-':
             if b[i] != '-' or use_gaps:
                 ntotal += 1
                 if a[i] != b[i]:
                     ndiff += 1
-    if l == -1:
+    if length == -1:
         if ntotal == 0:
             return -1, 0
         return 1 - float(ndiff) / ntotal, ntotal
-    return 1 - float(ndiff) / l
+    return 1 - float(ndiff) / length
+
 
 def sim_stats(sim, args):
 
@@ -36,27 +35,32 @@ def sim_stats(sim, args):
             # within species
             if i == j:
                 for k in range(len(inds1)):
-                    for l in range(k+1, len(inds1)):
+                    for pos in range(k+1, len(inds1)):
                         s1 = inds1[k]
-                        s2 = inds1[l]
-                        ids_ij.append(\
-                            seq_id(sim['seqs'][s1], sim['seqs'][s2], args['num_sites']))
-                        
+                        s2 = inds1[pos]
+                        ids_ij.append(
+                            seq_id(sim['seqs'][s1],
+                                   sim['seqs'][s2],
+                                   args['num_sites']))
+
             # between species
             else:
                 species2 = args['species'][j]
-                inds2 = args['species_to_indices'][species2] 
+                inds2 = args['species_to_indices'][species2]
                 for s1 in inds1:
                     for s2 in inds2:
-                        ids_ij.append(\
-                            seq_id(sim['seqs'][s1], sim['seqs'][s2], args['num_sites']))
+                        ids_ij.append(
+                            seq_id(sim['seqs'][s1],
+                                   sim['seqs'][s2],
+                                   args['num_sites']))
 
             ids[(species1, species2)] = ids_ij
 
-    stats = {'seq_ids':ids}
+    stats = {'seq_ids': ids}
 
     return stats
 
+
 def weighted_average(values, weights):
     assert len(values) == len(weights)
 
@@ -67,6 +71,7 @@ def weighted_average(values, weights):
 
     return total
 
+
 def calculate_ils(sim, args):
 
     num_trees = len(sim['trees'])
@@ -74,13 +79,14 @@ def calculate_ils(sim, args):
     for ti in range(num_trees):
         t = sim['trees'][ti]
 
-        if concordance_functions.is_concordant(\
-            t, args['index_to_species'], args['species_to']):
+        if concordance_functions.is_concordant(
+                t, args['index_to_species'], args['species_to']):
             concordant[ti] = 1
 
-    stats = {'concordant_site_freq':weighted_average(concordant, sim['recomb_sites']),\
-                 'concordant_tree_frac':float(sum(concordant)) / num_trees}
-            
+    stats = {'concordant_site_freq': weighted_average(concordant,
+                                                      sim['recomb_sites']),
+             'concordant_tree_frac': float(sum(concordant)) / num_trees}
+
     return stats
 
 
@@ -100,9 +106,10 @@ def find_introgressed_2(t, cutoff_time, species_to, index_to_species):
     # keep track of which species introgression came from
     introgressed = {}
     num_lineages_species_to = 0
-    for l in lineages:
+    for lineage in lineages:
         not_introgressed, s = \
-            concordance_functions.is_partial_clade(l, species_to, index_to_species)
+            concordance_functions.is_partial_clade(lineage, species_to,
+                                                   index_to_species)
         # if this lineage has only species_to members, then add this
         # to the number of lineages for the species
         if not_introgressed:
@@ -111,7 +118,7 @@ def find_introgressed_2(t, cutoff_time, species_to, index_to_species):
         # mark all individuals as coming from species_from (since
         # we're only allowing migration in one direction)
         else:
-            for label in concordance_functions.get_labels(l):
+            for label in concordance_functions.get_labels(lineage):
                 # note that species labels/indices start at 0 (see
                 # sim_process.parse_ms_tree_helper())
                 if index_to_species[label] == species_to:
@@ -119,6 +126,7 @@ def find_introgressed_2(t, cutoff_time, species_to, index_to_species):
 
     return introgressed, num_lineages_species_to
 
+
 def find_introgressed_3(t, topology, species_to, index_to_species):
     # find species that are introgressed within a single unrecombined
     # block, given there are two species migrating into one other
@@ -135,24 +143,26 @@ def find_introgressed_3(t, topology, species_to, index_to_species):
     # because two of them have joined; so instead...figure out which
     # species is the last to join (from the topology), then...  nvm,
     # only allowing migration later on makes this less complicated
-    
+
     join_time_species_to = topology[2]
     subtree = topology[0]
     last_to_join = topology[1]
-    if type(subtree) != type([]):
+    if isinstance(subtree, list):
         subtree = topology[1]
         last_to_join = topology[0]
     if last_to_join != species_to:
         join_time_species_to = subtree[2]
 
-    return find_introgressed_2(t, join_time_species_to, species_to, index_to_species)
+    return find_introgressed_2(t, join_time_species_to,
+                               species_to, index_to_species)
+
 
 def find_introgressed(sim, args):
 
-    ##======
+    # ======
     # figure out which sites are actually introgressed by separately
     # looking at the tree for each stretch without recombination
-    ##======
+    # ======
 
     # indices of all species_to individuals (the ones for which we
     # care about introgression)
@@ -180,15 +190,15 @@ def find_introgressed(sim, args):
         # this up a little
         introgressed = None
         # two species total
-        if args['species_from2'] == None:
+        if args['species_from2'] is None:
             introgressed, num_lineages = \
-                find_introgressed_2(t, args['topology'][2], args['species_to'], \
-                                        args['index_to_species'])
+                find_introgressed_2(t, args['topology'][2], args['species_to'],
+                                    args['index_to_species'])
         # three species total
         else:
             introgressed, num_lineages = \
-                find_introgressed_3(t, args['topology'], args['species_to'], \
-                                        args['index_to_species'])
+                find_introgressed_3(t, args['topology'], args['species_to'],
+                                    args['index_to_species'])
 
         # number of sites in the current block of sequence
         num_sites_t = sim['recomb_sites'][ti]
@@ -196,78 +206,81 @@ def find_introgressed(sim, args):
         # the length of the block to the total number of
         # introgressed sites across all strains; also update the
         # state sequence
-        
+
         # just call ref ind the first index of the species
-        ref_ind = min(args['species_to_indices'][args['species_to']]) 
+        ref_ind = min(args['species_to_indices'][args['species_to']])
         for i in inds:
-            if introgressed.has_key(i):
+            if i in introgressed:
                 num_introgressed[i] += num_sites_t
                 actual_state_seq[i] += [introgressed[i]] * num_sites_t
-                if not introgressed.has_key(ref_ind):
+                if ref_ind not in introgressed:
                     num_introgressed_non_ref[i] += num_sites_t
             else:
                 actual_state_seq[i] += [args['species_to']] * num_sites_t
 
-    stats = {'num_introgressed':num_introgressed, \
-             'num_introgressed_non_ref':num_introgressed_non_ref}
+    stats = {'num_introgressed': num_introgressed,
+             'num_introgressed_non_ref': num_introgressed_non_ref}
 
     return stats, actual_state_seq
 
+
 def one_output_header_chunk(d, sep):
 
     s = ''
     for key in sorted(d.keys()):
         value = d[key]
-        if type(value) == type({}):
+        if isinstance(value, dict):
             for subkey in sorted(value.keys()):
                 s += str(key) + '_' + str(subkey) + sep
-        elif type(value) == type([]):
+        elif isinstance(value, list):
             for i in range(len(value)):
                 s += str(key) + '_' + str(i) + sep
         else:
             s += str(key) + sep
     return s
 
-def write_output_headers(summary_info, concordance_info, introgression_info, f, sep):
+
+def write_output_headers(summary_info, concordance_info,
+                         introgression_info, f, sep):
 
     line_string = ''
 
     line_string += one_output_header_chunk(summary_info, sep)
     line_string += one_output_header_chunk(concordance_info, sep)
     line_string += one_output_header_chunk(introgression_info, sep)
-    
+
     f.write(line_string[:-1] + '\n')
-    
+
 
 def one_output_chunk(d, sep):
 
     s = ''
     for key in sorted(d.keys()):
         value = d[key]
-        if type(value) == type({}):
+        if isinstance(value, dict):
             for subkey in sorted(value.keys()):
                 s += str(value[subkey]) + sep
-        elif type(value) == type([]):
+        elif isinstance(value, list):
             for i in range(len(value)):
                 s += str(value[i]) + sep
         else:
             s += str(value) + sep
     return s
 
-def write_output_line(summary_info, concordance_info, introgression_info, f, \
-                          header = False):
+
+def write_output_line(summary_info, concordance_info, introgression_info, f,
+                      header=False):
 
     sep = '\t'
 
     if header:
-        write_output_headers(summary_info, concordance_info, introgression_info, f, sep)
+        write_output_headers(summary_info, concordance_info,
+                             introgression_info, f, sep)
 
     line_string = ''
 
     line_string += one_output_chunk(summary_info, sep)
     line_string += one_output_chunk(concordance_info, sep)
     line_string += one_output_chunk(introgression_info, sep)
-    
-    f.write(line_string[:-1] + '\n')
 
-    
+    f.write(line_string[:-1] + '\n')
diff --git a/code/sim/sim_actual_main.py b/code/sim/sim_actual_main.py
index 3750171..cd7e90e 100644
--- a/code/sim/sim_actual_main.py
+++ b/code/sim/sim_actual_main.py
@@ -1,77 +1,77 @@
 import sys
-import os
 import process_args
 import sim_process
-from sim_actual import *
-sys.path.append('..')
+from sim_actual import (sim_stats, calculate_ils, find_introgressed,
+                        write_output_line)
 import global_params as gp
 
-##======
+# ======
 # read in simulation parameters
-##======
- 
+# ======
+
 args, last_read = process_args.process_args(sys.argv)
 
-##======
+# ======
 # loop through all simulations and do several analyses
-##======
+# ======
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix + \
-                args['tag'] + '.txt', 'r')
+ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix +
+            args['tag'] + '.txt', 'r')
 # summary output
-out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                args['tag'] + '_summary.txt', 'w')
+out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+             args['tag'] + '_summary.txt', 'w')
 # introgression output
-introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                           args['tag'] + '_introgressed_actual.txt', 'w')
+introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                       args['tag'] + '_introgressed_actual.txt', 'w')
 
 for i in range(args['num_reps']):
-    
-    print i
 
-    ##======
+    print(i)
+
+    # ======
     # read in simulated sequences
-    ##======
-    
+    # ======
+
     # trees, recomb_sites, seg_sites, positions, seqs
-    sim = sim_process.read_one_sim(ms_f, args['num_sites'], args['num_samples'])
+    sim = sim_process.read_one_sim(ms_f, args['num_sites'],
+                                   args['num_samples'])
 
-    ##======
+    # ======
     # summarize properties of sequences
-    ##======
+    # ======
 
     stats = sim_stats(sim, args)
 
-    ##======
+    # ======
     # calculate frequency of ILS (or of possible ILS...)
-    ##======
+    # ======
 
     concordance_info = calculate_ils(sim, args)
 
-    ##======
+    # ======
     # find introgressed/non-introgressed tracts
-    ##======
+    # ======
 
     introgression_stats, actual_state_seq = find_introgressed(sim, args)
-    actual_state_seq_blocks = sim_process.convert_to_blocks(actual_state_seq, \
+    actual_state_seq_blocks = sim_process.convert_to_blocks(actual_state_seq,
                                                             args['species'])
 
-    ##======
+    # ======
     # output
-    ##======
+    # ======
 
     # general summary statistics about simulated sequences
-    write_output_line(stats, concordance_info, introgression_stats, out_f, i==0) 
+    write_output_line(stats, concordance_info, introgression_stats,
+                      out_f, i == 0)
 
     # specific locations of introgression (for comparing predictions
     # to)
-    sim_process.write_introgression_blocks(actual_state_seq_blocks, introgression_f, \
+    sim_process.write_introgression_blocks(actual_state_seq_blocks,
+                                           introgression_f,
                                            i, args['species'])
 
 ms_f.close()
 out_f.close()
 introgression_f.close()
-
-
diff --git a/code/sim/sim_analyze_hmm_bw.py b/code/sim/sim_analyze_hmm_bw.py
index 0ca33f2..8f2599e 100644
--- a/code/sim/sim_analyze_hmm_bw.py
+++ b/code/sim/sim_analyze_hmm_bw.py
@@ -1,40 +1,41 @@
-import os
 import sys
 import copy
 import itertools
-from concordance_functions import *
-sys.path.append('../hmm')
-from hmm_bw import *
-sys.path.append('..')
+from sim.concordance_functions import (is_concordant, split, is_partial_clade,
+                                       get_labels)
+from hmm.hmm_bw import HMM
 import global_params as gp
 
 
 theory = False
 theory_done = True
 
+
 # to use instead of d[key] = value, so we can ensure we're not
 # accidentally creating new keys
 def update_value(d, key, value):
     assert key in d, key + ' not in ' + str(d.keys())
     d[key] = value
     return d
-        
+
 
 def mean(a):
-    try: 
+    try:
         return sum(a) / float(len(a))
-    except:
+    except ZeroDivisionError:
         return -1
 
+
 # calculate percentage of sites at which sequences a and b match
 def match_percent(a, b, n):
     assert(len(a) == len(b))
     match = 0
-    for i in xrange(len(a)):
+    for i in range(len(a)):
         if a[i] == b[i]:
             match += 1
     return (float(match + n - len(a))) / n
 
+
 # actual introgressed with 2 species total, within a single unrecombined block
 def find_introgressed_2(t, cutoff_time, species_to, index_to_species):
 
@@ -62,10 +63,11 @@ def find_introgressed_2(t, cutoff_time, species_to, index_to_species):
             introgressed_species_to.append(introgressed[i])
         else:
             assert introgressed[i] == index_to_species[i], \
-                str(introgressed[i]) + ' '  + str(index_to_species[i])
+                str(introgressed[i]) + ' ' + str(index_to_species[i])
 
     return introgressed_species_to, num_lineages_species_to
 
+
 # actual introgressed with 3 species total, within a single unrecombined block
 # THIS FUNCTION ASSUMES MIGRATION ONLY HAPPENS AFTER MOST RECENT DIVERGENCE
 def find_introgressed_3(t, species_to, topology, index_to_species):
@@ -77,20 +79,22 @@ def find_introgressed_3(t, species_to, topology, index_to_species):
     # species because two of them have joined; so instead...figure out
     # which species is the last to join (from the topology), then...
     # nvm, only allowing migration later on makes this less complicated
-    
+
     join_time_species_to = topology[2]
     subtree = topology[0]
     last_to_join = topology[1]
-    if type(subtree) != type([]):
+    if isinstance(subtree, list):
         subtree = topology[1]
         last_to_join = topology[0]
     if last_to_join != species_to:
         join_time_species_to = subtree[2]
 
-    return find_introgressed_2(t, join_time_species_to, species_to, index_to_species)
+    return find_introgressed_2(t, join_time_species_to,
+                               species_to, index_to_species)
+
 
-def initial_probabilities(states, weighted_match_freqs, match_symbol, \
-                              expected_tract_lengths, expected_num_tracts, n):
+def initial_probabilities(states, weighted_match_freqs, match_symbol,
+                          expected_tract_lengths, expected_num_tracts, n):
 
     # a small value to add so that no frequencies are actually 0
     epsilon = .001
@@ -103,20 +107,24 @@ def initial_probabilities(states, weighted_match_freqs, match_symbol, \
         # add these two things because we want to average them instead
         # of letting one of them bring the total to zero [should we
         # really give them equal weight though?]
-        init.append(weighted_match_freqs[state] + \
-                        float(expected_tract_lengths[state] * \
-                                  expected_num_tracts[state]) / n + \
-                        epsilon)
+        init.append(weighted_match_freqs[state] +
+                    float(expected_tract_lengths[state] *
+                          expected_num_tracts[state]) / n +
+                    epsilon)
     scale = float(sum(init))
     for i in range(len(states)):
         init[i] /= scale
-        assert init[i] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+        assert init[i] > 0, ('my HMM methods break down with zero '
+                             'probabilities (just get rid of the state '
+                             'if you don\'t want it!)')
     return init
 
+
 # unlike for other parameters, calculate probabilities from the
 # appropriate species sequences instead of just the one being
 # predicted
-def emission_probabilities(states, unknown_species, d_freqs, match_symbol, mismatch_symbol, own_bias):
+def emission_probabilities(states, unknown_species, d_freqs,
+                           match_symbol, mismatch_symbol, own_bias):
 
     # a small value to add so that no frequencies are actually 0
     epsilon = .001
@@ -131,7 +139,8 @@ def emission_probabilities(states, unknown_species, d_freqs, match_symbol, misma
     emis = []
     for i in range(len(states)):
         emis_species = {}
-        individual_symbol_freqs, symbol_freqs, weighted_match_freqs = d_freqs[states[i]]
+        (individual_symbol_freqs,
+         symbol_freqs, weighted_match_freqs) = d_freqs[states[i]]
         for symbol in symbol_freqs.keys():
             p = symbol_freqs[symbol] + epsilon
             if states[i] == unknown_species:
@@ -142,35 +151,40 @@ def emission_probabilities(states, unknown_species, d_freqs, match_symbol, misma
                     p *= own_bias
             elif symbol[i] == match_symbol:
                 p *= own_bias
-            else: #symbol[i] == mismatch_symbol:
+            else:  # symbol[i] == mismatch_symbol:
                 p *= (1 - own_bias)
             # maybe this kind of guessing for '?' is worth doing? but
             # should probably also do it for the initial/transition
             # probabilities as well if doing it here
-            #else:
-            #    p *= ((own_bias) * individual_symbol_freqs[i][match_symbol] + \
-            #              (1 - own_bias) * individual_symbol_freqs[i][mismatch_symbol])
+            # else:
+            #    p *= ((own_bias) * \
+            #           individual_symbol_freqs[i][match_symbol] + \
+            #           (1 - own_bias) * \
+            #           individual_symbol_freqs[i][mismatch_symbol])
             emis_species[symbol] = p
         norm = float(sum(emis_species.values()))
         for symbol in emis_species:
             emis_species[symbol] /= norm
-            assert emis_species[symbol] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+            assert emis_species[symbol] > 0, \
+                ('my HMM methods break down with zero probabilities '
+                 '(just get rid of the state if you don\'t want it!)')
         emis.append(emis_species)
     return emis
 
-def transition_probabilities(expected_length_introgressed, \
-                                 expected_num_introgressed_tracts, \
-                                 predict_species, num_bases, \
-                                 weighted_match_freqs):
+
+def transition_probabilities(expected_length_introgressed,
+                             expected_num_introgressed_tracts,
+                             predict_species, num_bases,
+                             weighted_match_freqs):
 
     # a small value to add so that no frequencies are actually 0
     epsilon = 1./1000000
 
-    ##states_not_predict = states
-    ##states_not_predict.remove(predict_species)
-    
+    # states_not_predict = states
+    # states_not_predict.remove(predict_species)
+
     # if a is the species with introgression in it
-    
+
     # transition a->b 1/length not int a * frac to b
     # transition a->c 1/length not int a * frac to c
 
@@ -180,38 +194,41 @@ def transition_probabilities(expected_length_introgressed, \
     # b->a 1/length int b
     # b->c basically 0
 
-    # frac to a - 
+    # frac to a -
     # frac to c - num int c / num int c + num not int
 
     # num int c = bases int c / length int c
 
-    # bases int c = [(+c-b) + (+c+b) * 1/2] * some fraction accounting for randomness
+    # bases int c = [(+c-b) + (+c+b) * 1/2] * some fraction for randomness
 
     # c->a 1/length int c
     # c->b basically 0
 
     assert expected_length_introgressed >= 0, expected_length_introgressed
-    assert expected_num_introgressed_tracts >= 0, expected_num_introgressed_tracts
+    assert expected_num_introgressed_tracts >= 0, \
+        expected_num_introgressed_tracts
 
     states = expected_length_introgressed.keys()
 
     # TODO should be able to calculate expected length based on expected time?
-    #expected_length_introgressed = {}
+    # expected_length_introgressed = {}
     expected_num_introgressed_bases = {}
     for species in states:
         expected_num_introgressed_bases[species] = \
             expected_num_introgressed_tracts[species] * \
             expected_length_introgressed[species]
-    # TODO should be able to calculate expected length based on expected amount of migration?
-    #expected_num_introgressed_tracts = {}
+    # TODO should be able to calculate expected length
+    # based on expected amount of migration?
+    # expected_num_introgressed_tracts = {}
 
     expected_num_not_introgressed_tracts = \
         sum(expected_num_introgressed_tracts.values()) + 1
     expected_num_not_introgressed_bases = \
         num_bases - sum(expected_num_introgressed_bases.values())
-    expected_length_not_introgressed = float(expected_num_not_introgressed_bases) / \
+    expected_length_not_introgressed = \
+        float(expected_num_not_introgressed_bases) / \
         expected_num_not_introgressed_tracts
-    
+
     assert expected_num_not_introgressed_tracts >= 0, \
         expected_num_not_introgressed_tracts
     assert expected_num_not_introgressed_bases >= 0, \
@@ -277,17 +294,22 @@ def transition_probabilities(expected_length_introgressed, \
             trans_current[state_to] += epsilon
             trans_current[state_to] /= total
 
-        assert trans_current[state_from] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+        assert trans_current[state_from] > 0, \
+            ('my HMM methods break down with zero probabilities '
+             '(just get rid of the state if you don\'t want it!)')
         trans_row = []
         for state_to in states:
-            assert trans_current[state_to] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+            assert trans_current[state_to] > 0, \
+                ('my HMM methods break down with zero probabilities '
+                 '(just get rid of the state if you don\'t want it!)')
             trans_row.append(trans_current[state_to])
         trans.append(trans_row)
 
     return trans
 
-def get_symbol_freqs(states, index_to_species, seqs, unknown_species, \
-                         match_symbol, mismatch_symbol, unknown_symbol):
+
+def get_symbol_freqs(states, index_to_species, seqs, unknown_species,
+                     match_symbol, mismatch_symbol, unknown_symbol):
 
     # for each species/state set of sequences calculate: the
     # frequencies of individuals symbols for each species, the
@@ -299,16 +321,17 @@ def get_symbol_freqs(states, index_to_species, seqs, unknown_species, \
         for i in range(len(seqs)):
             if index_to_species[i] == state:
                 seqs_current.append(seqs[i])
-        d[state] = get_symbol_freqs_one(states, seqs_current, unknown_species, \
-                                            match_symbol, mismatch_symbol, \
-                                            unknown_symbol)
+        d[state] = get_symbol_freqs_one(states, seqs_current, unknown_species,
+                                        match_symbol, mismatch_symbol,
+                                        unknown_symbol)
     return d
 
-def get_symbol_freqs_one(states, seqs, unknown_species, \
-                             match_symbol, mismatch_symbol, unknown_symbol):
+
+def get_symbol_freqs_one(states, seqs, unknown_species,
+                         match_symbol, mismatch_symbol, unknown_symbol):
 
     known_states = copy.deepcopy(states)
-    if unknown_species != None:
+    if unknown_species is not None:
         known_states.remove(unknown_species)
 
     # for a single species
@@ -318,8 +341,8 @@ def get_symbol_freqs_one(states, seqs, unknown_species, \
 
     # for species a(0): +, -; species b(1): +, -...etc
     individual_symbol_freqs = []
-    print known_states
-    print seqs[0]
+    print(known_states)
+    print(seqs[0])
     for i in range(len(known_states)):
         d_species = {}
         for symbol in known_symbols:
@@ -333,7 +356,9 @@ def get_symbol_freqs_one(states, seqs, unknown_species, \
         individual_symbol_freqs.append(d_species)
 
     # for +++, ++-, +-+, etc
-    symbol_combinations = [''.join(x) for x in list(itertools.product(symbols, repeat=len(known_states)))]
+    symbol_combinations = [''.join(x)
+                           for x in list(itertools.product(
+                               symbols, repeat=len(known_states)))]
     symbol_freqs = dict(zip(symbol_combinations, [0]*len(symbol_combinations)))
     for symbol in symbol_freqs.keys():
         num = 0
@@ -349,7 +374,7 @@ def get_symbol_freqs_one(states, seqs, unknown_species, \
     total = 0
     for i in range(len(known_states)):
         current = 0
-        keep_symbols = [] # for first species +**
+        keep_symbols = []  # for first species +**
         weights = []
         for s in symbol_combinations:
             if s[i] == match_symbol:
@@ -362,7 +387,7 @@ def get_symbol_freqs_one(states, seqs, unknown_species, \
         total += current
 
     # for unknown species 1 pt for --- only (or ?)
-    if unknown_species != None:
+    if unknown_species is not None:
         current = 0
         keep_symbols = []
         for s in symbol_combinations:
@@ -377,55 +402,57 @@ def get_symbol_freqs_one(states, seqs, unknown_species, \
     for state in weighted_match_freqs:
         # total can only be zero if no species has matches
         weighted_match_freqs[state] /= float(total)
-    
+
     return individual_symbol_freqs, symbol_freqs, weighted_match_freqs
 
+
 def convert_predictions(path, states):
-    new_path = []
-    for p in path:
-        new_path.append(states[p])
-    return new_path
+    return [states[p] for p in path]
+
 
-def initial_hmm_parameters(seqs, predict_species, index_to_species, states, \
-                               unknown_species, \
-                               match_symbol, mismatch_symbol, unknown_symbol, \
-                               expected_length_introgressed, \
-                               expected_num_introgressed_tracts):
+def initial_hmm_parameters(seqs, predict_species, index_to_species, states,
+                           unknown_species,
+                           match_symbol, mismatch_symbol, unknown_symbol,
+                           expected_length_introgressed,
+                           expected_num_introgressed_tracts):
 
-    d_freqs = get_symbol_freqs(\
-        states, index_to_species, seqs, unknown_species, \
-            match_symbol, mismatch_symbol, unknown_symbol)
+    d_freqs = get_symbol_freqs(
+        states, index_to_species, seqs, unknown_species,
+        match_symbol, mismatch_symbol, unknown_symbol)
 
     individual_symbol_freqs, symbol_freqs, weighted_match_freqs = \
         d_freqs[predict_species]
 
     for s in symbol_freqs:
-        print s, symbol_freqs[s]
+        print(s, symbol_freqs[s])
 
     p = {}
 
-    p['init'] = initial_probabilities(states, weighted_match_freqs, match_symbol, \
-                                          expected_length_introgressed, \
-                                          expected_num_introgressed_tracts, \
-                                          len(seqs[0]))
+    p['init'] = initial_probabilities(states, weighted_match_freqs,
+                                      match_symbol,
+                                      expected_length_introgressed,
+                                      expected_num_introgressed_tracts,
+                                      len(seqs[0]))
 
-    p['emis'] = emission_probabilities(states, unknown_species, d_freqs, \
-                                            match_symbol, mismatch_symbol, .99)
+    p['emis'] = emission_probabilities(states, unknown_species, d_freqs,
+                                       match_symbol, mismatch_symbol, .99)
 
-    p['trans'] = transition_probabilities(expected_length_introgressed, \
-                                              expected_num_introgressed_tracts, \
-                                              predict_species, len(seqs[0]), \
-                                              weighted_match_freqs)
+    p['trans'] = transition_probabilities(expected_length_introgressed,
+                                          expected_num_introgressed_tracts,
+                                          predict_species, len(seqs[0]),
+                                          weighted_match_freqs)
 
     return p['init'], p['emis'], p['trans']
 
-def predict_introgressed_hmm(seqs, predict_species, index_to_species, states, \
-                                 unknown_species, ref_ind_species_to, \
-                                 match_symbol, mismatch_symbol, unknown_symbol, \
-                                 init, emis, trans, train=True):
-    if unknown_species != None:
+
+def predict_introgressed_hmm(seqs, predict_species, index_to_species, states,
+                             unknown_species, ref_ind_species_to,
+                             match_symbol, mismatch_symbol, unknown_symbol,
+                             init, emis, trans, train=True):
+    if unknown_species is not None:
         assert len(seqs[0][0]) == len(states) - 1
-        assert set(index_to_species) == set(states), str(set(index_to_species)) + ' ' + str(states)
+        assert set(index_to_species) == set(states), \
+            str(set(index_to_species)) + ' ' + str(states)
         assert unknown_species == states[-1]
 
     seqs_to_predict = []
@@ -462,6 +489,7 @@ def predict_introgressed_hmm(seqs, predict_species, index_to_species, states, \
 
     return predicted, hmm
 
+
 def group_actual_predicted_bases(actual, predicted, states):
     # number of bases that fit into every category of actual x
     # predicted y for all x,y in states
@@ -471,7 +499,7 @@ def group_actual_predicted_bases(actual, predicted, states):
     d = {}
     for state_actual in states:
         for state_predicted in states:
-            d[(state_actual,state_predicted)] = [0] * len(actual)
+            d[(state_actual, state_predicted)] = [0] * len(actual)
 
     for i in range(len(actual)):
         for j in range(len(actual[i])):
@@ -479,24 +507,28 @@ def group_actual_predicted_bases(actual, predicted, states):
 
     return d
 
-def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_to_predict):
+
+def group_actual_predicted_blocks(blocks_actual, blocks_predicted,
+                                  states, inds_to_predict):
     # each block entry is a list with four items: the species it was
     # predicted to be from, (dictionary of) the number of bases within
     # it that are actually from each species, the total block length,
     # and the index of the strain
 
     # returns dictionaries of lists (one entry per strain)
-    
+
     num_samples = len(inds_to_predict)
 
     # first loop through actual blocks
     d_actual_predicted = {}
     d_actual_counts = {}
     for state_actual in states:
-        # this is legit because species_to samples always come first 
-        d_actual_counts[state_actual] = dict(zip(inds_to_predict, [0] * num_samples))
+        # this is legit because species_to samples always come first
+        d_actual_counts[state_actual] = dict(zip(inds_to_predict,
+                                                 [0] * num_samples))
         for state_predicted in states:
-            d_actual_predicted[(state_actual,state_predicted)] = dict(zip(inds_to_predict, [0] * num_samples))
+            d_actual_predicted[(state_actual, state_predicted)] = \
+                dict(zip(inds_to_predict, [0] * num_samples))
 
     for b in range(len(blocks_actual)):
         state_actual = blocks_actual[b][0]
@@ -506,15 +538,18 @@ def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_
             add = 0
             if blocks_actual[b][1][state_predicted] > 0:
                 add = 1
-            d_actual_predicted[(state_actual, state_predicted)][sample_ind] += add
+            d_actual_predicted[(state_actual,
+                                state_predicted)][sample_ind] += add
 
     # then loop through predicted blocks
     d_predicted_actual = {}
     d_predicted_counts = {}
     for state_predicted in states:
-        d_predicted_counts[state_predicted] = dict(zip(inds_to_predict, [0] * num_samples))
+        d_predicted_counts[state_predicted] = dict(zip(inds_to_predict,
+                                                       [0] * num_samples))
         for state_actual in states:
-            d_predicted_actual[(state_predicted,state_actual)] = dict(zip(inds_to_predict, [0] * num_samples))
+            d_predicted_actual[(state_predicted, state_actual)] = \
+                dict(zip(inds_to_predict, [0] * num_samples))
 
     for b in range(len(blocks_predicted)):
         state_predicted = blocks_predicted[b][0]
@@ -524,33 +559,45 @@ def group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, inds_
             add = 0
             if blocks_predicted[b][1][state_actual] > 0:
                 add = 1
-            d_predicted_actual[(state_predicted, state_actual)][sample_ind] += add
+            d_predicted_actual[(state_predicted,
+                                state_actual)][sample_ind] += add
 
     '''
     # convert dictionaries to lists
     for state in states:
-        d_actual_counts[state] = [d_actual_counts[state][x] for x in sorted(d_actual_counts[state].keys())]
-        d_predicted_counts[state] = [d_predicted_counts[state][x] for x in sorted(d_predicted_counts[state].keys())]
+        d_actual_counts[state] = [d_actual_counts[state][x]
+            for x in sorted(d_actual_counts[state].keys())]
+        d_predicted_counts[state] = [d_predicted_counts[state][x]
+            for x in sorted(d_predicted_counts[state].keys())]
     for key in d_actual_predicted.keys():
-        d_actual_predicted[key] = [d_actual_predicted[key][x] for x in sorted(d_actual_predicted[key].keys())]
-        d_predicted_actual[key] = [d_predicted_actual[key][x] for x in sorted(d_predicted_actual[key].keys())]
+        d_actual_predicted[key] = [d_actual_predicted[key][x]
+            for x in sorted(d_actual_predicted[key].keys())]
+        d_predicted_actual[key] = [d_predicted_actual[key][x]
+            for x in sorted(d_predicted_actual[key].keys())]
     '''
 
     # convert to dictionaries containing lists for each individual
 
     for state_actual in states:
-        d_actual_counts[state_actual] = [d_actual_counts[state_actual][i] for i in inds_to_predict]
+        d_actual_counts[state_actual] = [d_actual_counts[state_actual][i]
+                                         for i in inds_to_predict]
         # looks dumb but correct
-        d_predicted_counts[state_actual] = [d_predicted_counts[state_actual][i] for i in inds_to_predict]
+        d_predicted_counts[state_actual] = [d_predicted_counts[state_actual][i]
+                                            for i in inds_to_predict]
         for state_predicted in states:
             d_actual_predicted[(state_actual, state_predicted)] = \
-                [d_actual_predicted[(state_actual, state_predicted)][i] for i in inds_to_predict]
+                [d_actual_predicted[(state_actual, state_predicted)][i]
+                 for i in inds_to_predict]
             d_predicted_actual[(state_predicted, state_actual)] = \
-                [d_predicted_actual[(state_predicted, state_actual)][i] for i in inds_to_predict]
+                [d_predicted_actual[(state_predicted, state_actual)][i]
+                 for i in inds_to_predict]
 
-    return d_actual_predicted, d_predicted_actual, d_actual_counts, d_predicted_counts
+    return (d_actual_predicted, d_predicted_actual,
+            d_actual_counts, d_predicted_counts)
 
-def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_to_predict):
+
+def evaluate_predicted_blocks(predicted, actual, species_to,
+                              all_species, inds_to_predict):
 
     # each block entry is a list with four items: the species it was
     # predicted to be from, (dictionary of) the number of bases within
@@ -563,7 +610,7 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
 
     for i in range(len(inds_to_predict)):
         strain_id = inds_to_predict[i]
-        
+
         assert len(predicted[i]) == len(actual[i]), \
             str(len(predicted[i])) + ' ' + str(len(actual[i]))
 
@@ -587,12 +634,13 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
             else:
                 count_species = {}
                 for sp in all_species:
-                    count_species[sp] = predicted_block_actual_sequence.count(sp)
+                    count_species[sp] = \
+                        predicted_block_actual_sequence.count(sp)
                 current_block = \
-                    [current_species_predicted,\
-                         count_species,\
-                         len(predicted_block_actual_sequence),\
-                         strain_id]
+                    [current_species_predicted,
+                     count_species,
+                     len(predicted_block_actual_sequence),
+                     strain_id]
                 blocks_predicted.append(current_block)
                 current_species_predicted = seq_predicted[j]
                 predicted_block_actual_sequence = [seq_actual[j]]
@@ -605,28 +653,32 @@ def evaluate_predicted_blocks(predicted, actual, species_to, all_species, inds_t
             else:
                 count_species = {}
                 for sp in all_species:
-                    count_species[sp] = actual_block_predicted_sequence.count(sp)
+                    count_species[sp] = \
+                        actual_block_predicted_sequence.count(sp)
                 current_block = \
-                    [current_species_actual,\
-                         count_species,\
-                         len(actual_block_predicted_sequence),\
-                         strain_id]
+                    [current_species_actual,
+                     count_species,
+                     len(actual_block_predicted_sequence),
+                     strain_id]
                 blocks_actual.append(current_block)
                 current_species_actual = seq_actual[j]
                 actual_block_predicted_sequence = [seq_predicted[j]]
 
     return blocks_predicted, blocks_actual
 
+
 def evaluate_predicted(predicted, actual, species_to):
 
     # predicted is a list; for each index that wasn't the species we
     # wanted to predict introgression, the entry is None; for other
     # indices, the entry is a list of predicted species at each
     # position (actual is the same deal)
-    assert len(predicted) == len(actual), str(len(predicted)) + ' ' + str(len(actual))
+    assert len(predicted) == len(actual), \
+        str(len(predicted)) + ' ' + str(len(actual))
 
     # make predictions for every cer sequence
-    # and also keep track of lengths of all actual and predicted introgressed tracts
+    # and also keep track of lengths of all
+    # actual and predicted introgressed tracts
     actual_lens = []
     predicted_lens = []
 
@@ -634,8 +686,10 @@ def evaluate_predicted(predicted, actual, species_to):
     num_correct = []
     num_introgressed_correct = []
 
-    num_predicted_tracts_actual = [] # number of predicted introgressed tracts that overlap an actual one
-    num_actual_tracts_predicted = [] # number of actual introgressed tracts that overlap a predicted one
+    # number of predicted introgressed tracts that overlap an actual one
+    num_predicted_tracts_actual = []
+    # number of actual introgressed tracts that overlap a predicted one
+    num_actual_tracts_predicted = []
 
     num_introgressed_tracts = []
     num_not_introgressed_tracts = []
@@ -649,20 +703,22 @@ def evaluate_predicted(predicted, actual, species_to):
         assert len(predicted[i]) == len(actual[i]), \
             str(len(predicted[i])) + ' ' + str(len(actual[i]))
 
-        in_ai = False # in actual introgressed region
-        in_pi = False # in predicted introgressed region
-        c = 0 # number of sites correct
-        ci = 0 # number of introgressed sites correct
-        ai_count = 0 # length of current actual introgressed tract
-        pi_count = 0 # length of current predicted introgressed tract
-        nai = 0 # number of actual introgressed tracts
-        npi = 0 # number of predicted introgressed tracts
-        hit_actual = 0 # whether we've hit an actual introgressed base
-                       # in the current predicted introgressed tract
-                       # (0 no, 1 yes)
-        hit_predicted = 0 # whether we've hit a predicted introgressed
-                          # base in the current actual introgressed
-                          # tract (0 no, 1 yes)
+        in_ai = False  # in actual introgressed region
+        in_pi = False  # in predicted introgressed region
+        c = 0  # number of sites correct
+        ci = 0  # number of introgressed sites correct
+        ai_count = 0  # length of current actual introgressed tract
+        pi_count = 0  # length of current predicted introgressed tract
+        nai = 0  # number of actual introgressed tracts
+        npi = 0  # number of predicted introgressed tracts
+        # whether we've hit an actual introgressed base
+        # in the current predicted introgressed tract
+        # (0 no, 1 yes)
+        hit_actual = 0
+        # whether we've hit a predicted introgressed
+        # base in the current actual introgressed
+        # tract (0 no, 1 yes)
+        hit_predicted = 0
         ap = 0
         pa = 0
 
@@ -676,8 +732,8 @@ def evaluate_predicted(predicted, actual, species_to):
                 # if introgressed, add one to correct introgressed sites count
                 if actual[i][b] != species_to:
                     ci += 1
-                    # if actual and predicted both introgressed, then we've found
-                    # each for the current tracts
+                    # if actual and predicted both introgressed,
+                    # then we've found each for the current tracts
                     hit_actual = 1
                     hit_predicted = 1
 
@@ -691,7 +747,7 @@ def evaluate_predicted(predicted, actual, species_to):
                     in_ai = True
                     ai_count = 1
             # if the current position is not introgressed, end
-            # previous actual introgressed region 
+            # previous actual introgressed region
             else:
                 if in_ai:
                     actual_lens.append(ai_count)
@@ -709,7 +765,7 @@ def evaluate_predicted(predicted, actual, species_to):
                     in_pi = True
                     pi_count = 1
             # if the current position is not predicted introgressed, end
-            # previous predicted introgressed region 
+            # previous predicted introgressed region
             else:
                 if in_pi:
                     predicted_lens.append(pi_count)
@@ -757,11 +813,13 @@ def evaluate_predicted(predicted, actual, species_to):
 
     assert(sum(num_introgressed_tracts) == len(actual_lens))
     assert(sum(num_predicted_introgressed_tracts) == len(predicted_lens))
-    
-    return num_correct, num_introgressed_correct, actual_lens, predicted_lens, \
-        num_predicted_tracts_actual, num_actual_tracts_predicted, \
-        num_introgressed_tracts, num_not_introgressed_tracts, \
-        num_predicted_introgressed_tracts, num_predicted_not_introgressed_tracts
+
+    return (num_correct, num_introgressed_correct, actual_lens, predicted_lens,
+            num_predicted_tracts_actual, num_actual_tracts_predicted,
+            num_introgressed_tracts, num_not_introgressed_tracts,
+            num_predicted_introgressed_tracts,
+            num_predicted_not_introgressed_tracts)
+
 
 def count_blocks(blocks, negative_label, kind):
     # negative label is species to predict - i.e. if it matches
@@ -781,6 +839,7 @@ def count_blocks(blocks, negative_label, kind):
                 x += 1
     return x
 
+
 def get_positives(blocks, negative):
     x = 0
     for block in blocks:
@@ -788,11 +847,12 @@ def get_positives(blocks, negative):
             x += 1
     return x
 
-# seqs are NOT coded, and the order in each symbol is the order to return them in
+
+# seqs are NOT coded, and the order in each symbol is the order to return them
 def seq_id(seqs, index_to_species, species_order):
     # [[within 1, between 1 & 2, between 1 & 3], [within 2...]...]
 
-    d = {} 
+    d = {}
     for i in range(len(species_order)):
         d[species_order[i]] = i
 
@@ -813,7 +873,7 @@ def seq_id(seqs, index_to_species, species_order):
             num = 0
             den = 0
             for k in range(len(seq1)):
-                if seq1[k] != '-' and seq2[k]!= '-':
+                if seq1[k] != '-' and seq2[k] != '-':
                     den += 1
                     if seq1[k] == seq2[k]:
                         num += 1
@@ -836,8 +896,9 @@ def seq_id(seqs, index_to_species, species_order):
     return s
 
 
-# convert sequences from bases to symbols indicating which references they match
-def code_seqs(seqs, nsites, ref_seqs, match_symbol, mismatch_symbol, unknown_symbol, unsequenced_symbol):
+# convert sequences from bases to symbols indicating the reference they match
+def code_seqs(seqs, nsites, ref_seqs, match_symbol,
+              mismatch_symbol, unknown_symbol, unsequenced_symbol):
 
     nrefs = len(ref_seqs)
     seqs_coded = []
@@ -856,8 +917,9 @@ def code_seqs(seqs, nsites, ref_seqs, match_symbol, mismatch_symbol, unknown_sym
         seqs_coded.append(s)
     return seqs_coded
 
+
 # TODO change stuff to account for multiple donor species better
-def make_output_dic(species, species_to, unknown_state = None):
+def make_output_dic(species, species_to, unknown_state=None):
 
     d = {}
 
@@ -869,12 +931,16 @@ def make_output_dic(species, species_to, unknown_state = None):
         """
         if species_from != species_to:
             # introgressed bases, actual and predicted
-            d['num_introgressed_bases_actual_from_' + species_from + '_to_' + species_to] = None
-            d['num_introgressed_bases_predicted_from_' + species_to + '_to_' + species_to] = None
+            d['num_introgressed_bases_actual_from_' +
+            species_from + '_to_' + species_to] = None
+            d['num_introgressed_bases_predicted_from_' +
+            species_to + '_to_' + species_to] = None
 
             # introgressed tracts, actual and predicted
-            d['num_introgressed_tracts_actual_from_' + species_to + '_to_' + species_to] = None
-            d['num_introgressed_tracts_predicted_from_' + species_to + '_to_' + species_to] = None
+            d['num_introgressed_tracts_actual_from_' +
+            species_to + '_to_' + species_to] = None
+            d['num_introgressed_tracts_predicted_from_' +
+            species_to + '_to_' + species_to] = None
         """
         # introgressed tract lengths, actual and predicted
         d['tract_lengths_actual_' + species_from] = None
@@ -889,13 +955,16 @@ def make_output_dic(species, species_to, unknown_state = None):
         # accuracy and stuff (this includes both introgressed on
         # not introgressed bases/tracts)
         for species_other in species:
-            d['num_bases_actual_' + species_from + '_predicted_' + species_other] = None
+            d['num_bases_actual_' + species_from +
+              '_predicted_' + species_other] = None
             # need both of these because math
-            d['num_tracts_actual_' + species_from + '_predicted_' + species_other] = None
-            d['num_tracts_predicted_' + species_from + '_actual_' + species_other] = None
-    
-    #d['num_bases_correct_' + species_to] = None
-    #d['num_tracts_correct_' + species_to] = None
+            d['num_tracts_actual_' + species_from +
+              '_predicted_' + species_other] = None
+            d['num_tracts_predicted_' + species_from +
+              '_actual_' + species_other] = None
+
+    # d['num_bases_correct_' + species_to] = None
+    # d['num_tracts_correct_' + species_to] = None
     d['prob_topological_concordance'] = None
     d['prob_monophyletic_concordance'] = None
     d['fraction_concordant'] = None
@@ -904,24 +973,24 @@ def make_output_dic(species, species_to, unknown_state = None):
         for j in range(i, len(species)):
             d['avg_identity_' + species[i] + '_' + species[j]] = None
 
-
     # HMM parameters
     for i in range(len(species)):
         d['init_' + species[i]] = None
-                   
+
     for i in range(len(species)):
         for j in range(len(species)):
             d['trans_' + species[i] + '_' + species[j]] = None
 
     for i in range(len(species)):
         to_states = species
-        if unknown_state != None:
+        if unknown_state is not None:
             to_states.append(unknown_state)
         for j in range(len(to_states)):
             d['emis_' + species[i] + '_' + species[j]] = None
 
     return d
 
+
 def write_output_line(f, d, header_line):
 
     items = sorted(d.keys())
@@ -932,11 +1001,12 @@ def write_output_line(f, d, header_line):
     i = 0
     for item in items[:-1]:
         f.write(item + '\t')
-        print keys[i]
-        print item
+        print(keys[i])
+        print(item)
         i += 1
     f.write(items[-1] + '\n')
 
+
 def average_hmm_params(init, emis, trans):
     n = len(init)
     num_states = len(init[0])
@@ -960,20 +1030,22 @@ def average_hmm_params(init, emis, trans):
     for state in range(num_states):
         for symbol in emis_avg[state]:
             emis_avg[state][symbol] /= float(n)
-    
+
     # trans
-    trans_avg = [[0 for state_to in range(num_states)] \
-                     for state_from in range(num_states)]
+    trans_avg = [[0 for state_to in range(num_states)]
+                 for state_from in range(num_states)]
     for i in range(n):
         for state_from in range(num_states):
             for state_to in range(num_states):
-                trans_avg[state_from][state_to] += trans[i][state_from][state_to]
+                trans_avg[state_from][state_to] += \
+                    trans[i][state_from][state_to]
     for state_from in range(num_states):
         for state_to in range(num_states):
             trans_avg[state_from][state_to] /= float(n)
-        
+
     return init_avg, emis_avg, trans_avg
 
+
 def write_hmm_params(init, emis, trans, states, unknown_state, fn):
     f = open(fn, 'w')
     # init
@@ -982,7 +1054,7 @@ def write_hmm_params(init, emis, trans, states, unknown_state, fn):
 
     # emis
     refs = copy.deepcopy(states)
-    if unknown_state != None:
+    if unknown_state is not None:
         assert unknown_state == refs[-1]
         refs = refs[:-1]
     for i in range(len(states)):
@@ -991,12 +1063,12 @@ def write_hmm_params(init, emis, trans, states, unknown_state, fn):
             for j in range(len(refs)):
                 f.write('\t' + refs[j] + '\t' + symbol[j])
             f.write('\t' + str(emis[i][symbol]) + '\n')
-    
+
     # trans
     for i in range(len(states)):
         for j in range(len(states)):
-            f.write('trans\t' + states[i] + '\t' + states[j] + '\t' + \
-                        str(trans[i][j]) + '\n')
+            f.write('trans\t' + states[i] + '\t' + states[j] + '\t' +
+                    str(trans[i][j]) + '\n')
 
     f.close()
 
@@ -1004,25 +1076,25 @@ def write_hmm_params(init, emis, trans, states, unknown_state, fn):
 def read_fasta(fn):
     # for seq-gen sequences
     f = open(fn, 'r')
-    f.readline() # header
+    f.readline()  # header
     line = f.readline()
     seqs = []
     while line != '':
         seqs.append(line[:-1])
-        f.readline() # header
+        f.readline()  # header
         line = f.readline()
     f.close()
     return seqs
 
-def analyze_one(sim, seqs_coded, \
-                    init, emis, trans, num_sites, fill_symbol, ref_inds, ref_seqs, \
-                    num_samples_species_to, species_to, index_to_species, \
-                    topology, states, unknown_species, num_from_species, \
-                    output_dic, fout, f_tracts_predicted, f_tracts_actual, \
-                    rep, train=True):
 
-    trees, recomb_sites, segsites, positions, seqs = sim
+def analyze_one(sim, seqs_coded,
+                init, emis, trans, num_sites, fill_symbol, ref_inds, ref_seqs,
+                num_samples_species_to, species_to, index_to_species,
+                topology, states, unknown_species, num_from_species,
+                output_dic, fout, f_tracts_predicted, f_tracts_actual,
+                rep, train=True):
 
+    trees, recomb_sites, segsites, positions, seqs = sim
 
     ########
     # figure out which sites are actually introgressed by
@@ -1046,7 +1118,7 @@ def analyze_one(sim, seqs_coded, \
         # note that species indices/labels are shifted to start at
         # 0 instead of 1
         t = trees[ti]
-            
+
         # is this tree concordant with the species tree? (only
         # checks whether the to species is monophyletic, which
         # indicates that ILS not possible)
@@ -1070,8 +1142,9 @@ def analyze_one(sim, seqs_coded, \
             # introgressed is a list of species (one entry for
             # each individual in to species)
             introgressed, num_lineages_at_join_current = \
-                find_introgressed_2(t, topology[2], species_to, index_to_species)
-        print introgressed[0]
+                find_introgressed_2(t, topology[2],
+                                    species_to, index_to_species)
+        print(introgressed[0])
         # number of lineages that were present when all
         # populations joined
         num_lineages_at_join.append(num_lineages_at_join_current)
@@ -1093,81 +1166,86 @@ def analyze_one(sim, seqs_coded, \
     # introgressed with a hidden markov model
     ########
 
-    print 'HMM'
-    
+    print('HMM')
+
     # predicted is a list with one entry for each individual of
     # species_to (expected for reference individual); each entry gives
     # predicted species for each position
-    predicted, hmm = predict_introgressed_hmm(seqs_coded, species_to, \
-                                                  index_to_species, \
-                                                  states, \
-                                                  unknown_species, ref_inds[0], \
-                                                  gp.match_symbol, \
-                                                  gp.mismatch_symbol, \
-                                                  gp.unknown_symbol, \
-                                                  init, emis, trans, \
-                                                  train)
+    predicted, hmm = predict_introgressed_hmm(seqs_coded, species_to,
+                                              index_to_species,
+                                              states,
+                                              unknown_species, ref_inds[0],
+                                              gp.match_symbol,
+                                              gp.mismatch_symbol,
+                                              gp.unknown_symbol,
+                                              init, emis, trans,
+                                              train)
 
     # remove reference individual
     assert min(ref_inds) == ref_inds[0], ref_inds
-    actual_state_seq_without_ref = actual_state_seq[0:ref_inds[0]] + actual_state_seq[ref_inds[0]+1:]
-    
-    num_correct, num_introgressed_correct, actual_lens, predicted_lens, \
-        num_predicted_tracts_actual, num_actual_tracts_predicted, \
-        num_introgressed_tracts, num_not_introgressed_tracts, \
-        num_predicted_introgressed_tracts, num_predicted_not_introgressed_tracts = \
-        evaluate_predicted(predicted, actual_state_seq_without_ref, species_to)
-
-    
-    groups = group_actual_predicted_bases(actual_state_seq_without_ref, predicted, states)
+    actual_state_seq_without_ref = actual_state_seq[0:ref_inds[0]] + \
+        actual_state_seq[ref_inds[0]+1:]
+
+    (num_correct, num_introgressed_correct, actual_lens, predicted_lens,
+     num_predicted_tracts_actual, num_actual_tracts_predicted,
+     num_introgressed_tracts, num_not_introgressed_tracts,
+     num_predicted_introgressed_tracts,
+     num_predicted_not_introgressed_tracts) = evaluate_predicted(
+         predicted, actual_state_seq_without_ref, species_to)
+
+    groups = group_actual_predicted_bases(actual_state_seq_without_ref,
+                                          predicted, states)
     for group in groups:
-        output_dic = update_value(output_dic, 'num_bases_actual_' + group[0] + \
-                                      '_predicted_' + group[1], \
-                                      groups[group])
+        output_dic = update_value(output_dic, 'num_bases_actual_' + group[0] +
+                                  '_predicted_' + group[1], groups[group])
 
-    ### tracts
+    # tracts
     inds_to_predict = range(num_samples_species_to)
     inds_to_predict.remove(ref_inds[0])
     blocks_predicted, blocks_actual = \
-        evaluate_predicted_blocks(predicted, actual_state_seq_without_ref, \
-                                      species_to, states, inds_to_predict)
+        evaluate_predicted_blocks(predicted, actual_state_seq_without_ref,
+                                  species_to, states, inds_to_predict)
     # remove blocks in reference individual
     blocks_actual = filter(lambda b: b[3] != ref_inds[0], blocks_actual)
-    d_actual_predicted, d_predicted_actual, d_actual_counts, d_predicted_counts = \
-        group_actual_predicted_blocks(blocks_actual, blocks_predicted, states, \
-                                          inds_to_predict)
+    (d_actual_predicted, d_predicted_actual,
+     d_actual_counts, d_predicted_counts) = \
+        group_actual_predicted_blocks(blocks_actual, blocks_predicted, states,
+                                      inds_to_predict)
     for group in d_actual_predicted:
-        output_dic = update_value(output_dic, 'num_tracts_actual_' + group[0] + \
-                                          '_predicted_' + group[1], \
-                                          d_actual_predicted[group])
+        output_dic = update_value(output_dic, 'num_tracts_actual_' + group[0] +
+                                  '_predicted_' + group[1],
+                                  d_actual_predicted[group])
     for group in d_predicted_actual:
-        output_dic = update_value(output_dic, 'num_tracts_predicted_' + group[0] + \
-                                      '_actual_' + group[1], \
-                                      d_predicted_actual[group])
+        output_dic = update_value(output_dic, 'num_tracts_predicted_'
+                                  + group[0] + '_actual_' + group[1],
+                                  d_predicted_actual[group])
     for state in states:
-        output_dic = update_value(output_dic, 'num_tracts_actual_' + state, \
-                                      d_actual_counts[state])
-        output_dic = update_value(output_dic, 'num_tracts_predicted_' + state, \
-                                      d_predicted_counts[state])
+        output_dic = update_value(output_dic, 'num_tracts_actual_' + state,
+                                  d_actual_counts[state])
+        output_dic = update_value(output_dic, 'num_tracts_predicted_' + state,
+                                  d_predicted_counts[state])
 
     # tract lengths, actual and predicted; list, not average
     for state in states:
-        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state, blocks_actual)]
-        output_dic = update_value(output_dic, 'tract_lengths_actual_' + state, tract_lengths)
-
-        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state, blocks_predicted)]
-        output_dic = update_value(output_dic, 'tract_lengths_predicted_' + state, tract_lengths)
+        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state,
+                                              blocks_actual)]
+        output_dic = update_value(
+            output_dic, 'tract_lengths_actual_' + state, tract_lengths)
 
+        tract_lengths = [b[2] for b in filter(lambda x: x[0] == state,
+                                              blocks_predicted)]
+        output_dic = update_value(
+            output_dic, 'tract_lengths_predicted_' + state, tract_lengths)
 
     # HMM parameters
     for i in range(len(states)):
         output_dic = update_value(output_dic, 'init_' + states[i], hmm.init[i])
-            
+
     for i in range(len(states)):
         for j in range(len(states)):
-            output_dic = update_value(output_dic, \
-                                          'trans_' + states[i] + '_' + states[j], \
-                                          hmm.trans[i][j])
+            output_dic = update_value(output_dic,
+                                      'trans_' + states[i] + '_' + states[j],
+                                      hmm.trans[i][j])
 
     # so here emis_cer_par is going to be the probability that cer
     # state emits symbol that matches par, so *+; note that these
@@ -1184,8 +1262,8 @@ def analyze_one(sim, seqs_coded, \
                         total += hmm.emis[i][s]
                 elif s[j] == gp.match_symbol:
                     total += hmm.emis[i][s]
-            output_dic = update_value(output_dic, 'emis_'  + states[i] + '_' + \
-                                          to_states[j], total)
+            output_dic = update_value(output_dic, 'emis_' + states[i] + '_' +
+                                      to_states[j], total)
 
     ########
     # sequence identities
@@ -1194,10 +1272,10 @@ def analyze_one(sim, seqs_coded, \
     s = seq_id(seqs, index_to_species, states)
     for i in range(len(states)):
         for j in range(i, len(states)):
-            output_dic = update_value(output_dic, \
-                                          'avg_identity_' + states[i] + \
-                                          '_' + states[j], \
-                                          s[i][j])
+            output_dic = update_value(output_dic,
+                                      'avg_identity_' + states[i] +
+                                      '_' + states[j],
+                                      s[i][j])
 
     #####
     # write results to file
@@ -1208,14 +1286,15 @@ def analyze_one(sim, seqs_coded, \
     sys.stdout.flush()
 
     for b in blocks_predicted:
-        f_tracts_predicted.write(str(rep) + ' ' +  ' '.join([str(x) for x in b]) + '\n')
+        f_tracts_predicted.write(str(rep) + ' ' +
+                                 ' '.join([str(x) for x in b]) + '\n')
     for b in blocks_actual:
-        f_tracts_actual.write(str(rep) + ' ' + ' '.join([str(x) for x in b]) + '\n')
+        f_tracts_actual.write(str(rep) + ' ' +
+                              ' '.join([str(x) for x in b]) + '\n')
 
     f_tracts_predicted.flush()
     f_tracts_actual.flush()
 
-
     #####
     # return hmm parameters
     #####
diff --git a/code/sim/sim_analyze_hmm_bw_main.py b/code/sim/sim_analyze_hmm_bw_main.py
index 6d2dcef..13bc55b 100644
--- a/code/sim/sim_analyze_hmm_bw_main.py
+++ b/code/sim/sim_analyze_hmm_bw_main.py
@@ -1,7 +1,9 @@
-from sim_analyze_hmm_bw import *
-from concordance_functions import *
-sys.path.insert(0, '..')
+from sim.sim_analyze_hmm_bw import (analyze_one, code_seqs, read_fasta,
+                                    write_hmm_params, average_hmm_params,
+                                    make_output_dic, write_output_line,
+                                    initial_hmm_parameters)
 import global_params as gp
+import sys
 
 
 # two options: just summary statistics for simulations, or also
@@ -16,9 +18,24 @@
 # program?
 seq_gen = False
 
+
+# These are not defined in original imports so will raise errors til found
+def process_args(argv):
+    raise NotImplementedError()
+
+
+def fill_seqs(argv):
+    raise NotImplementedError()
+
+
+def read_one_sim(argv):
+    raise NotImplementedError()
+
+
 # read in simulation arguments
 tag, topology, species_to, species_from1, species_from2, \
-    num_samples_species_to, num_samples_species_from1, num_samples_species_from2, \
+    num_samples_species_to, num_samples_species_from1, \
+    num_samples_species_from2, \
     N0_species_to, N0_species_from1, N0_species_from2, \
     migration_from1, migration_from2, \
     expected_length_introgressed, \
@@ -56,7 +73,7 @@
     ref_inds.append(ref_ind_species_from1)
 else:
     unknown_species = species_from1
-if species_from2 != None:
+if species_from2 is not None:
     states.append(species_from2)
     if has_ref_from2:
         ref_inds.append(ref_ind_species_from2)
@@ -69,10 +86,10 @@
 # the species in states correspond to the indices of the references
 # (and the sequence codings later); ACTUALLY just force the unknown
 # species to come last
-if species_from2 != None:
+if species_from2 is not None:
     assert has_ref_from1
-#if species_from2 != None and not has_ref_from1:
-#    states = states[0] + states[2] + states[1]
+# if species_from2 != None and not has_ref_from1:
+#     states = states[0] + states[2] + states[1]
 
 #####
 # keep track of HMM parameters to average at end
@@ -88,13 +105,17 @@
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +  gp.sim_out_prefix + tag + '.txt', 'r')
+ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +
+            gp.sim_out_prefix + tag + '.txt', 'r')
 
 # for writing results of analysis
-results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + gp.sim_out_prefix + tag + '_summary.txt'
-hmm_filename = gp_dir + gp.sim_out_dir + '/analyze/' + 'hmm_parameters_' + tag + '.txt'
+results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + \
+    gp.sim_out_prefix + tag + '_summary.txt'
+hmm_filename = gp_dir + gp.sim_out_dir + '/analyze/' + \
+    'hmm_parameters_' + tag + '.txt'
 
-avg_results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + gp.sim_out_prefix + 'avg_' + tag + '_summary.txt'
+avg_results_filename = gp_dir + gp.sim_out_dir + '/analyze/' + \
+    gp.sim_out_prefix + 'avg_' + tag + '_summary.txt'
 
 # write results headers
 # - for training on single sim
@@ -108,15 +129,19 @@
 
 # results files for tracts predicted to be and actually introgressed
 # - for training on single sim
-f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                              gp.sim_out_prefix + tag + '_introgressed_tracts_predicted.txt', 'w')
-f_tracts_actual = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                           gp.sim_out_prefix + tag + '_introgressed_tracts_actual.txt', 'w')
+f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                          gp.sim_out_prefix + tag +
+                          '_introgressed_tracts_predicted.txt', 'w')
+f_tracts_actual = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                       gp.sim_out_prefix + tag +
+                       '_introgressed_tracts_actual.txt', 'w')
 # - and using averaged parameters
-avg_f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                                  gp.sim_out_prefix + 'avg_' +  tag + '_introgressed_tracts_predicted.txt', 'w')
-avg_f_tracts_actual = open(gp_dir + gp.sim_out_dir + '/analyze/' + \
-                               gp.sim_out_prefix + 'avg_' + tag + '_introgressed_tracts_actual.txt', 'w')
+avg_f_tracts_predicted = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                              gp.sim_out_prefix + 'avg_' + tag +
+                              '_introgressed_tracts_predicted.txt', 'w')
+avg_f_tracts_actual = open(gp_dir + gp.sim_out_dir + '/analyze/' +
+                           gp.sim_out_prefix + 'avg_' + tag +
+                           '_introgressed_tracts_actual.txt', 'w')
 
 #####
 # theory (only need to do these calculations once)
@@ -152,13 +177,14 @@
 # loop through all reps; for each, read in simulation (and store it),
 # and predict introgressed tracts by training on those sequences
 for i in range(num_reps):
-    print i
+    print(i)
 
     sim = read_one_sim(ms_f, num_sites, num_samples)
-    assert sim != None, str(num_reps) + ' reps is not correct'
-    
+    assert sim is not None, str(num_reps) + ' reps is not correct'
+
     if seq_gen:
-        seq_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + gp.sim_out_prefix + 'seq_gen_' + tag + '_rep' + str(i) + '.fasta'
+        seq_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + gp.sim_out_prefix + \
+            'seq_gen_' + tag + '_rep' + str(i) + '.fasta'
         sim[4] = read_fasta(seq_fn)
     else:
         # fill in the nonpolymorphic sites
@@ -172,32 +198,23 @@
     ref_seqs = [seqs[x] for x in ref_inds]
     # convert from binary to symbols indicating which reference
     # sequences each base matches
-    seqs_coded = code_seqs(seqs, num_sites, ref_seqs, \
-                               gp.match_symbol, gp.mismatch_symbol, \
-                               gp.unknown_symbol, gp.unsequenced_symbol)
-
-    init, emis, trans = initial_hmm_parameters(seqs_coded, species_to, \
-                                                   index_to_species, states, \
-                                                   unknown_species, \
-                                                   gp.match_symbol, \
-                                                   gp.mismatch_symbol, \
-                                                   gp.unknown_symbol, \
-                                                   expected_length_introgressed, \
-                                                   expected_num_introgressed_tracts)
+    seqs_coded = code_seqs(seqs, num_sites, ref_seqs,
+                           gp.match_symbol, gp.mismatch_symbol,
+                           gp.unknown_symbol, gp.unsequenced_symbol)
+
+    init, emis, trans = initial_hmm_parameters(
+        seqs_coded, species_to, index_to_species, states,
+        unknown_species, gp.match_symbol, gp.mismatch_symbol,
+        gp.unknown_symbol, expected_length_introgressed,
+        expected_num_introgressed_tracts)
 
     # get parameters from training (but provide initial values)
-    init_new, emis_new, trans_new = analyze_one(sim, seqs_coded, \
-                                                    init, emis, trans, \
-                                                    num_sites, fill_symbol, \
-                                                    ref_inds, ref_seqs, \
-                                                    num_samples_species_to, \
-                                                    species_to, index_to_species, \
-                                                    topology, states, unknown_species, \
-                                                    len(states) - 1, \
-                                                    output_dic, f_out, \
-                                                    f_tracts_predicted, \
-                                                    f_tracts_actual, i, \
-                                                    train=True)
+    init_new, emis_new, trans_new = analyze_one(
+        sim, seqs_coded, init, emis, trans, num_sites, fill_symbol,
+        ref_inds, ref_seqs, num_samples_species_to, species_to,
+        index_to_species, topology, states, unknown_species,
+        len(states) - 1, output_dic, f_out, f_tracts_predicted,
+        f_tracts_actual, i, train=True)
     init_all.append(init_new)
     emis_all.append(emis_new)
     trans_all.append(trans_new)
@@ -205,7 +222,8 @@
 f_out.close()
 
 ms_f.close()
-ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +  gp.sim_out_prefix + tag + '.txt', 'r')
+ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' +
+            gp.sim_out_prefix + tag + '.txt', 'r')
 
 # get average of HMM parameters across all simulations and write them
 # to file
@@ -216,13 +234,14 @@
 # parameters (no training)
 for i in range(num_reps):
 
-    print i
+    print(i)
 
     sim = read_one_sim(ms_f, num_sites, num_samples)
-    assert sim != None, str(num_reps) + ' reps is not correct'
+    assert sim is not None, str(num_reps) + ' reps is not correct'
 
     if seq_gen:
-        seq_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + gp.sim_out_prefix + 'seq_gen_' + tag + '_rep' + str(i) + '.fasta'
+        seq_fn = gp_dir + gp.sim_out_dir + '/seq-gen/' + gp.sim_out_prefix + \
+            'seq_gen_' + tag + '_rep' + str(i) + '.fasta'
         sim[4] = read_fasta(seq_fn)
     else:
         # fill in the nonpolymorphic sites
@@ -236,25 +255,17 @@
     ref_seqs = [seqs[x] for x in ref_inds]
     # convert from binary to symbols indicating which reference
     # sequences each base matches
-    seqs_coded = code_seqs(seqs, num_sites, ref_seqs, \
-                               gp.match_symbol, gp.mismatch_symbol, \
-                               gp.unknown_symbol, gp.unsequenced_symbol)
-
+    seqs_coded = code_seqs(seqs, num_sites, ref_seqs,
+                           gp.match_symbol, gp.mismatch_symbol,
+                           gp.unknown_symbol, gp.unsequenced_symbol)
 
     # give averaged parameters this time
-    init_new, emis_new, trans_new = analyze_one(sim, seqs_coded, \
-                                                    init, emis, trans, \
-                                                    num_sites, fill_symbol, ref_inds, \
-                                                    ref_seqs, \
-                                                    num_samples_species_to, \
-                                                    species_to, index_to_species, \
-                                                    topology, states, \
-                                                    unknown_species, \
-                                                    len(states) - 1, \
-                                                    avg_output_dic, avg_f_out, \
-                                                    avg_f_tracts_predicted, \
-                                                    avg_f_tracts_actual, i, \
-                                                    train=False)
+    init_new, emis_new, trans_new = analyze_one(
+        sim, seqs_coded, init, emis, trans, num_sites, fill_symbol, ref_inds,
+        ref_seqs, num_samples_species_to, species_to, index_to_species,
+        topology, states, unknown_species, len(states) - 1,
+        avg_output_dic, avg_f_out, avg_f_tracts_predicted,
+        avg_f_tracts_actual, i, train=False)
 
 avg_f_out.close()
 
@@ -272,7 +283,7 @@
 
     for i in range(num_reps):
         print i
-    
+
         analyze_one_rep(params, f)
 """
 
@@ -280,4 +291,3 @@
 ms_f.close()
 f_tracts_predicted.close()
 f_tracts_actual.close()
-
diff --git a/code/sim/sim_multi_model.py b/code/sim/sim_multi_model.py
index 06badb2..979d123 100644
--- a/code/sim/sim_multi_model.py
+++ b/code/sim/sim_multi_model.py
@@ -1,6 +1,6 @@
 # Run ms coalescent simulations under a variety of demographic models
 # (actually, just continuous migration, after the last divergence, for
-# now) 
+# now)
 
 # aiming for ~89% sequence id b/t cer and par and ~99.5% within cer,
 # and ~70% between cer and bay (?)
@@ -8,9 +8,9 @@
 import os
 import sys
 import process_args
-sys.path.insert(0, '..')
 import global_params as gp
 
+
 args, last_read = process_args.process_args(sys.argv)
 
 num_samples = args['num_samples_species_to'] + \
@@ -27,40 +27,42 @@
 # the -r option)
 # the -p option is to insure there's enough granularity in segsite locations
 ms_command = \
-    gp.ms_install_path + '/ms ' + str(num_samples) + ' ' + str(args['num_reps']) + \
+    gp.ms_install_path + '/ms ' + str(num_samples) + ' ' + \
+    str(args['num_reps']) + \
     ' -t ' + str(args['theta']) + \
     ' -r ' + str(args['rho']) + ' ' + str(args['num_sites']) + \
     ' -p 8'
 
 
 # 2 species
-if args['species_from2'] == None:
+if args['species_from2'] is None:
     join_time = args['topology'][2]
     ms_command += \
         ' -I 2 ' + str(args['num_samples_species_to']) + \
         ' ' + str(args['num_samples_species_from1'])
     ms_command += \
         ' -m 1 2 ' + str(args['migration_from1']) + \
-        ' -em ' + str(join_time / float(migration1_time_scaling)) + ' 1 2 0' + \
-        ' -em ' + str(join_time) + ' 1 2 0' # this is probably implied
+        ' -em ' + str(join_time / float(migration1_time_scaling)) + ' 1 2 0' +\
+        ' -em ' + str(join_time) + ' 1 2 0'  # this is probably implied
     ms_command += \
         ' -ej ' + str(join_time) + ' 1 2'
 
 # 3 species
 else:
-    if type(args['topology'][0]) != type([]):
+    if not isinstance(args['topology'], list):
         left = args['topology'][0]
-        topology[0] = args['topology'][1]
-        topology[1] = left
+        # These values aren't used, and don't make sense with above isinstance
+        # topology[0] = args['topology'][1]
+        # topology[1] = left
     most_recent_join_time = args['topology'][0][2]
     least_recent_join_time = args['topology'][2]
     last_to_join = args['topology'][1]
     first_to_join1 = args['topology'][0][0]
     first_to_join2 = args['topology'][0][1]
 
-    label = {args['species_to']:'1', \
-             args['species_from1']:'2', \
-             args['species_from2']:'3'}
+    label = {args['species_to']: '1',
+             args['species_from1']: '2',
+             args['species_from2']: '3'}
 
     # note that we need to keep the species in the order to, from1,
     # from2 (because we're assuming this is true in the analysis)
@@ -70,8 +72,9 @@
 
     ms_command += \
         ' -m 1 2 ' + str(args['migration_from1'] * migration1_time_scaling) + \
-        ' -m 1 3 ' + str(args['migration_from2'])  + \
-        ' -em ' + str(most_recent_join_time / float(migration1_time_scaling)) + ' 1 2 0' + \
+        ' -m 1 3 ' + str(args['migration_from2']) + \
+        ' -em ' + str(most_recent_join_time /
+                      float(migration1_time_scaling)) + ' 1 2 0' + \
         ' -em ' + str(most_recent_join_time) + ' 1 3 0'
 
     ms_command += \
@@ -94,7 +97,8 @@
 #
 # generations per year ~2920 - Fay & Benavides 2005
 #
-# divergence of vineyard and sake cerevisiae (wine/sake) ~12000 ya - Fay & Benavides 2005
+# divergence of vineyard and sake cerevisiae (wine/sake)
+# ~12000 ya - Fay & Benavides 2005
 # (could easily be an order of magnitude older)
 #
 # population size of paradoxus europe ~8600000 - Tsai et al. 2008
@@ -111,5 +115,6 @@
 #
 # frequency of sex in cerevisiae 1/2000 - 1/9000 generations - Tsai et al. 2008
 #
-# divergence of paradoxus european and eastenr ~500000 ya - (guesstimate from phylogeny in) Liti et al. 2009
-# 
+# divergence of paradoxus european and eastenr ~500000 ya -
+# (guesstimate from phylogeny in) Liti et al. 2009
+#
diff --git a/code/sim/sim_predict.py b/code/sim/sim_predict.py
index 37245e6..cfca7a0 100644
--- a/code/sim/sim_predict.py
+++ b/code/sim/sim_predict.py
@@ -2,9 +2,12 @@
 import itertools
 from sim import sim_process
 import global_params as gp
+from typing import List
+from hmm import hmm_bw
+
 
 def process_args(arg_list, sim_args, i=1):
-    
+
     d = {}
 
     d['tag'] = arg_list[i]
@@ -46,15 +49,14 @@ def process_args(arg_list, sim_args, i=1):
     # the species in states correspond to the indices of the references
     # (and the sequence codings later); ACTUALLY just force the unknown
     # species to come last
-    if sim_args['species_from2'] != None:
+    if sim_args['species_from2'] is not None:
         assert d['has_ref_from1']
 
-
     # take first index from each population to be reference sequence
     ref_ind_species_to = 0
     ref_ind_species_from1 = sim_args['num_samples_species_to']
     ref_ind_species_from2 = sim_args['num_samples_species_to'] + \
-                            sim_args['num_samples_species_from1']
+        sim_args['num_samples_species_from1']
     ref_inds = [ref_ind_species_to]
 
     states = [sim_args['species_to'], sim_args['species_from1']]
@@ -63,7 +65,7 @@ def process_args(arg_list, sim_args, i=1):
         ref_inds.append(ref_ind_species_from1)
     else:
         unknown_species = sim_args['species_from1']
-    if sim_args['species_from2'] != None:
+    if sim_args['species_from2'] is not None:
         states.append(sim_args['species_from2'])
         if d['has_ref_from2']:
             ref_inds.append(ref_ind_species_from2)
@@ -75,11 +77,12 @@ def process_args(arg_list, sim_args, i=1):
     d['ref_inds'] = ref_inds
 
     # calculate these based on remaining bases
-    expected_num_tracts[sim_args['species_to']] = sum(expected_num_tracts.values()) + 1
+    expected_num_tracts[sim_args['species_to']] = \
+        sum(expected_num_tracts.values()) + 1
     expected_num_introgressed_bases = \
         expected_tract_lengths[sim_args['species_from1']] * \
         expected_num_tracts[sim_args['species_from1']]
-    if sim_args['species_from2'] != None:
+    if sim_args['species_from2'] is not None:
         expected_num_introgressed_bases += \
             expected_tract_lengths[sim_args['species_from2']] * \
             expected_num_tracts[sim_args['species_from2']]
@@ -92,7 +95,9 @@ def process_args(arg_list, sim_args, i=1):
 
     return d, i
 
-# convert sequences from bases to symbols indicating which references they match
+
+# convert sequences from bases to symbols
+# indicating which references they match
 def code_seqs(seqs, ref_seqs):
 
     nrefs = len(ref_seqs)
@@ -114,13 +119,14 @@ def code_seqs(seqs, ref_seqs):
         seqs_coded.append(s)
     return seqs_coded
 
+
 def get_symbol_freqs_one(seqs, predict_args):
 
     if len(seqs) == 0:
         return None
 
     known_states = copy.deepcopy(predict_args['states'])
-    if predict_args['unknown_species'] != None:
+    if predict_args['unknown_species'] is not None:
         known_states.remove(predict_args['unknown_species'])
 
     # for a single species
@@ -147,8 +153,9 @@ def get_symbol_freqs_one(seqs, predict_args):
     # TODO fix so that this makes more sense when we're only going off
     # of reference sequence (shouldn't have freq(+-) = 0 just because
     # it's par reference...but what would make more sense?
-    symbol_combinations = [''.join(x) for x in \
-                           list(itertools.product(symbols, repeat=len(known_states)))]
+    symbol_combinations = [''.join(x) for x in
+                           list(itertools.product(symbols,
+                                                  repeat=len(known_states)))]
     symbol_freqs = dict(zip(symbol_combinations, [0]*len(symbol_combinations)))
     for symbol in symbol_freqs.keys():
         num = 0
@@ -156,7 +163,7 @@ def get_symbol_freqs_one(seqs, predict_args):
         for seq in seqs:
             num += seq.count(symbol)
             den += len(seq)
-        symbol_freqs[symbol] = float(num) / den 
+        symbol_freqs[symbol] = float(num) / den
 
     # weighted matches; for first species +-- gets 1 pt, +-+ and ++-
     # each get 1/2 pt, +++ gets 1/3 pt [treat ? like -] etc
@@ -164,7 +171,7 @@ def get_symbol_freqs_one(seqs, predict_args):
     total = 0
     for i in range(len(known_states)):
         current = 0
-        keep_symbols = [] # for first species +**
+        keep_symbols = []  # for first species +**
         weights = []
         for s in symbol_combinations:
             if s[i] == gp.match_symbol:
@@ -177,7 +184,7 @@ def get_symbol_freqs_one(seqs, predict_args):
         total += current
 
     # for unknown species 1 pt for --- only (or ?)
-    if predict_args['unknown_species'] != None:
+    if predict_args['unknown_species'] is not None:
         current = 0
         keep_symbols = []
         for s in symbol_combinations:
@@ -192,9 +199,10 @@ def get_symbol_freqs_one(seqs, predict_args):
     for state in weighted_match_freqs:
         # total can only be zero if no species has matches
         weighted_match_freqs[state] /= float(total)
-    
+
     return individual_symbol_freqs, symbol_freqs, weighted_match_freqs
 
+
 def get_symbol_freqs(seqs_coded, species_to_indices, predict_args):
 
     # for each species/state set of sequences calculate: the
@@ -217,6 +225,7 @@ def get_symbol_freqs(seqs_coded, species_to_indices, predict_args):
 
     return d
 
+
 def initial_probabilities(weighted_match_freqs, num_sites, predict_args):
 
     # a small value to add so that no frequencies are actually 0
@@ -231,17 +240,20 @@ def initial_probabilities(weighted_match_freqs, num_sites, predict_args):
         # add these two things because we want to average them instead
         # of letting one of them bring the total to zero [should we
         # really give them equal weight though?]
-        init.append(weighted_match_freqs[state] + \
-                    weight_expected * \
-                    float(predict_args['expected_tract_lengths'][state] * \
-                          predict_args['expected_num_tracts'][state]) / \
+        init.append(weighted_match_freqs[state] +
+                    weight_expected *
+                    float(predict_args['expected_tract_lengths'][state] *
+                          predict_args['expected_num_tracts'][state]) /
                     num_sites + epsilon)
     scale = float(sum(init))
     for i in range(len(predict_args['states'])):
         init[i] /= scale
-        assert init[i] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+        assert init[i] > 0, ('my HMM methods break down with zero '
+                             'probabilities (just get rid of the state if '
+                             'you don\'t want it!)')
     return init
 
+
 # unlike for other parameters, calculate probabilities from the
 # appropriate species sequences instead of just the one being
 # predicted
@@ -266,43 +278,48 @@ def emission_probabilities(d_freqs, own_bias, num_sites, predict_args):
             = d_freqs[state]
         for symbol in symbol_freqs.keys():
             p = symbol_freqs[symbol] * weight_expected * \
-                float(predict_args['expected_tract_lengths'][state] * \
+                float(predict_args['expected_tract_lengths'][state] *
                       predict_args['expected_num_tracts'][state]) / \
                 num_sites + epsilon
             if state == predict_args['unknown_species']:
                 # treat ? as - for now
                 if gp.match_symbol in symbol:
-                    p *= (1 - own_bias)**2 # TODO is this a good idea?
+                    p *= (1 - own_bias)**2  # TODO is this a good idea?
                 else:
-                    p *= own_bias**2 # TODO
+                    p *= own_bias**2  # TODO
             elif symbol[i] == gp.match_symbol:
                 p *= own_bias
-            else: #symbol[i] == mismatch_symbol:
+            else:  # symbol[i] == mismatch_symbol:
                 p *= (1 - own_bias)
             # maybe this kind of guessing for '?' is worth doing? but
             # should probably also do it for the initial/transition
             # probabilities as well if doing it here
-            #else:
-            #    p *= ((own_bias) * individual_symbol_freqs[i][match_symbol] + \
-            #              (1 - own_bias) * individual_symbol_freqs[i][mismatch_symbol])
-            emis_species[symbol] = p 
+            # else:
+            #     p *= ((own_bias) * \
+            #           individual_symbol_freqs[i][match_symbol] + \
+            #               (1 - own_bias) * \
+            #               individual_symbol_freqs[i][mismatch_symbol])
+            emis_species[symbol] = p
         norm = float(sum(emis_species.values()))
         for symbol in emis_species:
             emis_species[symbol] /= norm
-            assert emis_species[symbol] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+            assert emis_species[symbol] > 0, (
+                'my HMM methods break down with zero probabilities '
+                '(just get rid of the state if you don\'t want it!)')
         emis.append(emis_species)
     return emis
 
+
 def transition_probabilities(weighted_match_freqs, species_to, predict_args):
 
     # a small value to add so that no frequencies are actually 0
     epsilon = 1./1000000
 
-    ##states_not_predict = states
-    ##states_not_predict.remove(predict_species)
-    
+    # states_not_predict = states
+    # states_not_predict.remove(predict_species)
+
     # if a is the species with introgression in it
-    
+
     # transition a->b 1/length not int a * frac to b
     # transition a->c 1/length not int a * frac to c
 
@@ -312,12 +329,13 @@ def transition_probabilities(weighted_match_freqs, species_to, predict_args):
     # b->a 1/length int b
     # b->c basically 0
 
-    # frac to a - 
+    # frac to a -
     # frac to c - num int c / num int c + num not int
 
     # num int c = bases int c / length int c
 
-    # bases int c = [(+c-b) + (+c+b) * 1/2] * some fraction accounting for randomness
+    # bases int c = [(+c-b) + (+c+b) * 1/2] * \
+    #   some fraction accounting for randomness
 
     # c->a 1/length int c
     # c->b basically 0
@@ -344,7 +362,7 @@ def transition_probabilities(weighted_match_freqs, species_to, predict_args):
     # any probabilities to 0
     for state in fracs:
         fracs[state] /= float(frac_den)
-        
+
     trans = []
     for state_from in predict_args['states']:
         trans_current = {}
@@ -359,7 +377,8 @@ def transition_probabilities(weighted_match_freqs, species_to, predict_args):
             # moving from non-introgressed (cer) to introgressed
             elif state_from == species_to:
                 if expected_length_not_introgressed > 0:
-                    val = 1 / expected_length_not_introgressed * fracs[state_to]
+                    val = 1 / expected_length_not_introgressed * \
+                        fracs[state_to]
                 else:
                     # we should definitely transition if we expect
                     # entire sequence to be introgressed
@@ -367,7 +386,8 @@ def transition_probabilities(weighted_match_freqs, species_to, predict_args):
             # moving from introgressed to non-introgressed
             elif state_to == species_to:
                 if predict_args['expected_tract_lengths'][state_from] > 0:
-                    val = 1 / float(predict_args['expected_tract_lengths'][state_from])
+                    val = 1 / float(
+                        predict_args['expected_tract_lengths'][state_from])
                 else:
                     # we should definitely transition if we don't
                     # expect any introgression
@@ -380,24 +400,28 @@ def transition_probabilities(weighted_match_freqs, species_to, predict_args):
             trans_current[state_to] = val
 
         trans_current[state_from] = 1 - total
-                                    
+
         # add epsilon and normalize to avoid 0 probabilities
         total = 1 + len(predict_args['states']) * epsilon
         for state_to in predict_args['states']:
             trans_current[state_to] += epsilon
             trans_current[state_to] /= total
 
-        assert trans_current[state_from] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+        assert trans_current[state_from] > 0, (
+            'my HMM methods break down with zero probabilities '
+            '(just get rid of the state if you don\'t want it!)')
         trans_row = []
         for state_to in predict_args['states']:
-            assert trans_current[state_to] > 0, 'my HMM methods break down with zero probabilities (just get rid of the state if you don\'t want it!)'
+            assert trans_current[state_to] > 0, (
+                'my HMM methods break down with zero probabilities '
+                '(just get rid of the state if you don\'t want it!)')
             trans_row.append(trans_current[state_to])
         trans.append(trans_row)
 
-
     return trans
 
-def initial_hmm_parameters(seqs_coded, species_to_indices, species_to, \
+
+def initial_hmm_parameters(seqs_coded, species_to_indices, species_to,
                            num_sites, predict_args):
 
     # get frequencies of all symbols (i.e. matching to each
@@ -412,19 +436,24 @@ def initial_hmm_parameters(seqs_coded, species_to_indices, species_to, \
         d_freqs[species_to]
 
     p = {}
-    p['init'] = initial_probabilities(weighted_match_freqs, num_sites, predict_args)
+    p['init'] = initial_probabilities(weighted_match_freqs,
+                                      num_sites, predict_args)
     p['emis'] = emission_probabilities(d_freqs, .99, num_sites, predict_args)
-    p['trans'] = transition_probabilities(weighted_match_freqs, species_to, predict_args)
+    p['trans'] = transition_probabilities(weighted_match_freqs,
+                                          species_to, predict_args)
 
     return p['init'], p['emis'], p['trans']
 
 
-def convert_predictions(path, states):
+def convert_predictions(path: List[int], states: List[str]):
+    '''
+    Convert a path of index values into strings based on the states list
+    '''
     return [states[p] for p in path]
 
 
 def fill_prediction(path, ps, seq_start, seq_end, fill_order):
-    
+
     path_filled = [None for i in range(seq_start, seq_end + 1)]
     num_sites = len(path_filled)
     for i in range(len(ps)):
@@ -433,7 +462,7 @@ def fill_prediction(path, ps, seq_start, seq_end, fill_order):
         # extend forward
         in_current_state = path_filled[0] == state
         for i in range(num_sites):
-            if path_filled[i] == None:
+            if path_filled[i] is None:
                 if in_current_state:
                     path_filled[i] = state
             elif path_filled[i] == state:
@@ -444,7 +473,7 @@ def fill_prediction(path, ps, seq_start, seq_end, fill_order):
         # extend backward
         in_current_state = path_filled[-1] == state
         for i in range(num_sites-1, -1, -1):
-            if path_filled[i] == None:
+            if path_filled[i] is None:
                 if in_current_state:
                     path_filled[i] = state
             elif path_filled[i] == state:
@@ -453,20 +482,24 @@ def fill_prediction(path, ps, seq_start, seq_end, fill_order):
                 in_current_state = False
     return path_filled
 
-def run_hmm(seqs, sim_args, predict_args, init, emis, trans, train, default_state, \
+
+def run_hmm(seqs, sim_args, predict_args, init, emis,
+            trans, train, default_state,
             method, ps, seq_start, seq_end):
 
     # sanity checks
-    if predict_args['unknown_species'] != None:
+    if predict_args['unknown_species'] is not None:
         assert len(seqs[0][0]) == len(predict_args['states']) - 1
-        assert set(sim_args['index_to_species']) == set(predict_args['states']), \
-            str(set(sim_args['index_to_species'])) + ' ' + str(predict_args['states'])
+        assert set(sim_args['index_to_species']) == set(
+            predict_args['states']), str(set(sim_args['index_to_species'])) +\
+            ' ' + str(predict_args['states'])
         assert predict_args['unknown_species'] == predict_args['states'][-1]
 
     # only make predictions for sequences that are species_to (and
     # that are not the reference sequences, which we clearly can't
     # make predictions for)
-    predict_inds = copy.deepcopy(sim_args['species_to_indices'][sim_args['species_to']])
+    predict_inds = copy.deepcopy(
+        sim_args['species_to_indices'][sim_args['species_to']])
     predict_inds.remove(predict_args['ref_inds'][0])
     seqs_to_predict = [seqs[x] for x in predict_inds]
 
@@ -488,7 +521,7 @@ def run_hmm(seqs, sim_args, predict_args, init, emis, trans, train, default_stat
     if train:
         hmm.go(predict_args['improvement_frac'])
 
-    # make predictions! 
+    # make predictions!
 
     if method == "posterior":
         predicted = {}
@@ -501,36 +534,38 @@ def run_hmm(seqs, sim_args, predict_args, init, emis, trans, train, default_stat
             # since we want to keep track of the probabilities for all
             # states at each point
             path, path_probs = sim_process.get_max_path(p[i])
-            path_t = sim_process.threshold_predicted(path, path_probs, \
-                                                 predict_args['threshold'], \
-                                                 default_state)
+            path_t = sim_process.threshold_predicted(path, path_probs,
+                                                     predict_args['threshold'],
+                                                     default_state)
             predicted[predict_inds[i]] = path_t
             all_probs[predict_inds[i]] = p[i]
-            predicted[predict_inds[i]] = fill_prediction(predicted[predict_inds[i]], \
-                                                         ps, seq_start, seq_end, \
-                                                         predict_args['states'])
+            predicted[predict_inds[i]] = fill_prediction(
+                predicted[predict_inds[i]], ps, seq_start, seq_end,
+                predict_args['states'])
 
         return predicted, all_probs, hmm, hmm_init
-        
+
     if method == 'viterbi':
         predicted = {}
         for i in predict_inds:
             hmm.set_obs(seqs[i])
-            predicted[i] = convert_predictions(hmm.viterbi(), predict_args['states'])
-            predicted[predict_inds[i]] = fill_prediction(predicted[predict_inds[i]], \
-                                                         ps, seq_start, seq_end, \
-                                                         predict_args['states'])
+            predicted[i] = convert_predictions(
+                hmm.viterbi(), predict_args['states'])
+            predicted[predict_inds[i]] = fill_prediction(
+                predicted[predict_inds[i]], ps, seq_start, seq_end,
+                predict_args['states'])
 
         return predicted, None, hmm, hmm_init
 
     else:
         print('invalid method')
 
-def set_up_seqs(sim, sim_args, predict_args): 
+
+def set_up_seqs(sim, sim_args, predict_args):
 
     # fill in nonpolymorphic sites
     fill_symbol = '0'
-    seqs_filled = sim_process.fill_seqs(sim['seqs'], sim['positions'], \
+    seqs_filled = sim_process.fill_seqs(sim['seqs'], sim['positions'],
                                         sim_args['num_sites'], fill_symbol)
 
     # code sequences by which references they match at each position
@@ -539,35 +574,40 @@ def set_up_seqs(sim, sim_args, predict_args):
 
     return seqs_coded
 
-def predict_introgressed(sim, sim_args, predict_args, train, method, only_poly=True):
-    
+
+def predict_introgressed(sim, sim_args, predict_args,
+                         train, method, only_poly=True):
+
     ref_seqs = [sim['seqs'][r] for r in predict_args['ref_inds']]
     seqs_coded = code_seqs(sim['seqs'], ref_seqs)
-    ps = sim['positions'] 
+    ps = sim['positions']
 
     if not only_poly:
         fill_symbol = gp.match_symbol * len(ref_seqs)
         for i in range(len(seqs_coded)):
-            seqs_coded[i] = sim_process.fill_seq(seqs_coded[i], sim['positions'], \
-                                                 sim_args['num_sites'], fill_symbol)
+            seqs_coded[i] = sim_process.fill_seq(
+                seqs_coded[i], sim['positions'],
+                sim_args['num_sites'], fill_symbol)
         ps = range(0, sim_args['num_sites'])
 
     # initial values for initial, emission, and transition
     # probabilities
-    init, emis, trans = initial_hmm_parameters(seqs_coded, \
-                                               sim_args['species_to_indices'], \
-                                               sim_args['species_to'], \
-                                               sim_args['num_sites'], predict_args)
+    init, emis, trans = initial_hmm_parameters(seqs_coded,
+                                               sim_args['species_to_indices'],
+                                               sim_args['species_to'],
+                                               sim_args['num_sites'],
+                                               predict_args)
 
     # make predictions
     default_state = sim_args['species_to']
     predicted, probs, hmm, hmm_init = \
-        run_hmm(seqs_coded, sim_args, predict_args,\
+        run_hmm(seqs_coded, sim_args, predict_args,
                 init, emis, trans, train, default_state, method, ps,
                 0, sim_args['num_sites']-1)
 
     return predicted, probs, hmm, hmm_init, ps
 
+
 def write_hmm_headers(states, emis_symbols, f, sep):
 
     header_string = ''
@@ -585,11 +625,12 @@ def write_hmm_headers(states, emis_symbols, f, sep):
     for s1 in states:
         for s2 in states:
             header_string += 'trans_' + s1 + '_' + s2 + sep
-    
+
     f.write(header_string[:-(len(sep))] + '\n')
     f.flush()
 
-def write_hmm_line(hmm, f, header = False):
+
+def write_hmm_line(hmm, f, header=False):
 
     sep = '\t'
     emis_symbols = sorted(hmm.emis[0].keys())
diff --git a/code/sim/sim_predict_main.py b/code/sim/sim_predict_main.py
index 6110371..b2b07db 100644
--- a/code/sim/sim_predict_main.py
+++ b/code/sim/sim_predict_main.py
@@ -1,17 +1,14 @@
 import sys
-import os
 import process_args
 import sim_process
 import sim_predict
 import gzip
-sys.path.append('..')
 import global_params as gp
-sys.path.append('../analyze')
-import predict
+from analyze import predict
 
-##======
+# ======
 # read in simulation parameters
-##======
+# ======
 
 sim_tag = sys.argv[2]
 sim_args = process_args.process_args_by_tag(sys.argv[1], sim_tag)
@@ -19,77 +16,79 @@
 only_poly = True
 write_ps = only_poly
 
-##======
+# ======
 # loop through all simulations and predict introgression
-##======
+# ======
 
 gp_dir = '../'
 
 # for reading output from ms
-ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix + \
+ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix +
             sim_tag + '.txt', 'r')
 # summary output
-out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
+out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
              sim_tag + '_hmm_' + predict_args['predict_tag'] + '.txt', 'w')
 # summary output
-out_init_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                  sim_tag + '_hmm_init_' + predict_args['predict_tag'] + '.txt', 'w')
+out_init_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                  sim_tag + '_hmm_init_' +
+                  predict_args['predict_tag'] + '.txt', 'w')
 # introgression output
-introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                       sim_tag + '_introgressed_predicted_' + \
+introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                       sim_tag + '_introgressed_predicted_' +
                        predict_args['predict_tag'] + '.txt', 'w')
 # associated probabilities output
-prob_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-              sim_tag + '_introgressed_probs_predicted_' + \
+prob_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+              sim_tag + '_introgressed_probs_predicted_' +
               predict_args['predict_tag'] + '.txt', 'w')
 
 # sites actually used in making predicitons (polymorphic sites)
-ps_f = gzip.open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                 sim_tag + '_positions_predicted_' + \
+ps_f = gzip.open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                 sim_tag + '_positions_predicted_' +
                  predict_args['predict_tag'] + '.txt.gz', 'wb')
 
 for i in range(sim_args['num_reps']):
-    
-    print i
 
-    ##======
+    print(i)
+
+    # ======
     # read in simulated sequences
-    ##======
-    
+    # ======
+
     # trees, recomb_sites, seg_sites, positions, seqs
-    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'], sim_args['num_samples'])
+    sim = sim_process.read_one_sim(
+        ms_f, sim_args['num_sites'], sim_args['num_samples'])
 
-    ##======
+    # ======
     # predict introgressed/non-introgressed tracts
-    ##======
-    
+    # ======
+
     state_seq, probs, hmm, hmm_init, ps = \
-        sim_predict.predict_introgressed(sim, sim_args, predict_args, \
-                                         train=True, method='posterior', \
+        sim_predict.predict_introgressed(sim, sim_args, predict_args,
+                                         train=True, method='posterior',
                                          only_poly=only_poly)
-    state_seq_blocks = sim_process.convert_to_blocks(state_seq, \
+    state_seq_blocks = sim_process.convert_to_blocks(state_seq,
                                                      predict_args['states'])
 
-    ##======
+    # ======
     # output
-    ##======
-    
+    # ======
+
     if write_ps:
         predict.write_positions(ps, ps_f, str(i), 'I')
 
     # summary info about HMM (before training)
-    sim_predict.write_hmm_line(hmm_init, out_init_f, i==0) 
+    sim_predict.write_hmm_line(hmm_init, out_init_f, i == 0)
 
     # summary info about HMM (after training)
-    sim_predict.write_hmm_line(hmm, out_f, i==0) 
+    sim_predict.write_hmm_line(hmm, out_f, i == 0)
 
     # locations of introgression
-    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f, \
+    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f,
                                            i, predict_args['states'])
 
     # probabilities at each site
     sim_process.write_state_probs(probs, prob_f, i)
-    
+
 ms_f.close()
 out_f.close()
 introgression_f.close()
diff --git a/code/sim/sim_predict_phylohmm.py b/code/sim/sim_predict_phylohmm.py
index 5c9ae2c..538161f 100644
--- a/code/sim/sim_predict_phylohmm.py
+++ b/code/sim/sim_predict_phylohmm.py
@@ -1,15 +1,13 @@
 import sys
 import os
-import copy
-import itertools
 import random
 import sim_process
 import sim_predict
-sys.path.append('..')
 import global_params as gp
 
+
 def process_args(arg_list, all_sim_args, i=1):
-    
+
     d = {}
 
     d['tag'] = arg_list[i]
@@ -25,6 +23,7 @@ def process_args(arg_list, all_sim_args, i=1):
 
     return d, i
 
+
 def convert_binary_to_nucleotides(seqs):
     n = ['A', 'T', 'G', 'C']
     seqs_n = [[] for s in range(len(seqs))]
@@ -39,6 +38,7 @@ def convert_binary_to_nucleotides(seqs):
                 seqs_n[s].append(l1)
     return seqs_n
 
+
 def write_fasta(seqs, names, fn):
     f = open(fn, 'w')
     for i in range(len(names)):
@@ -46,8 +46,9 @@ def write_fasta(seqs, names, fn):
         f.write(''.join(seqs[i]) + '\n')
     f.close()
 
+
 def read_predicted_viterbi(fn, trees_to_states):
-    
+
     # TODO make this deal with filtered sites? here or somewhere else?
 
     f = open(fn, 'r')
@@ -68,31 +69,35 @@ def read_predicted_viterbi(fn, trees_to_states):
     f.close()
     return predicted
 
+
 def process_phylo_viterbi_output(trees_to_states, tag, rep, filtered_sites_fn):
 
     # read predicted state sequence
-    viterbi_fn = gp.sim_out_dir_absolute + '/phylo-hmm/optimized.viterbi.sequence.'  + \
+    viterbi_fn = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.viterbi.sequence.' + \
         tag + '.' + str(rep)
     predicted = read_predicted_viterbi(viterbi_fn, trees_to_states)
 
     # move filtered sites file to appopriate output directory
     try:
-        filtered_sites = [int(x) for x in \
-                          open(filtered_sites_fn, 'r').readline().strip().split(' ')]
-    except:
-        print 'looks like none of the sites passed filtering'
+        _ = [int(x) for x in
+             open(filtered_sites_fn,
+                  'r').readline().strip().split(' ')]
+    except FileNotFoundError:
+        print('looks like none of the sites passed filtering')
         sys.exit()
 
-    os.system('mv ' + filtered_sites_fn + ' ' + gp.sim_out_dir_absolute + '/phylo-hmm/' + \
-                  'filtered_sites_' + tag + '_rep' + str(rep) + '.txt')
+    os.system('mv ' + filtered_sites_fn + ' ' +
+              gp.sim_out_dir_absolute + '/phylo-hmm/' +
+              'filtered_sites_' + tag + '_rep' + str(rep) + '.txt')
 
     # TODO implement getting hmm params
     return predicted, None, None, None
 
-def read_predicted_posterior_decoding(fn, state_index_to_species, \
-                                      predict_args,
-                                      default_state):
-    
+
+def read_predicted_posterior_decoding(fn, state_index_to_species,
+                                      predict_args, default_state):
+
     # TODO make this deal with filtered sites? here or somewhere else?
 
     f = open(fn, 'r')
@@ -105,34 +110,37 @@ def read_predicted_posterior_decoding(fn, state_index_to_species, \
         site = int(line[0])
         state = state_index_to_species[int(line[1])]
         prob = float(line[2])
-        if not all_probs[site].has_key(state):
+        if state not in all_probs[site]:
             all_probs[site][state] = 0
         all_probs[site][state] += prob
-        
+
     # not actually going to return probs, at least for now, since we want
     # to keep track of the probabilities for all states at each point
     predicted, probs = sim_process.get_max_path(all_probs)
 
-    predicted = sim_process.threshold_predicted(predicted, probs, \
-                                                predict_args['threshold'], \
+    predicted = sim_process.threshold_predicted(predicted, probs,
+                                                predict_args['threshold'],
                                                 default_state)
 
     return predicted, all_probs
 
-def process_phylo_posterior_decoding_output(state_index_to_species, tag, rep, \
-                                            filtered_sites_fn, \
+
+def process_phylo_posterior_decoding_output(state_index_to_species, tag, rep,
+                                            filtered_sites_fn,
                                             predict_args, default_state):
 
     # read predicted state sequence
     posterior_decoding_fn = \
-        gp.sim_out_dir_absolute + '/phylo-hmm/optimized.posterior.decoding.probabilities.' + \
+        gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.posterior.decoding.probabilities.' + \
         tag + '.' + str(rep)
     predicted, all_probs = \
-        read_predicted_posterior_decoding(posterior_decoding_fn, \
-                                          state_index_to_species, \
+        read_predicted_posterior_decoding(posterior_decoding_fn,
+                                          state_index_to_species,
                                           predict_args, default_state)
 
-    ps = [int(x) for x in open(filtered_sites_fn, 'r').readline().strip().split(' ')]
+    ps = [int(x) for x in
+          open(filtered_sites_fn, 'r').readline().strip().split(' ')]
 
     # TODO implement getting hmm params
     return predicted, all_probs, None, None, None, ps
@@ -142,7 +150,7 @@ def predict_introgressed(sim, sim_args, predict_args, i, gp_dir):
 
     # fill in nonpolymorphic sites
     fill_symbol = '0'
-    seqs_filled = sim_process.fill_seqs(sim['seqs'], sim['positions'], \
+    seqs_filled = sim_process.fill_seqs(sim['seqs'], sim['positions'],
                                         sim_args['num_sites'], fill_symbol)
 
     # use letters because phylo-hmm seems set up only for that
@@ -150,8 +158,9 @@ def predict_introgressed(sim, sim_args, predict_args, i, gp_dir):
 
     # and write to file
     seq_fn = gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix + \
-             'sequence_' + sim_args['tag'] + '_rep' + str(i) + '.fasta'
-    write_fasta(seqs_filled, ['C1', 'C2', 'P', 'OUTGROUP'], seq_fn) # TODO unhardcode
+        'sequence_' + sim_args['tag'] + '_rep' + str(i) + '.fasta'
+    # TODO unhardcode
+    write_fasta(seqs_filled, ['C1', 'C2', 'P', 'OUTGROUP'], seq_fn)
 
     # create input file for phylo-hmm
     input_fn = gp.sim_out_dir_absolute + '/phylo-hmm/' + 'autoinput_' + \
@@ -159,29 +168,28 @@ def predict_introgressed(sim, sim_args, predict_args, i, gp_dir):
     working_dir = gen_input_file(seq_fn, input_fn, sim_args['tag'], i)
 
     # make predictions
-    phylohmm_command = \
-            'java -jar /tigress/anneec/software/phylo_hmm/phmm-0.1/dist/lib/phmm.jar < ' + input_fn
-    print phylohmm_command
+    phylohmm_command = (
+            'java -jar /tigress/anneec/software/phylo_hmm/'
+            'phmm-0.1/dist/lib/phmm.jar < ' + input_fn)
+    print(phylohmm_command)
     os.system(phylohmm_command)
 
     # write results in different format
     """
-    trees_to_states = {'p1':'cer', 'p2':'par'} # generalize this? worth it? nah
-    state_seq, init, emis, trans = process_phylo_viterbi_output(trees_to_states, \
-                                                        args['tag'], \
-                                                        i, \
-                                                        working_dir + \
-                                                        '/filtered_sites.txt')
+    trees_to_states = {'p1':'cer', 'p2':'par'}
+    state_seq, init, emis, trans = process_phylo_viterbi_output(
+        trees_to_states, args['tag'], i,
+        working_dir + '/filtered_sites.txt')
     """
-    state_index_to_species = {0:'cer',1:'cer',2:'cer',3:'par',4:'par',5:'par'} 
+    state_index_to_species = {0: 'cer', 1: 'cer', 2: 'cer',
+                              3: 'par', 4: 'par', 5: 'par'}
     default_state = sim_args['species_to']
     state_seq, probs, init, emis, trans, ps = \
-        process_phylo_posterior_decoding_output(state_index_to_species, \
-                                                sim_args['tag'], i, \
-                                                working_dir + '/filtered_sites.txt',
-                                                predict_args, default_state)
-    state_seq = sim_predict.fill_prediction(state_seq, ps, 0, \
-                                            sim_args['num_sites'] - 1, \
+        process_phylo_posterior_decoding_output(
+            state_index_to_species, sim_args['tag'], i,
+            working_dir + '/filtered_sites.txt', predict_args, default_state)
+    state_seq = sim_predict.fill_prediction(state_seq, ps, 0,
+                                            sim_args['num_sites'] - 1,
                                             predict_args['states'])
 
     # TODO gah
@@ -190,182 +198,209 @@ def predict_introgressed(sim, sim_args, predict_args, i, gp_dir):
 
     return state_seq_dic, probs_dic, init, emis, trans, ps
 
+
 def gen_input_file(sequence_fn, fn, tag, rep):
     """
     Initial mode:
     0) Build a new model.
     1) Load a pre-existing model.
     2) Exit.
-    Choose an option: 
+    Choose an option:
     0
     Input the basic file info path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     basic-info.txt
     Input the number of trees or states for this HMM:
     6
-    
+
     Input the parental trees file path name:
-    (note: see README for file format) 
+    (note: see README for file format)
 
     parental.trees
-    
+
     Input the gene genealogies file path name:
-    (note: see README for file format) 
-    
+    (note: see README for file format)
+
     gene.trees
-    
-    Input outgroup taxon name, or empty string for no outgroup taxon: 
-    
+
+    Input outgroup taxon name, or empty string for no outgroup taxon:
+
     OUTGROUP
-    Empty working directory: 
+    Empty working directory:
     ./d
-    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>: 
+    Input non-zero substitution model rates in format <AG> <AC> <AT> <GC> <GT>:
     0.964011 0.192217 0.270592 0.0806232 0.174283
-    Input non-zero substitution model base frequencies in format <A> <G> <C> <T>: 
+    Input non-zero substitution model base frequencies in format
+        <A> <G> <C> <T>:
     0.1953 0.3077 0.3008 0.1962
-    
-    Across-row switching frequency gamma: 
+
+    Across-row switching frequency gamma:
     .1
-    Hidden state switching frequency ratio term input file: 
+    Hidden state switching frequency ratio term input file:
     switching-frequency-ratio-terms
 
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization heuristic
+        that incorporates Brent's method
     4) Exit
     Choose an option:
     0
 
-    Path to your output file: 
+    Path to your output file:
     initial.viterbi.sequence
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
     Choose an option:
     1
-    Keep or discard parsimony-uninformative sites? true for keep, false for discard: 
+    Keep or discard parsimony-uninformative sites? true for keep,
+        false for discard:
     false
-    Input observation sequence file path name : 
+    Input observation sequence file path name :
     sequence_nogaps.fasta
     Input alignment length: 86706
     Alignment length after optional filter step: 8194
     Building Psy Array
-    100% Done!                             
+    100% Done!
 
     Total: 29 MiB; Free: 22 MiB; --> Used: 7 MiB.
     Max: 454 MiB.
-    Begin computing Viterbi's algorithm : FORWARD PART 
-    100% DONE!                                
+    Begin computing Viterbi's algorithm : FORWARD PART
+    100% DONE!
     Enlarging... capacity = 10000
-    Begin computing Viterbi's algorithm : BACKWARD PART 
-    100% DONE!                                   
+    Begin computing Viterbi's algorithm : BACKWARD PART
+    100% DONE!
     Input HMM Viterbi log likelihood: |-21.866065387121736|
-    Computing input HMM log likelihood for input sequences... 
+    Computing input HMM log likelihood for input sequences...
     Computing input HMM log likelihood for input sequences DONE.
     Input HMM log likelihood: |-21.310316817591303|
     Operate mode:
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization heuristic
+        that incorporates Brent's method
     4) Exit
-    Choose an option: 
-    
+    Choose an option:
+
 
     Which observation sequence would you like to use?
     0) Reuse previously read sequence.
     1) Load a new observation sequence.
-    Choose an option: 
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    
-    Input parental-branch-length-parameter-to-edge map filename: 
-    Input parental-branch-length-parameter strict inequalities filename: 
-    Input length-parameter-set-constraints filename: 
-    Input checkpoint file to restore from, or empty line for no restore: 
-    Output posterior decoding probabilities file: 
-    Output Viterbi-optimal hidden state sequence file: 
-    Output model likelihoods file: 
-    Output file with optimized model parameter values: 
-    Initial search settings vector <setting 1> <setting 2> ... <setting s>, where <setting s> is one of CURRENT, DEFAULT, RANDOM
-    Enable optimization flag vector <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
+    Choose an option:
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ... <setting s>,
+        where <setting s> is one of CURRENT, DEFAULT, RANDOM
+
+    Input parental-branch-length-parameter-to-edge map filename:
+    Input parental-branch-length-parameter strict inequalities filename:
+    Input length-parameter-set-constraints filename:
+    Input checkpoint file to restore from, or empty line for no restore:
+    Output posterior decoding probabilities file:
+    Output Viterbi-optimal hidden state sequence file:
+    Output model likelihoods file:
+    Output file with optimized model parameter values:
+    Initial search settings vector <setting 1> <setting 2> ... <setting s>,
+        where <setting s> is one of CURRENT, DEFAULT, RANDOM
+    Enable optimization flag vector <enable parental tree optimization flag>
+        <enable gene genealogy optimization flag>
+        <enable switching frequency optimization flag>
+        <enable substitution model optimization flag>
     0) Run Viterbi
     1) Learn Model using Baum Welch.
-    3) Learn Model using a multivariate optimization heuristic that incorporates Brent's method
+    3) Learn Model using a multivariate optimization heuristic
+        that incorporates Brent's method
     4) Exit
-    Choose an option: 
+    Choose an option:
     """
 
-    initial_mode = '0' # build new model
+    initial_mode = '0'  # build new model
     basic_info_fn = 'phylohmm_inputs/basic-info.txt'
-    num_states = '6' # num parental trees * num genealogies
+    num_states = '6'  # num parental trees * num genealogies
     parental_trees_fn = 'phylohmm_inputs/parental.trees'
     gene_trees_fn = 'phylohmm_inputs/gene.trees'
     outgroup_name = 'OUTGROUP'
-    working_dir = gp.sim_out_dir_absolute + '/phylo-hmm/working/' + tag # don't need one for every rep because we do one at a time
+    # don't need one for every rep because we do one at a time
+    working_dir = gp.sim_out_dir_absolute + '/phylo-hmm/working/' + tag
     if not os.path.exists(working_dir):
         os.makedirs(working_dir)
-    substitution_rates = '1 1 1 1 1' # <AG> <AC> <AT> <GC> <GT>
-    base_frequencies = '0.25 0.25 0.25 0.25' # <A> <G> <C> <T>
+    substitution_rates = '1 1 1 1 1'  # <AG> <AC> <AT> <GC> <GT>
+    base_frequencies = '0.25 0.25 0.25 0.25'  # <A> <G> <C> <T>
     parental_tree_switching_freq = '.1'
     gene_tree_switching_fn = 'phylohmm_inputs/switching-frequency-ratio-terms'
-    operate_mode = '0' # run viterbi
+    operate_mode = '0'  # run viterbi
     # operate_mode = '1' # learn with Baum-Welch
-    output_file_path = gp.sim_out_dir_absolute + '/phylo-hmm/initial.viterbi.sequence.' + tag + '.' + str(rep)
-    observation_sequence_option = '1' # read new sequence
-    keep_uninformative_sites = 'false' # referred to as "optional filter step" later
-    #sequence_fn = 'sequence.fasta'
-    operate_mode_2 = '3' # learn with "a multivariate optimization heuristic that incorporates Brent's method"
-    observation_sequence_option_2 = '0' # reuse previously read sequence
+    output_file_path = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/initial.viterbi.sequence.' + tag + '.' + str(rep)
+    observation_sequence_option = '1'  # read new sequence
+    # referred to as "optional filter step" later
+    keep_uninformative_sites = 'false'
+    # sequence_fn = 'sequence.fasta'
+    # learn with "a multivariate optimization heuristic
+    # that incorporates Brent's method"
+    operate_mode_2 = '3'
+    observation_sequence_option_2 = '0'  # reuse previously read sequence
     length_params_fn = 'phylohmm_inputs/length-parameters'
-    length_params_inequality_constraints_fn = 'phylohmm_inputs/length-parameter-inequality-constraints'
-    length_params_constraint_sets_fn = 'phylohmm_inputs/length-parameter-constraint-sets'
-    restore_fn = '' # blank for no file
-    output_posterior_decoding_fn = gp.sim_out_dir_absolute + '/phylo-hmm/optimized.posterior.decoding.probabilities.' + tag + '.' + str(rep)
-    output_viterbi_optimized_fn = gp.sim_out_dir_absolute + '/phylo-hmm/optimized.viterbi.sequence.'  + tag + '.' + str(rep)
-    output_model_likelihoods_fn = gp.sim_out_dir_absolute + '/phylo-hmm/optimized.model.likelihoods.'  + tag + '.' + str(rep)
-    output_optimized_params_fn = gp.sim_out_dir_absolute + '/phylo-hmm/optimized.model.parameters.' + tag + '.' + str(rep)
+    length_params_inequality_constraints_fn = \
+        'phylohmm_inputs/length-parameter-inequality-constraints'
+    length_params_constraint_sets_fn = \
+        'phylohmm_inputs/length-parameter-constraint-sets'
+    restore_fn = ''  # blank for no file
+    output_posterior_decoding_fn = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.posterior.decoding.probabilities.' \
+        + tag + '.' + str(rep)
+    output_viterbi_optimized_fn = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.viterbi.sequence.' + tag + '.' + str(rep)
+    output_model_likelihoods_fn = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.model.likelihoods.' + tag + '.' + str(rep)
+    output_optimized_params_fn = gp.sim_out_dir_absolute + \
+        '/phylo-hmm/optimized.model.parameters.' + tag + '.' + str(rep)
     initial_search_settings = 'CURRENT DEFAULT'
-    enable_optimization = 'true true true true' # <enable parental tree optimization flag> <enable gene genealogy optimization flag> <enable switching frequency optimization flag> <enable substitution model optimization flag>
-    operate_mode_3='4' # exit
+    # <enable parental tree optimization flag>
+    # <enable gene genealogy optimization flag>
+    # <enable switching frequency optimization flag>
+    # <enable substitution model optimization flag>
+    enable_optimization = 'true true true true'
+    operate_mode_3 = '4'  # exit
 
     f = open(fn, 'w')
-    f.write(initial_mode + '\n' + \
-                basic_info_fn + '\n' + \
-                num_states + '\n' + \
-                parental_trees_fn + '\n' + \
-                gene_trees_fn + '\n' + \
-                outgroup_name + '\n' + \
-                working_dir + '\n' + \
-                substitution_rates + '\n' + \
-                base_frequencies + '\n' + \
-                parental_tree_switching_freq + '\n' + \
-                gene_tree_switching_fn + '\n' + \
-                operate_mode + '\n' + \
-                output_file_path + '\n' + \
-                observation_sequence_option + '\n' + \
-                keep_uninformative_sites + '\n' + \
-                sequence_fn + '\n' + \
-                operate_mode_2 + '\n' + \
-                observation_sequence_option_2 + '\n' + \
-                length_params_fn + '\n' + \
-                length_params_inequality_constraints_fn + '\n' + \
-                length_params_constraint_sets_fn + '\n' + \
-                restore_fn + '\n' + \
-                output_posterior_decoding_fn + '\n' + \
-                output_viterbi_optimized_fn + '\n' + \
-                output_model_likelihoods_fn + '\n' + \
-                output_optimized_params_fn + '\n' + \
-                initial_search_settings + '\n' + \
-                enable_optimization + '\n' + \
-                operate_mode_3)
+    f.write(initial_mode + '\n' +
+            basic_info_fn + '\n' +
+            num_states + '\n' +
+            parental_trees_fn + '\n' +
+            gene_trees_fn + '\n' +
+            outgroup_name + '\n' +
+            working_dir + '\n' +
+            substitution_rates + '\n' +
+            base_frequencies + '\n' +
+            parental_tree_switching_freq + '\n' +
+            gene_tree_switching_fn + '\n' +
+            operate_mode + '\n' +
+            output_file_path + '\n' +
+            observation_sequence_option + '\n' +
+            keep_uninformative_sites + '\n' +
+            sequence_fn + '\n' +
+            operate_mode_2 + '\n' +
+            observation_sequence_option_2 + '\n' +
+            length_params_fn + '\n' +
+            length_params_inequality_constraints_fn + '\n' +
+            length_params_constraint_sets_fn + '\n' +
+            restore_fn + '\n' +
+            output_posterior_decoding_fn + '\n' +
+            output_viterbi_optimized_fn + '\n' +
+            output_model_likelihoods_fn + '\n' +
+            output_optimized_params_fn + '\n' +
+            initial_search_settings + '\n' +
+            enable_optimization + '\n' +
+            operate_mode_3)
 
     return working_dir
diff --git a/code/sim/sim_predict_phylohmm_main.py b/code/sim/sim_predict_phylohmm_main.py
index 3b673a5..614b661 100644
--- a/code/sim/sim_predict_phylohmm_main.py
+++ b/code/sim/sim_predict_phylohmm_main.py
@@ -1,82 +1,82 @@
 import sys
-import os
 import process_args
 import sim_process
-import sim_predict_phylohmm 
+import sim_predict_phylohmm
 import gzip
-sys.path.append('..')
 import global_params as gp
-sys.path.append('../analyze')
-import predict
+from analyze import predict
 
-##======
+# ======
 # read in simulation parameters
-##======
+# ======
 
 sim_tag = sys.argv[2]
 sim_args = process_args.process_args_by_tag(sys.argv[1], sim_tag)
-predict_args, last_read = sim_predict_phylohmm.process_args(sys.argv, sim_args, i=2)
+predict_args, last_read = sim_predict_phylohmm.process_args(
+    sys.argv, sim_args, i=2)
 write_ps = True
 
-##======
+# ======
 # loop through all simulations predict introgression
-##======
+# ======
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix + \
+ms_f = open(gp.sim_out_dir_absolute + '/ms/' + gp.sim_out_prefix +
             sim_tag + '.txt', 'r')
 # summary output
-out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-             sim_tag + '_phylohmm_' + \
+out_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+             sim_tag + '_phylohmm_' +
              predict_args['predict_tag'] + '.txt', 'w')
 # introgression output
-introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                       sim_tag + '_introgressed_predicted_phylohmm_' + \
+introgression_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                       sim_tag + '_introgressed_predicted_phylohmm_' +
                        predict_args['predict_tag'] + '.txt', 'w')
 # associated probabilities output
-prob_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-              sim_tag + '_introgressed_probs_predicted_phylohmm_' + \
+prob_f = open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+              sim_tag + '_introgressed_probs_predicted_phylohmm_' +
               predict_args['predict_tag'] + '.txt', 'w')
 
 # sites actually used in making predicitons (polymorphic sites)
-ps_f = gzip.open(gp.sim_out_dir_absolute + gp.sim_out_prefix + \
-                 sim_tag + '_positions_predicted_phylohmm_' + \
+ps_f = gzip.open(gp.sim_out_dir_absolute + gp.sim_out_prefix +
+                 sim_tag + '_positions_predicted_phylohmm_' +
                  predict_args['predict_tag'] + '.txt.gz', 'wb')
 
 for i in range(sim_args['num_reps']):
-    
-    print i
 
-    ##======
+    print(i)
+
+    # ======
     # read in simulated sequences
-    ##======
-    
+    # ======
+
     # trees, recomb_sites, seg_sites, positions, seqs
-    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'], sim_args['num_samples'])
+    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'],
+                                   sim_args['num_samples'])
 
-    ##======
+    # ======
     # predict introgressed/non-introgressed tracts
-    ##======
+    # ======
 
     state_seq, probs, init, emis, trans, ps = \
-        sim_predict_phylohmm.predict_introgressed(sim, sim_args, predict_args, i, gp_dir)
+        sim_predict_phylohmm.predict_introgressed(sim, sim_args,
+                                                  predict_args, i, gp_dir)
 
-    state_seq_blocks = sim_process.convert_to_blocks(state_seq, \
+    state_seq_blocks = sim_process.convert_to_blocks(state_seq,
                                                      sim_args['species'])
 
-    ##======
+    # ======
     # output
-    ##======
+    # ======
 
     if write_ps:
         predict.write_positions(ps, ps_f, str(i), 'I')
 
     # summary info about HMM
-    # sim_predict.write_hmm_line(hmm, out_f, i==0) 
+    # sim_predict.write_hmm_line(hmm, out_f, i==0)
 
     # locations of introgression
-    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f, \
+    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f,
                                            i, sim_args['species'])
 
     # probabilities at each site
diff --git a/code/sim/sim_predict_viterbi_main.py b/code/sim/sim_predict_viterbi_main.py
index 5064002..02024d5 100644
--- a/code/sim/sim_predict_viterbi_main.py
+++ b/code/sim/sim_predict_viterbi_main.py
@@ -1,76 +1,74 @@
 import sys
-import os
 import process_args
 import sim_process
 import sim_predict
-sys.path.append('..')
 import global_params as gp
 
-##======
+# ======
 # read in simulation parameters
-##======
+# ======
 
 sim_tag = sys.argv[2]
 sim_args = process_args.process_args_by_tag(sys.argv[1], sim_tag)
 predict_args, last_read = sim_predict.process_args(sys.argv, sim_args, i=2)
 
-##======
+# ======
 # loop through all simulations predict introgression
-##======
+# ======
 
 gp_dir = '../'
 # for reading output from ms
-ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' + gp.sim_out_prefix + \
+ms_f = open(gp_dir + gp.sim_out_dir + '/ms/' + gp.sim_out_prefix +
             sim_tag + '.txt', 'r')
 # summary output
-out_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
-             sim_tag + '_hmm_viterbi_' + predict_args['predict_tag'] + '.txt', 'w')
+out_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix +
+             sim_tag + '_hmm_viterbi_' +
+             predict_args['predict_tag'] + '.txt', 'w')
 # summary output
-out_init_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
-                  sim_tag + '_hmm_init_viterbi_' + predict_args['predict_tag'] + \
-                  '.txt', 'w')
+out_init_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix +
+                  sim_tag + '_hmm_init_viterbi_' +
+                  predict_args['predict_tag'] + '.txt', 'w')
 # introgression output
-introgression_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix + \
-                       sim_tag + '_introgressed_predicted_viterbi_' + \
+introgression_f = open(gp_dir + gp.sim_out_dir + gp.sim_out_prefix +
+                       sim_tag + '_introgressed_predicted_viterbi_' +
                        predict_args['predict_tag'] + '.txt', 'w')
 
 for i in range(sim_args['num_reps']):
-    
-    print i
 
-    ##======
+    print(i)
+
+    # ======
     # read in simulated sequences
-    ##======
-    
+    # ======
+
     # trees, recomb_sites, seg_sites, positions, seqs
-    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'], sim_args['num_samples'])
+    sim = sim_process.read_one_sim(ms_f, sim_args['num_sites'],
+                                   sim_args['num_samples'])
 
-    ##======
+    # ======
     # predict introgressed/non-introgressed tracts
-    ##======
-    
-    state_seq, hmm, hmm_init = sim_predict.predict_introgressed(sim, sim_args, \
-                                                                predict_args, \
-                                                                train=True,
-                                                                method = "viterbi")
-    state_seq_blocks = sim_process.convert_to_blocks(state_seq, \
+    # ======
+
+    state_seq, hmm, hmm_init = sim_predict.predict_introgressed(
+        sim, sim_args, predict_args, train=True, method="viterbi")
+    state_seq_blocks = sim_process.convert_to_blocks(state_seq,
                                                      predict_args['states'])
 
-    ##======
+    # ======
     # output
-    ##======
+    # ======
 
     # summary info about HMM (before training)
-    sim_predict.write_hmm_line(hmm_init, out_init_f, i==0) 
+    sim_predict.write_hmm_line(hmm_init, out_init_f, i == 0)
 
     # summary info about HMM (after training)
-    sim_predict.write_hmm_line(hmm, out_f, i==0) 
+    sim_predict.write_hmm_line(hmm, out_f, i == 0)
 
     # locations of introgression
-    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f, \
+    sim_process.write_introgression_blocks(state_seq_blocks, introgression_f,
                                            i, predict_args['states'])
 
-    
+
 ms_f.close()
 out_f.close()
 introgression_f.close()
diff --git a/code/sim/sim_process.py b/code/sim/sim_process.py
index 31d1855..c0af3cf 100644
--- a/code/sim/sim_process.py
+++ b/code/sim/sim_process.py
@@ -1,15 +1,16 @@
 import numpy as np
+from typing import List, Dict, Tuple
 
 
 # given fractional positions for snvs and length of sequence l,
 # determine integer positions; if allow_multi_hit is true, easy but if
 # not, shift them around to include all the snvs
-def integer_positions(positions, l, allow_multi_hit = False):
+def integer_positions(positions, l, allow_multi_hit=False):
 
     int_positions = [int(x * l) for x in positions]
     if allow_multi_hit:
         return int_positions
-    
+
     assert(len(positions) <= l)
 
     # keep first position
@@ -19,7 +20,7 @@ def integer_positions(positions, l, allow_multi_hit = False):
     for p in int_positions[1:]:
         new_p = p
         n = 1
-        add = True # adding or substracting (right or left)
+        add = True  # adding or substracting (right or left)
         while new_p in a or new_p < 0 or new_p >= l:
             if add:
                 new_p = p + n
@@ -37,12 +38,12 @@ def parse_ms_tree_helper(t):
         colon_ind = t.find(':')
         # index from 0 instead of 1
         return [int(t[:colon_ind]) - 1, float(t[colon_ind+1:])]
-        #return [int(t[:colon_ind]) - 1, Decimal(t[colon_ind+1:])]
+        # return [int(t[:colon_ind]) - 1, Decimal(t[colon_ind+1:])]
     left = ''
     right = ''
     i = -1
     if t[1] != '(':
-        comma_ind = t.find(',') 
+        comma_ind = t.find(',')
         left = t[1:comma_ind]
         right = t[comma_ind+1:t.rfind(')')]
     else:
@@ -59,7 +60,7 @@ def parse_ms_tree_helper(t):
         right = t[i+1:t.rfind(')')]
 
     time = float(t[t.rfind(':')+1:])
-    #time = Decimal(t[t.rfind(':')+1:])
+    # time = Decimal(t[t.rfind(':')+1:])
     return [parse_ms_tree_helper(left), parse_ms_tree_helper(right), time]
 
 
@@ -82,7 +83,7 @@ def read_one_sim(f, num_sites, num_samples):
     t_string = f.readline()
     recomb_sites = []
     trees = []
-    
+
     while t_string[0] == '[':
         t_start = t_string.find(']') + 1
         recomb_sites.append(int(t_string[1:t_start-1]))
@@ -99,7 +100,8 @@ def read_one_sim(f, num_sites, num_samples):
     # convert positions to integers
     # (allow sites to be hit multiple times because that seems reasonable)
     # (zero-indexed)
-    sim['positions'] = integer_positions(positions, num_sites, allow_multi_hit=True)
+    sim['positions'] = integer_positions(positions,
+                                         num_sites, allow_multi_hit=True)
     # read in sequences (at this point only sites that are polymorphic)
     seqs = []
     for i in range(num_samples):
@@ -110,7 +112,14 @@ def read_one_sim(f, num_sites, num_samples):
     return sim
 
 
-def convert_to_blocks_one(state_seq, states):
+def convert_to_blocks_one(state_seq: List[str],
+                          states: List[str]) -> Dict[
+                              str, List[Tuple[int, int]]]:
+    '''
+    Convert a list of sequences into a structure with start and end positions
+    Return structure is a dict keyed on species with values of Lists of
+    each block, which is a tuple with start and end positions
+    '''
     # single individual state sequence
     blocks = {}
     for state in states:
@@ -158,10 +167,9 @@ def write_introgression(state_seq, f, rep, states):
         d[ind] = {}
         for p in range(len(state_seq[ind])):
             current_species = state_seq[ind][p]
-            if current_species != species:
-                if not d[ind].has_key(current_species):
-                    d[ind][current_species] = []
-                d[ind][current_species].append(p)
+            if current_species not in d[ind]:
+                d[ind][current_species] = []
+            d[ind][current_species].append(p)
 
     f.write('rep ' + str(rep) + '\n')
 
@@ -176,8 +184,8 @@ def write_introgression(state_seq, f, rep, states):
 
 def read_introgression(f, line, states):
     # inverse of write_introgression function above
-    
-    d = {} # keyed by individual, then strain, list of sites
+
+    d = {}  # keyed by individual, then strain, list of sites
     assert line.startswith('rep'), line
     rep = int(line[len('rep '):-1])
     # process each individual
@@ -193,7 +201,7 @@ def read_introgression(f, line, states):
             d_ind[species] = [int(i) for i in sites.split(',')]
         d[ind] = d_ind
         line = f.readline()
-                
+
     return d, rep, line
 
 
@@ -220,8 +228,8 @@ def write_introgression_blocks(state_seq_blocks, f, rep, states):
 
 def read_introgression_blocks(f, line, states):
     # inverse of write_introgression_blocks function above
-    
-    d = {} # keyed by individual, then species, list of (start, end)
+
+    d = {}  # keyed by individual, then species, list of (start, end)
     assert line.startswith('rep'), line
     rep = int(line[len('rep '):-1])
     # process each individual
@@ -235,19 +243,19 @@ def read_introgression_blocks(f, line, states):
         for s in x[1:]:
             species, blocks = s.split(':')
             blocks = blocks.split(',')
-            blocks = filter(lambda x: x != '', blocks) # a lil janky
+            blocks = filter(lambda x: x != '', blocks)  # a lil janky
             for block in blocks:
                 block_start, block_end = block.split('-')
                 d_ind[species].append((int(block_start), int(block_end)))
         d[ind] = d_ind
         line = f.readline()
-                
+
     return d, rep, line
 
 
 def unblock(blocks, num_sites):
     # blocks is keyed by individual, then species, list of (start, end)
-    d = {} # keyed by individual, list of species, one for each site
+    d = {}  # keyed by individual, list of species, one for each site
     for ind in blocks.keys():
         d[ind] = ['None' for x in range(num_sites)]
         for state in blocks[ind].keys():
@@ -261,7 +269,7 @@ def write_state_probs(probs, f, rep):
 
     # probs is keyed by individual, list of sites, each site dic keyed
     # by state
-    
+
     # file format is:
     # rep 0
     # 2\tcer:.1,.2,.3\tpar:.9,.8,.7
@@ -274,7 +282,7 @@ def write_state_probs(probs, f, rep):
         f.write(str(ind))
         for state in probs[ind][0].keys():
             f.write('\t' + state + ':')
-            probs_string = ','.join(["{0:.5f}".format(site[state]) \
+            probs_string = ','.join(["{0:.5f}".format(site[state])
                                      for site in probs[ind]])
             f.write(probs_string)
         f.write('\n')
@@ -283,7 +291,7 @@ def write_state_probs(probs, f, rep):
 
 def read_state_probs(f, line):
 
-    d = {} # keyed by individual, then species, list of probs
+    d = {}  # keyed by individual, then species, list of probs
     assert line.startswith('rep'), line
     rep = int(line[len('rep '):-1])
     # process each individual
@@ -298,16 +306,20 @@ def read_state_probs(f, line):
             d_ind[species] = probs
         d[ind] = d_ind
         line = f.readline()
-                
-    return d, rep, line
 
+    return d, rep, line
 
-def threshold_predicted(predicted, probs, threshold, default_state):
 
-    predicted_thresholded = np.array(predicted)
+def threshold_predicted(predicted: List[str],
+                        probs: List[float],
+                        threshold: float,
+                        default_state: str) -> List[str]:
+    '''
+    Given a list of states, predicted, and the associated probabilities, probs
+    Converts any states with probability < threshold to the default state
+    '''
     probs = np.array(probs)
-    predicted_thresholded[probs < threshold] = default_state
-    return list(predicted_thresholded)
+    return list(np.where(probs < threshold, default_state, predicted))
 
 
 def fill_seq(seq, polymorphic_sites, nsites, fill):
@@ -319,18 +331,23 @@ def fill_seq(seq, polymorphic_sites, nsites, fill):
 
 # add in the nonpolymorphic sites
 def fill_seqs(polymorphic_seqs, polymorphic_sites, nsites, fill):
-    
+
     # note that polymorphic sites can have duplicates
     seqs_filled = []
     for seq in polymorphic_seqs:
         s = fill_seq(seq, polymorphic_sites, nsites, fill)
-        seqs_filled.append(''.join(s)) # TODO should return this as list instead
+        # TODO should return this as list instead
+        seqs_filled.append(''.join(s))
     return seqs_filled
 
 
-def get_max_path(p, states):
-    # p is a list of dictionaries, one per site; each dict has keys
-    # for each state, with associated probability
+def get_max_path(p: np.array, states: List[float]) -> Tuple[
+        List[int], List[float]]:
+    '''
+    p is a list of dictionaries, one per site; each dict has keys
+    for each state, with associated probability
+    Return the maximum likelihood path and the associated probabilities
+    '''
     max_positions = np.argmax(p, axis=1)
     max_path = [states[i] for i in max_positions]
     max_probs = [p[i, pos] for i, pos in enumerate(max_positions)]
diff --git a/code/sim/sim_stats.py b/code/sim/sim_stats.py
index ba7f1d7..4e491c3 100644
--- a/code/sim/sim_stats.py
+++ b/code/sim/sim_stats.py
@@ -2,7 +2,4 @@
 # - average number of introgressed tracts per genome
 # - average number of non-introgressed tracts per genome
 # - average sequence identity within and between species
-# - 
-
-
-
+# -
diff --git a/code/sim/summarize.py b/code/sim/summarize.py
index 11f566b..12b5ddf 100644
--- a/code/sim/summarize.py
+++ b/code/sim/summarize.py
@@ -1,11 +1,11 @@
-from sim_analyze_hmm_bw import *
-from concordance_functions import *
-sys.path.insert(0, '..')
 import global_params as gp
 
+
 def read_list(l):
     return [x.strip() for x in l[1:-1].split(',')]
 
+
+tag = None  # was not defined
 gp_dir = '../'
 outfilename = gp.sim_out_prefix + tag + '.txt'
 results_filename = gp.sim_out_prefix + tag + '_summary.txt'
diff --git a/code/sim/summarize_ils.py b/code/sim/summarize_ils.py
index 9c12ef4..46c24ee 100644
--- a/code/sim/summarize_ils.py
+++ b/code/sim/summarize_ils.py
@@ -1,23 +1,22 @@
-import os
-import math
-import numpy.random
-import sys
-sys.path.insert(0, '../misc/')
-import mystats
 
-params = [line.strip().split(' ') for line in open('sim_multi_model_args.txt', 'r').readlines()]
-param_names = ['tag', 'model', 'N0', 'num_samples_par', 'num_samples_cer', 'par_cer_migration', 't_par_cer', 'num_sites', 'rho', 'outcross_rate', 'num_reps']
-assert len(param_names) == len(params[0]), str(len(param_names)) + ' != ' + str(len(params[0]))
+
+params = [line.strip().split(' ')
+          for line in open('sim_multi_model_args.txt', 'r').readlines()]
+param_names = ['tag', 'model', 'N0', 'num_samples_par', 'num_samples_cer',
+               'par_cer_migration', 't_par_cer', 'num_sites', 'rho',
+               'outcross_rate', 'num_reps']
+assert len(param_names) == len(params[0]), \
+    str(len(param_names)) + ' != ' + str(len(params[0]))
 out_dir = '../../results/sim/run_3/'
 prefix = 'sim_out_'
 suffix = '_summary.txt'
 ids = range(1, len(params) + 1)
 f_out = open(out_dir + prefix + 'ils' + suffix, 'w')
 for i in ids:
-    print i
+    print(i)
     f = open(out_dir + prefix + str(i) + suffix, 'r')
-    col_names = f.readline().strip().split('\t') # header
-    print col_names[22]
+    col_names = f.readline().strip().split('\t')  # header
+    print(col_names[22])
     if i == ids[0]:
         for p in param_names[:-1]:
             f_out.write(p + '\t')
@@ -25,7 +24,7 @@
         for x in range(100):
             f_out.write('\t' + col_names[22] + '.' + str(x))
         f_out.write('\n')
-    
+
     agg = []
     line = f.readline()
     while line != '':
diff --git a/code/sim/summarize_power_fpr.py b/code/sim/summarize_power_fpr.py
index 26a54b8..aeaa14a 100644
--- a/code/sim/summarize_power_fpr.py
+++ b/code/sim/summarize_power_fpr.py
@@ -1,9 +1,5 @@
-import os
-import math
-import numpy.random
-import sys
-sys.path.insert(0, '../misc/')
-import mystats
+from misc import mystats
+
 
 def parse_list(l):
     assert len(l) >= 2, l
@@ -14,26 +10,31 @@ def parse_list(l):
 # power = true positives / all positives
 # false positive rate = false positives / all negatives
 
-params = [line.strip().split(' ') for line in open('sim_multi_model_args.txt', 'r').readlines()]
-param_names = ['tag', 'model', 'N0', 'num_samples_par', 'num_samples_cer', 'par_cer_migration', 't_par_cer', 'num_sites', 'rho', 'outcross_rate', 'num_reps']
-assert len(param_names) == len(params[0]), str(len(param_names)) + ' != ' + str(len(params[0]))
+
+params = [line.strip().split(' ')
+          for line in open('sim_multi_model_args.txt', 'r').readlines()]
+param_names = ['tag', 'model', 'N0', 'num_samples_par', 'num_samples_cer',
+               'par_cer_migration', 't_par_cer', 'num_sites',
+               'rho', 'outcross_rate', 'num_reps']
+assert len(param_names) == len(params[0]), \
+    str(len(param_names)) + ' != ' + str(len(params[0]))
 out_dir = '../../results/sim/run_3/'
 prefix = 'sim_out_'
 suffix = '_summary.txt'
 ids = range(1, len(params) + 1)
 f_all = open(out_dir + prefix + 'power_fpr' + suffix, 'w')
 for i in ids:
-    print i
+    print(i)
     f = open(out_dir + prefix + str(i) + suffix, 'r')
-    col_names = f.readline().strip().split('\t') # header
+    col_names = f.readline().strip().split('\t')  # header
     assert col_names[0] == 'num_introgressed_cer'
     assert col_names[1] == 'num_introgressed_tracts_cer'
     assert col_names[2] == 'num_not_introgressed_tracts_cer'
     assert col_names[3] == 'num_predicted_introgressed_cer'
     assert col_names[4] == 'num_predicted_introgressed_tracts_cer'
-    assert col_names[10] == 'num_introgressed_correct'    
-    assert col_names[11] == 'num_predicted_tracts_actual'    
-    assert col_names[12] == 'num_actual_tracts_predicted'    
+    assert col_names[10] == 'num_introgressed_correct'
+    assert col_names[11] == 'num_predicted_tracts_actual'
+    assert col_names[12] == 'num_actual_tracts_predicted'
     assert param_names[7] == 'num_sites'
 
     if i == ids[0]:
@@ -66,35 +67,38 @@ def parse_list(l):
 
         fpr_temp = []
         all_pred_pos = parse_list(line[3])[1:]
-        true_pred_pos  = parse_list(line[10])[1:]
+        true_pred_pos = parse_list(line[10])[1:]
         # false_pos = all_pred_pos - true_pred_pos
         nsites = float(params[int(i) - 1][7])
         pos_sites = parse_list(line[0])[1:]
         # actual negatives = nsites - pos_sites
         for x in range(len(all_pred_pos)):
             if nsites - float(pos_sites[x]) != 0:
-                fpr_temp.append((float(all_pred_pos[x]) - float(true_pred_pos[x]))/\
-                                    (nsites - float(pos_sites[x])))
+                fpr_temp.append(
+                    (float(all_pred_pos[x]) - float(true_pred_pos[x])) /
+                    (nsites - float(pos_sites[x])))
         m = mystats.mean(fpr_temp)
         if m != 'NA':
             fpr.append(m)
-        
+
         # fdr is false positives / total calls (i.e. proportion of
         # predictions that are correct)
         fdr_temp = []
         all_pred_pos = parse_list(line[3])[1:]
-        true_pred_pos  = parse_list(line[10])[1:]
+        true_pred_pos = parse_list(line[10])[1:]
         # false_pos = all_pred_pos - true_pred_pos
         for x in range(len(all_pred_pos)):
             if float(all_pred_pos[x]) != 0:
-                fdr_temp.append((float(all_pred_pos[x]) - float(true_pred_pos[x]))/\
-                                    float(all_pred_pos[x]))
-                assert(fdr_temp[-1]) <= 1, str(i) + '  '+ str(x) + ' ' + str(all_pred_pos[x]) + ' ' + str(true_pred_pos[x])
+                fdr_temp.append(
+                    (float(all_pred_pos[x]) - float(true_pred_pos[x])) /
+                    float(all_pred_pos[x]))
+                assert(fdr_temp[-1]) <= 1, (
+                    str(i) + '  ' + str(x) + ' ' +
+                    str(all_pred_pos[x]) + ' ' + str(true_pred_pos[x]))
         m = mystats.mean(fdr_temp)
         if m != 'NA':
             fdr.append(m)
-        
-        
+
         # tracts
         power_tracts_temp = []
         tp = parse_list(line[12])
@@ -105,27 +109,29 @@ def parse_list(l):
         m = mystats.mean(power_tracts_temp)
         if m != 'NA':
             power_tracts.append(m)
-            
+
         fpr_tracts_temp = []
         all_pred_pos = parse_list(line[4])[1:]
         true_pred_pos = parse_list(line[11])[1:]
         neg_tracts = parse_list(line[2])[1:]
         for x in range(len(all_pred_pos)):
             if float(neg_tracts[x]) != 0:
-                fpr_tracts_temp.append((float(all_pred_pos[x]) - float(true_pred_pos[x]))/\
-                                           float(neg_tracts[x]))
+                fpr_tracts_temp.append(
+                    (float(all_pred_pos[x]) - float(true_pred_pos[x])) /
+                    float(neg_tracts[x]))
         m = mystats.mean(fpr_tracts_temp)
         if m != 'NA':
             fpr_tracts.append(m)
 
         fdr_tracts_temp = []
         all_pred_pos = parse_list(line[4])[1:]
-        true_pred_pos  = parse_list(line[11])[1:]
+        true_pred_pos = parse_list(line[11])[1:]
         # false_pos = all_pred_pos - true_pred_pos
         for x in range(len(all_pred_pos)):
             if float(all_pred_pos[x]) != 0:
-                fdr_tracts_temp.append((float(all_pred_pos[x]) - float(true_pred_pos[x]))/\
-                                    float(all_pred_pos[x]))
+                fdr_tracts_temp.append(
+                    (float(all_pred_pos[x]) - float(true_pred_pos[x])) /
+                    float(all_pred_pos[x]))
         m = mystats.mean(fdr_tracts_temp)
         if m != 'NA':
             fdr_tracts.append(m)
@@ -154,11 +160,11 @@ def parse_list(l):
     bus = []
     for item in agg:
         bl, bu = mystats.bootstrap(item)
-        print bl, bu
+        print(bl, bu)
         bls.append(bl)
         bus.append(bu)
-    print bls
-    print bus
+    print(bls)
+    print(bus)
     # lower bootstrap row
     for p in params[int(i) - 1]:
         f_all.write(p + '\t')
diff --git a/code/sim/summary_stats.py b/code/sim/summary_stats.py
index 86aab9a..89475ed 100644
--- a/code/sim/summary_stats.py
+++ b/code/sim/summary_stats.py
@@ -8,40 +8,42 @@
 import sys
 import os
 import re
-sys.path.insert(0, '../misc/')
-import read_fasta
+from misc import read_fasta
 
 
-def seq_id(a, b, l = -1, use_gaps = False):
+def seq_id(a, b, length=-1, use_gaps=False):
     assert len(a) == len(b)
     ndiff = 0
     ntotal = 0
-    for i in xrange(len(a)):
+    for i in range(len(a)):
         # use sequence a as denominator
         if a[i] != '-':
             if b[i] != '-' or use_gaps:
                 ntotal += 1
                 if a[i] != b[i]:
                     ndiff += 1
-    if l == -1:
+    if length == -1:
         if ntotal == 0:
             return -1, 0
         return 1 - float(ndiff) / ntotal, ntotal
-    return 1 - float(ndiff) / l
+    return 1 - float(ndiff) / length
+
 
 def mean(a):
     return float(sum(a))/len(a)
 
 
-def maf_id(fn, ref1 = 'S288c', ref2 = 'CBS432'):
+def maf_id(fn, ref1='S288c', ref2='CBS432'):
     headers, seqs = read_fasta.read_fasta(fn)
     id1, den1 = seq_id(seqs[2], seqs[0])
     id2, den2 = seq_id(seqs[2], seqs[1])
     return id1, id2, den1, den2
 
+
 # for muscle output
-def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
-    assert only_threeway, 'reading non-threeway parts of the alignment not yet implemented'
+def maf_id_old(fn, only_threeway, ref1='S288c', ref2='CBS432'):
+    assert only_threeway, \
+        'reading non-threeway parts of the alignment not yet implemented'
     f = open(fn, 'r')
     line = f.readline()
     id1 = []
@@ -69,12 +71,11 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
                     len2.append(r2[1])
         line = f.readline()
     f.close()
-    num1 = sum([id1[i] * len1[i] for i in xrange(len(id1))])
-    num2 = sum([id2[i] * len2[i] for i in xrange(len(id2))])
+    num1 = sum([id1[i] * len1[i] for i in range(len(id1))])
+    num2 = sum([id2[i] * len2[i] for i in range(len(id2))])
     den1 = float(sum(len1))
     den2 = float(sum(len2))
     return num1 / den1, num2 / den2, den1, den2
-    
 
 
 if sys.argv[1] == '100':
@@ -83,7 +84,8 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
     # sequence identity, i.e. assume genomes are mostly aligned so
     # this is valid
 
-    chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
+    chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
+             'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XIV']
 
     # get all strain names
     alignment_dir = '../../alignments/genbank/'
@@ -95,7 +97,7 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
         try:
             m = re.match(prefix + '(?P<strain>[a-zA-Z0-9]+)_chr', fn)
             strains.add(m.group('strain'))
-        except:
+        except AttributeError:
             pass
     # process alignments for each strain and chromosome
     id_cer = []
@@ -103,34 +105,38 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
     len_cer = []
     len_par = []
     for strain in strains:
-        print strain
+        print(strain)
         id_strain_cer = []
         id_strain_par = []
         len_strain_cer = []
         len_strain_par = []
         for chrm in chrms:
-            id_chrm_cer, id_chrm_par, len_chrm_cer, len_chrm_par = maf_id(alignment_dir + prefix + strain + '_chr' + chrm + '_mafft.maf')
+            id_chrm_cer, id_chrm_par, len_chrm_cer, len_chrm_par = \
+                maf_id(alignment_dir + prefix + strain +
+                       '_chr' + chrm + '_mafft.maf')
             id_strain_cer.append(id_chrm_cer)
             id_strain_par.append(id_chrm_par)
             len_strain_cer.append(len_chrm_cer)
             len_strain_par.append(len_chrm_par)
-        num_strain_cer = sum([id_strain_cer[i] * len_strain_cer[i] for i in xrange(len(chrms))])
-        num_strain_par = sum([id_strain_par[i] * len_strain_par[i] for i in xrange(len(chrms))])
+        num_strain_cer = sum([id_strain_cer[i] * len_strain_cer[i]
+                              for i in range(len(chrms))])
+        num_strain_par = sum([id_strain_par[i] * len_strain_par[i]
+                              for i in range(len(chrms))])
         den_strain_cer = float(sum(len_strain_cer))
         den_strain_par = float(sum(len_strain_par))
         id_cer.append(num_strain_cer/den_strain_cer)
         id_par.append(num_strain_par/den_strain_par)
         len_cer.append(den_strain_cer)
         len_par.append(den_strain_par)
-        print id_cer, id_par
-    num_cer = sum([id_cer[i] * len_cer[i] for i in xrange(len(id_cer))])
-    num_par = sum([id_par[i] * len_par[i] for i in xrange(len(id_par))])
+        print(id_cer, id_par)
+    num_cer = sum([id_cer[i] * len_cer[i] for i in range(len(id_cer))])
+    num_par = sum([id_par[i] * len_par[i] for i in range(len(id_par))])
     den_cer = float(sum(len_cer))
     den_par = float(sum(len_par))
-    print 'average id between cer and cer ref:', num_cer/den_cer
-    print '(all strains:', id_cer, len_cer, ')'
-    print 'average id between cer and par ref:', num_par/den_par
-    print '(all strains:', id_par, len_par, ')'    
+    print('average id between cer and cer ref:', num_cer/den_cer)
+    print('(all strains:', id_cer, len_cer, ')')
+    print('average id between cer and par ref:', num_par/den_par)
+    print('(all strains:', id_par, len_par, ')')
 
 elif sys.argv[1] == 'sim':
     # for simulations, specifically with one par and one cer population
@@ -143,14 +149,15 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
         num_cer = int(p[4])
         par_ref_ind = 0
         cer_ref_ind = num_par
-        seq_len = int(p[7])            
-        d = {'par_ref':{'par':[], 'cer':[]}, 'cer_ref':{'par':[], 'cer':[]}}
+        seq_len = int(p[7])
+        d = {'par_ref': {'par': [], 'cer': []},
+             'cer_ref': {'par': [], 'cer': []}}
 
         f = open(out_dir + out_prefix + p[0] + '.txt', 'r')
         line = f.readline()
         while line != '':
-            m = re.match('segsites: (?P<nseg>[0-9]+)', line) 
-            if m != None:
+            m = re.match('segsites: (?P<nseg>[0-9]+)', line)
+            if m is not None:
                 # positions:
                 f.readline()
                 # begin sequences
@@ -159,15 +166,20 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
                 while line != '' and line != '\n':
                     seqs.append(line[:-1])
                     line = f.readline()
-                d_temp = {'par_ref':{'par':[], 'cer':[]}, 'cer_ref':{'par':[], 'cer':[]}}
+                d_temp = {'par_ref': {'par': [], 'cer': []},
+                          'cer_ref': {'par': [], 'cer': []}}
                 for i in range(0, num_par):
                     if i != par_ref_ind:
-                        d_temp['par_ref']['par'].append(seq_id(seqs[par_ref_ind], seqs[i], l=seq_len))
-                    d_temp['cer_ref']['par'].append(seq_id(seqs[cer_ref_ind], seqs[i], l=seq_len))
+                        d_temp['par_ref']['par'].append(
+                            seq_id(seqs[par_ref_ind], seqs[i], length=seq_len))
+                    d_temp['cer_ref']['par'].append(
+                        seq_id(seqs[cer_ref_ind], seqs[i], length=seq_len))
                 for i in range(num_par, num_par + num_cer):
                     if i != cer_ref_ind:
-                        d_temp['cer_ref']['cer'].append(seq_id(seqs[cer_ref_ind], seqs[i], l=seq_len))
-                    d_temp['par_ref']['cer'].append(seq_id(seqs[par_ref_ind], seqs[i], l=seq_len))
+                        d_temp['cer_ref']['cer'].append(
+                            seq_id(seqs[cer_ref_ind], seqs[i], length=seq_len))
+                    d_temp['par_ref']['cer'].append(
+                        seq_id(seqs[par_ref_ind], seqs[i], length=seq_len))
                 d['par_ref']['par'].append(mean(d_temp['par_ref']['par']))
                 d['par_ref']['cer'].append(mean(d_temp['par_ref']['cer']))
                 d['cer_ref']['par'].append(mean(d_temp['cer_ref']['par']))
@@ -175,11 +187,10 @@ def maf_id_old(fn, only_threeway, ref1 = 'S288c', ref2 = 'CBS432'):
             line = f.readline()
         f.close()
 
-        print '-----'
-        print out_prefix + p[0]
-        print p
-        print 'average id within par:', mean(d['par_ref']['par'])
-        print 'average id within cer:', mean(d['cer_ref']['cer'])
-        print 'average id between cer ref and par:', mean(d['cer_ref']['par'])
-        print 'average id between par ref and cer:', mean(d['par_ref']['cer'])
-
+        print('-----')
+        print(out_prefix + p[0])
+        print(p)
+        print('average id within par:', mean(d['par_ref']['par']))
+        print('average id within cer:', mean(d['cer_ref']['cer']))
+        print('average id between cer ref and par:', mean(d['cer_ref']['par']))
+        print('average id between par ref and cer:', mean(d['par_ref']['cer']))
diff --git a/code/test/analyze/r10805.fa b/code/test/analyze/r10805.fa
new file mode 100644
index 0000000..0f65359
--- /dev/null
+++ b/code/test/analyze/r10805.fa
@@ -0,0 +1,14 @@
+> S288c 279920 281150
+tttagaggattaccatttcaacagatcgtccttagcatataagtagtcgtcaaaaatgaattcaacttcgtctgtttcggcattgtagccgccaactctgatggattcgtggtttttgacaatgatgtcacagcctttttcctttaggaagtccaagtcgaaagtagtggcaataccaatgatcttacaaccggcggcttttccggcggcaatacctgctggagcgtcttcaaatactactaccttagatttggaagggtcttgctcattgatcggatatcctaagccattcctgcccttcagatatggttctggatgaggcttaccctgtttgacatcattagcggtaatgaagtactttggtctcctgattcccagatgctcgaaccatttttgtgccatatcacgggtaccggaagttgccacagcccatttctcttttggtagagcgttcaaagcgttgcacagcttaactgcacctgggacttcaatggatttttcaccgtacttgaccggaatttcagcttctaatttgttaacatactcttcattggcaaagtctggagcgaacttagcaatggcatcaaacgttctccaaccatgcgagacttggataacgtgttcagcatcgaaataaggtttgtccttaccgaaatccctccagaatgcagcaatggctggttgagagatgataatggtaccgtcgacgtcgaacaaagcggcgttaactttcaaagatagaggtttagtagtcaatcccattccgaatattgtttttattgttttat--gtttttccactgatctggtaaacactagctggttggcgctattaatatgaaaagagttagaccaaattgagtagaaaagaaacctttggcaatcctaactatgttgttttagcttgtgtatttaagcgc------------atatatatatatttctgaaaatgacaacatcaaaagaaacgaacttatttagaataaaaagaaacgacttggcttcttattattcctactttacgtcacgtgggaggcccgtttagg----ggggcagctatgtagtttttccgagcgtactttctttcagcatccgaaaagtcctcacttgacggcttacacggaaacgccgcggattgtggggcacagatgatgacgcagacggaacactgc--agaaatctttttaccttgtcgttaaagacgatattagagagaagagttt-ggctggggacaaagtgccagctttt
+> CBS432 276420 277656
+tttaaaaggttaccatttcaaaagatcgtccttagcatacaagtagtcatcaaatataaattcaacttcatccgtttcggcatcgtaaccgccaactctgatggattcatggttcttaacaatgatgtcacaacctttttcctttaggaagtctaaatcgaaagtagtggcaataccaatgatcttacaaccggcggccttaccggcggcaatacctgctggggcgtcttcaaaaactaccaccttggatttggaaggatcttgttcattgattggatatcctaggccatttctacctttcaaatatggttctggatgaggtttaccttgtttgacatcattagcggtaatgaaatactttggtctcctgattcctagatgttcaaaccatttttgagccatatcacgggtaccagaagttgccacagcccatttttcttttggtagagcgttcaaagcattgcacagcttgactgcacctggaacttcaatggacttttcaccgtacttgaccggaatttcagcttctagtttgttaacatactcttcattggcaaagtccggagcaaacttagcaatggcatcgtaagttctccagccatgtgagacttggataacatgctcggcatcgaaataaggcttgtccttaccgaagtccctccagaatgcagcgatggctggttgagagatgatgatggtaccgtcgacgtcgaacaaagcggcgttaacttttaaagataaaggtttagtagtcaatcccattttaaatattgtttttattattttcttatattttctgctgttgtaataaacactagcttggt-gtactctgaaaatgaaaaagacaaaaacagactgaat--tagaggaaccttgggctatgttaattgtgttcttctagattgtatgattaacctta----------tatatatatatatttatgaaaatgactacatcaaaagaaacgaacata-ccaaaagaaaaagaaacgacttggcttcttattattcctactttacgtcacgtggga-gcccgtttagggggaggggcaggcatatagtttttccgagcgtagtttcttttagcatccgaaaagtcctcagttggcggcttactcggaaacgccgcggattgtggggcacagaggatagcgcagacggaacacggcagagagaacaattttacttgtcgctaaagacgaaaccagagatgagagtttcggatagggacaaaatgcccgctttc
+> N_45 285263 286497
+tttaaaaggttaccatttcaaaagatcgtccttagcatacaagtagtcatcaaatataaattcaacttcatccgtttcggcatcgtaaccgccaactctgatggattcatggttcttaacaatgatgtcacaacctttttcctttaggaagtctaaatcgaaagtagtggcaataccaatgatcttacaaccggcggccttaccggcggcaatacctgctggggcgtcttcaaaaactaccaccttggatttggaaggatcttgttcattgattggatatcctaggccatttctacctttcaaatatggttctggatgaggtttaccttgtttgacatcattagcggtaatgaaatactttggtctcctgattcctagatgttcaaaccatttttgagccatatcacgggtaccagaagttgccacagcccatttctcttttggtagagcgttcaaagcattgcacagcttgactgcacctggaacttcaatggacttttcaccgtacttgaccggaatttcagcttctaatttgttaacatactcttcattggcaaagtccggagcaaacttagcaatggcatcgtaagttctccagccatgcgagacttggataacatgctcggcatcgaaataaggcttgtccttaccgaagtccctccagaatgcagcgatggctggttgagagatgatgatggtaccgtcgacgtcgaacaaagcggcgttaacttttaaagataaaggtttagtagtcaatcccattttaaatattgtttttactattttcttatatgttctgctgttgtaataaacactagcttggt-gtactctgaaaatgaaaaagacaaaaacagactgaat--tagaggaacctcgggctatgttaattgtgttcttctagattgtatgattaacctt------------atatatatatatttatgaaaatgactacatcaaaagaaacgaacata-ccaaaagaaaaagaaacgacttggcttcttattattcctactttacgtcacgtggga-gcccgtttagggggaggggcaggcatatagtttttccgagcgtagtttcttttagcatccgaaaagtcctcagtttgcggcttactcggaaacgccgcggattgtggggcacagaggatagcgcagacggaacacggcagagagaacaattttacttgtcgctaaagacgaaaccagagatgagagtttcggatagggacaaaatgcccgctttc
+> DBVPG6304 291210 292453
+cttaaaaggttaccatttcaaaagatcgtccttggcatacaagtagtcatcaaatataaattcaacttcatccgtttcggcatcgtaaccgccaactctgatggattcatggttcttgacaatgatgtcacaacctttttcctttaggaagtctaagtcgaaagtagtggcaataccaatgatcttacaaccggcagccttaccggcggcaatacctgctggggcgtcttcaaaaacaaccaccttggatttggaaggatcttgttcattgattgggtatccaaggccatttctacctttcaaatatggttctggatgaggtttaccttgtttaacatcattagcggtgataaaatactttggtctcttgattcctagatgttcaaaccatttttgagccatatcacgggtaccagaagttgccacagcccatttctcttttggtagagcattcaaagcattgcacagcttgactgcacctggaacttcaatggatttttcaccgtatttgactggaatttcagcttctaatttattaacatattcttcattggcaaagtccggagcaaacttagcaatggcatcgtaagttctccaaccatgcgagacttggataacatgctcggcatcgaaataaggcttgtccttaccgaagtccctccagaatgcagcgatggctggttgagagatgatgatggtaccatcgacatcgaacaaagcggcgttaactttcaaagataaaggtttagtagtcaatcccattttaaatgttgtttttattattttcttatattttttgttgttgtaataaacactagcttgat-gtgctctgaaaatgaaaaagactaaaacaaactgaat--tagaggaaccttgggctatgttaattgtgttcttc-agatagtatgattaaccttgtatatatatatgtatatatgtatatatgaaaatgactacatcaaaagaaacgaacgta-ccaaaagaaaaagaaacgacttggtttcttattattcccactttacgtcacgtggga-gcccgtttagggggaggggttggcatatagtttttccgagcgtagtttcttttagcatccgaaaagtcctcagttggcggcttactcggaaacgccgcggattgtggggcacagaggatagcgcagacggatcacggc--agggaacaattttacttgtcgctaaagacgatatcagagaagagagtttcggatggggacaaaatgcccgctttc
+> UWOPS91_917_1 293672 294915
+cttaaaaggttaccatttcaaaagatcgtccttggcatacaagtagtcatcaaatataaattcaacttcatccgtttcggcatcataaccgccaactctgatggattcatggttcttgacaatgatgtcacaacctttttcctttaggaagtctaagtcgaaagtagtggcaataccaatgatcttacaaccggcggccttaccggcggcaatacctgctggggcgtcttcaaaaacaaccaccttggatttggaaggatcttgttcattgattggatatcctaggccatttctacctttcaaatatggttctggatgaggtttaccttgtttaacatcattggcggtgataaaatactttggtctcctgattcctagatgttcaaaccatttttgagccatatcacgggtaccagaagttgccacagcccatttctcttttggtagagcattcaaagcattgcacagcttgactgcacctggaacttcaatggatttttcaccgtatttgactggaatttcagcttctaatttattaacatattcttcattggcaaagtccggagcaaacttagcaatggcatcgtaggtcctccaaccatgcgagacttggataacatgctcggcatcgaaataaggcttgtccttaccgaagtccctccagaatgcagcgatggctggttgagagatgatgatggtaccgtcgacgtcgaacaaagcggcgttaactttcaaagataaaggtttagtagtcaatcccattttaaatattgtttttattattttcttatattttttgttgttgtaataaacactagcttgat-gtgctctgaaaatgaaaaagactaaaacaaactgaat--tagaggaaccttgggctatgttaattgtgttcttc-agatagtatgattaaccttgtatatatatatgtatatatgtatatatgaaaatgactacatcaaaagaaacgaacgta-ccaaaagaaaaagaaacgacttggtttcttattattcccactttacgtcacgtggga-gcccgtttagggggaggggttggcatatagtttttccgagcgtagtttcttttagcatccgaaaagtcctcagttggcggcttactcggaaacgccgcggattgtggggcacagaggatagcgcagacggatcacggc--agggaacaattttacttgtcgctaaagacgatatcagagaagagagtttcggatggggacaaaatgcccgctttc
+> yjm248 287893 289127
+tttaaaaggttaccatttcaaaaggtcgtccttagcatacaagtagtcatcaaatataaattcaacttcatccgtttcggcatcgtaaccgccaactctgatggattcatggttcttaacaatgatgtcacaacctttttcctttaggaagtctaaatcgaaagtagtggcaataccaatgatcttacaaccggcggccttaccggcggcaatacctgctggggcgtcttcaaaaactaccaccttggatttggaaggatcttgttcattgattggatatcctaggccatttctacctttcaaatatggttctggatgaggtttaccttgtttgacatcattagcggtaatgaaatactttggtctcctgattcctagatgttcaaaccatttttgagccatatcacgggtaccagaagttgccacagcccatttttcttttggtagagcgttcaaagcattgcacagcttgactgcacctggaacttcaatggacttttcaccgtacttgaccggaatttcagcttctagtttgttaacatactcttcattggcaaagtccggagcaaacttagcaatggcatcgtaagttctccagccatgcgagacttggataacatgctcggcatcgaaataaggcttgtccttaccgaagtccctccagaatgcagcgatggctggttgagagatgatgatggtaccgtcgacgtcgaacaaagcggcgttaacttttaaagataaaggtttagtagtcaatcccattttaaatattgtttttattattttcttatattttctgctgttgtaataaacactagcttggt-gtactctgaaaatgaaaaagacaaaaacagactgaat--tagaggaaccttgggctatgttaattgtgttcttctagattgtatgattaacctt------------atatatatatatttatgaaaatgactacatcaaaagaaacgaacata-ccaaaagaaaaagaaacgacttggcttcttattattcctactttacgtcacgtggga-gcccgtttagggggaggggcaggcatatagtttttccgagcgtagtttcttttagcatccgaaaagtcctcagttggcggcttactcggaaacgccgcggattgtggggcacagaggatagcgcagacggaacacggcagagagaacaattttacttgtcgctaaagacgaaaccagagatgagagtttcggatagggacaaaatgcccgctttc
+> info
+B...P.P.P............P..X........B.....P........P.....P..P...........P..P..........PB..P....................P.....P..P..............P....................P..P......................................B..P..P....................P...........P..B..P.....P...........P.....P........P..B.....B.P......P..P..P....P..................P.....P.....B........B.....B..B..P............B.......P.....P..P...........P.................P....................X..............B........P...........P...........P...........P...........B.....B................X...B........B.................P.....P.................PP.P..B.....P.....B..............P..P..P..............P..............P.................P....................P........B.....B.......................P.......P......................PPP...B.........C.P....P.--PP.C..BPPB..P.P.PP............P.P.-.PP..P.P..P.......PPPPP.P.P..P.P...P.--P.P..P.....CP...P..PP...P.P....P..Pb..P.B..P.PP....P.P_------------____________B.P..........P..................P..-PP.P..P.................B.............B..................-...........----....BB.PP..P.................P.......P...................P..CP........P.............................P...PP..........B...P..--..BP.P.PP...PP.......P.........P.PP.....PP.......-..P.P........P....P.....P
diff --git a/code/test/analyze/test_extract_region.py b/code/test/analyze/test_extract_region.py
index aa0ba26..b910653 100644
--- a/code/test/analyze/test_extract_region.py
+++ b/code/test/analyze/test_extract_region.py
@@ -1,6 +1,7 @@
 from analyze import extract_region as ex
 import pytest
 from io import StringIO
+from misc.region_reader import Region_Reader
 
 
 def compare_args(args, non_defaults):
@@ -40,97 +41,121 @@ def test_validate_args(mocker):
     # fail on filename existing
     mocker.patch('os.path.exists', return_value=False)
     with pytest.raises(ValueError) as e:
-        ex.validate_args({'filename': 'test'})
+        ex.validate_args({'filename': 'test', 'suppress_header': False})
     assert 'test not found' in str(e)
 
     # fail on filename format
     mocker.patch('os.path.exists', return_value=True)
     with pytest.raises(ValueError) as e:
-        ex.validate_args({'filename': 'test'})
+        ex.validate_args({'filename': 'test',
+                          'suppress_header': False})
 
     # fail on pickle
     mocker.patch('os.path.exists', side_effect=[True, False])
     with pytest.raises(ValueError) as e:
-        ex.validate_args({'filename': 'test.fa.gz'})
+        ex.validate_args({'filename': 'test.fa.gz',
+                          'suppress_header': False})
 
     # fail on regions
     mocker.patch('os.path.exists', side_effect=[True, True])
     with pytest.raises(ValueError) as e:
-        ex.validate_args({'filename': 'test.fa.gz', 'regions': ['z123']})
+        ex.validate_args({'filename': 'test.fa.gz',
+                          'regions': ['z123'],
+                          'suppress_header': False})
 
     # fail on regions
     mocker.patch('os.path.exists', side_effect=[True, True])
     with pytest.raises(ValueError) as e:
-        ex.validate_args({'filename': 'test.fa.gz', 'regions': ['rz123']})
+        ex.validate_args({'filename': 'test.fa.gz',
+                          'regions': ['rz123'],
+                          'suppress_header': False})
 
     # fail on regions
     mocker.patch('os.path.exists', side_effect=[True, True])
     with pytest.raises(ValueError) as e:
         ex.validate_args({'filename': 'test.fa.gz',
-                          'regions': 'r123 12 z2'.split()})
+                          'regions': 'r123 12 z2'.split(),
+                          'suppress_header': False})
     assert 'z2 could not be parsed' in str(e)
 
     # success!
     mocker.patch('os.path.exists', side_effect=[True, True])
-    args = ex.validate_args({'filename': 'test.fa.gz',
-                             'regions': 'r123 12 42'.split()})
-    assert args['pickle'] == 'test.pkl'
+    args, reader = ex.validate_args({'filename': 'test.fa.gz',
+                                     'regions': 'r123 12 42'.split(),
+                                     'suppress_header': False})
+    assert reader.pickle == 'test.pkl'
     assert args['regions'] == [123, 12, 42]
 
 
-def test_decode_regions():
-    index = {1: 2, 10: 3, 100: 4}
+@pytest.fixture
+def r(mocker):
+    mocker.patch('os.path.exists', side_effect=[True, True])
+    return Region_Reader('test.fa.gz')
+
+
+def test_decode_regions(r):
+    r.index = {1: 2, 10: 3, 100: 4}
 
     # raise key error
     with pytest.raises(KeyError) as e:
-        ex.decode_regions([1, 3], index, True)
+        ex.decode_regions([1, 3], r, True)
     assert 'r3 not found in index' in str(e)
 
-    result = ex.decode_regions([1, 1, 100, 10], index, True)
+    result = ex.decode_regions([1, 1, 100, 10], r, True)
     assert result == [2, 2, 4, 3]
 
-    result = ex.decode_regions([1, 1, 100, 10], index, False)
+    result = ex.decode_regions([1, 1, 100, 10], r, False)
     assert result == [2, 2, 3, 4]
 
 
-def test_write_regions(capsys):
+def test_write_regions(r, capsys):
     # empty regions
-    reader = StringIO('')
-    ex.write_regions(reader, [], True, 1)
+    r.region_reader = StringIO('')
+    r.suppress_header = True
+    r.num_lines = 1
+    ex.write_regions(r, [])
     assert capsys.readouterr().out == ''
 
     # outside of file
-    reader = StringIO('')
-    ex.write_regions(reader, [100], False, 2)
+    r.region_reader = StringIO('')
+    r.suppress_header = False
+    r.num_lines = 2
+    ex.write_regions(r, [100])
     soe = capsys.readouterr()
     assert soe.out == ''
     assert soe.err == '100 outside of file\n'
 
     # outside of file on second
-    reader = StringIO('a test\n')
-    ex.write_regions(reader, [0], False, 2)
+    r.region_reader = StringIO('a test\n')
+    r.suppress_header = False
+    r.num_lines = 2
+    ex.write_regions(r, [0])
     soe = capsys.readouterr()
     assert soe.err == '0 outside of file\n'
     assert soe.out == 'a test\n'
 
     # normal, no header
-    reader = StringIO('header\n'
-                      'line 1\n'
-                      'line 2\n'
-                      'header\n'
-                      'line 3\n')
-    ex.write_regions(reader, [0, 21, 0], True, 2)
+    r.region_reader = StringIO('header\n'
+                               'line 1\n'
+                               'line 2\n'
+                               'header\n'
+                               'line 3\n')
+    r.suppress_header = True
+    r.num_lines = 1
+    ex.write_regions(r, [0, 21, 0])
     soe = capsys.readouterr()
     assert soe.err == ''
     assert soe.out == 'line 1\nline 3\nline 1\n'
 
     # normal, with header
-    reader = StringIO('head 1\n'
-                      'line 1\n'
-                      'line 2\n'
-                      'head 2\n'
-                      'line 3\n')
-    ex.write_regions(reader, [0, 21, 0], False, 2)
+    r.region_reader = StringIO('head 1\n'
+                               'line 1\n'
+                               'line 2\n'
+                               'head 2\n'
+                               'line 3\n')
+    r.suppress_header = False
+    r.num_lines = 1
+    ex.write_regions(r, [0, 21, 0])
     soe = capsys.readouterr()
-    assert soe.err == ''
     assert soe.out == 'head 1\nline 1\nhead 2\nline 3\nhead 1\nline 1\n'
+    assert soe.err == ''
diff --git a/code/test/analyze/test_filter_regions.py b/code/test/analyze/test_filter_regions.py
new file mode 100644
index 0000000..1da5227
--- /dev/null
+++ b/code/test/analyze/test_filter_regions.py
@@ -0,0 +1,741 @@
+from analyze import filter_regions
+import pytest
+from io import StringIO
+from misc import read_fasta
+import os
+import numpy as np
+import warnings
+from analyze.introgression_configuration import Configuration
+
+
+@pytest.fixture
+def filterer():
+    config = Configuration()
+    config.set('symbols',
+               introgressed='int_{state}.txt',
+               introgressed_intermediate='int_int_{state}.txt',
+               ambiguous='amb_{state}.txt',
+               ambiguous_intermediate='amb_int_{state}.txt',
+               filter_sweep='sweep.txt',
+               filter_threshold=0.1,
+               regions='region_{state}.fa.gz',
+               region_index='region_{state}.pkl',
+               quality_blocks='block_{state}_quality.txt')
+    config.add_config({
+            'analysis_params':
+            {'reference': {'name': 'ref'},
+             'known_states': [
+                 {'name': 'pred'},
+                 {'name': 'pred2'},
+             ],
+             }
+        })
+    config.set('states')
+    return filter_regions.Filterer(config)
+
+
+class NoCloseStringIO(StringIO):
+    def close(self):
+        pass
+
+    def super_close(self):
+        super(StringIO).close(self)
+
+
+def test_run_no_thresh_file(filterer, mocker):
+    mocker.patch('analyze.filter_regions.read_table.read_table_rows',
+                 return_value=({'r1': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 4,
+                     'num_sites_nonmask_pred': 0,
+                     'match_nongap_pred': 0,
+                     'num_sites_nongap_pred': 0,
+                     'match_nongap_ref': 0,
+                     'num_sites_nongap_ref': 0,
+                 }, 'r2': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }, 'r3': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }}, ['regions']))
+
+    files = [NoCloseStringIO() for i in range(8)]
+    mocked_file = mocker.patch('analyze.filter_regions.open',
+                               side_effect=files)
+
+    mock_read = mocker.patch('analyze.filter_regions.Region_Reader')
+    mock_read().__enter__().yield_fa.return_value = iter([
+        ('r1', ['> seq', '> info'], ['atcg', 'x..']),
+        ('r2', ['> seq', '> info'], ['attatt', 'aggcat', 'attatt',
+                                     'ag-tat', np.array(list('CPPPPPPP'))]),
+        ('r3', ['> seq', '> info'], ['actata', 'attatt', 'nng---',
+                                     'ag-tat', np.array(list('CPPPPPPP'))])])
+
+    filterer.config.filter_sweep = None
+    filterer.run([.9])
+
+    assert mock_read.call_count == 3  # called once during setup
+    mock_read.assert_called_with('region_pred2.fa.gz', as_fa=True)
+
+    assert mocked_file.call_args_list == [
+        mocker.call('int_pred.txt', 'w'),
+        mocker.call('int_int_pred.txt', 'w'),
+        mocker.call('amb_pred.txt', 'w'),
+        mocker.call('amb_int_pred.txt', 'w'),
+        mocker.call('int_pred2.txt', 'w'),
+        mocker.call('int_int_pred2.txt', 'w'),
+        mocker.call('amb_pred2.txt', 'w'),
+        mocker.call('amb_int_pred2.txt', 'w'),
+    ]
+
+    assert files[0].getvalue() == 'regions\nr2\t\nr3\t\n'  # pass filter 1
+    assert files[1].getvalue() == (
+        'regions\treason\n'
+        'r1\tfraction gaps/masked in master = 0.6\n'
+        'r2\t\n'
+        'r3\t\n'
+    )
+    assert files[2].getvalue() == 'regions\nr3\t\n'  # pass filter 2
+    assert files[3].getvalue() == (
+        'regions\talternative_states\talternative_ids\talternative_P_counts\n'
+        'r2\tpred,pred2\t1.0,1.0\t0,0\n'
+        'r3\tpred\t1.0\t0\n'
+    )
+    # files 4:8 are just headers
+
+
+def test_run_no_thresh(filterer, mocker):
+    mocker.patch('analyze.filter_regions.read_table.read_table_rows',
+                 return_value=({'r1': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 4,
+                     'num_sites_nonmask_pred': 0,
+                     'match_nongap_pred': 0,
+                     'num_sites_nongap_pred': 0,
+                     'match_nongap_ref': 0,
+                     'num_sites_nongap_ref': 0,
+                 }, 'r2': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }, 'r3': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }}, ['regions']))
+
+    files = [NoCloseStringIO() for i in range(8)]
+    mocked_file = mocker.patch('analyze.filter_regions.open',
+                               side_effect=files)
+
+    mock_read = mocker.patch('analyze.filter_regions.Region_Reader')
+    mock_read().__enter__().yield_fa.return_value = iter([
+        ('r1', ['> seq', '> info'], ['atcg', np.array(list('x..'))]),
+        ('r2', ['> seq', '> info'], ['attatt', 'aggcat', 'attatt',
+                                     'ag-tat', np.array(list('CPPPPPPP'))]),
+        ('r3', ['> seq', '> info'], ['actata', 'attatt', 'nng---',
+                                     'ag-tat', np.array(list('CPPPPPPP'))])])
+
+    mock_log = mocker.patch('analyze.filter_regions.log')
+
+    filterer.run()
+
+    assert mock_log.info.call_args_list == [
+        mocker.call('pred'),
+        mocker.call('pred2'),
+    ]
+
+    assert mock_read.call_count == 3  # called once during setup
+    mock_read.assert_called_with('region_pred2.fa.gz', as_fa=True)
+
+    assert mocked_file.call_args_list == [
+        mocker.call('int_pred.txt', 'w'),
+        mocker.call('int_int_pred.txt', 'w'),
+        mocker.call('amb_pred.txt', 'w'),
+        mocker.call('amb_int_pred.txt', 'w'),
+        mocker.call('int_pred2.txt', 'w'),
+        mocker.call('int_int_pred2.txt', 'w'),
+        mocker.call('amb_pred2.txt', 'w'),
+        mocker.call('amb_int_pred2.txt', 'w'),
+    ]
+
+    assert files[0].getvalue() == 'regions\nr2\t\nr3\t\n'  # pass filter 1
+    assert files[1].getvalue() == (
+        'regions\treason\n'
+        'r1\tfraction gaps/masked in master = 0.6\n'
+        'r2\t\n'
+        'r3\t\n'
+    )
+    assert files[2].getvalue() == 'regions\nr3\t\n'  # pass filter 2
+    assert files[3].getvalue() == (
+        'regions\talternative_states\talternative_ids\talternative_P_counts\n'
+        'r2\tpred,pred2\t1.0,1.0\t0,0\n'
+        'r3\tpred\t1.0\t0\n'
+    )
+
+
+def test_run(filterer, mocker):
+    mocker.patch('analyze.filter_regions.read_table.read_table_rows',
+                 return_value=({'r1': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 4,
+                     'num_sites_nonmask_pred': 0,
+                     'match_nongap_pred': 0,
+                     'num_sites_nongap_pred': 0,
+                     'match_nongap_ref': 0,
+                     'num_sites_nongap_ref': 0,
+                 }, 'r2': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }, 'r3': {
+                     'predicted_species': 'pred',
+                     'start': 0,
+                     'end': 9,
+                     'num_sites_nonmask_ref': 5,
+                     'num_sites_nonmask_pred': 5,
+                     'match_nongap_pred': 8,
+                     'num_sites_nongap_pred': 10,
+                     'match_nongap_ref': 7,
+                     'num_sites_nongap_ref': 10,
+                 }}, ['regions']))
+
+    files = [NoCloseStringIO() for i in range(9)]
+    mocked_file = mocker.patch('analyze.filter_regions.open',
+                               side_effect=files)
+
+    mock_read = mocker.patch('analyze.filter_regions.Region_Reader')
+    mock_read().__enter__().yield_fa.return_value = iter([
+        ('r1', ['> seq', '> info'], ['atcg', 'x..']),
+        ('r2', ['> seq', '> info'], ['attatt', 'aggcat', 'attatt',
+                                     'ag-tat', np.array(list('CPPPPPPP'))]),
+        ('r3', ['> seq', '> info'], ['actata', 'attatt', 'nng---',
+                                     'ag-tat', np.array(list('CPPPPPPP'))])])
+    mock_log = mocker.patch('analyze.filter_regions.log')
+
+    filterer.run([0.99, 0.8, 0.1])
+
+    assert mock_log.info.call_args_list == [
+        mocker.call('pred'),
+        mocker.call('pred2'),
+    ]
+
+    assert mock_read.call_count == 3  # called once during setup
+    mock_read.assert_called_with('region_pred2.fa.gz', as_fa=True)
+
+    assert mocked_file.call_args_list == [
+        mocker.call('sweep.txt', 'w'),
+        mocker.call('int_pred.txt', 'w'),
+        mocker.call('int_int_pred.txt', 'w'),
+        mocker.call('amb_pred.txt', 'w'),
+        mocker.call('amb_int_pred.txt', 'w'),
+        mocker.call('int_pred2.txt', 'w'),
+        mocker.call('int_int_pred2.txt', 'w'),
+        mocker.call('amb_pred2.txt', 'w'),
+        mocker.call('amb_int_pred2.txt', 'w'),
+    ]
+
+    print(files[0].getvalue())
+    assert files[0].getvalue() == (
+        'threshold\tpredicted_state\talternative_states\tcount\n'
+        '0.99\tpred\tpred,pred2\t1\n'
+        '0.99\tpred\tpred\t1\n'
+        '0.8\tpred\tpred,pred2\t1\n'
+        '0.8\tpred\tpred\t1\n'
+        '0.1\tpred\tpred,pred2\t1\n'
+        '0.1\tpred\tpred\t1\n'
+    )
+
+    assert files[1].getvalue() == 'regions\nr2\t\nr3\t\n'
+    assert files[2].getvalue() == (
+        'regions\treason\n'
+        'r1\tfraction gaps/masked in master = 0.6\n'
+        'r2\t\n'
+        'r3\t\n'
+    )
+    assert files[3].getvalue() == 'regions\nr3\t\n'  # pass filter 2
+    assert files[4].getvalue() == (
+        'regions\talternative_states\talternative_ids\talternative_P_counts\n'
+        'r2\tpred,pred2\t1.0,1.0\t0,0\n'
+        'r3\tpred\t1.0\t0\n'
+    )
+
+
+def test_filter_introgressed(filterer, mocker):
+    # fail fraction gapped on reference
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 4,
+              'num_sites_nonmask_pred': 0,
+              'match_nongap_pred': 0,
+              'num_sites_nongap_pred': 0,
+              'match_nongap_ref': 0,
+              'num_sites_nongap_ref': 0,
+              }
+
+    assert filterer.filter_introgressed(region, '', 'ref') == \
+        (False, 'fraction gaps/masked in master = 0.6')
+
+    # fail fraction gapped on predicted
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 5,
+              'num_sites_nonmask_pred': 3,
+              'match_nongap_pred': 0,
+              'num_sites_nongap_pred': 0,
+              'match_nongap_ref': 0,
+              'num_sites_nongap_ref': 0,
+              }
+
+    assert filterer.filter_introgressed(region, '', 'ref') == \
+        (False, 'fraction gaps/masked in predicted = 0.7')
+
+    # fail match counts
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 5,
+              'num_sites_nonmask_pred': 5,
+              'match_nongap_pred': 0,
+              'num_sites_nongap_pred': 0,
+              'match_nongap_ref': 0,
+              'num_sites_nongap_ref': 0,
+              }
+
+    assert filterer.filter_introgressed(region,
+                                        np.array(list('CP')), 'ref') == \
+        (False, 'count_P = 1')
+    assert filterer.filter_introgressed(region,
+                                        np.array(list('CCCCCCCCPPPPPPP')),
+                                        'ref') == \
+        (False, 'count_P = 7 and count_C = 8')
+
+    # fail divergence, master >= pred
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 5,
+              'num_sites_nonmask_pred': 5,
+              'match_nongap_pred': 5,
+              'num_sites_nongap_pred': 10,
+              'match_nongap_ref': 6,
+              'num_sites_nongap_ref': 10,
+              }
+
+    assert filterer.filter_introgressed(region,
+                                        np.array(list('CPPPPPPP')), 'ref') == \
+        (False, 'id with master = 0.6 and id with predicted = 0.5')
+
+    # fail divergence, master >= 0.7
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 5,
+              'num_sites_nonmask_pred': 5,
+              'match_nongap_pred': 8,
+              'num_sites_nongap_pred': 10,
+              'match_nongap_ref': 6,
+              'num_sites_nongap_ref': 10,
+              }
+
+    assert filterer.filter_introgressed(region,
+                                        np.array(list('CPPPPPPP')), 'ref') == \
+        (False, 'id with master = 0.6')
+
+    # passes
+    region = {'predicted_species': 'pred',
+              'start': 0,
+              'end': 9,
+              'num_sites_nonmask_ref': 5,
+              'num_sites_nonmask_pred': 5,
+              'match_nongap_pred': 8,
+              'num_sites_nongap_pred': 10,
+              'match_nongap_ref': 7,
+              'num_sites_nongap_ref': 10,
+              }
+
+    assert filterer.filter_introgressed(region,
+                                        np.array(list('CPPPPPPP')), 'ref') == \
+        (True, '')
+
+
+def test_filter_ambiguous(filterer, mocker):
+    region = {'predicted_species': '1'}
+    seqs = [list('attatt'),  # reference
+            list('aggcat'),  # 4 / 5, p = 2
+            list('a--tta'),  # 2 / 4, p = 1
+            list('nng---'),  # no matches, '3' not in outputs
+            list('attatt'),  # 2 / 5, p = 0
+            list('ag-tat')]  # test sequence
+
+    threshold = 0
+    filt, states = filterer.filter_ambiguous(
+        region, seqs, threshold, ['ref', '1', '2', '3', '4'])
+    assert filt is False
+    assert region['alternative_states'] == '1,2,4'
+    assert region['alternative_ids'] == '0.8,0.5,0.4'
+    assert region['alternative_P_counts'] == '2,1,0'
+    assert states == ['1', '2', '4']
+
+    threshold = 0.1
+    filt, _ = filterer.filter_ambiguous(
+        region, seqs, threshold, ['ref', '1', '2', '3', '4'])
+    assert filt is False
+    assert region['alternative_states'] == '1,2'
+    assert region['alternative_ids'] == '0.8,0.5'
+    assert region['alternative_P_counts'] == '2,1'
+
+    threshold = 0.9
+    filt, _ = filterer.filter_ambiguous(
+        region, seqs, threshold, ['ref', '1', '2', '3', '4'])
+    assert filt is True
+    assert region['alternative_states'] == '1'
+    assert region['alternative_ids'] == '0.8'
+    assert region['alternative_P_counts'] == '2'
+
+
+def test_filter_ambiguous_on_region_10817(filterer, mocker):
+
+    fa = os.path.join(os.path.split(__file__)[0], 'r10817.fa')
+
+    if os.path.exists(fa):
+        headers, seqs = read_fasta.read_fasta(fa, gz=False)
+        seqs = seqs[:-1]
+        region = {'predicted_species': 'CBS432'}
+        p, _ = filterer.filter_ambiguous(
+            region, seqs, 0.98,
+            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
+        assert p is False
+        assert region['alternative_states'] == (
+            'CBS432,N_45')
+        assert region['alternative_P_counts'] == '111,110'
+
+    else:
+        warnings.warn('Unable to test with datafile r10817.fa')
+
+
+def test_filter_ambiguous_on_region_10805(filterer, mocker):
+
+    fa = os.path.join(os.path.split(__file__)[0], 'r10805.fa')
+
+    if os.path.exists(fa):
+        headers, seqs = read_fasta.read_fasta(fa, gz=False)
+        seqs = seqs[:-1]
+        region = {'predicted_species': 'N_45'}
+        p, _ = filterer.filter_ambiguous(
+            region, seqs, 0.1,
+            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
+        assert p is False
+        assert region['alternative_states'] == (
+            'CBS432,N_45,UWOPS91_917_1,DBVPG6304')
+        assert region['alternative_ids'] == (
+            '0.9983805668016195,0.994331983805668,'
+            '0.9642857142857143,0.9618506493506493')
+        assert region['alternative_P_counts'] == '145,143,128,129'
+
+        region = {'predicted_species': 'N_45'}
+        p, _ = filterer.filter_ambiguous(
+            region, seqs, 0.98,
+            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
+        assert p is False
+        assert region['alternative_states'] == 'CBS432,N_45'
+        assert region['alternative_ids'] == (
+            '0.9983805668016195,0.994331983805668')
+        assert region['alternative_P_counts'] == '145,143'
+
+    else:
+        warnings.warn('Unable to test with datafile r10805.fa')
+
+
+@pytest.fixture
+def filter_sweep():
+    return filter_regions.Filter_Sweep(None, [])
+
+
+def test_filter_sweep_context(mocker):
+    # no file, no list
+    mock_open = mocker.patch('analyze.filter_regions.open')
+    fs = filter_regions.Filter_Sweep(None, [])
+    mock_open.assert_not_called()
+
+    fs.__enter__()
+    assert fs.sweep_writer is None
+    mock_open.assert_not_called()
+    fs.__exit__(None, None, None)
+    mock_open.return_value.close.assert_not_called()
+
+    # file, no list
+    mock_open = mocker.patch('analyze.filter_regions.open')
+    fs = filter_regions.Filter_Sweep('sweep.txt', [])
+    mock_open.assert_not_called()
+
+    fs.__enter__()
+    assert fs.sweep_writer is None
+    mock_open.assert_not_called()
+    assert not fs.__exit__(None, None, 'trace')
+    mock_open.return_value.close.assert_not_called()
+
+    # file, list
+    mock_open = mocker.patch('analyze.filter_regions.open')
+    fs = filter_regions.Filter_Sweep('sweep.txt', [.99])
+    mock_open.assert_not_called()
+
+    fs.__enter__()
+    mock_open.assert_called_once_with('sweep.txt', 'w')
+    assert fs.sweep_writer is not None
+    assert fs.__exit__(None, None, None)
+    mock_open.return_value.close.assert_called_once()
+
+
+def test_sweep_write_header(filter_sweep):
+    output = StringIO()
+    filter_sweep.sweep_writer = output
+
+    filter_sweep.write_header()
+    assert output.getvalue() == \
+        'threshold\tpredicted_state\talternative_states\tcount\n'
+
+
+def test_sweep_record(filter_sweep, mocker):
+    mock_lambda = mocker.MagicMock(
+        side_effect=[
+            (0, ['s1']),
+            (0, ['s1', 's2']),
+            (0, ['s2', 's3']),
+            (0, ['s4', 's3']),
+        ])
+    filter_sweep.thresholds = [1, 0.9, 0.8, 0.7]
+
+    filter_sweep.record('test', mock_lambda)
+    mock_lambda.assert_not_called()
+
+    filter_sweep.sweep_writer = ''
+    filter_sweep.record('test', mock_lambda)
+    assert mock_lambda.call_args_list == [
+        mocker.call(1),
+        mocker.call(0.9),
+        mocker.call(0.8),
+        mocker.call(0.7)]
+
+    assert filter_sweep.data_table == {
+        1: {'test': {'s1': 1}},
+        0.9: {'test': {'s1,s2': 1}},
+        0.8: {'test': {'s2,s3': 1}},
+        0.7: {'test': {'s3,s4': 1}},
+    }
+
+
+def test_sweep_write_results(filter_sweep):
+    filter_sweep.data_table == {
+        1: {'test': {'s1': 1}},
+        0.9: {'test': {'s1,s2': 1}},
+        0.8: {'test': {'s2,s3': 1}},
+        0.7: {'test': {'s3,s4': 1}},
+    }
+    filter_sweep.thresholds = [1, 0.9, 0.8, 0.7, 0]
+
+    filter_sweep.write_results([])
+
+    output = StringIO()
+    filter_sweep.sweep_writer = output
+
+    filter_sweep.write_results(['state'])
+    assert output.getvalue() == ''
+
+    filter_sweep.write_results(['test'])
+    assert output.getvalue() == (
+        ''
+    )
+
+
+def test_record_data_hit(filter_sweep):
+    filter_sweep.record_data_hit(0.9, 's1', ['k1'])
+    assert filter_sweep.data_table == {0.9: {'s1': {'k1': 1}}}
+    filter_sweep.record_data_hit(0.9, 's1', ['k1'])
+    filter_sweep.record_data_hit(0.9, 's1', ['k1'])
+    assert filter_sweep.data_table == {0.9: {'s1': {'k1': 3}}}
+    filter_sweep.record_data_hit(0.9, 's1', ['k2'])
+    assert filter_sweep.data_table == {
+        0.9: {
+            's1': {'k1': 3, 'k2': 1}
+        }
+    }
+    filter_sweep.record_data_hit(0.9, 's2', ['k2'])
+    assert filter_sweep.data_table == {
+        0.9: {
+            's1': {'k1': 3, 'k2': 1},
+            's2': {'k2': 1}
+        }
+    }
+    filter_sweep.record_data_hit(0.8, 's2', ['k2'])
+    assert filter_sweep.data_table == {
+        0.9: {
+            's1': {'k1': 3, 'k2': 1},
+            's2': {'k2': 1}
+        },
+        0.8: {
+            's2': {'k2': 1}
+        }
+    }
+    filter_sweep.record_data_hit(0.9, 's2', ['k2', 'k3'])
+    assert filter_sweep.data_table == {
+        0.9: {
+            's1': {'k1': 3, 'k2': 1},
+            's2': {'k2': 1, 'k2,k3': 1}
+        },
+        0.8: {
+            's2': {'k2': 1}
+        }
+    }
+
+
+@pytest.fixture
+def filter_writer():
+    config = Configuration()
+    config.set(introgressed='int_{state}.txt',
+               introgressed_intermediate='int_int_{state}.txt',
+               ambiguous='amb_{state}.txt',
+               ambiguous_intermediate='amb_int_{state}.txt')
+    return filter_regions.Filter_Writers(config)
+
+
+def test_filter_writer_init(filter_writer):
+    assert filter_writer.files == {
+        'introgressed': 'int_{state}.txt',
+        'introgressed_int': 'int_int_{state}.txt',
+        'ambiguous': 'amb_{state}.txt',
+        'ambiguous_int': 'amb_int_{state}.txt'
+    }
+    assert filter_writer.writers is None
+    assert filter_writer.headers is None
+
+
+def test_filter_writer_context(filter_writer, mocker):
+    mock_open = mocker.patch('analyze.filter_regions.open')
+    with filter_writer.open_state('s1', ['h1']) as filter_writer:
+        assert mock_open.call_args_list == [
+            mocker.call('int_s1.txt', 'w'),
+            mocker.call('int_int_s1.txt', 'w'),
+            mocker.call('amb_s1.txt', 'w'),
+            mocker.call('amb_int_s1.txt', 'w')]
+        mock_open.return_value.close.assert_not_called()
+        assert filter_writer.headers == {
+            'introgressed': ['h1'],
+            'introgressed_int': ['h1', 'reason'],
+            'ambiguous': ['h1'],
+            'ambiguous_int': ['h1', 'alternative_states',
+                              'alternative_ids', 'alternative_P_counts']
+        }
+
+    assert mock_open.return_value.close.call_count == 4
+    assert filter_writer.writers is None
+    assert filter_writer.headers is None
+    mock_open.reset_mock()
+
+    with filter_writer.open_state('s2', ['h2']) as filter_writer:
+        assert mock_open.call_args_list == [
+            mocker.call('int_s2.txt', 'w'),
+            mocker.call('int_int_s2.txt', 'w'),
+            mocker.call('amb_s2.txt', 'w'),
+            mocker.call('amb_int_s2.txt', 'w')]
+
+        mock_open.return_value.close.assert_not_called()
+
+        assert filter_writer.headers == {
+            'introgressed': ['h2'],
+            'introgressed_int': ['h2', 'reason'],
+            'ambiguous': ['h2'],
+            'ambiguous_int': ['h2', 'alternative_states',
+                              'alternative_ids', 'alternative_P_counts']
+        }
+
+    assert mock_open.return_value.close.call_count == 4
+
+
+def test_filter_writers_write_headers(filter_writer):
+    filter_writer.write_headers()  # nop
+
+    filter_writer.writers = {
+        'introgressed': StringIO(),
+        'introgressed_int': StringIO(),
+        'ambiguous': StringIO(),
+        'ambiguous_int': StringIO()
+    }
+
+    filter_writer.write_headers()  # nop
+
+    filter_writer.headers = {
+        'introgressed': ['h1'],
+        'introgressed_int': ['h2', 'h3'],
+        'ambiguous': ['h4'],
+        'ambiguous_int': ['h5']
+    }
+
+    filter_writer.write_headers()
+    assert filter_writer.writers['introgressed'].getvalue() == 'h1\n'
+    assert filter_writer.writers['introgressed_int'].getvalue() == 'h2\th3\n'
+    assert filter_writer.writers['ambiguous'].getvalue() == 'h4\n'
+    assert filter_writer.writers['ambiguous_int'].getvalue() == 'h5\n'
+
+
+def test_write_filtered_line(filter_writer):
+    # single value, first field is ignored
+    output = StringIO()
+    filter_writer.write_filtered_line(output, 'r1', {'chr': 'I'}, ['', 'chr'])
+
+    assert output.getvalue() == 'r1\tI\n'
+
+    # no value
+    output = StringIO()
+    filter_writer.write_filtered_line(output, 'r1', {}, [])
+
+    assert output.getvalue() == 'r1\t\n'
+
+    # two values
+    output = StringIO()
+    filter_writer.write_filtered_line(output, 'r1',
+                                      {'a': 'b', 'c': 'd'},
+                                      ['', 'c', 'a'])
+
+    assert output.getvalue() == 'r1\td\tb\n'
diff --git a/code/test/analyze/test_id_regions.py b/code/test/analyze/test_id_regions.py
new file mode 100644
index 0000000..8c1cf58
--- /dev/null
+++ b/code/test/analyze/test_id_regions.py
@@ -0,0 +1,157 @@
+from analyze import id_regions
+import pytest
+from analyze.introgression_configuration import Configuration
+
+
+@pytest.fixture
+def id_producer():
+    config = Configuration()
+    config.add_config({
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432'},
+             {'name': 'N_45'},
+             {'name': 'DBVPG6304'},
+             {'name': 'UWOPS91_917_1'},
+         ],
+         'unknown_states': [{'name': 'unknown'}]
+         }
+    })
+    config.set('states')
+    result = id_regions.ID_producer(config)
+    return result
+
+
+def test_producer(id_producer):
+    assert id_producer.config.known_states == \
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1'.split()
+    assert id_producer.config.unknown_states == \
+        'unknown'.split()
+    assert id_producer.config.states == \
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1 unknown'.split()
+
+
+def test_add_ids_empty(id_producer, mocker):
+    id_producer.config.add_config({
+        'chromosomes': ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX',
+                        'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'],
+        'paths': {'analysis': {'blocks': 'dir/blocks_{state}.txt',
+                               'labeled_blocks':
+                               'dir/blocks_{state}_labeled.txt',
+                               }}})
+
+    id_producer.config.states = 'ref state1 unknown'.split()
+    id_producer.config.set(blocks=None,
+                           labeled_blocks=None,
+                           chromosomes=None)
+
+    mocker.patch('analyze.id_regions.read_blocks',
+                 return_value={})
+
+    mocked_file = mocker.patch('analyze.id_regions.open',
+                               mocker.mock_open())
+
+    id_producer.add_ids()
+
+    assert mocked_file.call_count == 3
+    mocked_file.assert_any_call('dir/blocks_ref_labeled.txt', 'w')
+    mocked_file.assert_any_call('dir/blocks_state1_labeled.txt', 'w')
+    mocked_file.assert_any_call('dir/blocks_unknown_labeled.txt', 'w')
+
+    # just headers
+    mocked_file().write.assert_has_calls([
+        mocker.call('region_id\tstrain\tchromosome\tpredicted_species'
+                    '\tstart\tend\tnum_sites_hmm\n')
+    ]*3)
+
+
+def test_add_ids(id_producer, mocker):
+    id_producer.config.add_config({
+        'chromosomes': ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX',
+                        'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'],
+        'paths': {'analysis': {'blocks': 'dir/blocks_{state}.txt',
+                               'labeled_blocks':
+                               'dir/blocks_{state}_labeled.txt',
+                               }}})
+
+    id_producer.config.states = 'ref state1 unknown'.split()
+    id_producer.config.set(blocks=None,
+                           labeled_blocks=None,
+                           chromosomes=None)
+
+    regions = [
+        {
+            'strain1': {
+                'I': [(10, 100, 10), (10, 100, 1)],
+                'VI': [(10, 100, 10), (10, 100, 1)],
+            },
+            'strain2': {
+                'V': [(10, 100, 10), (10, 100, 1)],
+            },
+            'strain3': {
+                'III': [(10, 100, 10), (10, 100, 1)],
+            }
+        },
+        {
+            'strain1': {
+                'IX': [(10, 100, 10), (10, 100, 1)],
+            },
+            'strain2': {
+                'II': [(10, 100, 10), (10, 100, 1)],
+            },
+            'strain3': {
+                'X': [(10, 100, 10), (10, 100, 1)],
+            }
+        },
+        {}
+    ]
+    mocker.patch('analyze.id_regions.read_blocks',
+                 side_effect=regions)
+
+    mocked_file = mocker.patch('analyze.id_regions.open',
+                               mocker.mock_open())
+
+    id_producer.add_ids()
+
+    assert mocked_file.call_count == 3
+    mocked_file.assert_any_call('dir/blocks_ref_labeled.txt', 'w')
+    mocked_file.assert_any_call('dir/blocks_state1_labeled.txt', 'w')
+    mocked_file.assert_any_call('dir/blocks_unknown_labeled.txt', 'w')
+
+    # headers
+    calls = [
+        mocker.call('region_id\tstrain\tchromosome\tpredicted_species'
+                    '\tstart\tend\tnum_sites_hmm\n')
+    ]*3 + [
+        mocker.call('r1\tstrain1\tI\tref\t10\t100\t10\n'),
+        mocker.call('r2\tstrain1\tI\tref\t10\t100\t1\n'),
+        mocker.call('r3\tstrain2\tII\tstate1\t10\t100\t10\n'),
+        mocker.call('r4\tstrain2\tII\tstate1\t10\t100\t1\n'),
+        mocker.call('r5\tstrain3\tIII\tref\t10\t100\t10\n'),
+        mocker.call('r6\tstrain3\tIII\tref\t10\t100\t1\n'),
+        mocker.call('r7\tstrain2\tV\tref\t10\t100\t10\n'),
+        mocker.call('r8\tstrain2\tV\tref\t10\t100\t1\n'),
+        mocker.call('r9\tstrain1\tVI\tref\t10\t100\t10\n'),
+        mocker.call('r10\tstrain1\tVI\tref\t10\t100\t1\n'),
+        mocker.call('r11\tstrain1\tIX\tstate1\t10\t100\t10\n'),
+        mocker.call('r12\tstrain1\tIX\tstate1\t10\t100\t1\n'),
+        mocker.call('r13\tstrain3\tX\tstate1\t10\t100\t10\n'),
+        mocker.call('r14\tstrain3\tX\tstate1\t10\t100\t1\n'),
+    ]
+    mocked_file().write.assert_has_calls(calls)
+
+
+def test_validate_arguments(id_producer):
+    with pytest.raises(ValueError) as e:
+        id_producer.validate_arguments()
+    assert ('Failed to validate ID Producer, '
+            "required argument 'chromosomes' was unset") in str(e)
+
+    config = id_producer.config
+    config.chromosomes = 1
+    config.blocks = 1
+    config.labeled_blocks = 1
+    config.states = 1
+
+    assert id_producer.validate_arguments()
diff --git a/code/test/analyze/test_id_regions_main.py b/code/test/analyze/test_id_regions_main.py
deleted file mode 100644
index bdbfc00..0000000
--- a/code/test/analyze/test_id_regions_main.py
+++ /dev/null
@@ -1,103 +0,0 @@
-from analyze import id_regions_main as main
-from operator import itemgetter
-
-
-def test_main_blank(mocker):
-    # setup global params to match expectations
-    mocker.patch('analyze.predict.gp.alignment_ref_order',
-                 ['ref', 'state1'])
-    mocker.patch('analyze.id_regions_main.gp.chrms',
-                 ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX',
-                  'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'])
-    mocker.patch('analyze.id_regions_main.gp.analysis_out_dir_absolute',
-                 'dir/')
-
-    mocker.patch('sys.argv',
-                 "test.py tag .001 viterbi 1000 .025 unknown 1000 .01".split())
-    mocker.patch('analyze.predict.read_blocks',
-                 return_value={})
-
-    mocked_file = mocker.patch('analyze.id_regions_main.open',
-                               mocker.mock_open())
-
-    main.main()
-
-    assert mocked_file.call_count == 3
-    mocked_file.assert_any_call('dir/tag/blocks_ref_tag_labeled.txt', 'w')
-    mocked_file.assert_any_call('dir/tag/blocks_state1_tag_labeled.txt', 'w')
-    mocked_file.assert_any_call('dir/tag/blocks_unknown_tag_labeled.txt', 'w')
-
-    # just headers
-    mocked_file().write.assert_has_calls([
-        mocker.call('region_id\tstrain\tchromosome\tpredicted_species'
-                    '\tstart\tend\tnum_sites_hmm\n')
-    ]*3)
-
-
-def test_main(mocker):
-    # setup global params to match expectations
-    mocker.patch('analyze.predict.gp.alignment_ref_order',
-                 ['ref', 'state1'])
-    mocker.patch('analyze.id_regions_main.gp.chrms',
-                 ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX',
-                  'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'])
-    mocker.patch('analyze.id_regions_main.gp.analysis_out_dir_absolute',
-                 'dir/')
-
-    mocker.patch('sys.argv',
-                 "test.py tag .001 viterbi 1000 .025 unknown 1000 .01".split())
-
-    regions = {
-        'ref': {
-            'I': [(10, 100, 10), (10, 100, 1)],
-            'IX': [(10, 100, 10), (10, 100, 1)],
-            'VI': [(10, 100, 10), (10, 100, 1)],
-        },
-        'state1': {
-            'II': [(10, 100, 10), (10, 100, 1)],
-            'X': [(10, 100, 10), (10, 100, 1)],
-            'V': [(10, 100, 10), (10, 100, 1)],
-        },
-        'unknown': {
-            'II': [(10, 100, 10), (10, 100, 1)],
-            'X': [(10, 100, 10), (10, 100, 1)],
-            'V': [(10, 100, 10), (10, 100, 1)],
-        }
-    }
-    mocker.patch('analyze.predict.read_blocks',
-                 return_value=regions)
-
-    mocked_file = mocker.patch('analyze.id_regions_main.open',
-                               mocker.mock_open())
-
-    main.main()
-
-    assert mocked_file.call_count == 3
-    mocked_file.assert_any_call('dir/tag/blocks_ref_tag_labeled.txt', 'w')
-    mocked_file.assert_any_call('dir/tag/blocks_state1_tag_labeled.txt', 'w')
-    mocked_file.assert_any_call('dir/tag/blocks_unknown_tag_labeled.txt', 'w')
-
-    # headers
-    calls = [
-        mocker.call('region_id\tstrain\tchromosome\tpredicted_species'
-                    '\tstart\tend\tnum_sites_hmm\n')
-    ]*3
-
-    rid = 1
-    by_chrom = dict(zip(main.gp.chrms, [[] for chrm in main.gp.chrms]))
-    for spec in ('ref', 'state1', 'unknown'):
-        for s in sorted(regions):
-            for c in regions[s]:
-                for e in regions[s][c]:
-                    start, end, num = e
-                    by_chrom[c].append((start, end, num, s, spec))
-
-    for c in main.gp.chrms:
-        for e in sorted(by_chrom[c], key=itemgetter(0, 3)):
-            start, end, num, s, spec = e
-            calls.append(mocker.call(
-                'r{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
-                    rid, s, c, spec, start, end, num)))
-            rid += 1
-
-    mocked_file().write.assert_has_calls(calls)
diff --git a/code/test/analyze/test_introgression_configuration.py b/code/test/analyze/test_introgression_configuration.py
new file mode 100644
index 0000000..3e3379e
--- /dev/null
+++ b/code/test/analyze/test_introgression_configuration.py
@@ -0,0 +1,453 @@
+from analyze.introgression_configuration import (
+    Configuration, Variable)
+import pytest
+
+
+@pytest.fixture()
+def config():
+    return Configuration()
+
+
+def test_set(config):
+    # unknown key
+    with pytest.raises(ValueError) as e:
+        config.set(asdf=None)
+    assert 'Unknown variable to set: asdf' in str(e)
+
+    # chromosomes
+    with pytest.raises(ValueError) as e:
+        config.set('chromosomes')
+    assert 'No chromosomes provided' in str(e)
+
+    config.config = {'chromosomes': ['I']}
+    config.set('chromosomes')
+    assert config.chromosomes == ['I']
+
+    # log file
+    config.set(log_file='')
+    assert config.log_file is None
+
+    config.set(log_file='test')
+    assert config.log_file == 'test'
+
+    config.config = {'paths': {'log_file': 'log'}}
+    config.set(log_file='')
+    assert config.log_file == 'log'
+
+    config.set(log_file='test')
+    assert config.log_file == 'test'
+
+
+def test_set_state_files(config):
+    state_files = [
+        'blocks',
+        'labeled_blocks',
+        'quality_blocks',
+        'introgressed',
+        'introgressed_intermediate',
+        'ambiguous',
+        'ambiguous_intermediate',
+    ]
+    for sf in state_files:
+        with pytest.raises(ValueError) as e:
+            config.set(**{sf: None})
+        assert f'No {sf} provided' in str(e)
+
+        with pytest.raises(ValueError) as e:
+            config.set(**{sf: 'test'})
+        assert '{state} not found in test' in str(e)
+
+        config.set(**{sf: 'test{state}'})
+        assert config.__dict__[sf] == 'test{state}'
+
+        config.config = {'paths': {'analysis': {sf: 'test2{state}'}}}
+        config.set(**{sf: None})
+        assert config.__dict__[sf] == 'test2{state}'
+
+
+def test_set_nonwild_files(config):
+    nonwild_files = [
+        'hmm_initial',
+        'hmm_trained',
+        'positions'
+    ]
+    for nwf in nonwild_files:
+        with pytest.raises(ValueError) as e:
+            config.set(**{nwf: None})
+        assert f'No {nwf} provided' in str(e)
+
+        config.set(**{nwf: 'test'})
+        assert config.__dict__[nwf] == 'test'
+
+        config.config = {'paths': {'analysis': {nwf: 'test2'}}}
+        config.set(**{nwf: None})
+        assert config.__dict__[nwf] == 'test2'
+
+
+def test_get_states(config):
+    assert config.get_states() == ([], [])
+
+    config.config = {
+            'analysis_params': {
+                'known_states': [
+                    {'name': 'k1'},
+                    {'name': 'k2'},
+                    {'name': 'k3'},
+                ],
+                'unknown_states': [
+                    {'name': 'u1'},
+                    {'name': 'u2'},
+                ]
+            }
+        }
+    assert config.get_states() == ('k1 k2 k3'.split(), 'u1 u2'.split())
+
+    config.config = {
+            'analysis_params': {
+                'reference': {'name': 'ref'},
+                'unknown_states': [
+                    {'name': 'u1'},
+                    {'name': 'u2'},
+                ]
+            }
+        }
+    assert config.get_states() == ('ref'.split(), 'u1 u2'.split())
+
+    config.config = {
+            'analysis_params': {
+                'reference': {'name': 'ref'},
+                'known_states': [
+                    {'name': 'k1'},
+                    {'name': 'k2'},
+                    {'name': 'k3'},
+                ],
+                'unknown_states': [
+                    {'name': 'u1'},
+                    {'name': 'u2'},
+                ]
+            }
+        }
+    assert config.get_states() == ('ref k1 k2 k3'.split(), 'u1 u2'.split())
+
+
+def test_get_interval_states(config):
+    assert config.get_interval_states() == []
+
+    config.config = {
+            'analysis_params': {
+                'reference': {'name': 'ref'},
+                'known_states': [
+                    {'name': 'k1'},
+                    {'name': 'k2'},
+                    {'name': 'k3'},
+                ],
+            }
+        }
+    assert config.get_interval_states() == 'ref k1 k2 k3'.split()
+
+    config.config = {
+            'analysis_params': {
+                'known_states': [
+                    {'name': 'k1'},
+                    {'name': 'k2'},
+                    {'name': 'k3'},
+                ],
+            }
+        }
+    assert config.get_interval_states() == 'k1 k2 k3'.split()
+
+    config.config = {
+            'analysis_params': {
+                'reference': {'name': 'ref'},
+            }
+        }
+    assert config.get_interval_states() == 'ref'.split()
+
+    config.config = {
+            'analysis_params': {
+                'reference': {'name': 'ref',
+                              'interval_name': 'int_ref'},
+                'known_states': [
+                    {'name': 'k1',
+                     'interval_name': 'i1'},
+                    {'name': 'k2'},
+                    {'name': 'k3',
+                     'interval_name': 'i3'},
+                ],
+            }
+        }
+    assert config.get_interval_states() == 'int_ref i1 k2 i3'.split()
+
+
+def test_set_states(config):
+    config.config = {
+            'analysis_params':
+            {'reference': {'name': 'S288c'},
+             'known_states': [
+                 {'name': 'CBS432'},
+                 {'name': 'N_45'},
+                 {'name': 'DBVPG6304'},
+                 {'name': 'UWOPS91_917_1'},
+             ],
+             'unknown_states': [{'name': 'unknown'}]
+             }
+        }
+
+    config.set('states')
+    assert config.known_states ==\
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1'.split()
+    assert config.unknown_states ==\
+        'unknown'.split()
+    assert config.states ==\
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1 unknown'.split()
+
+    config.set(states=[])
+    assert config.known_states ==\
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1'.split()
+    assert config.unknown_states ==\
+        'unknown'.split()
+    assert config.states ==\
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1 unknown'.split()
+
+    config.set(states='testing 123'.split())
+    assert config.states == ['testing', '123']
+
+    config.config = {}
+
+    with pytest.raises(ValueError) as e:
+        config.set('states')
+    assert 'No states specified' in str(e)
+
+
+def test_set_threshold(config):
+    with pytest.raises(ValueError) as e:
+        config.set('threshold')
+    assert 'No threshold provided' in str(e)
+
+    config.config = {'analysis_params': {'threshold': 'asdf'}}
+    with pytest.raises(ValueError) as e:
+        config.set('threshold')
+    assert 'Unsupported threshold value: asdf' in str(e)
+
+    config.set(threshold=0.05)
+    assert config.threshold == 0.05
+
+    config.config = {'analysis_params':
+                     {'threshold': 'viterbi'}}
+    config.set('threshold')
+    assert config.threshold == 'viterbi'
+
+
+def test_set_prefix(config):
+    config.known_states = ['s1']
+    config.set('prefix')
+    assert config.prefix == 's1'
+
+    config.known_states = 's1 s2'.split()
+    config.set('prefix')
+    assert config.prefix == 's1_s2'
+
+    config.set(prefix='prefix')
+    assert config.prefix == 'prefix'
+
+    config.known_states = []
+    with pytest.raises(ValueError) as e:
+        config.set('prefix')
+    assert 'Unable to build prefix, no known states provided' in str(e)
+
+
+def test_set_strains(config, mocker):
+    mock_find = mocker.patch.object(Configuration, 'find_strains')
+
+    config.set('strains')
+    mock_find.called_with(None)
+
+    with pytest.raises(ValueError) as e:
+        config.config = {'paths': {'test_strains': ['test']}}
+        config.set('strains')
+    assert '{strain} not found in test' in str(e)
+
+    with pytest.raises(ValueError) as e:
+        config.config = {'paths': {'test_strains': ['test{strain}']}}
+        config.set('strains')
+    assert '{chrom} not found in test{strain}' in str(e)
+
+    config.config = {'paths': {'test_strains':
+                               ['test{strain}{chrom}']}}
+    config.set('strains')
+    mock_find.called_with(['test{strain}{chrom}'])
+
+    config.set(strains='test{strain}{chrom}')
+    mock_find.called_with(['test{strain}{chrom}'])
+
+
+def test_find_strains(config, mocker):
+    with pytest.raises(ValueError) as e:
+        config.find_strains()
+    assert ('Unable to find strains in config and '
+            'no test_strains provided') in str(e)
+
+    config.config = {'strains': ['test2', 'test1']}
+    config.find_strains()
+    # sorted
+    assert config.strains == 'test1 test2'.split()
+
+    config.config = {}
+    config.chromosomes = ['I']
+
+    # too many chroms for s1
+    mock_glob = mocker.patch('analyze.introgression_configuration.glob.iglob',
+                             side_effect=[[
+                                 'test_prefix_s1_cII.fa',
+                                 'test_prefix_s2_cII.fa',
+                                 'test_prefix_s1_cIII.fa',
+                                 'test_prefix.fa',
+                             ]])
+    mock_log = mocker.patch('analyze.introgression_configuration.log')
+    with pytest.raises(ValueError) as e:
+        config.find_strains(['test_prefix_{strain}_c{chrom}.fa'])
+
+    assert "Strain s1 is missing chromosomes. Unable to find chromosome 'I'"\
+        in str(e)
+    mock_glob.assert_called_with('test_prefix_*_c*.fa')
+    mock_log.info.assert_called_with('searching for test_prefix_*_c*.fa')
+    assert mock_log.debug.call_args_list == \
+        [mocker.call("matched with ('s1', 'II')"),
+         mocker.call("matched with ('s2', 'II')"),
+         mocker.call("matched with ('s1', 'III')"),
+         ]
+
+    # no matches
+    mock_glob = mocker.patch('analyze.introgression_configuration.glob.iglob',
+                             side_effect=[[
+                                 'test_prefix.fa',
+                             ]])
+    mock_log = mocker.patch('analyze.introgression_configuration.log')
+    with pytest.raises(ValueError) as e:
+        config.find_strains(['test_prefix_{strain}_{chrom}.fa'])
+    assert ('Found no chromosome sequence files in '
+            "['test_prefix_{strain}_{chrom}.fa']") in str(e)
+    mock_glob.assert_called_with('test_prefix_*_*.fa')
+    mock_log.info.assert_called_with('searching for test_prefix_*_*.fa')
+    assert mock_log.debug.call_args_list == []
+
+    # correct, with second test_strains, extra chromosomes
+    mock_glob = mocker.patch('analyze.introgression_configuration.glob.iglob',
+                             side_effect=[
+                                 [
+                                     'test_prefix_s1_cI.fa',
+                                     'test_prefix_s2_cI.fa',
+                                     'test_prefix_s2_cII.fa',
+                                     'test_prefix.fa',
+                                 ],
+                                 ['test_prefix_cI_s3.fa']
+                             ])
+    mock_log = mocker.patch('analyze.introgression_configuration.log')
+    config.find_strains(['test_prefix_{strain}_c{chrom}.fa',
+                         'test_prefix_c{chrom}_{strain}.fa'])
+    assert mock_glob.call_args_list == \
+        [mocker.call('test_prefix_*_c*.fa'),
+         mocker.call('test_prefix_c*_*.fa')]
+    assert mock_log.info.call_args_list ==\
+        [mocker.call('searching for test_prefix_*_c*.fa'),
+         mocker.call('searching for test_prefix_c*_*.fa')]
+    assert mock_log.debug.call_args_list == \
+        [mocker.call("matched with ('s1', 'I')"),
+         mocker.call("matched with ('s2', 'I')"),
+         mocker.call("matched with ('s2', 'II')"),
+         mocker.call("matched with ('s3', 'I')"),
+         ]
+    assert config.strains == ['s1', 's2', 's3']
+
+
+def test_set_alignment(config):
+    config.set(alignment='align{strain}{chrom}')
+    assert config.alignment == 'align{strain}{chrom}'
+
+    with pytest.raises(AttributeError) as e:
+        config.set(alignment='align{prefix}{strain}{chrom}')
+    assert "'Configuration' object has no attribute 'prefix'" in str(e)
+
+    config.prefix = 'prefix'
+    config.set(alignment='align{prefix}{strain}{chrom}')
+    assert config.alignment == 'alignprefix{strain}{chrom}'
+
+
+def test_set_masked_file(config):
+    with pytest.raises(ValueError) as e:
+        config.set('masks')
+    assert 'No masks provided' in str(e)
+
+    with pytest.raises(ValueError) as e:
+        config.set(masks='mask')
+    assert '{strain} not found in mask' in str(e)
+
+    with pytest.raises(ValueError) as e:
+        config.set(masks='mask{strain}')
+    assert '{chrom} not found in mask{strain}' in str(e)
+
+    config.set(masks='mask{strain}{chrom}')
+    assert config.masks == 'mask{strain}{chrom}'
+
+    config.config = {'paths': {'analysis':
+                               {'masked_intervals': 'msk{strain}{chrom}'}}}
+    config.set('masks')
+    assert config.masks == 'msk{strain}{chrom}'
+
+
+def test_set_filter_threshold(config):
+    with pytest.raises(ValueError) as e:
+        config.set('filter_threshold')
+    assert 'No filter_threshold provided' in str(e)
+
+    config.set(filter_threshold=0.9)
+    assert config.filter_threshold == 0.9
+
+    config.config = {'analysis_params': {'filter_threshold': 0.8}}
+    config.set('filter_threshold')
+    assert config.filter_threshold == 0.8
+
+    with pytest.raises(ValueError) as e:
+        config.set(filter_threshold='test')
+    assert 'Filter threshold is not a valid number' in str(e)
+
+
+@pytest.fixture
+def variable():
+    return Variable('test')
+
+
+def test_variable_init(variable):
+    assert variable.name == 'test'
+    assert variable.config_path == 'test'
+    assert variable.nullable is False
+    assert variable.wildcards is None
+
+    var2 = Variable('test2', 'test.path', True, 'wild')
+    assert var2.name == 'test2'
+    assert var2.config_path == 'test.path'
+    assert var2.nullable is True
+    assert var2.wildcards == 'wild'
+
+
+def test_variable_parse(variable):
+    with pytest.raises(ValueError) as e:
+        variable.parse(None)
+    assert 'No test provided' in str(e)
+
+    assert variable.parse('test', {}) == 'test'
+    assert variable.parse(None, {'test': 'test'}) == 'test'
+
+    variable.config_path = 'test.path'
+    assert variable.parse(None, {'test': {'path': 'test'}}) == 'test'
+
+    variable.nullable = True
+    assert variable.parse(None) is None
+    assert variable.parse('test') == 'test'
+
+    variable.wildcards = 'state'
+    with pytest.raises(ValueError) as e:
+        variable.parse('test')
+    assert '{state} not found in test' in str(e)
+
+    assert variable.parse('test{state}') == 'test{state}'
diff --git a/code/test/analyze/test_main.py b/code/test/analyze/test_main.py
new file mode 100644
index 0000000..532d860
--- /dev/null
+++ b/code/test/analyze/test_main.py
@@ -0,0 +1,120 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+import logging as log
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_main_cli_configs(runner, mocker):
+    result = runner.invoke(main.cli)
+    assert result.exit_code == 0
+
+    with runner.isolated_filesystem():
+        mock_echo = mocker.patch('analyze.main.click.echo_via_pager')
+        mock_log_info = mocker.patch('analyze.main.log.info')
+        mock_log_debug = mocker.patch('analyze.main.log.debug')
+        mock_log_lvl = mocker.patch('analyze.main.log.basicConfig')
+
+        with open('config1.yaml', 'w') as f:
+            yaml.dump({'test': '123'}, f)
+        with open('config2.yaml', 'w') as f:
+            yaml.dump({'test': '23', 'test2': '34'}, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config1.yaml --config config2.yaml'.split())
+        assert result.exit_code == 0
+
+        # since no subcommand was called
+        mock_echo.assert_called_once()
+
+        mock_log_lvl.assert_called_once_with(level=log.WARNING)
+        assert mock_log_info.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 2 config files')
+        ]
+        print(mock_log_debug.call_args_list[0][0])
+        assert mock_log_debug.call_args_list == [
+            mocker.call('Cleaned config:\nConfig file:\ntest - 23\n'
+                        'test2 - 34\n\nSettings:\nlog_file - None\n')
+        ]
+
+
+def test_main_cli_verbosity(runner, mocker):
+    mock_log_info = mocker.patch('analyze.main.log.info')
+    mock_log_lvl = mocker.patch('analyze.main.log.basicConfig')
+
+    result = runner.invoke(
+        main.cli,
+        '-v')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.CRITICAL)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to CRITICAL'),
+        mocker.call('Read in 0 config files')
+    ]
+
+    mock_log_info.reset_mock()
+    mock_log_lvl.reset_mock()
+    result = runner.invoke(
+        main.cli,
+        '-vv')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.ERROR)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to ERROR'),
+        mocker.call('Read in 0 config files')
+    ]
+
+    mock_log_info.reset_mock()
+    mock_log_lvl.reset_mock()
+    result = runner.invoke(
+        main.cli,
+        '-vvv')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.WARNING)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to WARNING'),
+        mocker.call('Read in 0 config files')
+    ]
+
+    mock_log_info.reset_mock()
+    mock_log_lvl.reset_mock()
+    result = runner.invoke(
+        main.cli,
+        '-vvvv')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.INFO)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to INFO'),
+        mocker.call('Read in 0 config files')
+    ]
+
+    mock_log_info.reset_mock()
+    mock_log_lvl.reset_mock()
+    result = runner.invoke(
+        main.cli,
+        '-vvvvv')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.DEBUG)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to DEBUG'),
+        mocker.call('Read in 0 config files')
+    ]
+
+    mock_log_info.reset_mock()
+    mock_log_lvl.reset_mock()
+    result = runner.invoke(
+        main.cli,
+        '-vvvvvv')
+    assert result.exit_code == 0
+    mock_log_lvl.assert_called_once_with(level=log.DEBUG)
+    assert mock_log_info.call_args_list == [
+        mocker.call('Verbosity set to DEBUG'),
+        mocker.call('Read in 0 config files')
+    ]
diff --git a/code/test/analyze/test_main_filter_regions_args.py b/code/test/analyze/test_main_filter_regions_args.py
new file mode 100644
index 0000000..f0c1a8d
--- /dev/null
+++ b/code/test/analyze/test_main_filter_regions_args.py
@@ -0,0 +1,283 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.filter_regions import Filterer
+
+
+'''
+Unit tests for the filter_regions command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_threshold(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml filter-regions '
+            '--thresh 0.9'
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No introgressed provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Filter threshold set to \'0.9\''),
+        ]
+
+
+def test_filter_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        files = ['--introgress-filter int_{state}.txt',
+                 '--introgress-inter int_int_{state}.txt',
+                 '--ambiguous-filter amb_{state}.txt',
+                 '--ambiguous-inter amb_int_{state}.txt',
+                 '--filter-sweep filter.txt',
+                 ]
+        results = [
+            'No introgressed_intermediate provided',
+            'No ambiguous provided',
+            'No ambiguous_intermediate provided',
+            'No regions provided',  # sweep is not required
+            'No regions provided',
+        ]
+        for i, expected in enumerate(results):
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '--thresh 0.9 ' +
+                ' '.join(files[0:i+1])
+            )
+
+            assert result.exit_code != 0
+            assert str(result.exception) == expected
+            log = [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+            ]
+            if i >= 3:
+                log += [
+                    mocker.call('Introgressed filtered file '
+                                'is \'int_{state}.txt\''),
+                    mocker.call('Introgressed intermediate file '
+                                'is \'int_int_{state}.txt\''),
+                    mocker.call('Ambiguous filtered file '
+                                'is \'amb_{state}.txt\''),
+                    mocker.call('Ambiguous intermediate file '
+                                'is \'amb_int_{state}.txt\''),
+                ]
+            # filter sweep line is not printed if it is unset
+            if i == 4:
+                log += [mocker.call('Filter sweep file is \'filter.txt\'')]
+            assert mock_log.call_args_list == log
+            mock_log.reset_mock()
+
+
+def test_region_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '--thresh 0.9 '
+                '--introgress-filter int_{state}.txt '
+                '--introgress-inter int_int_{state}.txt '
+                '--ambiguous-filter amb_{state}.txt '
+                '--ambiguous-inter amb_int_{state}.txt '
+                '--filter-sweep filter.txt '
+                '--region region_{state}.gz '
+            )
+
+            assert result.exit_code != 0
+            assert str(result.exception) == 'No region_index provided'
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\'')
+            ]
+            mock_log.reset_mock()
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '--thresh 0.9 '
+                '--introgress-filter int_{state}.txt '
+                '--introgress-inter int_int_{state}.txt '
+                '--ambiguous-filter amb_{state}.txt '
+                '--ambiguous-inter amb_int_{state}.txt '
+                '--filter-sweep filter.txt '
+                '--region region_{state}.gz '
+                '--region-index region_{state}.pkl '
+            )
+
+            assert result.exit_code != 0
+            assert str(result.exception) == 'No quality_blocks provided'
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\'')
+            ]
+
+
+def test_quality_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_run = mocker.patch.object(Filterer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '--thresh 0.9 '
+                '--introgress-filter int_{state}.txt '
+                '--introgress-inter int_int_{state}.txt '
+                '--ambiguous-filter amb_{state}.txt '
+                '--ambiguous-inter amb_int_{state}.txt '
+                '--filter-sweep filter.txt '
+                '--region region_{state}.gz '
+                '--region-index region_{state}.pkl '
+                '--quality quality_{state}.txt '
+            )
+
+            assert result.exit_code == 0
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\''),
+                mocker.call('Quality file is \'quality_{state}.txt\''),
+                mocker.call('Threshold sweep with: []'),
+            ]
+
+            mock_run.assert_called_once_with([])
+
+
+def test_thresholds_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_run = mocker.patch.object(Filterer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '--thresh 0.9 '
+                '--introgress-filter int_{state}.txt '
+                '--introgress-inter int_int_{state}.txt '
+                '--ambiguous-filter amb_{state}.txt '
+                '--ambiguous-inter amb_int_{state}.txt '
+                '--filter-sweep filter.txt '
+                '--region region_{state}.gz '
+                '--region-index region_{state}.pkl '
+                '--quality quality_{state}.txt '
+                '1.0 .99 .98 .1 .01'
+            )
+
+            assert result.exit_code == 0
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\''),
+                mocker.call('Quality file is \'quality_{state}.txt\''),
+                mocker.call('Threshold sweep with: '
+                            '[1.0, 0.99, 0.98, 0.1, 0.01]'),
+            ]
+
+            mock_run.assert_called_once_with([1.0, 0.99, 0.98, 0.1, 0.01])
diff --git a/code/test/analyze/test_main_filter_regions_config.py b/code/test/analyze/test_main_filter_regions_config.py
new file mode 100644
index 0000000..edbe091
--- /dev/null
+++ b/code/test/analyze/test_main_filter_regions_config.py
@@ -0,0 +1,282 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.filter_regions import Filterer
+
+
+'''
+Unit tests for the filter_regions command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_empty(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        result = runner.invoke(
+            main.cli,
+            'filter-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No states specified'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 0 config files'),
+        ]
+
+
+def test_states(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml filter-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No filter_threshold provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+        ]
+
+
+def test_threshold(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml filter-regions '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No introgressed provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Filter threshold set to \'0.9\''),
+        ]
+
+
+def test_filter_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                    'paths': {'analysis': {
+                        'introgressed': 'int_{state}.txt',
+                        'introgressed_intermediate': 'int_int_{state}.txt',
+                        'ambiguous': 'amb_{state}.txt',
+                        'ambiguous_intermediate': 'amb_int_{state}.txt',
+                    }}
+                }, f)
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml filter-regions '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No regions provided'
+        log = [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Filter threshold set to \'0.9\''),
+            mocker.call('Introgressed filtered file '
+                        'is \'int_{state}.txt\''),
+            mocker.call('Introgressed intermediate file '
+                        'is \'int_int_{state}.txt\''),
+            mocker.call('Ambiguous filtered file '
+                        'is \'amb_{state}.txt\''),
+            mocker.call('Ambiguous intermediate file '
+                        'is \'amb_int_{state}.txt\''),
+        ]
+        assert mock_log.call_args_list == log
+
+
+def test_region_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                    'paths': {'analysis': {
+                        'introgressed': 'int_{state}.txt',
+                        'introgressed_intermediate': 'int_int_{state}.txt',
+                        'ambiguous': 'amb_{state}.txt',
+                        'ambiguous_intermediate': 'amb_int_{state}.txt',
+                        'filter_sweep': 'filter.txt',
+                        'regions': 'region_{state}.gz',
+                        'region_index': 'region_{state}.pkl',
+                    }}
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+            )
+
+            assert result.exit_code != 0
+            assert str(result.exception) == 'No quality_blocks provided'
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\'')
+            ]
+
+
+def test_quality_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_run = mocker.patch.object(Filterer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                    'paths': {'analysis': {
+                        'introgressed': 'int_{state}.txt',
+                        'introgressed_intermediate': 'int_int_{state}.txt',
+                        'ambiguous': 'amb_{state}.txt',
+                        'ambiguous_intermediate': 'amb_int_{state}.txt',
+                        'filter_sweep': 'filter.txt',
+                        'regions': 'region_{state}.gz',
+                        'region_index': 'region_{state}.pkl',
+                        'quality_blocks': 'quality_{state}.txt',
+                    }}
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+            )
+
+            assert result.exit_code == 0
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\''),
+                mocker.call('Quality file is \'quality_{state}.txt\''),
+                mocker.call('Threshold sweep with: []'),
+            ]
+
+            mock_run.assert_called_once_with([])
+
+
+def test_thresholds_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_run = mocker.patch.object(Filterer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                    'paths': {'analysis': {
+                        'introgressed': 'int_{state}.txt',
+                        'introgressed_intermediate': 'int_int_{state}.txt',
+                        'ambiguous': 'amb_{state}.txt',
+                        'ambiguous_intermediate': 'amb_int_{state}.txt',
+                        'filter_sweep': 'filter.txt',
+                        'regions': 'region_{state}.gz',
+                        'region_index': 'region_{state}.pkl',
+                        'quality_blocks': 'quality_{state}.txt',
+                    }}
+                }, f)
+
+            result = runner.invoke(
+                main.cli,
+                '--config config.yaml filter-regions '
+                '1.0 .99 .98 .1 .01'
+            )
+
+            assert result.exit_code == 0
+            assert mock_log.call_args_list == [
+                mocker.call('Verbosity set to WARNING'),
+                mocker.call('Read in 1 config file'),
+                mocker.call('Filter threshold set to \'0.9\''),
+                mocker.call('Introgressed filtered file '
+                            'is \'int_{state}.txt\''),
+                mocker.call('Introgressed intermediate file '
+                            'is \'int_int_{state}.txt\''),
+                mocker.call('Ambiguous filtered file '
+                            'is \'amb_{state}.txt\''),
+                mocker.call('Ambiguous intermediate file '
+                            'is \'amb_int_{state}.txt\''),
+                mocker.call('Filter sweep file is \'filter.txt\''),
+                mocker.call('Region file is \'region_{state}.gz\''),
+                mocker.call('Region index file is \'region_{state}.pkl\''),
+                mocker.call('Quality file is \'quality_{state}.txt\''),
+                mocker.call('Threshold sweep with: '
+                            '[1.0, 0.99, 0.98, 0.1, 0.01]'),
+            ]
+
+            mock_run.assert_called_once_with([1.0, 0.99, 0.98, 0.1, 0.01])
diff --git a/code/test/analyze/test_main_id_args.py b/code/test/analyze/test_main_id_args.py
new file mode 100644
index 0000000..6c4db93
--- /dev/null
+++ b/code/test/analyze/test_main_id_args.py
@@ -0,0 +1,97 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.id_regions import ID_producer
+
+
+'''
+Unit tests for the id_regions command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_states(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions --state s1 --state s2')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+        ]
+
+
+def test_block_file(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions '
+            '--state s1 --state s2 '
+            '--blocks block_{state}.txt ')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No labeled_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+            mocker.call("Input blocks file is 'block_{state}.txt'"),
+        ]
+
+
+def test_labeled_block_file(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        mock_id = mocker.patch.object(ID_producer, 'add_ids')
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions '
+            '--state s1 --state s2 '
+            '--blocks block_{state}.txt '
+            '--labeled labeled_block_{state}.txt'
+        )
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+            mocker.call("Input blocks file is 'block_{state}.txt'"),
+            mocker.call("Output blocks file is 'labeled_block_{state}.txt'"),
+        ]
+
+        mock_id.called_once()
diff --git a/code/test/analyze/test_main_id_config.py b/code/test/analyze/test_main_id_config.py
new file mode 100644
index 0000000..60024d2
--- /dev/null
+++ b/code/test/analyze/test_main_id_config.py
@@ -0,0 +1,143 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.id_regions import ID_producer
+
+
+'''
+Unit tests for the id_regions command of main.py when all parameters are
+provided by the config file
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_empty(runner):
+    result = runner.invoke(
+        main.cli,
+        'id-regions')
+    assert result.exit_code != 0
+    assert str(result.exception) == 'No chromosomes provided'
+
+
+def test_chroms(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split()
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No states specified'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config')
+        ]
+
+
+def test_states(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+        ]
+
+
+def test_block_file(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'block_{state}.txt',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No labeled_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+            mocker.call("Input blocks file is 'block_{state}.txt'"),
+        ]
+
+
+def test_labeled_block_file(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'block_{state}.txt',
+                        'labeled_blocks': 'labeled_block_{state}.txt',
+                    }}
+                }, f)
+
+        mock_id = mocker.patch.object(ID_producer, 'add_ids')
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml id-regions')
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call('Found 2 states to process'),
+            mocker.call("Input blocks file is 'block_{state}.txt'"),
+            mocker.call("Output blocks file is 'labeled_block_{state}.txt'"),
+        ]
+
+        mock_id.called_once()
diff --git a/code/test/analyze/test_main_predict_args.py b/code/test/analyze/test_main_predict_args.py
new file mode 100644
index 0000000..c108ef7
--- /dev/null
+++ b/code/test/analyze/test_main_predict_args.py
@@ -0,0 +1,337 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze import predict
+from pathlib import Path
+
+
+'''
+Unit tests for the predict command of main.py when parameters are
+provided by the arguments primarily
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_threshold(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold 0.05')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is '0.05'")
+        ]
+
+
+def test_block(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt'
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'Unable to find strains in config and no test_strains provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+        ]
+
+
+def test_prefix(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2'
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'Unable to find strains in config and no test_strains provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+        ]
+
+
+def test_test_strains(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        Path('s1_chrI.fa').touch()
+        Path('s1_chrII.fa').touch()
+        Path('s1_chrIII.fa').touch()
+        Path('s2_chrI.fa').touch()
+        Path('s2_chrII.fa').touch()
+        Path('s2_chrIII.fa').touch()
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--test-strains {strain}_chr{chrom}.fa'
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No hmm_initial provided'
+
+        print(mock_log.call_args_list)
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+            mocker.call('searching for *_chr*.fa'),
+            mocker.call('Found 1 test strain'),
+            mocker.call('Found 2 unique strains'),
+        ]
+
+
+def test_outputs(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_calls = [
+        mocker.call('Verbosity set to WARNING'),
+        mocker.call('Read in 1 config file'),
+        mocker.call('Found 3 chromosomes in config'),
+        mocker.call("Threshold value is 'viterbi'"),
+        mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+        mocker.call("Prefix is 's1_s2'"),
+        mocker.call('No test_strains provided'),
+        mocker.call('Found 2 unique strains'),
+    ]
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No hmm_trained provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+            '--hmm-trained hmm_trained.txt '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No positions provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+            '--hmm-trained hmm_trained.txt '
+            '--positions pos.txt.gz '
+            '--probabilities probs.txt.gz '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No alignment provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+            '--hmm-trained hmm_trained.txt '
+            '--positions pos.txt.gz '
+            '--probabilities probs.txt.gz '
+            '--alignment {prefix}_{strain}_chr{chrom}.maf '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            's1 did not provide an expected_length'
+        assert mock_log.call_args_list == mock_calls + [
+            mocker.call("Hmm_initial file is 'hmm_init.txt'"),
+            mocker.call("Hmm_trained file is 'hmm_trained.txt'"),
+            mocker.call("Positions file is 'pos.txt.gz'"),
+            mocker.call("Probabilities file is 'probs.txt.gz'"),
+            mocker.call("Alignment file is 's1_s2_{strain}_chr{chrom}.maf'"),
+            mocker.call("Only considering polymorphic sites"),
+        ]
+
+    mock_predict = mocker.patch.object(predict.Predictor, 'run_prediction')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+            '--hmm-trained hmm_trained.txt '
+            '--probabilities probs.txt.gz '
+            '--positions pos.txt.gz '
+            '--alignment {prefix}_{strain}_chr{chrom}.maf '
+        )
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == mock_calls + [
+            mocker.call("Hmm_initial file is 'hmm_init.txt'"),
+            mocker.call("Hmm_trained file is 'hmm_trained.txt'"),
+            mocker.call("Positions file is 'pos.txt.gz'"),
+            mocker.call("Probabilities file is 'probs.txt.gz'"),
+            mocker.call("Alignment file is 's1_s2_{strain}_chr{chrom}.maf'"),
+            mocker.call("Only considering polymorphic sites"),
+        ]
+        mock_predict.called_once_with(True)
+
+        mock_predict.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --threshold viterbi '
+            '--blocks blocks_{state}.txt --prefix s1_s2 '
+            '--hmm-initial hmm_init.txt '
+            '--hmm-trained hmm_trained.txt '
+            '--probabilities probs.txt.gz '
+            '--positions pos.txt.gz '
+            '--alignment {prefix}_{strain}_chr{chrom}.maf '
+            '--all-sites'
+        )
+
+        mock_predict.called_once_with(False)
diff --git a/code/test/analyze/test_main_predict_config.py b/code/test/analyze/test_main_predict_config.py
new file mode 100644
index 0000000..af5dcb8
--- /dev/null
+++ b/code/test/analyze/test_main_predict_config.py
@@ -0,0 +1,446 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze import predict
+from pathlib import Path
+
+
+'''
+Unit tests for the predict command of main.py when all parameters are
+provided by the config file
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_chroms(runner, mocker):
+    result = runner.invoke(
+        main.cli,
+        'predict')
+    assert result.exit_code != 0
+    assert str(result.exception) == 'No chromosomes provided'
+
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split()
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No threshold provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config')
+        ]
+
+
+def test_threshold(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi'
+                    }
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'")
+        ]
+
+
+def test_block(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                    }},
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'Unable to find strains in config and no test_strains provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+        ]
+
+
+def test_prefix(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                    }},
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'Unable to find strains in config and no test_strains provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+        ]
+
+
+def test_strains(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                    }},
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No hmm_initial provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+            mocker.call('No test_strains provided'),
+            mocker.call('Found 2 unique strains'),
+        ]
+
+
+def test_test_strains(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {
+                        'analysis': {
+                            'blocks': 'blocks_{state}.txt',
+                        },
+                        'test_strains': ['{strain}_chr{chrom}.fa']},
+                }, f)
+
+        Path('s1_chrI.fa').touch()
+        Path('s1_chrII.fa').touch()
+        Path('s1_chrIII.fa').touch()
+        Path('s2_chrI.fa').touch()
+        Path('s2_chrII.fa').touch()
+        Path('s2_chrIII.fa').touch()
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No hmm_initial provided'
+
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Threshold value is 'viterbi'"),
+            mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+            mocker.call("Prefix is 's1_s2'"),
+            mocker.call('searching for *_chr*.fa'),
+            mocker.call('Found 1 test strain'),
+            mocker.call('Found 2 unique strains'),
+        ]
+
+
+def test_outputs(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_calls = [
+        mocker.call('Verbosity set to WARNING'),
+        mocker.call('Read in 1 config file'),
+        mocker.call('Found 3 chromosomes in config'),
+        mocker.call("Threshold value is 'viterbi'"),
+        mocker.call("Output blocks file is 'blocks_{state}.txt'"),
+        mocker.call("Prefix is 's1_s2'"),
+        mocker.call('No test_strains provided'),
+        mocker.call('Found 2 unique strains'),
+    ]
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No hmm_trained provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                        'hmm_trained': 'hmm_trained.txt',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No positions provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                        'hmm_trained': 'hmm_trained.txt',
+                        'positions': 'pos.txt.gz',
+                        'probabilities': 'probs.txt.gz',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            'No alignment provided'
+        assert mock_log.call_args_list == mock_calls
+
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                        'hmm_trained': 'hmm_trained.txt',
+                        'probabilities': 'probs.txt.gz',
+                        'positions': 'pos.txt.gz',
+                        'alignment': '{prefix}_{strain}_chr{chrom}.maf',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == \
+            's1 did not provide an expected_length'
+        assert mock_log.call_args_list == mock_calls + [
+            mocker.call("Hmm_initial file is 'hmm_init.txt'"),
+            mocker.call("Hmm_trained file is 'hmm_trained.txt'"),
+            mocker.call("Positions file is 'pos.txt.gz'"),
+            mocker.call("Probabilities file is 'probs.txt.gz'"),
+            mocker.call("Alignment file is 's1_s2_{strain}_chr{chrom}.maf'"),
+            mocker.call("Only considering polymorphic sites")
+        ]
+
+    mock_predict = mocker.patch.object(predict.Predictor, 'run_prediction')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                        'hmm_trained': 'hmm_trained.txt',
+                        'positions': 'pos.txt.gz',
+                        'probabilities': 'probs.txt.gz',
+                        'alignment': '{prefix}_{strain}_chr{chrom}.maf',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --only-poly-sites')
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == mock_calls + [
+            mocker.call("Hmm_initial file is 'hmm_init.txt'"),
+            mocker.call("Hmm_trained file is 'hmm_trained.txt'"),
+            mocker.call("Positions file is 'pos.txt.gz'"),
+            mocker.call("Probabilities file is 'probs.txt.gz'"),
+            mocker.call("Alignment file is 's1_s2_{strain}_chr{chrom}.maf'"),
+            mocker.call("Only considering polymorphic sites"),
+        ]
+        mock_predict.called_once_with(True)
+
+    mock_predict = mocker.patch.object(predict.Predictor, 'run_prediction')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'chromosomes': 'I II III'.split(),
+                    'strains': 'str1 str2 str1'.split(),
+                    'analysis_params': {
+                        'threshold': 'viterbi',
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                    'paths': {'analysis': {
+                        'blocks': 'blocks_{state}.txt',
+                        'hmm_initial': 'hmm_init.txt',
+                        'hmm_trained': 'hmm_trained.txt',
+                        'positions': 'pos.txt.gz',
+                        'probabilities': 'probs.txt.gz',
+                        'alignment': '{prefix}_{strain}_chr{chrom}.maf',
+                    }},
+                }, f)
+
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml predict --all-sites')
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == mock_calls + [
+            mocker.call("Hmm_initial file is 'hmm_init.txt'"),
+            mocker.call("Hmm_trained file is 'hmm_trained.txt'"),
+            mocker.call("Positions file is 'pos.txt.gz'"),
+            mocker.call("Probabilities file is 'probs.txt.gz'"),
+            mocker.call("Alignment file is 's1_s2_{strain}_chr{chrom}.maf'"),
+            mocker.call("Considering all sites"),
+        ]
+        mock_predict.called_once_with(True)
diff --git a/code/test/analyze/test_main_summarize_regions_args.py b/code/test/analyze/test_main_summarize_regions_args.py
new file mode 100644
index 0000000..c36322d
--- /dev/null
+++ b/code/test/analyze/test_main_summarize_regions_args.py
@@ -0,0 +1,328 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.summarize_region_quality import Summarizer
+
+
+'''
+Unit tests for the summarize_regiosn command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_empty(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No labeled_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+        ]
+
+
+def test_labeled(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No quality_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+        ]
+
+
+def test_quality(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No masks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+        ]
+
+
+def test_masked(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No alignment provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+        ]
+
+
+def test_alignment(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        # no prefix
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt '
+            '--alignment {strain}_{chrom}_align.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No positions provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+        ]
+
+        # with prefix
+        mock_log.reset_mock()
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt '
+            '--alignment {prefix}_{strain}_{chrom}_align.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No positions provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is "
+                        "'r1_s1_s2_{strain}_{chrom}_align.txt'"),
+        ]
+
+
+def test_positions(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt '
+            '--alignment {strain}_{chrom}_align.txt --positions pos.txt')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No regions provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+        ]
+
+
+def test_region(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt '
+            '--alignment {strain}_{chrom}_align.txt --positions pos.txt '
+            '--region region{state}.gz'
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No region_index provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+        ]
+
+
+def test_run(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_summarize = mocker.patch.object(Summarizer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions --labeled {state}lbl.txt '
+            '--quality {state}qual.txt --masks {strain}_{chrom}mask.txt '
+            '--alignment {strain}_{chrom}_align.txt --positions pos.txt '
+            '--region region{state}.gz --region-index ind{state}.pkl'
+        )
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+            mocker.call("Region file is 'region{state}.gz'"),
+            mocker.call("Region index file is 'ind{state}.pkl'"),
+        ]
+        mock_summarize.assert_called_once_with([])
diff --git a/code/test/analyze/test_main_summarize_regions_config.py b/code/test/analyze/test_main_summarize_regions_config.py
new file mode 100644
index 0000000..cecbe0e
--- /dev/null
+++ b/code/test/analyze/test_main_summarize_regions_config.py
@@ -0,0 +1,335 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.summarize_region_quality import Summarizer
+
+
+'''
+Unit tests for the summarize_regions command of main.py when parameters are
+provided by config
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_empty(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No labeled_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+        ]
+
+
+def test_labeled(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No quality_blocks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+        ]
+
+
+def test_quality(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No masks provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+        ]
+
+
+def test_masked(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                        'masked_intervals': '{strain}_{chrom}mask.txt',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No alignment provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+        ]
+
+
+def test_alignment(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                        'masked_intervals': '{strain}_{chrom}mask.txt',
+                        'alignment': '{strain}_{chrom}_align.txt',
+                    }}
+                }, f)
+
+        # no prefix
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No positions provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+        ]
+
+
+def test_positions(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                        'masked_intervals': '{strain}_{chrom}mask.txt',
+                        'alignment': '{strain}_{chrom}_align.txt',
+                        'positions': 'pos.txt',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No regions provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+        ]
+
+
+def test_region(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                        'masked_intervals': '{strain}_{chrom}mask.txt',
+                        'alignment': '{strain}_{chrom}_align.txt',
+                        'positions': 'pos.txt',
+                        'regions': 'region{state}.gz',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No region_index provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+        ]
+
+
+def test_run(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mock_summarize = mocker.patch.object(Summarizer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'reference':
+                            {'name': 'r1'},
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'unknown_states': [
+                            {'name': 'u1'}]
+                    },
+                    'chromosomes': 'I II III'.split(),
+                    'paths': {'analysis': {
+                        'labeled_blocks': '{state}lbl.txt',
+                        'quality_blocks': '{state}qual.txt',
+                        'masked_intervals': '{strain}_{chrom}mask.txt',
+                        'alignment': '{strain}_{chrom}_align.txt',
+                        'positions': 'pos.txt',
+                        'regions': 'region{state}.gz',
+                        'region_index': 'ind{state}.pkl',
+                    }}
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-regions')
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call('Found 3 chromosomes in config'),
+            mocker.call("Labeled blocks file is '{state}lbl.txt'"),
+            mocker.call("Quality file is '{state}qual.txt'"),
+            mocker.call("Mask file is '{strain}_{chrom}mask.txt'"),
+            mocker.call("Alignment file is '{strain}_{chrom}_align.txt'"),
+            mocker.call("Positions file is 'pos.txt'"),
+            mocker.call("Region file is 'region{state}.gz'"),
+            mocker.call("Region index file is 'ind{state}.pkl'"),
+        ]
+        mock_summarize.assert_called_once_with([])
diff --git a/code/test/analyze/test_main_summarize_strains_args.py b/code/test/analyze/test_main_summarize_strains_args.py
new file mode 100644
index 0000000..0205baf
--- /dev/null
+++ b/code/test/analyze/test_main_summarize_strains_args.py
@@ -0,0 +1,136 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.summarize_strain_states import Strain_Summarizer
+
+
+'''
+Unit tests for the summarize_strains command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_introgressed(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains '
+            '--introgress-inter int_int_{state}.txt '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No ambiguous_intermediate provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+        ]
+
+
+def test_ambiguous(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains '
+            '--introgress-inter int_int_{state}.txt '
+            '--ambiguous-inter amb_int_{state}.txt '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No strain_info provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+        ]
+
+
+def test_info(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains '
+            '--introgress-inter int_int_{state}.txt '
+            '--ambiguous-inter amb_int_{state}.txt '
+            '--strain-info strain_info.txt '
+        )
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No state_counts provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+        ]
+
+
+def test_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mocker.patch.object(Strain_Summarizer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains '
+            '--introgress-inter int_int_{state}.txt '
+            '--ambiguous-inter amb_int_{state}.txt '
+            '--strain-info strain_info.txt '
+            '--state-counts state_counts.txt '
+        )
+
+        assert result.exit_code == 0
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call("Introgressed intermediate file is "
+                        "'int_int_{state}.txt'"),
+            mocker.call("Ambiguous intermediate file is "
+                        "'amb_int_{state}.txt'"),
+            mocker.call("Strain information from 'strain_info.txt'"),
+            mocker.call("State counts saved to 'state_counts.txt'"),
+        ]
diff --git a/code/test/analyze/test_main_summarize_strains_config.py b/code/test/analyze/test_main_summarize_strains_config.py
new file mode 100644
index 0000000..947062c
--- /dev/null
+++ b/code/test/analyze/test_main_summarize_strains_config.py
@@ -0,0 +1,95 @@
+import pytest
+from click.testing import CliRunner
+import analyze.main as main
+import yaml
+from analyze.summarize_strain_states import Strain_Summarizer
+
+
+'''
+Unit tests for the summarize_strains command of main.py when parameters are
+provided by args
+'''
+
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+
+def test_empty(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        result = runner.invoke(
+            main.cli,
+            'summarize-strains')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No states specified'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 0 config files'),
+        ]
+
+
+def test_states(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                    },
+                }, f)
+
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains')
+
+        assert result.exit_code != 0
+        assert str(result.exception) == 'No introgressed_intermediate provided'
+        assert mock_log.call_args_list == [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+        ]
+
+
+def test_files(runner, mocker):
+    mock_log = mocker.patch('analyze.main.log.info')
+    mocker.patch.object(Strain_Summarizer, 'run')
+    with runner.isolated_filesystem():
+        with open('config.yaml', 'w') as f:
+            yaml.dump(
+                {
+                    'analysis_params': {
+                        'known_states': [
+                            {'name': 's1'},
+                            {'name': 's2'}],
+                        'filter_threshold': 0.9,
+                    },
+                    'paths': {'analysis': {
+                        'introgressed_intermediate': 'int_int_{state}.txt',
+                        'ambiguous_intermediate': 'amb_int_{state}.txt',
+                        'strain_info': 'strain_info.txt',
+                        'state_counts': 'state_counts.txt'
+                    }}
+                }, f)
+        result = runner.invoke(
+            main.cli,
+            '--config config.yaml summarize-strains '
+        )
+
+        assert result.exit_code == 0
+        log = [
+            mocker.call('Verbosity set to WARNING'),
+            mocker.call('Read in 1 config file'),
+            mocker.call("Introgressed intermediate file is "
+                        "'int_int_{state}.txt'"),
+            mocker.call("Ambiguous intermediate file is "
+                        "'amb_int_{state}.txt'"),
+            mocker.call("Strain information from 'strain_info.txt'"),
+            mocker.call("State counts saved to 'state_counts.txt'"),
+        ]
+        assert mock_log.call_args_list == log
diff --git a/code/test/analyze/test_predict.py b/code/test/analyze/test_predict.py
deleted file mode 100644
index c556411..0000000
--- a/code/test/analyze/test_predict.py
+++ /dev/null
@@ -1,656 +0,0 @@
-from analyze import predict
-from hmm import hmm_bw as hmm
-import pytest
-from pytest import approx
-from io import StringIO
-from collections import Counter, defaultdict
-import random
-import numpy as np
-
-
-def test_gp_symbols():
-    # because the following tests use symbols instead of the variables
-    assert predict.gp.match_symbol == '+'
-    assert predict.gp.mismatch_symbol == '-'
-    assert predict.gp.gap_symbol == '-'
-    assert predict.gp.unsequenced_symbol == 'n'
-
-
-@pytest.fixture
-def args():
-    args = predict.process_predict_args('p4e2 .001 viterbi 10000 .025 10000\
-                                         .025 10000 .025 10000 .025 unknown\
-                                         1000 .01'.split())
-    return args
-
-
-def test_process_predict_args():
-    # test with default args
-    args = predict.process_predict_args('p4e2 .001 viterbi 10000 .025 10000\
-                                         .025 10000 .025 10000 .025 unknown\
-                                         1000 .01'.split())
-    assert args['tag'] == 'p4e2'
-    assert args['improvement_frac'] == 0.001
-    assert args['threshold'] == 'viterbi'
-
-    assert args['known_states'] == predict.gp.alignment_ref_order
-    assert args['unknown_states'] == ['unknown']
-    assert args['states'] == predict.gp.alignment_ref_order + ['unknown']
-
-    assert args['expected_frac'] == {'DBVPG6304': 0.025,
-                                     'UWOPS91_917_1': 0.025,
-                                     'unknown': 0.01,
-                                     'CBS432': 0.025,
-                                     'N_45': 0.025,
-                                     'S288c': 0.89}
-
-    assert args['expected_tract_lengths'] == {'DBVPG6304': 10000.0,
-                                              'UWOPS91_917_1': 10000.0,
-                                              'unknown': 1000.0,
-                                              'CBS432': 10000.0,
-                                              'N_45': 10000.0,
-                                              'S288c': 0}
-    assert args['expected_num_tracts'] == {}
-    assert args['expected_bases'] == {}
-
-    assert len(args.keys()) == 10
-
-
-def test_process_predict_args_threshold():
-    args = predict.process_predict_args('p4e2 .001 test 10000 .025 10000\
-                                         .025 10000 .025 10000 .025 unknown\
-                                         1000 .01'.split())
-    assert args['threshold'] == 'viterbi'
-
-    args = predict.process_predict_args('p4e2 .001 0.1 10000 .025 10000\
-                                         .025 10000 .025 10000 .025 unknown\
-                                         1000 .01'.split())
-    assert args['threshold'] == 0.1
-
-
-def test_process_predict_args_exceptions():
-    # not enough unknown values
-    with pytest.raises(IndexError):
-        predict.process_predict_args('p4e2 .001 0.1 10000 .025 10000\
-                                      .025 10000 .025 10000 .025 unknown\
-                                      1000'.split())
-
-    # not enough arg values
-    with pytest.raises(ValueError):
-        predict.process_predict_args('p4e2 .001 0.1 10000 .025 10000\
-                                      .025 10000 .025 .025 unknown\
-                                      1000 0.01'.split())
-
-    with pytest.raises(ValueError):
-        predict.process_predict_args('p4e2 .001 0.1 10000 .025 10000\
-                                      .025 10000 .025 10000 .025 unknown\
-                                      1000 NotADouble'.split())
-
-
-def test_write_blocks_header():
-    writer = StringIO()
-    predict.write_blocks_header(writer)
-
-    assert writer.getvalue() == '\t'.join(['strain',
-                                           'chromosome',
-                                           'predicted_species',
-                                           'start',
-                                           'end',
-                                           'num_sites_hmm']) + '\n'
-
-
-def test_get_emis_symbols():
-    assert predict.get_emis_symbols([1]*1) == ['+', '-']
-    assert predict.get_emis_symbols([1]*3) == ['+++',
-                                               '++-',
-                                               '+-+',
-                                               '+--',
-                                               '-++',
-                                               '-+-',
-                                               '--+',
-                                               '---',
-                                               ]
-
-
-def test_write_hmm_header():
-    # TODO this has a return value which doesn't seem to be used
-    writer = StringIO()
-    predict.write_hmm_header([], [], [], writer)
-    assert writer.getvalue() == 'strain\tchromosome\t\n'
-
-    writer = StringIO()
-    predict.write_hmm_header(['s1', 's2'], ['u1'], ['-', '+'], writer)
-
-    header = 'strain\tchromosome\t'
-    header += '\t'.join(
-        ['init_{}'.format(s) for s in ['s1', 's2', 'u1']] +
-        ['emis_{}_{}'.format(s, sym)
-         for s in ['s1', 's2', 'u1']
-         for sym in ['-', '+']] +
-        ['trans_{}_{}'.format(s, s2)
-         for s in ['s1', 's2', 'u1']
-         for s2 in ['s1', 's2', 'u1']])
-
-    assert writer.getvalue() == header + '\n'
-
-
-def test_ungap_and_code():
-    # nothing in prediction
-    sequence, positions = predict.ungap_and_code(
-        '---',  # predicted reference string
-        ['abc', 'def', 'ghi'],  # several references
-        0)  # reference index
-    assert positions == approx([])
-    assert sequence == []
-
-    # one match
-    sequence, positions = predict.ungap_and_code(
-        'a--',
-        ['abc', 'def', 'ghi'],
-        0)
-    assert positions == approx([0])
-    assert sequence == ['+--']
-
-    # no match from refs
-    sequence, positions = predict.ungap_and_code(
-        'a--',
-        ['abc', 'def', '-hi'],
-        0)
-    assert positions == approx([])
-    assert sequence == []
-
-    # two matches
-    sequence, positions = predict.ungap_and_code(
-        'ae-',
-        ['abc', 'def', 'gei'],
-        0)
-    assert positions == approx([0, 1])
-    assert sequence == ['+--', '-++']
-
-    # mess with ref index
-    sequence, positions = predict.ungap_and_code(
-        'a--e-',
-        ['a--bc', 'deeef', 'geeei'],
-        0)
-    assert positions == approx([0, 1])
-    assert sequence == ['+--', '-++']
-    sequence, positions = predict.ungap_and_code(
-        'a--e-',
-        ['a--bc', 'deeef', 'geeei'],
-        1)
-    assert positions == approx([0, 3])
-    assert sequence == ['+--', '-++']
-
-
-    sequence, positions = predict.ungap_and_code(
-        'a---ef--i',
-        ['ab-dhfghi',
-         'a-cceeg-i',
-         'a-ceef-hh'],
-        0)
-
-    assert sequence == '+++ -++ +-+ ++-'.split()
-    assert positions == approx([0, 3, 4, 7])
-
-
-def test_poly_sites():
-    sequence, positions = predict.poly_sites(
-        '+++ -++ +-+ ++-'.split(),
-        [0, 3, 4, 7]
-    )
-    assert sequence == '-++ +-+ ++-'.split()
-    assert positions == approx([3, 4, 7])
-
-
-def test_set_expectations_default(args):
-    prev_tract = dict(args['expected_tract_lengths'])
-    assert args['expected_num_tracts'] == {}
-    assert args['expected_bases'] == {}
-    predict.set_expectations(args, 1e5)  # made number arbitrary
-    print(args)
-    assert args['expected_num_tracts'] == {'DBVPG6304': 0.025 * 10,
-                                           'UWOPS91_917_1': 0.025 * 10,
-                                           'CBS432': 0.025 * 10,
-                                           'N_45': 0.025 * 10,
-                                           'S288c': 1 + 1}
-
-    assert args['expected_bases'] == {'DBVPG6304': 0.025 * 1e5,
-                                      'UWOPS91_917_1': 0.025 * 1e5,
-                                      'CBS432': 0.025 * 1e5,
-                                      'N_45': 0.025 * 1e5,
-                                      'S288c': 1e5 - 1e4}
-    prev_tract['S288c'] = 45000
-    assert args['expected_tract_lengths'] == prev_tract
-
-
-def test_get_symbol_freqs():
-    sequence = '-++ +-+ ++- ---'.split()
-    symbol_test_helper(sequence)
-    # TODO throw better exception or handle better
-    # symbol_test_helper([])
-    symbol_test_helper(['+'])
-    # get all len 10 symbols
-    syms = predict.get_emis_symbols([1]*10)
-
-    random.seed(0)
-    for i in range(10):
-        sequence = [random.choice(syms) for j in range(100)]
-        symbol_test_helper(sequence)
-
-
-def symbol_test_helper(sequence):
-    ind, symb, weigh = predict.get_symbol_freqs(sequence)
-
-    num_states = len(sequence[0])
-    num_sites = len(sequence)
-
-    individual_symbol_freqs = []
-    for s in range(num_states):
-        d = defaultdict(int)
-        for i in range(num_sites):
-            d[sequence[i][s]] += 1
-        for sym in d:
-            d[sym] /= num_sites
-        individual_symbol_freqs.append(d)
-
-    symbol_freqs = defaultdict(int)
-    for i in range(num_sites):
-        symbol_freqs[sequence[i]] += 1
-    for sym in symbol_freqs:
-        symbol_freqs[sym] /= num_sites
-
-    # for each state, how often seq matches that state relative to
-    # others
-    weighted_match_freqs = []
-    for s in range(num_states):
-        weighted_match_freqs.append(
-            individual_symbol_freqs[s][predict.gp.match_symbol])
-
-    weighted_match_freqs /= np.sum(weighted_match_freqs)
-
-    assert ind == individual_symbol_freqs
-    assert symb == symbol_freqs
-    assert weigh == approx(weighted_match_freqs)
-
-
-def test_initial_probabilities(args):
-    probs = predict.initial_probabilities(args['known_states'],
-                                          args['unknown_states'],
-                                          args['expected_frac'],
-                                          [0.1, 0.2, 0.3, 0.4, 0.5])
-
-    assert args['expected_frac'] == {'DBVPG6304': 0.025,
-                                     'UWOPS91_917_1': 0.025,
-                                     'unknown': 0.01,
-                                     'CBS432': 0.025,
-                                     'N_45': 0.025,
-                                     'S288c': 0.89}
-    p = [0.1 + (0.89 - 0.1) * 0.9,
-         0.2 + (0.025 - 0.2) * 0.9,
-         0.3 + (0.025 - 0.3) * 0.9,
-         0.4 + (0.025 - 0.4) * 0.9,
-         0.5 + (0.025 - 0.5) * 0.9,
-         0.01]
-
-    p = p / np.sum(p, dtype=np.float)
-
-    assert probs == approx(p)
-
-
-def test_emission_probabilities(args):
-    # normal mode
-    symbols = predict.get_emis_symbols([1]*5)
-
-    # NOTE not sure why this takes the keys in predict.py or uses len-2
-    emis = predict.emission_probabilities(args['known_states'],
-                                          args['unknown_states'],
-                                          symbols)
-
-    np_emis = np_emission(args, symbols)
-    is_approx_equal_list_dict(emis, np_emis)
-
-    # too many symbols
-    symbols = predict.get_emis_symbols([1]*6)
-    emis = predict.emission_probabilities(args['known_states'],
-                                          args['unknown_states'],
-                                          symbols)
-    np_emis = np_emission(args, symbols)
-    is_approx_equal_list_dict(emis, np_emis)
-
-    # more unknowns
-    args['unknown_states'].append('test')
-    symbols = predict.get_emis_symbols([1]*5)
-    emis = predict.emission_probabilities(args['known_states'],
-                                          args['unknown_states'],
-                                          symbols)
-    np_emis = np_emission(args, symbols)
-    is_approx_equal_list_dict(emis, np_emis)
-
-    # no unknowns
-    args['unknown_states'] = []
-    symbols = predict.get_emis_symbols([1]*5)
-    emis = predict.emission_probabilities(args['known_states'],
-                                          args['unknown_states'],
-                                          symbols)
-    np_emis = np_emission(args, symbols)
-    is_approx_equal_list_dict(emis, np_emis)
-
-
-def np_emission(args, symbols):
-    probs = {'-+': 0.9,
-             '++': 0.09,
-             '--': 0.009,
-             '+-': 0.001}
-    mismatch_bias = 0.99
-
-    known_len = len(args['known_states'])
-    for k in probs:
-        probs[k] *= 2**(known_len - 2)
-
-    emis = []
-    # using older, iterative version
-    for s in range(known_len):
-        emis.append(defaultdict(float))
-        for symbol in symbols:
-            key = symbol[0] + symbol[s]
-            emis[s][symbol] = probs[key]
-
-        emis[s] = mynorm(emis[s])
-
-    symbol_len = len(symbols[0])
-    for s in range(len(args['unknown_states'])):
-        emis.append(defaultdict(float))
-        for symbol in symbols:
-            match_count = symbol.count('+')
-            mismatch_count = symbol_len - match_count
-            emis[s + known_len][symbol] = \
-                (match_count * (1 - mismatch_bias)
-                 + mismatch_count * mismatch_bias)
-        emis[s + known_len] = mynorm(emis[s + known_len])
-
-    return emis
-
-
-def is_approx_equal_list_dict(actual, expected):
-    for i in range(len(actual)):
-        for k in actual[i]:
-            assert actual[i][k] == approx(expected[i][k]),\
-                "failed at i={}, k={}".format(i, k)
-
-
-def mynorm(d):
-    total = float(sum(d.values()))
-    return {k: v/total for k, v in d.items()}
-
-
-def test_transition_probabilities(args):
-    args['expected_tract_lengths']['S288c'] = 45000
-    trans = predict.transition_probabilities(args['known_states'],
-                                             args['unknown_states'],
-                                             args['expected_frac'],
-                                             args['expected_tract_lengths'])
-
-    np_trans = np_transition(args)
-    for i in range(len(trans)):
-        assert trans[i] == approx(np_trans[i])
-
-
-def np_transition(args):
-    states = args['known_states'] + args['unknown_states']
-    expected_frac = args['expected_frac']
-    expected_tract_lengths = args['expected_tract_lengths']
-    trans = []
-    for i in range(len(states)):
-        state_from = states[i]
-        trans.append([])
-        scale_other = 1 / (1 - expected_frac[state_from])
-        for j in range(len(states)):
-            state_to = states[j]
-            if state_from == state_to:
-                trans[i].append(1 - 1./expected_tract_lengths[state_from])
-            else:
-                trans[i].append(1./expected_tract_lengths[state_from] *
-                                expected_frac[state_to] * scale_other)
-
-        trans[i] /= np.sum(trans[i])
-
-    return trans
-
-
-def test_initial_hmm_parameters(args):
-    args['expected_tract_lengths']['S288c'] = 45000
-    symbols = predict.get_emis_symbols([1]*5)
-    hm = predict.initial_hmm_parameters(
-        symbols,
-        args['known_states'],
-        args['unknown_states'],
-        args['expected_frac'],
-        args['expected_tract_lengths'])
-
-    assert args['expected_frac'] == {'DBVPG6304': 0.025,
-                                     'UWOPS91_917_1': 0.025,
-                                     'unknown': 0.01,
-                                     'CBS432': 0.025,
-                                     'N_45': 0.025,
-                                     'S288c': 0.89}
-    p = [0.2 + (0.89 - 0.2) * 0.9,
-         0.2 + (0.025 - 0.2) * 0.9,
-         0.2 + (0.025 - 0.2) * 0.9,
-         0.2 + (0.025 - 0.2) * 0.9,
-         0.2 + (0.025 - 0.2) * 0.9,
-         0.01]
-
-    p = p / np.sum(p, dtype=np.float)
-    assert hm.initial_p == approx(p)
-
-    np_emis = np_emission(args, symbols)
-    hm2 = hmm.HMM()
-    hm2.set_emissions(np_emis)
-    assert hm.emissions == approx(hm2.emissions)
-
-    np_trans = np_transition(args)
-    for i in range(len(hm.transitions)):
-        assert hm.transitions[i] == approx(np_trans[i])
-
-
-def test_predict_introgressed(args, capsys):
-    seqs = [list('NNENNENNEN'),  # S2288c
-            list('NNNENEENNN'),  # CBS432
-            list('NN-NNEENNN'),  # N_45
-            list('NEENN-ENEN'),  # DBVPG6304
-            list('ENENNEENEN'),  # UWOPS..
-            list('NNENNEENEN'),  # predicted
-            ]
-    ref = seqs[:-1]
-    pred = seqs[-1]
-
-    path, prob, hmm, hmm_init, ps = predict.predict_introgressed(
-        ref, pred, args, train=True)
-
-    # check hmm output
-    captured = capsys.readouterr()
-    out = captured.out.split('\n')
-    assert 'finished in 10 iterations' in out[-2]
-
-    # ps are locations of polymorphic sites, not counting missing '-'
-    assert ps == approx([0, 1, 3, 6, 8])
-    assert np.array_equal(hmm.initial_p, np.array([1, 0, 0, 0, 0, 0]))
-
-    # check path
-    assert path == ['S288c', 'S288c', 'UWOPS91_917_1',
-                    'UWOPS91_917_1', 'UWOPS91_917_1']
-
-    assert prob[0][0] == 1
-
-
-def test_write_positions():
-    output = StringIO()
-    predict.write_positions([0, 1, 3, 5, 7], output, 'test', 'I')
-    assert output.getvalue() == "{}\t{}\t{}\n".format(
-        "test",
-        "I",
-        "\t".join([str(i) for i in (0, 1, 3, 5, 7)]))
-
-
-def test_write_blocks():
-    output = StringIO()
-    block = []
-    pos = [i * 2 for i in range(20)]
-    predict.write_blocks(block,
-                         pos,
-                         output, 'test', 'I', 'pred')
-
-    assert output.getvalue() == ''
-
-    output = StringIO()
-    block = [(0, 1), (4, 6), (10, 8)]
-    pos = [i * 2 for i in range(20)]
-    predict.write_blocks(block,
-                         pos,
-                         output, 'test', 'I', 'pred')
-
-    result = "\n".join(
-        ["\t".join(['test', 'I', 'pred',
-                    str(pos[s]), str(pos[e]), str(e - s + 1)])
-         for s, e in block]) + "\n"
-
-    assert output.getvalue() == result
-
-
-def test_read_blocks(mocker):
-    block_in = StringIO('''
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked')
-
-    mocked_file.assert_called_with('mocked', 'r')
-    assert list(output.keys()) == []
-
-    block_in = StringIO('''header
-test\tI\tpred\t100\t200\t10
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked')
-
-    assert len(output) == 1
-    assert output['test']['I'] == [(100, 200, 10)]
-
-    block_in = StringIO('''header
-test\tI\tpred\t100\t200\t10
-test\tI\tpred\t200\t200\t30
-test\tI\tpred\t300\t400\t40
-test\tII\tpred\t300\t400\t40
-test2\tIII\tpred\t300\t400\t47
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked')
-
-    assert len(output) == 2
-    assert len(output['test']) == 2
-    assert len(output['test2']) == 1
-    assert output['test']['I'] == [
-        (100, 200, 10),
-        (200, 200, 30),
-        (300, 400, 40),
-    ]
-    assert output['test']['II'] == [(300, 400, 40)]
-    assert output['test2']['III'] == [(300, 400, 47)]
-
-
-def test_read_blocks_labeled(mocker):
-    block_in = StringIO('''
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked', labeled=True)
-
-    mocked_file.assert_called_with('mocked', 'r')
-    assert list(output.keys()) == []
-
-    block_in = StringIO('''header
-r1\ttest\tI\tpred\t100\t200\t10
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked', labeled=True)
-
-    assert len(output) == 1
-    assert output['test']['I'] == [('r1', 100, 200, 10)]
-
-    block_in = StringIO('''header
-r1\ttest\tI\tpred\t100\t200\t10
-r2\ttest\tI\tpred\t200\t200\t30
-r3\ttest\tI\tpred\t300\t400\t40
-r4\ttest\tII\tpred\t300\t400\t40
-r5\ttest2\tIII\tpred\t300\t400\t47
-''')
-
-    mocked_file = mocker.patch('analyze.predict.open',
-                               return_value=block_in)
-    output = predict.read_blocks('mocked', labeled=True)
-
-    assert len(output) == 2
-    assert len(output['test']) == 2
-    assert len(output['test2']) == 1
-    assert output['test']['I'] == [
-        ('r1', 100, 200, 10),
-        ('r2', 200, 200, 30),
-        ('r3', 300, 400, 40),
-    ]
-    assert output['test']['II'] == [('r4', 300, 400, 40)]
-    assert output['test2']['III'] == [('r5', 300, 400, 47)]
-
-
-def test_write_hmm():
-    output = StringIO()
-
-    hm = hmm.HMM()
-
-    # empty hmm
-    predict.write_hmm(hm, output, 'strain', 'I', list('abc'))
-    assert output.getvalue() == 'strain\tI\t\n'
-
-    hm.set_hidden_states(list('abc'))
-    hm.set_initial_p([0, 1, 0])
-    hm.set_transitions([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
-    hm.set_emissions([{'a': 1, 'b': 0, 'c': 0},
-                      {'a': 0, 'b': 0, 'c': 1},
-                      {'a': 0, 'b': 1, 'c': 0},
-                      ])
-
-    output = StringIO()
-    predict.write_hmm(hm, output, 'strain', 'I', list('abc'))
-
-    result = 'strain\tI\t'
-    result += '\t'.join(list('010')) + '\t'  # init
-    result += '\t'.join(list('100001010')) + '\t'  # emis
-    result += '\t'.join(list('010100001')) + '\n'  # trans
-    assert output.getvalue() == result
-
-
-def test_write_state_probs():
-    output = StringIO()
-    predict.write_state_probs([{}], output, 'strain', 'I', [])
-
-    assert output.getvalue() == 'strain\tI\t\n'
-
-    output = StringIO()
-    predict.write_state_probs([
-        [0, 0, 1],
-        [1, 0, 0],
-        [0, 1, 1],
-    ], output, 'strain', 'I', list('abc'))
-
-    assert output.getvalue() == \
-        ('strain\tI\t'
-         'a:0.00000,1.00000,0.00000\t'
-         'b:0.00000,0.00000,1.00000\t'
-         'c:1.00000,0.00000,1.00000\n')
diff --git a/code/test/analyze/test_predict_hmm_builder.py b/code/test/analyze/test_predict_hmm_builder.py
new file mode 100644
index 0000000..ee92c7e
--- /dev/null
+++ b/code/test/analyze/test_predict_hmm_builder.py
@@ -0,0 +1,537 @@
+from analyze import predict
+from hmm import hmm_bw as hmm
+import pytest
+from pytest import approx
+from collections import defaultdict
+import random
+import numpy as np
+from analyze.introgression_configuration import Configuration
+
+
+@pytest.fixture
+def config():
+    return Configuration()
+
+
+@pytest.fixture
+def default_builder(config):
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'N_45',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'DBVPG6304',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'UWOPS91_917_1',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1000,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    builder = predict.HMM_Builder(config)
+    config.set('states')
+    builder.set_expected_values()
+    builder.update_expected_length(1e5)
+    return builder
+
+
+@pytest.fixture
+def builder(config):
+    return predict.HMM_Builder(config)
+
+
+def test_builder(builder):
+    assert builder.symbols == {
+        'match': '+',
+        'mismatch': '-',
+        'unknown': '?',
+        'unsequenced': 'n',
+        'gap': '-',
+        'unaligned': '?',
+        'masked': 'x'
+    }
+
+
+def test_init(mocker, config):
+    mock_log = mocker.patch('analyze.predict.log')
+    predict.HMM_Builder(config)
+    # no config, all warnings
+    mock_log.warning.has_calls([
+        mocker.call("Symbol for match unset in config, using default '+'"),
+        mocker.call("Symbol for mismatch unset in config, using default '-'"),
+        mocker.call("Symbol for unknown unset in config, using default '?'"),
+        mocker.call("Symbol for unsequenced unset in config, "
+                    "using default 'n'"),
+        mocker.call("Symbol for gap unset in config, using default '-'"),
+        mocker.call("Symbol for unaligned unset in config, using default '?'"),
+        mocker.call("Symbol for masked unset in config, using default 'x'")
+    ])
+
+    # config, same warnings as above along with unused
+    mock_log = mocker.patch('analyze.predict.log')
+    config.config = {'HMM_symbols': {'unused': 'X'}}
+    predict.HMM_Builder(config)
+    mock_log.warning.has_calls([
+        mocker.call("Unused symbol in configuration: unused -> 'X'"),
+        mocker.call("Symbol for mismatch unset in config, using default '-'"),
+        mocker.call("Symbol for unknown unset in config, using default '?'"),
+        mocker.call("Symbol for unsequenced unset in config, "
+                    "using default 'n'"),
+        mocker.call("Symbol for gap unset in config, using default '-'"),
+        mocker.call("Symbol for unaligned unset in config, using default '?'"),
+        mocker.call("Symbol for masked unset in config, using default 'x'")
+    ])
+
+    # overwrite
+    mock_log = mocker.patch('analyze.predict.log')
+    config.config = {'HMM_symbols': {'masked': 'X'}}
+    predict.HMM_Builder(config)
+    mock_log.debug.has_calls([
+        mocker.call("Overwriting default symbol for masked with 'X'")
+    ])
+
+
+def test_update_emission_symbols(builder):
+    assert builder.update_emission_symbols(1) == ['+', '-']
+    assert builder.update_emission_symbols(3) == ['+++',
+                                                  '++-',
+                                                  '+-+',
+                                                  '+--',
+                                                  '-++',
+                                                  '-+-',
+                                                  '--+',
+                                                  '---',
+                                                  ]
+
+
+def test_get_symbol_freqs(builder):
+    sequence = '-++ +-+ ++- ---'.split()
+    symbol_test_helper(sequence, builder)
+    symbol_test_helper(['+'], builder)
+    # get all len 10 symbols
+    syms = builder.update_emission_symbols(10)
+
+    random.seed(0)
+    for i in range(10):
+        sequence = [random.choice(syms) for j in range(100)]
+        symbol_test_helper(sequence, builder)
+
+
+def symbol_test_helper(sequence, builder):
+    symb, weigh = builder.get_symbol_freqs(np.array(sequence))
+
+    num_states = len(sequence[0])
+    num_sites = len(sequence)
+
+    individual_symbol_freqs = []
+    for s in range(num_states):
+        d = defaultdict(int)
+        for i in range(num_sites):
+            d[sequence[i][s]] += 1
+        for sym in d:
+            d[sym] /= num_sites
+        individual_symbol_freqs.append(d)
+
+    symbol_freqs = defaultdict(int)
+    for i in range(num_sites):
+        symbol_freqs[sequence[i]] += 1
+    for sym in symbol_freqs:
+        symbol_freqs[sym] /= num_sites
+
+    # for each state, how often seq matches that state relative to
+    # others
+    weighted_match_freqs = []
+    for s in range(num_states):
+        weighted_match_freqs.append(
+            individual_symbol_freqs[s][builder.symbols['match']])
+
+    weighted_match_freqs /= np.sum(weighted_match_freqs)
+
+    assert symb == symbol_freqs
+    assert weigh == approx(weighted_match_freqs)
+
+
+def test_set_expected_values(builder, config):
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 10,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 10,
+              'expected_fraction': 0.01},
+             {'name': 'DBVPG6304',
+              'expected_length': 10,
+              'expected_fraction': 0.01},
+             {'name': 'UWOPS91_917_1',
+              'expected_length': 10,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 10,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    config.set('states')
+    builder.set_expected_values()
+    assert builder.expected_lengths == {
+        'CBS432': 10,
+        'N_45': 10,
+        'DBVPG6304': 10,
+        'UWOPS91_917_1': 10,
+        'unknown': 10}
+
+    assert builder.expected_fractions == {
+        'S288c': 0.95,
+        'CBS432': 0.01,
+        'N_45': 0.01,
+        'DBVPG6304': 0.01,
+        'UWOPS91_917_1': 0.01,
+        'unknown': 0.01}
+    assert builder.ref_fraction == 0.96
+    assert builder.other_sum == 0.004
+    assert builder.ref_state == 'S288c'
+
+
+def test_update_expected_length(builder, config):
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'N_45',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'DBVPG6304',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'UWOPS91_917_1',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1000,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    config.set('states')
+    builder.set_expected_values()
+
+    assert builder.expected_lengths == {
+        'CBS432': 10000,
+        'N_45': 10000,
+        'DBVPG6304': 10000,
+        'UWOPS91_917_1': 10000,
+        'unknown': 1000}
+
+    assert builder.expected_fractions == {
+        'S288c': 0.89,
+        'CBS432': 0.025,
+        'N_45': 0.025,
+        'DBVPG6304': 0.025,
+        'UWOPS91_917_1': 0.025,
+        'unknown': 0.01}
+
+    assert builder.ref_fraction == 0.9
+    assert builder.other_sum == 1e-5
+    assert builder.ref_state == 'S288c'
+
+    builder.update_expected_length(1e5)
+    assert builder.expected_lengths['S288c'] == 45000
+
+
+def test_initial_probabilities(default_builder):
+    probs = default_builder.initial_probabilities(
+        [0.1, 0.2, 0.3, 0.4, 0.5])
+
+    assert default_builder.expected_fractions == (
+        {'DBVPG6304': 0.025,
+         'UWOPS91_917_1': 0.025,
+         'unknown': 0.01,
+         'CBS432': 0.025,
+         'N_45': 0.025,
+         'S288c': 0.89})
+
+    p = [0.1 + (0.89 - 0.1) * 0.9,
+         0.2 + (0.025 - 0.2) * 0.9,
+         0.3 + (0.025 - 0.3) * 0.9,
+         0.4 + (0.025 - 0.4) * 0.9,
+         0.5 + (0.025 - 0.5) * 0.9,
+         0.01]
+
+    p = p / np.sum(p, dtype=np.float)
+
+    assert probs == approx(p)
+
+
+def test_emission_probabilities(default_builder):
+    # normal mode, 5 known_states
+    symbols = default_builder.update_emission_symbols(5)
+    emis = default_builder.emission_probabilities(symbols)
+
+    iter_emis = iter_emission(default_builder, symbols)
+    is_approx_equal_list_dict(emis, iter_emis)
+
+    # more unknowns
+    default_builder.unknown_states.append('test')
+    emis = default_builder.emission_probabilities(symbols)
+    iter_emis = iter_emission(default_builder, symbols)
+    is_approx_equal_list_dict(emis, iter_emis)
+
+    # no unknowns
+    default_builder.unknown_states = []
+    emis = default_builder.emission_probabilities(symbols)
+    iter_emis = iter_emission(default_builder, symbols)
+    is_approx_equal_list_dict(emis, iter_emis)
+
+    # too many symbols
+    symbols = default_builder.update_emission_symbols(6)
+    emis = default_builder.emission_probabilities(symbols)
+    iter_emis = iter_emission(default_builder, symbols)
+    is_approx_equal_list_dict(emis, iter_emis)
+
+
+def iter_emission(builder, symbols):
+    probs = {'-+': 0.9,
+             '++': 0.09,
+             '--': 0.009,
+             '+-': 0.001}
+    mismatch_bias = 0.99
+
+    known_len = len(builder.known_states)
+    for k in probs:
+        probs[k] *= 2**(known_len - 2)
+
+    emis = []
+    # using older, iterative version
+    for s in range(known_len):
+        emis.append(defaultdict(float))
+        for symbol in symbols:
+            key = symbol[0] + symbol[s]
+            emis[s][symbol] = probs[key]
+
+        emis[s] = mynorm(emis[s])
+
+    symbol_len = len(symbols[0])
+    for s in range(len(builder.unknown_states)):
+        emis.append(defaultdict(float))
+        for symbol in symbols:
+            match_count = symbol.count('+')
+            mismatch_count = symbol_len - match_count
+            emis[s + known_len][symbol] = \
+                (match_count * (1 - mismatch_bias)
+                 + mismatch_count * mismatch_bias)
+        emis[s + known_len] = mynorm(emis[s + known_len])
+
+    return emis
+
+
+def is_approx_equal_list_dict(actual, expected):
+    for i in range(len(actual)):
+        for k in actual[i]:
+            assert actual[i][k] == approx(expected[i][k]),\
+                "failed at i={}, k={}".format(i, k)
+
+
+def mynorm(d):
+    total = float(sum(d.values()))
+    return {k: v/total for k, v in d.items()}
+
+
+def test_transition_probabilities(default_builder):
+    trans = default_builder.transition_probabilities()
+
+    iter_trans = iter_transition(default_builder)
+    for i in range(len(trans)):
+        assert trans[i] == approx(iter_trans[i])
+
+
+def iter_transition(builder):
+    states = builder.known_states + builder.unknown_states
+    expected_frac = builder.expected_fractions
+    expected_length = builder.expected_lengths
+    trans = []
+    for i in range(len(states)):
+        state_from = states[i]
+        trans.append([])
+        scale_other = 1 / (1 - expected_frac[state_from])
+        for j in range(len(states)):
+            state_to = states[j]
+            if state_from == state_to:
+                trans[i].append(1 - 1./expected_length[state_from])
+            else:
+                trans[i].append(1./expected_length[state_from] *
+                                expected_frac[state_to] * scale_other)
+
+        trans[i] /= np.sum(trans[i])
+
+    return trans
+
+
+def test_build_initial_hmm(default_builder):
+    symbols = default_builder.update_emission_symbols(5)
+    hm = default_builder.build_initial_hmm(
+        symbols)
+
+    assert default_builder.expected_fractions == (
+        {'DBVPG6304': 0.025,
+         'UWOPS91_917_1': 0.025,
+         'unknown': 0.01,
+         'CBS432': 0.025,
+         'N_45': 0.025,
+         'S288c': 0.89})
+
+    p = [0.2 + (0.89 - 0.2) * 0.9,
+         0.2 + (0.025 - 0.2) * 0.9,
+         0.2 + (0.025 - 0.2) * 0.9,
+         0.2 + (0.025 - 0.2) * 0.9,
+         0.2 + (0.025 - 0.2) * 0.9,
+         0.01]
+
+    p = p / np.sum(p, dtype=np.float)
+    assert hm.initial_p == approx(p)
+
+    iter_emis = iter_emission(default_builder, symbols)
+    hm2 = hmm.HMM()
+    hm2.set_emissions(iter_emis)
+    assert hm.emissions == approx(hm2.emissions)
+
+    iter_trans = iter_transition(default_builder)
+    for i in range(len(hm.transitions)):
+        assert hm.transitions[i] == approx(iter_trans[i])
+
+
+def test_run_hmm(default_builder, capsys, mocker):
+    seqs = [list('NNENNENNEN'),  # S2288c
+            list('NNNENEENNN'),  # CBS432
+            list('NN-NNEENNN'),  # N_45
+            list('NEENN-ENEN'),  # DBVPG6304
+            list('ENENNEENEN'),  # UWOPS..
+            list('NNENNEENEN'),  # predicted
+            ]
+    mock_fasta = mocker.patch('analyze.predict.read_fasta',
+                              return_value=(None, seqs))
+    mock_log_hmm = mocker.patch('hmm.hmm_bw.log.info')
+
+    hmm_init, hmm, positions = default_builder.run_hmm('MOCKED', True)
+
+    mock_fasta.called_with('MOCKED')
+
+    # check hmm output
+    assert mock_log_hmm.call_args_list[-3:] == \
+        [mocker.call('Iteration 8'),
+         mocker.call('Iteration 9'),
+         mocker.call('finished in 10 iterations')]
+
+    # ps are locations of polymorphic sites, not counting missing '-'
+    assert positions == approx([0, 1, 3, 6, 8])
+    assert np.array_equal(hmm.initial_p, np.array([1, 0, 0, 0, 0, 0]))
+    np.testing.assert_allclose(
+        hmm_init.initial_p,
+        np.array([0.8212314, 0.03825122, 0.04350912,
+                  0.04350912, 0.04350912, 0.00999001]))
+
+
+def test_encode_sequence(builder, mocker):
+    mock_fasta = mocker.patch('analyze.predict.read_fasta',
+                              return_value=(None,
+                                            [
+                                                list('abcd'),
+                                                list('abed'),
+                                                list('bbcf'),
+                                            ]))
+
+    seq_coded, positions, len_pred = builder.encode_sequence('test', True)
+    assert (seq_coded == '-- +- --'.split()).all()
+    assert (positions == [0, 2, 3]).all()
+    assert len_pred == 4
+    mock_fasta.called_with('test')
+
+    seq_coded, positions, len_pred = builder.encode_sequence('test2', False)
+    assert (seq_coded == '-- ++ +- --'.split()).all()
+    assert (positions == [0, 1, 2, 3]).all()
+    assert len_pred == 4
+    mock_fasta.called_with('test2')
+
+
+def test_ungap_and_code(builder):
+    # nothing in prediction
+    sequence, positions = builder.ungap_and_code(
+        '---',  # predicted reference string
+        ['abc', 'def', 'ghi'],  # several references
+        0)  # reference index
+    assert positions == approx([])
+    assert sequence == approx([])
+
+    # one match
+    sequence, positions = builder.ungap_and_code(
+        'a--',
+        ['abc', 'def', 'ghi'],
+        0)
+    assert positions == approx([0])
+    assert sequence == ['+--']
+
+    # no match from refs
+    sequence, positions = builder.ungap_and_code(
+        'a--',
+        ['abc', 'def', '-hi'],
+        0)
+    assert positions == approx([])
+    assert sequence == approx([])
+
+    # two matches
+    sequence, positions = builder.ungap_and_code(
+        'ae-',
+        ['abc', 'def', 'gei'],
+        0)
+    assert positions == approx([0, 1])
+    assert (sequence == ['+--', '-++']).all()
+
+    # mess with ref index
+    sequence, positions = builder.ungap_and_code(
+        'a--e-',
+        ['a--bc', 'deeef', 'geeei'],
+        0)
+    assert positions == approx([0, 1])
+    assert (sequence == ['+--', '-++']).all()
+    sequence, positions = builder.ungap_and_code(
+        'a--e-',
+        ['a--bc', 'deeef', 'geeei'],
+        1)
+    assert positions == approx([0, 3])
+    assert (sequence == ['+--', '-++']).all()
+
+    sequence, positions = builder.ungap_and_code(
+        'a---ef--i',
+        ['ab-dhfghi',
+         'a-cceeg-i',
+         'a-ceef-hh'],
+        0)
+
+    assert (sequence == '+++ -++ +-+ ++-'.split()).all()
+    assert positions == approx([0, 3, 4, 7])
+
+
+def test_poly_sites(builder):
+    sequence, positions = builder.poly_sites(
+        np.array('+++ -++ +-+ ++-'.split()),
+        np.array([0, 3, 4, 7])
+    )
+    assert (sequence == '-++ +-+ ++-'.split()).all()
+    assert positions == approx([3, 4, 7])
diff --git a/code/test/analyze/test_predict_predictor.py b/code/test/analyze/test_predict_predictor.py
new file mode 100644
index 0000000..c1d19cb
--- /dev/null
+++ b/code/test/analyze/test_predict_predictor.py
@@ -0,0 +1,709 @@
+from analyze import predict
+from hmm import hmm_bw as hmm
+import pytest
+from io import StringIO
+from collections import defaultdict
+import random
+import numpy as np
+from analyze.introgression_configuration import Configuration
+
+
+@pytest.fixture
+def config():
+    config = Configuration()
+    config.add_config({
+            'analysis_params':
+            {'reference': {'name': 'S288c'},
+             'known_states': [
+                 {'name': 'CBS432'},
+                 {'name': 'N_45'},
+                 {'name': 'DBVPG6304'},
+                 {'name': 'UWOPS91_917_1'},
+             ],
+             'unknown_states': [{'name': 'unknown'}]
+             }
+        })
+
+    return config
+
+
+@pytest.fixture
+def predictor(config):
+    result = predict.Predictor(config)
+    config.set('states')
+    return result
+
+
+def test_predictor(predictor):
+    assert predictor.config.known_states ==\
+        'S288c CBS432 N_45 DBVPG6304 UWOPS91_917_1'.split()
+    assert predictor.config.unknown_states == ['unknown']
+
+
+def test_validate_arguments(predictor):
+    config = predictor.config
+    config.chromosomes = 1
+    config.blocks = 1
+    config.prefix = 1
+    config.strains = 1
+    config.hmm_initial = 1
+    config.hmm_trained = 1
+    config.probabilities = 1
+    config.positions = 1
+    config.alignment = 1
+    config.known_states = 1
+    config.unknown_states = 1
+    config.threshold = 1
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'DBVPG6304',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'UWOPS91_917_1',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+
+    assert predictor.validate_arguments()
+
+    args = [
+        'chromosomes',
+        'blocks',
+        'prefix',
+        'strains',
+        'hmm_initial',
+        'hmm_trained',
+        'probabilities',
+        'positions',
+        'alignment',
+        'known_states',
+        'unknown_states',
+        'threshold'
+    ]
+
+    for arg in args:
+        config.__dict__[arg] = None
+        with pytest.raises(ValueError) as e:
+            predictor.validate_arguments()
+        assert ('Failed to validate Predictor, '
+                f"required argument '{arg}' was unset") in str(e)
+        config.__dict__[arg] = 1
+
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'Configuration did not provide any known_states' in str(e)
+
+    config.config = {
+        'analysis_params':
+        {'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'Configuration did not specify a reference strain' in str(e)
+
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'CBS432 did not provide an expected_length' in str(e)
+
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              },
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'N_45 did not provide an expected_fraction' in str(e)
+
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'unknown did not provide an expected_length' in str(e)
+
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+             {'name': 'N_45',
+              'expected_length': 1,
+              'expected_fraction': 0.01},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1,
+                             },
+                            ]
+         }
+    }
+    with pytest.raises(ValueError) as e:
+        predictor.validate_arguments()
+    assert 'unknown did not provide an expected_fraction' in str(e)
+
+
+def test_run_prediction_full(predictor, config, mocker):
+    config.chromosomes = ['I', 'II']
+    config.blocks = 'blocks{state}.txt'
+    config.prefix = 'prefix'
+    config.strains = ['s1', 's2']
+    config.hmm_initial = 'hmm_initial.txt'
+    config.hmm_trained = 'hmm_trained.txt'
+    config.probabilities = 'probs.txt'
+    config.positions = 'pos.txt'
+    config.alignment = 'prefix_{strain}_chr{chrom}.maf'
+    config.known_states = 'S288c CBS432 N_45 DBVP UWOP'.split()
+    config.unknown_states = ['unknown']
+    config.states = config.known_states + config.unknown_states
+    config.threshold = 'viterbi'
+    config.config = {
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'N_45',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'DBVP',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+             {'name': 'UWOP',
+              'expected_length': 10000,
+              'expected_fraction': 0.025},
+         ],
+         'unknown_states': [{'name': 'unknown',
+                             'expected_length': 1000,
+                             'expected_fraction': 0.01},
+                            ]
+         }
+    }
+    mock_files = [mocker.MagicMock() for i in range(8)]
+    mock_open = mocker.patch('analyze.predict.open',
+                             side_effect=mock_files)
+    mock_gzip = mocker.patch('analyze.predict.gzip.open')
+    mock_log = mocker.patch('analyze.predict.log')
+    mocker.patch('analyze.predict.os.path.exists', return_value=True)
+    mock_fasta = mocker.patch('analyze.predict.read_fasta',
+                              return_value=(None,
+                                            [list('NNENNENNEN'),  # S288c
+                                             list('NNNENEENNN'),  # CBS432
+                                             list('NN-NNEENNN'),  # N_45
+                                             list('NEENN-ENEN'),  # DBVPG6304
+                                             list('ENENNEENEN'),  # UWOPS..
+                                             list('NNENNEENEN'),  # predicted
+                                             ]
+                                            ))
+    mock_log_hmm = mocker.patch('hmm.hmm_bw.log.info')
+
+    predictor.run_prediction(only_poly_sites=True)
+
+    # check hmm output
+    assert mock_log_hmm.call_args_list[-3:] == \
+        [mocker.call('Iteration 8'),
+         mocker.call('Iteration 9'),
+         mocker.call('finished in 10 iterations')]
+
+    mock_open.assert_has_calls([
+        mocker.call('hmm_initial.txt', 'w'),
+        mocker.call('hmm_trained.txt', 'w'),
+        mocker.call('blocksS288c.txt', 'w'),
+        mocker.call('blocksCBS432.txt', 'w'),
+        mocker.call('blocksN_45.txt', 'w'),
+        mocker.call('blocksDBVP.txt', 'w'),
+        mocker.call('blocksUWOP.txt', 'w'),
+        mocker.call('blocksunknown.txt', 'w')])
+
+    # hmm_initial
+    mock_files[0].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s1\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s1\tII\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tII\t'),
+            mocker.ANY,
+            mocker.call('\n')
+        ])
+    # trained
+    mock_files[1].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s1\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s1\tII\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tII\t'),
+            mocker.ANY,
+            mocker.call('\n')
+        ])
+    # check initial probability (5th write, dereference to get string...)
+    hmm_entry = mock_files[1].__enter__().write.\
+        call_args_list[4][0][0].split('\t')
+    assert hmm_entry[0] == '1.0'
+    assert hmm_entry[1] == '0.0'
+    assert hmm_entry[2] == '0.0'
+    assert hmm_entry[3] == '0.0'
+    assert hmm_entry[4] == '0.0'
+    assert hmm_entry[5] == '0.0'
+    assert hmm_entry[6] == '0.0'
+
+    # blocks S288c
+    mock_files[2].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call('s1\tI\tS288c\t0\t1\t2'),
+            mocker.call('\n'),
+            mocker.call('s2\tI\tS288c\t0\t1\t2'),
+            mocker.call('\n'),
+            mocker.call('s1\tII\tS288c\t0\t1\t2'),
+            mocker.call('\n'),
+            mocker.call('s2\tII\tS288c\t0\t1\t2'),
+            mocker.call('\n')
+        ])
+    # blocks CBS432
+    mock_files[3].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call('')])
+    # blocks N_45
+    mock_files[4].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call('')])
+    # blocks DBVP
+    mock_files[5].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call('')])
+    # blocks UWOP
+    mock_files[6].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call('s1\tI\tUWOP\t3\t8\t3'),
+            mocker.call('\n'),
+            mocker.call('s2\tI\tUWOP\t3\t8\t3'),
+            mocker.call('\n'),
+            mocker.call('s1\tII\tUWOP\t3\t8\t3'),
+            mocker.call('\n'),
+            mocker.call('s2\tII\tUWOP\t3\t8\t3'),
+            mocker.call('\n')
+        ])
+    # blocks unknown
+    mock_files[7].__enter__().write.assert_has_calls(
+        [
+            mocker.call('strain\tchromosome\tpredicted_species'
+                        '\tstart\tend\tnum_sites_hmm\n'),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call(''),
+            mocker.call('')])
+
+    mock_gzip.assert_any_call('probs.txt', 'wt')
+    mock_gzip.assert_any_call('pos.txt', 'wt')
+
+    # probs and pos interspersed
+    mock_gzip.return_value.__enter__().write.assert_has_calls(
+        [
+            mocker.call('s1\tI\t0\t1\t3\t6\t8\n'),
+            mocker.call('s1\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tI\t0\t1\t3\t6\t8\n'),
+            mocker.call('s2\tI\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s1\tII\t0\t1\t3\t6\t8\n'),
+            mocker.call('s1\tII\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+            mocker.call('s2\tII\t0\t1\t3\t6\t8\n'),
+            mocker.call('s2\tII\t'),
+            mocker.ANY,
+            mocker.call('\n'),
+
+        ])
+
+    mock_fasta.assert_has_calls([
+        mocker.call('prefix_s1_chrI.maf'),
+        mocker.call('prefix_s2_chrI.maf'),
+        mocker.call('prefix_s1_chrII.maf'),
+        mocker.call('prefix_s2_chrII.maf')
+    ])
+
+    mock_log.info.assert_has_calls([
+        mocker.call('working on: s1 I (1 of 4)'),
+        mocker.call('working on: s2 I (2 of 4)'),
+        mocker.call('working on: s1 II (3 of 4)'),
+        mocker.call('working on: s2 II (4 of 4)')
+    ])
+
+
+def test_write_hmm_header(predictor, config):
+    config.states = []
+    predictor.emission_symbols = []
+    writer = StringIO()
+    predictor.write_hmm_header(writer)
+    assert writer.getvalue() == 'strain\tchromosome\t\n'
+
+    config.states = ['s1', 's2', 'u1']
+    predictor.emission_symbols = ['-', '+']
+    writer = StringIO()
+    predictor.write_hmm_header(writer)
+
+    header = 'strain\tchromosome\t'
+    header += '\t'.join(
+        ['init_{}'.format(s) for s in ['s1', 's2', 'u1']] +
+        ['emis_{}_{}'.format(s, sym)
+         for s in ['s1', 's2', 'u1']
+         for sym in ['-', '+']] +
+        ['trans_{}_{}'.format(s, s2)
+         for s in ['s1', 's2', 'u1']
+         for s2 in ['s1', 's2', 'u1']])
+
+    assert writer.getvalue() == header + '\n'
+
+
+def test_write_hmm(predictor):
+    predictor.emission_symbols = list('abc')
+    output = StringIO()
+
+    hm = hmm.HMM()
+
+    # empty hmm
+    predictor.write_hmm(hm, output, 'strain', 'I')
+    assert output.getvalue() == 'strain\tI\t\n'
+
+    hm.set_hidden_states(list('abc'))
+    hm.set_initial_p([0, 1, 0])
+    hm.set_transitions([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
+    hm.set_emissions([{'a': 1, 'b': 0, 'c': 0},
+                      {'a': 0, 'b': 0, 'c': 1},
+                      {'a': 0, 'b': 1, 'c': 0},
+                      ])
+
+    output = StringIO()
+    predictor.write_hmm(hm, output, 'strain', 'I')
+
+    result = 'strain\tI\t'
+    result += '\t'.join(list('010')) + '\t'  # init
+    result += '\t'.join(list('100001010')) + '\t'  # emis
+    result += '\t'.join(list('010100001')) + '\n'  # trans
+    assert output.getvalue() == result
+
+
+def test_write_blocks_header(predictor):
+    writer = StringIO()
+    predictor.write_blocks_header(writer)
+
+    assert writer.getvalue() == '\t'.join(['strain',
+                                           'chromosome',
+                                           'predicted_species',
+                                           'start',
+                                           'end',
+                                           'num_sites_hmm']) + '\n'
+
+
+def test_write_blocks(predictor):
+    output = StringIO()
+    block = []
+    pos = [i * 2 for i in range(20)]
+    predictor.write_blocks(block,
+                           pos,
+                           output, 'test', 'I', 'pred')
+
+    assert output.getvalue() == ''
+
+    output = StringIO()
+    block = [(0, 1), (4, 6), (10, 8)]
+    pos = [i * 2 for i in range(20)]
+    predictor.write_blocks(block,
+                           pos,
+                           output, 'test', 'I', 'pred')
+
+    result = "\n".join(
+        ["\t".join(['test', 'I', 'pred',
+                    str(pos[s]), str(pos[e]), str(e - s + 1)])
+         for s, e in block]) + "\n"
+
+    assert output.getvalue() == result
+
+
+def test_write_positions(predictor):
+    output = StringIO()
+    predictor.write_positions([0, 1, 3, 5, 7], output, 'test', 'I')
+    assert output.getvalue() == "{}\t{}\t{}\n".format(
+        "test",
+        "I",
+        "\t".join([str(i) for i in (0, 1, 3, 5, 7)]))
+
+
+def test_write_state_probs(predictor):
+    output = StringIO()
+    predictor.config.states = []
+    predictor.write_state_probs([{}], output, 'strain', 'I')
+
+    assert output.getvalue() == 'strain\tI\t\n'
+
+    output = StringIO()
+    predictor.config.states = list('abc')
+    predictor.write_state_probs([
+        [0, 0, 1],
+        [1, 0, 0],
+        [0, 1, 1],
+    ], output, 'strain', 'I')
+
+    assert output.getvalue() == \
+        ('strain\tI\t'
+         'a:0.00000,1.00000,0.00000\t'
+         'b:0.00000,0.00000,1.00000\t'
+         'c:1.00000,0.00000,1.00000\n')
+
+
+def test_process_path(predictor, config, hm):
+    probs = hm.posterior_decoding()[0]
+    config.set(threshold=0.8)
+    config.states = 'N E'.split()
+    config.known_states = 'N E'.split()
+    path, probability = predictor.process_path(hm)
+    assert (probability == probs).all()
+    assert path == 'E E N E E N E E N N'.split()
+
+    config.set(threshold='viterbi')
+    path, probability = predictor.process_path(hm)
+
+    assert (probability == probs).all()
+    assert path == 'E E N E E N E E N E'.split()
+
+
+def test_convert_to_blocks(predictor):
+    random.seed(0)
+    states = [str(i) for i in range(10)]
+    help_test_convert_blocks(states, list('1'), predictor)
+    help_test_convert_blocks(states, list('12'), predictor)
+    help_test_convert_blocks(states, list('1111'), predictor)
+
+    for test in range(10):
+        seq = [str(random.randint(0, 9)) for i in range(100)]
+        help_test_convert_blocks(states, seq, predictor)
+
+
+def help_test_convert_blocks(states, seq, predictor):
+    predictor.config.states = states
+    blocks = predictor.convert_to_blocks(seq)
+
+    nseq = np.array(seq, int)
+    # add element to the end to catch repeats on last index
+    nseq = np.append(nseq, nseq[-1]+1)
+    diff = np.diff(nseq)
+    locs = np.nonzero(diff)[0]
+    lens = np.diff(locs)
+    lens = np.append(locs[0]+1, lens)
+
+    current = 0
+    result = defaultdict(list)
+    for i, l in enumerate(locs):
+        result[seq[l]].append((current, current + lens[i] - 1))
+        current += lens[i]
+
+    for k in blocks:
+        assert blocks[k] == result[k]
+
+
+def test_read_blocks(mocker):
+    block_in = StringIO('''
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked')
+
+    mocked_file.assert_called_with('mocked', 'r')
+    assert list(output.keys()) == []
+
+    block_in = StringIO('''header
+test\tI\tpred\t100\t200\t10
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked')
+
+    assert len(output) == 1
+    assert output['test']['I'] == [(100, 200, 10)]
+
+    block_in = StringIO('''header
+test\tI\tpred\t100\t200\t10
+test\tI\tpred\t200\t200\t30
+test\tI\tpred\t300\t400\t40
+test\tII\tpred\t300\t400\t40
+test2\tIII\tpred\t300\t400\t47
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked')
+
+    assert len(output) == 2
+    assert len(output['test']) == 2
+    assert len(output['test2']) == 1
+    assert output['test']['I'] == [
+        (100, 200, 10),
+        (200, 200, 30),
+        (300, 400, 40),
+    ]
+    assert output['test']['II'] == [(300, 400, 40)]
+    assert output['test2']['III'] == [(300, 400, 47)]
+
+
+def test_read_blocks_labeled(mocker):
+    block_in = StringIO('''
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked', labeled=True)
+
+    mocked_file.assert_called_with('mocked', 'r')
+    assert list(output.keys()) == []
+
+    block_in = StringIO('''header
+r1\ttest\tI\tpred\t100\t200\t10
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked', labeled=True)
+
+    assert len(output) == 1
+    assert output['test']['I'] == [('r1', 100, 200, 10)]
+
+    block_in = StringIO('''header
+r1\ttest\tI\tpred\t100\t200\t10
+r2\ttest\tI\tpred\t200\t200\t30
+r3\ttest\tI\tpred\t300\t400\t40
+r4\ttest\tII\tpred\t300\t400\t40
+r5\ttest2\tIII\tpred\t300\t400\t47
+''')
+
+    mocked_file = mocker.patch('analyze.predict.open',
+                               return_value=block_in)
+    output = predict.read_blocks('mocked', labeled=True)
+
+    assert len(output) == 2
+    assert len(output['test']) == 2
+    assert len(output['test2']) == 1
+    assert output['test']['I'] == [
+        ('r1', 100, 200, 10),
+        ('r2', 200, 200, 30),
+        ('r3', 300, 400, 40),
+    ]
+    assert output['test']['II'] == [('r4', 300, 400, 40)]
+    assert output['test2']['III'] == [('r5', 300, 400, 47)]
diff --git a/code/test/analyze/test_summarize_region_quality.py b/code/test/analyze/test_summarize_region_quality.py
index 80721c6..068bfd0 100644
--- a/code/test/analyze/test_summarize_region_quality.py
+++ b/code/test/analyze/test_summarize_region_quality.py
@@ -3,20 +3,1092 @@
 import pytest
 from pytest import approx
 import numpy as np
+from numpy.testing import assert_array_equal as aae
+from analyze.introgression_configuration import Configuration
 
 
-def test_read_masked_intervals(mocker):
+@pytest.fixture
+def summarizer():
+    return summarize.Summarizer(Configuration())
+
+
+def test_states_to_process(summarizer, mocker):
+    summarizer.config.add_config({
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432'},
+             {'name': 'N_45'},
+         ],
+         'unknown_states': [{'name': 'unknown'}]
+         }
+    })
+    summarizer.config.set('states')
+
+    assert summarizer.states_to_process() == \
+        (0, 'S288c CBS432 N_45 unknown'.split())
+
+    mock_warn = mocker.patch('analyze.summarize_region_quality.log.warning')
+    assert summarizer.states_to_process('N_45 asdf S288c'.split()) == \
+        (0, 'N_45 S288c'.split())
+    mock_warn.assert_called_with("state 'asdf' was not found as a state")
+
+    with pytest.raises(ValueError) as e:
+        summarizer.states_to_process('asdf qwer'.split())
+    assert 'No valid states were found to process' in str(e)
+
+    summarizer.config.add_config({
+        'analysis_params': {'reference': {'name': 'N_45'}}})
+
+    assert summarizer.states_to_process() == \
+        (2, 'S288c CBS432 N_45 unknown'.split())
+
+
+def test_run(summarizer, mocker):
+    summarizer.config.add_config({
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432'},
+             {'name': 'N_45'},
+             {'name': 'DBVPG6304'},
+             {'name': 'UWOPS91_917_1'}
+         ],
+         'unknown_states': [{'name': 'unknown'}]
+         }
+    })
+    summarizer.config.set('symbols', 'states',
+                          positions='positions.txt.gz',
+                          labeled_blocks='dir/tag/blocks_{state}_labeled.txt',
+                          quality_blocks='dir/tag/blocks_{state}_quality.txt',
+                          alignment='dir/tag/blocks_{chrom}_{strain}.txt',
+                          regions='dir/tag/regions/{state}.fa.gz',
+                          region_index='dir/tag/regions/{state}.pkl',
+                          masks='dir/masked/{strain}_chr{chrom}.txt')
+    summarizer.config.chromosomes = ['I', 'II']
+    summarizer.validate_arguments()
+    # for region database
+    mock_table = mocker.patch(
+        'misc.read_table.open',
+        mocker.mock_open(
+            read_data='region_id\tstrain\tchromosome\t'
+            'predicted_species\tstart\tend\tnum_sites_hmm\n'
+            'r4\tyjm1381\tI\tS288c\t2\t5\t60\n'
+            'r5\tyjm689\tI\tS288c\t3\t6\t56\n'
+            'r6\tyjm1381\tI\tS288c\t3\t7\t18\n'
+            'r7\tyjm689\tI\tS288c\t3\t5\t13728\n'
+            'r8\tyjm1208\tI\tS288c\t3\t4\t20\n'
+            'r9\tyjm1304\tII\tS288c\t3\t7\t16\n'
+        ))
+
+    # sequence analyzer masked sites
+    mock_masked = mocker.patch.object(
+        summarize.Sequence_Analyzer,
+        'read_masked_intervals',
+        return_value=[
+            (0, 2),
+            (4, 5),
+        ])
+    position_in = StringIO(
+        'yjm1381\tI\t2\t3\t5\t7\n'
+        'yjm689\tI\t3\t4\t5\t6\n'
+        'yjm1464\tI\t1\t2\t3\t3\n'
+    )
+    region_out = StringIO()
+
+    def new_close():
+        pass
+
+    mocker.patch.object(region_out, 'close', new_close)
+
+    mocked_gzip = mocker.patch(
+        'analyze.summarize_region_quality.gzip.open',
+        side_effect=[position_in, region_out])
+
+    mocked_file = mocker.patch('analyze.summarize_region_quality.open',
+                               mocker.mock_open())
+    mock_log = mocker.patch('analyze.summarize_region_quality.log')
+
+    mocker.patch('analyze.summarize_region_quality.read_fasta.read_fasta',
+                 return_value=('',
+                               np.asarray([
+                                   list('--gatcctag--'),
+                                   list('-agatgcaag-c'),
+                                   list('-agatgcaag-c'),
+                                   list('-agatgcaag-c'),
+                                   list('-a-attacagt-'),
+                                   list('-a-atttcagt-'),
+                               ])))
+
+    summarizer.run(['unknown'])
+
+    mock_masked.assert_any_call('dir/masked/UWOPS91_917_1_chrII.txt')
+    mock_masked.assert_any_call('dir/masked/yjm1381_chrI.txt')
+
+    assert mocked_gzip.call_args_list == [
+        mocker.call('positions.txt.gz', 'rt'),
+        mocker.call('dir/tag/regions/unknown.fa.gz', 'wt'),
+    ]
+    assert mock_log.debug.call_args_list == [
+          mocker.call('reference index: 0'),
+          mocker.call("states to analyze: ['unknown']"),
+          mocker.call("known_states ['S288c', 'CBS432', 'N_45', "
+                      "'DBVPG6304', 'UWOPS91_917_1']"),
+          mocker.call('Sequence_Analyzer init with:'),
+          mocker.call('masks: dir/masked/{strain}_chr{chrom}.txt'),
+          mocker.call('alignment: dir/tag/blocks_{chrom}_{strain}.txt'),
+          mocker.call('yjm1381 I'),
+          mocker.call('yjm689 I')
+    ]
+    assert mock_log.info.call_args_list == [
+        mocker.call('Working on state unknown'),
+        mocker.call('Working on chromosome I'),
+        mocker.call('Working on chromosome II'),
+    ]
+
+    assert mocked_file.call_count == 2
+    mocked_file.assert_any_call(
+        'dir/tag/blocks_unknown_quality.txt', 'w')
+    mocked_file.assert_any_call(
+        'dir/tag/regions/unknown.pkl', 'wb')
+
+    mock_table.assert_any_call(
+        'dir/tag/blocks_unknown_labeled.txt', 'r')
+
+    # just headers
+    states = ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1']
+    symbols = list('.-_npbcxNPBCX')
+    assert mocked_file().write.call_args_list == [
+        mocker.call(
+            '\t'.join(
+                ('region_id\tstrain\tchromosome\tpredicted_species\tstart'
+                 '\tend\tnum_sites_hmm').split() +
+                ['match_nongap_' + x for x in states] +
+                ['num_sites_nongap_' + x for x in states] +
+                ['match_hmm_' + x for x in states] +
+                ['match_nonmask_' + x for x in states] +
+                ['num_sites_nonmask_' + x for x in states] +
+                ['count_' + x for x in symbols]
+            ) + '\n'),
+
+        mocker.call('r4\tyjm1381\tI\tS288c\t2\t5\t3\t1\t1\t1\t1\t3\t4\t4\t4\t4'
+                    '\t4\t1\t1\t1\t1\t3\t0\t0\t0\t0\t1\t1\t0\t0\t0\t1\t0\t0'
+                    '\t4\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.call('r5\tyjm689\tI\tS288c\t3\t6\t4\t1\t1\t1\t1\t3\t4\t4\t4\t4'
+                    '\t4\t1\t1\t1\t1\t3\t1\t1\t1\t1\t2\t2\t1\t1\t1\t2\t1\t0'
+                    '\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.call('r6\tyjm1381\tI\tS288c\t3\t7\t3\t2\t2\t2\t2\t4\t5\t5\t5'
+                    '\t5\t5\t1\t1\t1\t1\t3\t2\t2\t2\t2\t3\t3\t2\t2\t2\t3\t2'
+                    '\t0\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.call('r7\tyjm689\tI\tS288c\t3\t5\t3\t0\t0\t0\t0\t2\t3\t3\t3\t3'
+                    '\t3\t0\t0\t0\t0\t2\t0\t0\t0\t0\t1\t1\t0\t0\t0\t1\t0\t0'
+                    '\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.call('r8\tyjm1208\tI\tS288c\t3\t4\t20\t0\t0\t0\t0\t0\t0\t0\t0'
+                    '\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0'
+                    '\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.call('r9\tyjm1304\tII\tS288c\t3\t7\t16\t0\t0\t0\t0\t0\t0\t0\t0'
+                    '\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0'
+                    '\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'),
+        mocker.ANY
+    ]
+
+    region_output = region_out.getvalue()
+    region_out.close()
+
+    assert region_output == (
+        '#r4\n'
+        '> S288c 2 5\ntcct\n'
+        '> CBS432 3 6\ntgca\n'
+        '> N_45 3 6\ntgca\n'
+        '> DBVPG6304 3 6\ntgca\n'
+        '> UWOPS91_917_1 2 5\nttac\n'
+        '> yjm1381 2 5\ntttc\n'
+        '> info\n____\n'
+        '#r6\n'
+        '> S288c 3 7\ncctag\n'
+        '> CBS432 4 8\ngcaag\n'
+        '> N_45 4 8\ngcaag\n'
+        '> DBVPG6304 4 8\ngcaag\n'
+        '> UWOPS91_917_1 3 7\ntacag\n'
+        '> yjm1381 3 7\nttcag\n'
+        '> info\n___..\n'
+        '#r5\n'
+        '> S288c 3 6\nccta\n'
+        '> CBS432 4 7\ngcaa\n'
+        '> N_45 4 7\ngcaa\n'
+        '> DBVPG6304 4 7\ngcaa\n'
+        '> UWOPS91_917_1 3 6\ntaca\n'
+        '> yjm689 3 6\nttca\n'
+        '> info\n___.\n'
+        '#r7\n'
+        '> S288c 3 5\ncct\n'
+        '> CBS432 4 6\ngca\n'
+        '> N_45 4 6\ngca\n'
+        '> DBVPG6304 4 6\ngca\n'
+        '> UWOPS91_917_1 3 5\ntac\n'
+        '> yjm689 3 5\nttc\n'
+        '> info\n___\n'
+    )
+
+
+def test_run_all_states(summarizer, mocker):
+    summarizer.config.add_config({
+        'analysis_params':
+        {'reference': {'name': 'S288c'},
+         'known_states': [
+             {'name': 'CBS432'},
+             {'name': 'N_45'},
+             {'name': 'DBVPG6304'},
+             {'name': 'UWOPS91_917_1'}
+         ],
+         'unknown_states': [{'name': 'unknown'}]
+         }
+    })
+    summarizer.config.set('states',
+                          'symbols',
+                          positions='positions.txt.gz',
+                          labeled_blocks='dir/tag/blocks_{state}_labeled.txt',
+                          quality_blocks='dir/tag/blocks_{state}_quality.txt',
+                          alignment='dir/tag/blocks_{chrom}_{strain}.txt',
+                          regions='dir/tag/regions/{state}.fa.gz',
+                          region_index='dir/tag/regions/{state}.pkl',
+                          masks='dir/masked/{strain}_chr{chrom}.txt')
+    summarizer.config.chromosomes = ['I', 'II']
+    assert summarizer.validate_arguments()
+
+    mock_log = mocker.patch('analyze.summarize_region_quality.log')
+
+    with pytest.raises(FileNotFoundError) as e:
+        summarizer.run()
+    assert "No such file or directory: 'dir/masked/S288c_chrI.txt'" in str(e)
+
+    assert mock_log.debug.call_args_list == [
+        mocker.call('reference index: 0'),
+        mocker.call("states to analyze: ['S288c', 'CBS432', 'N_45', "
+                    "'DBVPG6304', 'UWOPS91_917_1', 'unknown']"),
+        mocker.call("known_states ['S288c', 'CBS432', 'N_45', 'DBVPG6304', "
+                    "'UWOPS91_917_1']"),
+        mocker.call('Sequence_Analyzer init with:'),
+        mocker.call('masks: dir/masked/{strain}_chr{chrom}.txt'),
+        mocker.call('alignment: dir/tag/blocks_{chrom}_{strain}.txt')]
+
+
+@pytest.fixture
+def flag():
+    return summarize.Flag_Info()
+
+
+def test_flag_info_init(flag):
+    assert flag.__dict__ == {
+        'gap_any': None,
+        'mask_any': None,
+        'unseq_any': None,
+        'hmm': None,
+        'gap': None,
+        'mask': None,
+        'unseq': None,
+        'match': None
+    }
+
+
+def test_intialize_flags(flag):
+    flag.initialize_flags(3, 2)
+
+    aae(flag.gap_any, np.zeros((3), bool))
+    aae(flag.mask_any, np.zeros((3), bool))
+    aae(flag.unseq_any, np.zeros((3), bool))
+    aae(flag.gap, np.zeros((3, 2), bool))
+    aae(flag.mask, np.zeros((3, 2), bool))
+    aae(flag.unseq, np.zeros((3, 2), bool))
+    aae(flag.match, np.zeros((3, 2), bool))
+
+    flag.initialize_flags(5, 3)
+
+    aae(flag.gap_any, np.zeros((5), bool))
+    aae(flag.mask_any, np.zeros((5), bool))
+    aae(flag.unseq_any, np.zeros((5), bool))
+    aae(flag.gap, np.zeros((5, 3), bool))
+    aae(flag.mask, np.zeros((5, 3), bool))
+    aae(flag.unseq, np.zeros((5, 3), bool))
+    aae(flag.match, np.zeros((5, 3), bool))
+
+
+def test_add_sequence_flags(flag):
+    flag.initialize_flags(3, 2)
+
+    other = summarize.Flag_Info()
+    other.gap = np.array([0, 0, 1], bool)
+    other.unseq = np.array([1, 0, 0], bool)
+    other.hmm = np.array([1, 0, 1], bool)
+    other.match = np.array([0, 1, 1], bool)
+
+    flag.add_sequence_flags(other, 0)
+    aae(flag.hmm, np.array([1, 0, 1], bool))
+    aae(flag.gap_any, np.array([0, 0, 1], bool))
+    aae(flag.unseq_any, np.array([1, 0, 0], bool))
+
+    aae(flag.gap, np.array(
+        [
+            [0, 0],
+            [0, 0],
+            [1, 0],
+        ], bool))
+
+    aae(flag.unseq, np.array(
+        [
+            [1, 0],
+            [0, 0],
+            [0, 0],
+        ], bool))
+
+    aae(flag.match, np.array(
+        [
+            [0, 0],
+            [1, 0],
+            [1, 0],
+        ], bool))
+
+    other = summarize.Flag_Info()
+    other.gap = np.array([1, 0, 0], bool)
+    other.unseq = np.array([0, 0, 1], bool)
+    other.hmm = np.array([0, 1, 0], bool)
+    other.match = np.array([0, 1, 1], bool)
+
+    flag.add_sequence_flags(other, 1)
+    aae(flag.hmm, np.array([1, 0, 1], bool))
+    aae(flag.gap_any, np.array([1, 0, 1], bool))
+    aae(flag.unseq_any, np.array([1, 0, 1], bool))
+
+    aae(flag.gap, np.array(
+        [
+            [0, 1],
+            [0, 0],
+            [1, 0],
+        ], bool))
+
+    aae(flag.unseq, np.array(
+        [
+            [1, 0],
+            [0, 0],
+            [0, 1],
+        ], bool))
+
+    aae(flag.match, np.array(
+        [
+            [0, 0],
+            [1, 1],
+            [1, 1],
+        ], bool))
+
+
+def test_add_mask_flags(flag):
+    flag.initialize_flags(3, 2)
+
+    other = summarize.Flag_Info()
+    other.mask = np.array([0, 0, 1], bool)
+
+    flag.add_mask_flags(other, 0)
+    aae(flag.mask_any, np.array([0, 0, 1], bool))
+
+    aae(flag.mask, np.array(
+        [
+            [0, 0],
+            [0, 0],
+            [1, 0],
+        ], bool))
+
+    other = summarize.Flag_Info()
+    other.mask = np.array([0, 1, 0], bool)
+
+    flag.add_mask_flags(other, 1)
+    aae(flag.mask_any, np.array([0, 1, 1], bool))
+
+    aae(flag.mask, np.array(
+        [
+            [0, 0],
+            [0, 1],
+            [1, 0],
+        ], bool))
+
+
+def test_encode_info(flag):
+    flag.initialize_flags(14, 3)
+    flag.hmm = np.zeros((14), bool)
+
+    flag.gap[0, 0] = True  # -
+    flag.gap[11, 1] = True  # -
+    flag.mask[1, 0] = True  # _
+    flag.mask[12, 1] = True  # _
+    flag.match[2, :] = True  # .
+    flag.match[13, :] = True  # .
+    flag.match[(3, 4, 5, 6), 0] = True  # b and c
+    flag.match[(3, 4, 7, 8), 1] = True  # b and p
+    # x is default
+    flag.hmm[[4, 6, 8, 10]] = True  # capitalize
+
+    s = flag.encode_info(master_ind=0, predict_ind=1)
+    assert s == '-_.bBcCpPxX-_.'
+    #            01234567890123
+
+    flag.initialize_flags(0, 3)
+    flag.hmm = np.zeros((0), bool)
+
+    s = flag.encode_info(master_ind=0, predict_ind=1)
+    assert s == ''
+
+
+def test_encode_unknown_info(flag):
+    flag.initialize_flags(5, 2)
+
+    flag.gap_any[0] = True  # -
+    flag.mask_any[1] = True  # _
+    flag.match[2, :] = True  # .
+    flag.match[3, 0] = True  # x
+    flag.match[4, 1] = True  # X
+
+    s = flag.encode_unknown_info(master_ind=0)
+    assert s == '-_.xX'
+    s = flag.encode_info(master_ind=0, predict_ind=3)
+    assert s == '-_.xX'
+
+    flag.initialize_flags(0, 2)
+    s = flag.encode_unknown_info(master_ind=0)
+    assert s == ''
+
+
+@pytest.fixture
+def region_db(mocker):
+    mocker.patch(
+        'misc.read_table.open',
+        mocker.mock_open(
+            read_data='region_id\tstrain\tchromosome\t'
+            'predicted_species\tstart\tend\tnum_sites_hmm\n'
+            'r4\tyjm1381\tI\tS288c\t24327\t26193\t60\n'
+            'r5\tyjm689\tI\tS288c\t24327\t24444\t56\n'
+            'r6\tyjm1381\tI\tS288c\t24612\t25439\t18\n'
+            'r7\tyjm689\tI\tS288c\t24612\t138647\t13728\n'
+            'r8\tyjm1208\tI\tS288c\t25395\t25448\t20\n'
+        ))
+
+    return summarize.Region_Database('labeled.txt',
+                                     'I',
+                                     ['s1', 's2'])
+
+
+def test_region_init(mocker):
+    mock_open = mocker.patch(
+        'misc.read_table.open',
+        mocker.mock_open(
+            read_data='region_id\tstrain\tchromosome\t'
+            'predicted_species\tstart\tend\tnum_sites_hmm\n'
+            'r4\tyjm1381\tI\tS288c\t24327\t26193\t60\n'
+            'r5\tyjm689\tI\tS288c\t24327\t24444\t56\n'
+            'r6\tyjm1381\tI\tS288c\t24612\t25439\t18\n'
+            'r7\tyjm689\tI\tS288c\t24612\t138647\t13728\n'
+            'r8\tyjm1208\tI\tS288c\t25395\t25448\t20\n'
+            'r9\tyjm1304\tII\tS288c\t25395\t25436\t16\n'
+        ))
+
+    db = summarize.Region_Database('labeled.txt',
+                                   'I',
+                                   ['s1', 's2'])
+    mock_open.assert_called_with('labeled.txt', 'r')
+
+    assert db.labels == ['region_id', 'strain', 'chromosome',
+                         'predicted_species', 'start', 'end', 'num_sites_hmm',
+                         'match_nongap_s1', 'match_nongap_s2',
+                         'num_sites_nongap_s1', 'num_sites_nongap_s2',
+                         'match_hmm_s1', 'match_hmm_s2', 'match_nonmask_s1',
+                         'match_nonmask_s2', 'num_sites_nonmask_s1',
+                         'num_sites_nonmask_s2', 'count_.', 'count_-',
+                         'count__', 'count_n', 'count_p', 'count_b',
+                         'count_c', 'count_x', 'count_N', 'count_P',
+                         'count_B', 'count_C', 'count_X']
+
+    assert db.data == {
+        'yjm1381': {
+            'region_id': ['r4', 'r6'],
+            'strain': ['yjm1381', 'yjm1381'],
+            'chromosome': ['I', 'I'],
+            'predicted_species': ['S288c', 'S288c'],
+            'start': ['24327', '24612'],
+            'end': ['26193', '25439'],
+            'num_sites_hmm': ['60', '18'],
+            'match_nongap_s1': [0, 0],
+            'num_sites_nongap_s1': [0, 0],
+            'match_hmm_s1': [0, 0],
+            'match_nonmask_s1': [0, 0],
+            'num_sites_nonmask_s1': [0, 0],
+            'match_nongap_s2': [0, 0],
+            'num_sites_nongap_s2': [0, 0],
+            'match_hmm_s2': [0, 0],
+            'match_nonmask_s2': [0, 0],
+            'num_sites_nonmask_s2': [0, 0],
+            'count_.': [0, 0], 'count_-': [0, 0], 'count__': [0, 0],
+            'count_n': [0, 0], 'count_p': [0, 0], 'count_b': [0, 0],
+            'count_c': [0, 0], 'count_x': [0, 0], 'count_N': [0, 0],
+            'count_P': [0, 0], 'count_B': [0, 0], 'count_C': [0, 0],
+            'count_X': [0, 0]},
+        'yjm689': {
+            'region_id': ['r5', 'r7'],
+            'strain': ['yjm689', 'yjm689'],
+            'chromosome': ['I', 'I'],
+            'predicted_species': ['S288c', 'S288c'],
+            'start': ['24327', '24612'],
+            'end': ['24444', '138647'],
+            'num_sites_hmm': ['56', '13728'],
+            'match_nongap_s1': [0, 0],
+            'num_sites_nongap_s1': [0, 0],
+            'match_hmm_s1': [0, 0],
+            'match_nonmask_s1': [0, 0],
+            'num_sites_nonmask_s1': [0, 0],
+            'match_nongap_s2': [0, 0],
+            'num_sites_nongap_s2': [0, 0],
+            'match_hmm_s2': [0, 0],
+            'match_nonmask_s2': [0, 0],
+            'num_sites_nonmask_s2': [0, 0],
+            'count_.': [0, 0], 'count_-': [0, 0], 'count__': [0, 0],
+            'count_n': [0, 0], 'count_p': [0, 0], 'count_b': [0, 0],
+            'count_c': [0, 0], 'count_x': [0, 0], 'count_N': [0, 0],
+            'count_P': [0, 0], 'count_B': [0, 0], 'count_C': [0, 0],
+            'count_X': [0, 0]},
+        'yjm1208': {
+            'region_id': ['r8'],
+            'strain': ['yjm1208'],
+            'chromosome': ['I'],
+            'predicted_species': ['S288c'],
+            'start': ['25395'],
+            'end': ['25448'],
+            'num_sites_hmm': ['20'],
+            'match_nongap_s1': [0],
+            'num_sites_nongap_s1': [0],
+            'match_hmm_s1': [0],
+            'match_nonmask_s1': [0],
+            'num_sites_nonmask_s1': [0],
+            'match_nongap_s2': [0],
+            'num_sites_nongap_s2': [0],
+            'match_hmm_s2': [0],
+            'match_nonmask_s2': [0],
+            'num_sites_nonmask_s2': [0],
+            'count_.': [0], 'count_-': [0], 'count__': [0],
+            'count_n': [0], 'count_p': [0], 'count_b': [0],
+            'count_c': [0], 'count_x': [0], 'count_N': [0],
+            'count_P': [0], 'count_B': [0], 'count_C': [0],
+            'count_X': [0]}}
+
+    assert db.info_string_symbols == list('.-_npbcxNPBCX')
+    assert db.label_prefixes == [
+        'match_nongap',
+        'num_sites_nongap',
+        'match_hmm',
+        'match_nonmask',
+        'num_sites_nonmask']
+
+
+def test_has_strain(region_db):
+    assert region_db.has_strain('yjm689')
+    assert not region_db.has_strain('yjm688')
+
+
+def test_get_entries(region_db):
+    result = [('r4', 24327, 26193),
+              ('r6', 24612, 25439)]
+    for i, (r_id, start, end) in enumerate(region_db.get_entries('yjm1381')):
+        entry = result[i]
+        assert r_id == entry[0]
+        assert start == entry[1]
+        assert end == entry[2]
+
+    result = [('r5', 24327, 24444),
+              ('r7', 24612, 138647)]
+    for i, (r_id, start, end) in enumerate(region_db.get_entries('yjm689')):
+        entry = result[i]
+        assert r_id == entry[0]
+        assert start == entry[1]
+        assert end == entry[2]
+
+    result = [('r8', 25395, 25448)]
+    for i, (r_id, start, end) in enumerate(region_db.get_entries('yjm1208')):
+        entry = result[i]
+        assert r_id == entry[0]
+        assert start == entry[1]
+        assert end == entry[2]
+
+    with pytest.raises(ValueError) as e:
+        list(region_db.get_entries('asdf'))
+    assert 'Region Database does not contain strain asdf' in str(e)
+
+
+def test_set_region(region_db):
+    region_db.set_region('yjm1381',
+                         0,
+                         's1',
+                         (10, 20),
+                         (30, 40),
+                         (50, 60))
+
+    ds = region_db.data['yjm1381']
+
+    assert ds['num_sites_hmm'][0] == 20
+    assert ds[f'match_hmm_s1'][0] == 10
+    assert ds[f'match_nongap_s1'][0] == 30
+    assert ds[f'num_sites_nongap_s1'][0] == 40
+    assert ds[f'match_nonmask_s1'][0] == 50
+    assert ds[f'num_sites_nonmask_s1'][0] == 60
+
+    region_db.set_region('yjm1381',
+                         1,
+                         's2',
+                         (11, None),
+                         (31, 41),
+                         (51, 61))
+
+    ds = region_db.data['yjm1381']
+
+    # retained from initial value
+    assert ds['num_sites_hmm'][1] == '18'
+    assert ds[f'match_hmm_s2'][1] == 11
+    assert ds[f'match_nongap_s2'][1] == 31
+    assert ds[f'num_sites_nongap_s2'][1] == 41
+    assert ds[f'match_nonmask_s2'][1] == 51
+    assert ds[f'num_sites_nonmask_s2'][1] == 61
+
+
+def test_generate_output(region_db):
+    initial_lines = [
+        'r4\tyjm1381\tI\tS288c\t24327\t26193\t60',
+        'r5\tyjm689\tI\tS288c\t24327\t24444\t56',
+        'r6\tyjm1381\tI\tS288c\t24612\t25439\t18',
+        'r7\tyjm689\tI\tS288c\t24612\t138647\t13728',
+        'r8\tyjm1208\tI\tS288c\t25395\t25448\t20',
+    ]
+    for i, line in enumerate(region_db.generate_output()):
+        assert line == initial_lines[i] + '\t' + '\t'.join(['0']*23) + '\n'
+
+
+def test_generate_header(region_db, mocker):
+    assert region_db.generate_header() == (
+        'region_id\tstrain\tchromosome\tpredicted_species\tstart\tend'
+        '\tnum_sites_hmm\tmatch_nongap_s1\tmatch_nongap_s2'
+        '\tnum_sites_nongap_s1\tnum_sites_nongap_s2\tmatch_hmm_s1'
+        '\tmatch_hmm_s2\tmatch_nonmask_s1\tmatch_nonmask_s2'
+        '\tnum_sites_nonmask_s1\tnum_sites_nonmask_s2\tcount_.\tcount_-'
+        '\tcount__\tcount_n\tcount_p\tcount_b\tcount_c\tcount_x\tcount_N'
+        '\tcount_P\tcount_B\tcount_C\tcount_X\n')
+
+    mocker.patch(
+        'misc.read_table.open',
+        mocker.mock_open(
+            read_data='region_id\tstrain\tchromosome\n'
+            'r4\tyjm1381\tII\n'
+        ))
+
+    # fewer headers, states, same counts
+    db = summarize.Region_Database('labeled.txt', 'II', ['st1'])
+    assert db.generate_header() == (
+        'region_id\tstrain\tchromosome\tmatch_nongap_st1'
+        '\tnum_sites_nongap_st1\tmatch_hmm_st1\tmatch_nonmask_st1'
+        '\tnum_sites_nonmask_st1\tcount_.\tcount_-\tcount__\tcount_n\tcount_p'
+        '\tcount_b\tcount_c\tcount_x\tcount_N\tcount_P\tcount_B\tcount_C'
+        '\tcount_X\n')
+
+
+def test_update_counts(region_db):
+    syms = list('.-_npbcxNPBCX')
+
+    region_db.update_counts('yjm1381', 0, '.-_npbcxNPBCX')
+    for s in syms:
+        assert region_db.data['yjm1381'][f'count_{s}'][0] == 1
+
+    region_db.update_counts('yjm1381', 1, '.-_npbcxNPBCX'*20)
+    for s in syms:
+        assert region_db.data['yjm1381'][f'count_{s}'][1] == 20
+
+    region_db.update_counts('yjm689', 1, '._pcNBX'*40)
+    for i, s in enumerate(syms):
+        if i % 2 == 0:
+            assert region_db.data['yjm689'][f'count_{s}'][1] == 40
+        else:
+            assert region_db.data['yjm689'][f'count_{s}'][1] == 0
+
+    # same values, gets overwritten
+    region_db.update_counts('yjm689', 1, '-nbxPC'*10)
+    for i, s in enumerate(syms):
+        if i % 2 == 0:
+            assert region_db.data['yjm689'][f'count_{s}'][1] == 0
+        else:
+            assert region_db.data['yjm689'][f'count_{s}'][1] == 10
+
+
+@pytest.fixture
+def region_context(mocker):
+    writer = summarize.Region_Writer(
+        'test_region.gz', 'test_index.pkl', 's1 s2'.split())
+    mock_gzip = mocker.patch('analyze.summarize_region_quality.gzip')
+    mock_open = mocker.patch('analyze.summarize_region_quality.open')
+    mock_pickle = mocker.patch('analyze.summarize_region_quality.pickle.dump')
+
+    return (writer, mock_gzip, mock_open, mock_pickle)
+
+
+def test_region_context(region_context, mocker):
+    writer, mock_gzip, mock_open, mock_pickle = region_context
+
+    assert writer.region_file == 'test_region.gz'
+    assert writer.index_file == 'test_index.pkl'
+    assert writer.index == {}
+    assert writer.known_states == 's1 s2'.split()
+
+    mock_gzip.assert_not_called()
+    mock_open.assert_not_called()
+
+    writer = writer.__enter__()
+    assert writer.region_writer is not None
+    mock_gzip.open.assert_called_once_with('test_region.gz', 'wt')
+    mock_open.assert_not_called()
+
+    writer.__exit__(None, None, None)
+    mock_gzip.open.return_value.close.assert_called_once()
+    mock_open.assert_called_once_with('test_index.pkl', 'wb')
+    mock_pickle.assert_called_once_with({}, mocker.ANY)
+
+
+def test_region_write_header(region_context):
+    writer, _, _, _ = region_context
+    output = StringIO()
+    writer.region_writer = output
+    writer.write_header('r123')
+    writer.write_header('f13')
+    writer.write_header('q3')
+    writer.write_header('213')
+
+    # note, 213 is changed to 13 when storing in index, overwriting f13
+    assert output.getvalue() == '#r123\n#f13\n#q3\n#213\n'
+    assert writer.index == {3: 11, 13: 15, 123: 0}
+
+
+def test_region_write_sequences(region_context):
+    writer, _, _, _ = region_context
+    output = StringIO()
+    writer.region_writer = output
+
+    writer.write_sequences('strain',
+                           [
+                               list(range(10)),
+                               list(range(2, 12)),
+                               list(range(4, 14))
+                           ],
+                           np.array([
+                               list('0123456789')*3,
+                               list('123456789')*3,
+                               list('23456789')*3,
+                           ]),
+                           (5, 9))
+    assert output.getvalue() == (
+        '> s1 5 9\n'
+        '56789\n'
+        '> s2 3 7\n'
+        '67891\n'
+        '> strain 1 5\n'
+        '78923\n')
+
+
+def test_region_write_info_string(region_context):
+    writer, _, _, _ = region_context
+    output = StringIO()
+    writer.region_writer = output
+
+    writer.write_info_string('this is my string')
+    assert output.getvalue() == (
+        '> info\n'
+        'this is my string\n')
+
+
+def test_position_reader_context(mocker):
+    mock_gzip = mocker.patch('analyze.summarize_region_quality.gzip.open')
+    reader = summarize.Position_Reader('test_file.gz')
+    assert reader.position_file == 'test_file.gz'
+    assert reader.last_position == 0
+
+    reader = reader.__enter__()
+    assert reader.reader is not None
+    mock_gzip.assert_called_once_with('test_file.gz', 'rt')
+
+    reader = reader.__exit__(None, None, None)
+    mock_gzip.return_value.close.assert_called_once()
+
+
+def test_position_reader_next_line():
+    reader = summarize.Position_Reader('mock')
+    positions = StringIO(
+        'yjm1460\tI\t25957\t25958\t25961\t25963\n'
+        'yjm1463\tI\t25665\t25668\t25670\t25676\n'
+        'yjm1464\tI\t25665\t25668\t25670\t25676\n'
+        'yjm1464\tII\t25665\t25668\t25670\t25676\n'
+        'yjm1460\tIII\t25957\t25958\t25961\t25963\n'
+    )
+    reader.reader = positions
+
+    assert reader.next_line() == \
+        'yjm1460\tI\t25957\t25958\t25961\t25963\n'
+    assert reader.last_position == 0
+
+    assert reader.next_line() == \
+        'yjm1463\tI\t25665\t25668\t25670\t25676\n'
+    assert reader.last_position == 34
+
+    assert reader.next_line() == \
+        'yjm1464\tI\t25665\t25668\t25670\t25676\n'
+    assert reader.last_position == 68
+
+    assert reader.next_line() == \
+        'yjm1464\tII\t25665\t25668\t25670\t25676\n'
+    assert reader.last_position == 102
+
+    assert reader.next_line() == \
+        'yjm1460\tIII\t25957\t25958\t25961\t25963\n'
+    assert reader.last_position == 137
+
+    assert reader.next_line() == ''
+    assert reader.last_position == 173
+
+
+def test_position_reader_get_positions(region_db):
+    # region_db contains yjm1381, 689, 1208
+    reader = summarize.Position_Reader('mock')
+    positions = StringIO(
+        'yjm1381\tI\t25957\t25958\t25961\t25963\n'
+        'yjm689\tI\t25665\t25668\t25670\t25676\n'
+        'yjm1464\tI\t25665\t25668\t25670\t25676\n'
+        'yjm1464\tII\t25665\t25668\t25670\t25676\n'
+        'yjm1381\tIII\t25957\t25958\t25961\t25963\n'
+    )
+    reader.reader = positions
+
+    results = [
+        ('yjm1381', np.array([25957, 25958, 25961, 25963]), 0),
+        ('yjm689', np.array([25665, 25668, 25670, 25676]), 34),
+    ]
+
+    for i, (strain, ps) in enumerate(reader.get_positions(region_db, 'I')):
+        assert strain == results[i][0]
+        aae(ps, results[i][1])
+        assert reader.last_position == results[i][2]
+
+    assert i == 1
+    assert reader.last_position == 101
+
+    i = None
+    # won't run because chromosome is not in order (on II)
+    for i, (strain, ps) in enumerate(reader.get_positions(region_db, 'III')):
+        pass
+
+    assert i is None
+    assert reader.last_position == 101
+
+    # won't return since strain not in regions, will change last position
+    for i, (strain, ps) in enumerate(reader.get_positions(region_db, 'II')):
+        pass
+
+    # if loop has not run
+    assert i is None
+    assert reader.last_position == 136
+
+    for i, (strain, ps) in enumerate(reader.get_positions(region_db, 'III')):
+        assert strain == 'yjm1381'
+        aae(ps, np.array([25957, 25958, 25961, 25963]))
+        assert reader.last_position == 136
+
+    assert i == 0
+    assert reader.last_position == 172
+
+
+def test_quality_writer_context(mocker):
+    mock_open = mocker.patch('analyze.summarize_region_quality.open')
+    writer = summarize.Quality_Writer('test_file.txt')
+    assert writer.filename == 'test_file.txt'
+    assert writer.first_write is True
+
+    writer = writer.__enter__()
+    assert writer.writer is not None
+    mock_open.assert_called_once_with('test_file.txt', 'w')
+
+    writer = writer.__exit__(None, None, None)
+    mock_open.return_value.close.assert_called_once()
+
+
+def test_quality_writer_write_quality(mocker):
+    writer = summarize.Quality_Writer('test')
+    output = StringIO()
+    writer.writer = output
+
+    assert writer.first_write is True
+    mock_region = mocker.MagicMock()
+    mock_region.generate_header.return_value = 'header1\n'
+    mock_region.generate_output.return_value = [
+        'region1\n',
+        'region2\n',
+    ]
+
+    writer.write_quality(mock_region)
+    assert writer.first_write is False
+
+    mock_region = mocker.MagicMock()
+    mock_region.generate_header.return_value = 'header2\n'
+    mock_region.generate_output.return_value = [
+        'a\tdifferent\tformat\n',
+    ]
+
+    writer.write_quality(mock_region)
+
+    assert writer.first_write is False
+    assert output.getvalue() == (
+        'header1\n'
+        'region1\n'
+        'region2\n'
+        'a\tdifferent\tformat\n'
+    )
+
+
+def test_sequence_analyzer_init():
+    sa = summarize.Sequence_Analyzer('mask',
+                                     'alignment',
+                                     ['s1', 's2'],
+                                     ['s1', 's2'],
+                                     ['I', 'II'],
+                                     {})
+
+    assert sa.masks == 'mask'
+    assert sa.alignments == 'alignment'
+    assert sa.known_states == ['s1', 's2']
+    assert sa.chromosomes == ['I', 'II']
+    assert sa.symbols == {}
+
+
+@pytest.fixture
+def sa():
+    symbols = {
+        'match': '+',
+        'mismatch': '-',
+        'unknown': '?',
+        'unsequenced': 'n',
+        'gap': '-',
+        'unaligned': '?',
+        'masked': 'x'
+    }
+    return summarize.Sequence_Analyzer('', '', [], [], [], symbols)
+
+
+def test_SA_build_masked_sites(sa, mocker):
+    mock_open = mocker.patch(
+        'analyze.summarize_region_quality.open',
+        mocker.mock_open(
+            read_data='>header\n'
+            '0 - 2\n'
+            '22 - 25\n'
+            '32 - 33\n'
+        ))
+
+    sa.chromosomes = ['I', 'II']
+    sa.known_states = ['s1', 's2']
+    sa.interval_states = ['i1', 'i2']
+    sa.masks = '{strain}_chr{chrom}_intervals.txt'
+    sa.build_masked_sites()
+    sites = sa.masked_sites
+
+    assert mock_open.call_args_list == [
+        mocker.call('i1_chrI_intervals.txt', 'r'),
+        mocker.call('i2_chrI_intervals.txt', 'r'),
+        mocker.call('i1_chrII_intervals.txt', 'r'),
+        mocker.call('i2_chrII_intervals.txt', 'r'),
+    ]
+
+    expected = {
+        'I': {
+            's1': np.array([0, 1, 2, 22, 23, 24, 25, 32, 33]),
+            's2': np.array([0, 1, 2, 22, 23, 24, 25, 32, 33]),
+        },
+        'II': {
+            's1': np.array([0, 1, 2, 22, 23, 24, 25, 32, 33]),
+            's2': np.array([0, 1, 2, 22, 23, 24, 25, 32, 33]),
+        }
+    }
+    for chrom in sites:
+        for state in sites[chrom]:
+            aae(sites[chrom][state], expected[chrom][state])
+
+    mock_open = mocker.patch(
+        'analyze.summarize_region_quality.open',
+        mocker.mock_open(
+            read_data='>header\n'
+        ))
+
+    sa.build_masked_sites()
+    sites = sa.masked_sites
+
+    assert mock_open.call_args_list == [
+        mocker.call('i1_chrI_intervals.txt', 'r'),
+        mocker.call('i2_chrI_intervals.txt', 'r'),
+        mocker.call('i1_chrII_intervals.txt', 'r'),
+        mocker.call('i2_chrII_intervals.txt', 'r'),
+    ]
+
+    expected = {
+        'I': {
+            's1': np.array([]),
+            's2': np.array([]),
+        },
+        'II': {
+            's1': np.array([]),
+            's2': np.array([]),
+        }
+    }
+    for chrom in sites:
+        for state in sites[chrom]:
+            aae(sites[chrom][state], expected[chrom][state])
+
+
+def test_read_masked_sites(sa, mocker):
+    mock_open = mocker.patch(
+        'analyze.summarize_region_quality.open',
+        mocker.mock_open(
+            read_data='>header\n'
+            '0 - 2\n'
+            '22 - 25\n'
+            '32 - 33\n'
+        ))
+
+    sa.masks = '{chrom}_{strain}_mock'
+    result = sa.read_masked_sites('I', 'str')
+    assert mock_open.call_args_list == [
+        mocker.call('I_str_mock', 'r')
+    ]
+    aae(result, np.array('0 1 2 22 23 24 25 32 33'.split(), dtype=int))
+
+
+def test_convert_intervals_to_sites(sa):
+    sites = sa.convert_intervals_to_sites([])
+    assert sites == approx([])
+
+    sites = sa.convert_intervals_to_sites([(1, 2)])
+    assert sites == approx([1, 2])
+
+    sites = sa.convert_intervals_to_sites([(1, 2), (4, 6)])
+    assert sites == approx([1, 2, 4, 5, 6])
+
+
+def test_read_masked_intervals(sa, mocker):
     lines = StringIO('')
     mocked_file = mocker.patch('analyze.summarize_region_quality.open',
                                return_value=lines)
-    intervals = summarize.read_masked_intervals('mocked')
+    intervals = sa.read_masked_intervals('mocked')
     mocked_file.assert_called_with('mocked', 'r')
     assert intervals == []
 
     lines = StringIO('I am a header')
     mocked_file = mocker.patch('analyze.summarize_region_quality.open',
                                return_value=lines)
-    intervals = summarize.read_masked_intervals('mocked')
+    intervals = sa.read_masked_intervals('mocked')
     mocked_file.assert_called_with('mocked', 'r')
     assert intervals == []
 
@@ -25,167 +1097,265 @@ def test_read_masked_intervals(mocker):
     mocked_file = mocker.patch('analyze.summarize_region_quality.open',
                                return_value=lines)
     with pytest.raises(ValueError):
-        intervals = summarize.read_masked_intervals('mocked')
+        intervals = sa.read_masked_intervals('mocked')
     mocked_file.assert_called_with('mocked', 'r')
 
     lines = StringIO('I am a header\n'
                      '1 and 2')
     mocked_file = mocker.patch('analyze.summarize_region_quality.open',
                                return_value=lines)
-    intervals = summarize.read_masked_intervals('mocked')
+    intervals = sa.read_masked_intervals('mocked')
     assert intervals == [(1, 2)]
 
 
-def test_convert_intervals_to_sites():
-    sites = summarize.convert_intervals_to_sites([])
-    assert sites == approx([])
+def test_get_stats(sa):
+    hmm, nongap, nonmask = sa.get_stats(
+        np.array(list('abc')),
+        np.array(list('abd')),
+        0,
+        [1, 2, 5],
+        ([0], []))
 
-    sites = summarize.convert_intervals_to_sites([(1, 2)])
-    assert sites == approx([1, 2])
+    assert hmm[0] == 1
+    assert hmm[1] == 2
+    assert hmm[2].gap == approx([False] * 3)
+    assert hmm[2].hmm == approx([False, True, True])
+    assert hmm[2].match == approx([True, True, False])
+    assert hmm[2].unseq == approx([False] * 3)
 
-    sites = summarize.convert_intervals_to_sites([(1, 2), (4, 6)])
-    assert sites == approx([1, 2, 4, 5, 6])
+    assert nongap[0] == 1
+    assert nongap[1] == 1
+
+    assert nonmask[0] == 1
+    assert nonmask[1] == 2
+    assert nonmask[2].mask == approx([True, False, False])
+
+
+def test_process_alignment(sa, region_context, mocker):
+    sa.known_states = ['k1', 'k2']
+    sa.masked_sites = {
+        'I': {
+            'k1': np.array([0, 1, 2]),
+            'k2': np.array([2, 4, 6]),
+        }}
+    region_writer, _, _, _ = region_context
+    output = StringIO()
+    region_writer.region_writer = output
+    region_writer.known_states = ['k1', 'k2']
+    mocker.patch('analyze.summarize_region_quality.read_fasta.read_fasta',
+                 return_value=('',
+                               np.asarray([
+                                   list('--gatcctag--'),
+                                   list('-agatgcaag-c'),
+                                   list('-a-att-cagt-'),
+                               ])))
+    mocker.patch.object(summarize.Sequence_Analyzer, 'read_masked_sites',
+                        return_value=np.array(
+                            '2 3 4'.split(), int))
+
+    # for region database
+    mocker.patch(
+        'misc.read_table.open',
+        mocker.mock_open(
+            read_data='region_id\tstrain\tchromosome\t'
+            'predicted_species\tstart\tend\tnum_sites_hmm\n'
+            'r4\ts1\tI\tS288c\t1\t3\t60\n'
+            'r5\tyjm689\tI\tS288c\t24327\t24444\t56\n'
+            'r6\ts1\tI\tS288c\t5\t7\t18\n'
+            'r7\tyjm689\tI\tS288c\t24612\t138647\t13728\n'
+            'r8\tyjm1208\tI\tS288c\t25395\t25448\t20\n'
+        ))
+    rd = summarize.Region_Database('labeled_file.txt',
+                                   'I',
+                                   ['k1', 'k2'])
+
+    sa.process_alignment(0,
+                         1,
+                         'I',
+                         's1',
+                         np.array([1, 3, 5, 7]),
+                         rd,
+                         region_writer)
+
+    # these are inputs with modified hmm (last column)
+    region_input = ['r4\ts1\tI\tS288c\t1\t3\t2\t',
+                    'r5\tyjm689\tI\tS288c\t24327\t24444\t56\t',
+                    'r6\ts1\tI\tS288c\t5\t7\t2\t',
+                    'r7\tyjm689\tI\tS288c\t24612\t138647\t13728\t',
+                    'r8\tyjm1208\tI\tS288c\t25395\t25448\t20\t']
+    # these are the counts
+    region_output = [
+        '\t'.join('2 2 3 3 1 1 0 0 0 0 0 0 3 '
+                  '0 0 0 0 0 0 0 0 0 0'.split()) + '\n',
+        '\t'.join(['0' for _ in range(23)]) + '\n',
+        '\t'.join('2 2 3 3 1 1 2 2 2 2 2 0 1 '
+                  '0 0 0 0 0 0 0 0 0 0'.split()) + '\n',
+        '\t'.join(['0' for _ in range(23)]) + '\n',
+        '\t'.join(['0' for _ in range(23)]) + '\n',
+    ]
+
+    for i, line in enumerate(rd.generate_output()):
+        assert line == region_input[i] + region_output[i]
+
+    region_writer_output = (
+        '#r4\n'
+        '> k1 1 3\n'
+        'atc\n'
+        '> k2 2 4\n'
+        'atg\n'
+        '> s1 1 3\n'
+        'att\n'
+        '> info\n'
+        '___\n'
+        '#r6\n'
+        '> k1 5 7\n'
+        'tag\n'
+        '> k2 6 8\n'
+        'aag\n'
+        '> s1 4 6\n'
+        'cag\n'
+        '> info\n'
+        '_..\n'
+    )
+    assert output.getvalue() == region_writer_output
+
+
+def test_get_indices(sa, mocker):
+    sa.known_states = ['k1', 'k2']
+    sa.masked_sites = {
+        'I': {
+            'k1': np.array([0, 1, 2]),
+            'k2': np.array([2, 4, 6]),
+        }}
+    sa.alignments = 'align_{chrom}_{strain}'
+    sa.masks = 'mask_{chrom}_{strain}'
+    mock_fasta = mocker.patch(
+        'analyze.summarize_region_quality.read_fasta.read_fasta',
+        return_value=('',
+                      np.asarray([
+                          list('--gatcctag--'),
+                          list('-agatgcaag-c'),
+                          list('-a-att-cagt-'),
+                      ])))
+    mock_mask = mocker.patch.object(summarize.Sequence_Analyzer,
+                                    'read_masked_sites',
+                                    return_value=np.array(
+                                        '2 3 4'.split(), int))
 
+    seq, align, mask = sa.get_indices('I', 's1')
 
-def test_index_alignment_by_reference():
-    assert summarize.gp.gap_symbol == '-'
+    mock_fasta.assert_called_once_with('align_I_s1')
+    mock_mask.assert_called_once_with('I', 's1')
 
-    output = summarize.index_alignment_by_reference(np.array(list('abc')))
+    aae(seq, np.asarray([
+        list('--gatcctag--'),
+        list('-agatgcaag-c'),
+        list('-a-att-cagt-'),
+    ]))
+
+    result = [
+        np.array(range(2, 10)),
+        np.array(list(range(1, 10)) + [11]),
+        np.array([1, 3, 4, 5, 7, 8, 9, 10])
+    ]
+    for i, a in enumerate(align):
+        aae(a, result[i])
+
+    result = [
+        np.array([2, 3, 4]),
+        np.array([3, 5, 7]),
+        np.array([4, 5, 7]),
+    ]
+    for i, m in enumerate(mask):
+        aae(m, result[i])
+
+
+def test_get_slice(sa):
+    alignment = np.array([1, 2, 3, 4, 5])
+    ps_align = np.array([2, 4])
+
+    start, end = sa.get_slice(1, 3, alignment, ps_align)
+    assert start == 2
+    assert end == 4
+
+    with pytest.raises(ValueError) as e:
+        sa.get_slice(0, 3, alignment, ps_align)
+    assert 'Slice not found in position alignment' in str(e)
+
+    with pytest.raises(ValueError) as e:
+        sa.get_slice(1, 4, alignment, ps_align)
+    assert 'Slice not found in position alignment' in str(e)
+
+
+def test_index_alignment_by_reference(sa):
+    output = sa.index_alignment_by_reference(np.array(list('abc')))
     assert output == approx([0, 1, 2])
 
-    output = summarize.index_alignment_by_reference(np.array(list('a-b-c')))
+    output = sa.index_alignment_by_reference(np.array(list('a-b-c')))
     assert output == approx([0, 2, 4])
 
 
-def test_seq_id_hmm():
-    assert summarize.gp.gap_symbol == '-'
-    assert summarize.gp.unsequenced_symbol == 'n'
-
-    match, sites, d = summarize.seq_id_hmm(np.array(list('abd')),
-                                           np.array(list('abc')),
-                                           0, [1, 2, 5])
-    assert match == 1  # only count matches in included sites
-    assert sites == 2  # included, not matching
-    assert d['gap_flag'] == approx([False] * 3)
-    assert d['hmm_flag'] == approx([False, True, True])
-    assert d['match'] == approx([True, True, False])
-    assert d['unseq_flag'] == approx([False] * 3)
-    assert len(d) == 4
-
-    match, sites, d = summarize.seq_id_hmm(np.array(list('n-d')),
-                                           np.array(list('--c')),
-                                           1, [3, 5])
-    assert match == 0
-    assert sites == 1
-    assert d['gap_flag'] == approx([True, True, False])
-    assert d['hmm_flag'] == approx([False, False, True])
-    assert d['match'] == approx([False, True, False])
-    assert d['unseq_flag'] == approx([True, False, False])
-    assert len(d) == 4
-
-    with pytest.raises(AssertionError) as e:
-        match, sites, d = summarize.seq_id_hmm(np.array(list('n-d')),
-                                               np.array(list('--c')),
-                                               1, [2, 5])
-    assert '- - 1' in str(e)
-
-    with pytest.raises(AssertionError) as e:
-        match, sites, d = summarize.seq_id_hmm(np.array(list('n-d')),
-                                               np.array(list('--c')),
-                                               1, [1, 5])
-    assert 'n - 0' in str(e)
-
-
-def test_seq_id_unmasked():
-    assert summarize.gp.gap_symbol == '-'
-    assert summarize.gp.unsequenced_symbol == 'n'
-
-    match, sites, d = summarize.seq_id_unmasked(np.array(list('abd')),
-                                                np.array(list('abc')),
-                                                0, [], [])
+def test_seq_id_unmasked(sa):
+    match, sites, info = sa.seq_id_unmasked(np.array(list('abd')),
+                                            np.array(list('abc')),
+                                            0, [], [])
     assert match == 2
     assert sites == 3
-    assert d['mask_flag'] == approx([False, False, False])
+    assert info.mask == approx([False, False, False])
 
-    match, sites, d = summarize.seq_id_unmasked(np.array(list('abd')),
-                                                np.array(list('abc')),
-                                                0, [0], [])
+    match, sites, info = sa.seq_id_unmasked(np.array(list('abd')),
+                                            np.array(list('abc')),
+                                            0, [0], [])
     assert match == 1
     assert sites == 2
-    assert d['mask_flag'] == approx([True, False, False])
+    assert info.mask == approx([True, False, False])
 
-    match, sites, d = summarize.seq_id_unmasked(np.array(list('abd')),
-                                                np.array(list('abc')),
-                                                2, [0], [1])
+    match, sites, info = sa.seq_id_unmasked(np.array(list('abd')),
+                                            np.array(list('abc')),
+                                            2, [0], [1])
     assert match == 2
     assert sites == 3
-    assert d['mask_flag'] == approx([False, False, False])
+    assert info.mask == approx([False, False, False])
 
-    match, sites, d = summarize.seq_id_unmasked(np.array(list('abd')),
-                                                np.array(list('abc')),
-                                                0, [0], [1])
+    match, sites, info = sa.seq_id_unmasked(np.array(list('abd')),
+                                            np.array(list('abc')),
+                                            0, [0], [1])
     assert match == 0
     assert sites == 1
-    assert d['mask_flag'] == approx([True, True, False])
-
-
-def test_make_info_string():
-    len_seqx = 14
-    len_states = 3
-    info = {'hmm_flag': np.zeros((len_seqx), bool),
-            'gap_flag': np.zeros((len_seqx, len_states), bool),
-            'mask_flag': np.zeros((len_seqx, len_states), bool),
-            'match_flag': np.zeros((len_seqx, len_states), bool)}
-
-    info['gap_flag'][0, 0] = True  # -
-    info['gap_flag'][11, 1] = True  # -
-    info['mask_flag'][1, 0] = True  # _
-    info['mask_flag'][12, 1] = True  # _
-    info['match_flag'][2, :] = True  # .
-    info['match_flag'][13, :] = True  # .
-    info['match_flag'][(3, 4, 5, 6), 0] = True  # b and c
-    info['match_flag'][(3, 4, 7, 8), 1] = True  # b and p
-    # x is default
-    info['hmm_flag'][[4, 6, 8, 10]] = True  # capitalize
+    assert info.mask == approx([True, True, False])
 
-    s = summarize.make_info_string(info, master_ind=0, predict_ind=1)
-    assert s == '-_.bBcCpPxX-_.'
-    #            01234567890123
-
-    len_seqx = 0
-    len_states = 3
-    info = {'hmm_flag': np.zeros((len_seqx), bool),
-            'gap_flag': np.zeros((len_seqx, len_states), bool),
-            'mask_flag': np.zeros((len_seqx, len_states), bool),
-            'match_flag': np.zeros((len_seqx, len_states), bool)}
-
-    s = summarize.make_info_string(info, master_ind=0, predict_ind=1)
-    assert s == ''
-
-
-def test_info_string_unknown():
-    len_seqx = 5
-    len_states = 2
-    info = {'gap_any_flag': np.zeros((len_seqx), bool),
-            'mask_any_flag': np.zeros((len_seqx), bool),
-            'match_flag': np.zeros((len_seqx, len_states), bool)}
 
-    info['gap_any_flag'][0] = True  # -
-    info['mask_any_flag'][1] = True  # _
-    info['match_flag'][2, :] = True  # .
-    info['match_flag'][3, 0] = True  # x
-    info['match_flag'][4, 1] = True  # X
+def test_seq_id_hmm(sa):
+    match, sites, info = sa.seq_id_hmm(np.array(list('abd')),
+                                       np.array(list('abc')),
+                                       0, [1, 2, 5])
+    assert match == 1  # only count matches in included sites
+    assert sites == 2  # included, not matching
+    assert info.gap == approx([False] * 3)
+    assert info.hmm == approx([False, True, True])
+    assert info.match == approx([True, True, False])
+    assert info.unseq == approx([False] * 3)
 
-    s = summarize.make_info_string_unknown(info, master_ind=0)
-    assert s == '-_.xX'
-    s = summarize.make_info_string(info, master_ind=0, predict_ind=3)
-    assert s == '-_.xX'
+    match, sites, info = sa.seq_id_hmm(np.array(list('n-d')),
+                                       np.array(list('--c')),
+                                       1, [3, 5])
+    assert match == 0
+    assert sites == 1
+    assert info.gap == approx([True, True, False])
+    assert info.hmm == approx([False, False, True])
+    assert info.match == approx([False, True, False])
+    assert info.unseq == approx([True, False, False])
 
-    len_seqx = 0
-    len_states = 2
-    info = {'gap_any_flag': np.zeros((len_seqx), bool),
-            'mask_any_flag': np.zeros((len_seqx), bool),
-            'match_flag': np.zeros((len_seqx, len_states), bool)}
+    with pytest.raises(ValueError) as e:
+        match, sites, d = sa.seq_id_hmm(np.array(list('n-d')),
+                                        np.array(list('--c')),
+                                        1, [2, 5])
+    assert ('Need to skip site specified as included '
+            f'seq1: -, seq2: -, index: 1') in str(e)
 
-    s = summarize.make_info_string_unknown(info, master_ind=0)
-    assert s == ''
+    with pytest.raises(ValueError) as e:
+        match, sites, d = sa.seq_id_hmm(np.array(list('n-d')),
+                                        np.array(list('--c')),
+                                        1, [1, 5])
+    assert ('Need to skip site specified as included '
+            f'seq1: n, seq2: -, index: 0') in str(e)
diff --git a/code/test/analyze/test_summarize_region_quality_main.py b/code/test/analyze/test_summarize_region_quality_main.py
deleted file mode 100644
index 1decede..0000000
--- a/code/test/analyze/test_summarize_region_quality_main.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import analyze.summarize_region_quality_main as main
-from io import StringIO
-
-
-def test_main(mocker):
-    # setup global params to match expectations
-    mocker.patch(
-        'analyze.summarize_region_quality_main.gp.analysis_out_dir_absolute',
-        'dir/')
-    mocker.patch('analyze.summarize_region_quality_main.gp.chrms',
-                 ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX',
-                  'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'])
-
-    mocker.patch('sys.argv',
-                 "test.py 5 tag .001 viterbi 10000 .025 10000 .025 \
-                 10000 .025 10000 .025 unknown 1000 .01".split())
-    mocker.patch('analyze.summarize_region_quality_main.os.path.isdir',
-                 return_value=True)
-    # TODO check call arguments
-    mocker.patch('misc.read_table.read_table_columns',
-                 return_value=({'s': {'region_id': []}}, ['region_id']))
-    mocker.patch('analyze.summarize_region_quality_main.read_masked_intervals',
-                 return_value=[(1, 2)])
-    lines = StringIO('')
-    mocked_file = mocker.patch(
-        'analyze.summarize_region_quality_main.gzip.open',
-        return_value=lines)
-
-    mocked_file = mocker.patch('analyze.summarize_region_quality_main.open',
-                               mocker.mock_open())
-
-    main.main()
-
-    assert mocked_file.call_count == 2
-    mocked_file.assert_any_call(
-        'dir/tag/blocks_unknown_tag_quality.txt', 'w')
-    mocked_file.assert_any_call(
-        'dir/tag/regions/unknown.pkl', 'wb')
-
-    # just headers
-    states = ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1']
-    symbols = list('.-_npbcxNPBCX')
-    mocked_file().write.assert_has_calls([
-        mocker.call('\t'.join(['region_id'] +
-                              ['match_nongap_' + x for x in states] +
-                              ['num_sites_nongap_' + x for x in states] +
-                              ['match_hmm_' + x for x in states] +
-                              ['match_nonmask_' + x for x in states] +
-                              ['num_sites_nonmask_' + x for x in states] +
-                              ['count_' + x for x in symbols]
-                              )
-                    + '\n')
-    ])
-    assert True
diff --git a/code/test/analyze/test_summarize_strain_states.py b/code/test/analyze/test_summarize_strain_states.py
new file mode 100644
index 0000000..e712090
--- /dev/null
+++ b/code/test/analyze/test_summarize_strain_states.py
@@ -0,0 +1,415 @@
+from analyze import summarize_strain_states
+from analyze.summarize_strain_states import Summary_Table
+from analyze.introgression_configuration import Configuration
+import pytest
+from io import StringIO
+
+
+@pytest.fixture
+def summarizer():
+    config = Configuration()
+    config.known_states = 'state1 state2 state3'.split()
+    config.set(
+        introgressed_intermediate='/dirtag/'
+        'blocks_{state}_tag_filtered1intermediate.txt',
+        ambiguous_intermediate='/dirtag/'
+        'blocks_{state}_tag_filtered2intermediate.txt',
+        strain_info='100_genomes_info.txt',
+        state_counts='/dirtag/state_counts_by_strain.txt'
+    )
+
+    return summarize_strain_states.Strain_Summarizer(config)
+
+
+def test_run(summarizer, mocker, capsys):
+    mock_log = mocker.patch('analyze.summarize_strain_states.log.info')
+    mock_read = mocker.patch(
+        'analyze.summarize_strain_states.read_table.read_table_rows',
+        side_effect=[
+            ({'r1': {'strain': 's1', 'start': 10, 'end': 20,
+                     'reason': ''},
+              'r2': {'strain': 's2', 'start': 25, 'end': 40,
+                     'reason': 'test'}},
+             ['regions']),
+            ({'r1': {'alternative_states': 'state2,state3'},
+              'r2': {'not_called': ''}},
+             ['regions']),
+            ({'r1': {'strain': 's1', 'start': 35, 'end': 40,
+                     'reason': ''},
+              'r2': {'strain': 's2', 'start': 4, 'end': 8,
+                     'reason': ''}},
+             ['regions']),
+            ({'r1': {'alternative_states': 'state3,state2'},
+              'r2': {'alternative_states': 'state3'}},
+             ['regions'])
+        ])
+
+    handle1 = mocker.MagicMock()
+    handle1.__enter__.return_value.__iter__.return_value = \
+        ("s1\tnothing\tnothing\tgeo1\tenv1\tpop1\n",
+         "s2\tnothing\tnothing\tgeo2\tenv2\tpop2\n")
+    handle2 = mocker.MagicMock()
+
+    mocker.patch(
+        'analyze.summarize_strain_states.open',
+        side_effect=(handle1, handle2)
+    )
+
+    summarizer.run()
+
+    assert mock_log.call_args_list == [
+        mocker.call('state2'),
+        mocker.call('state3'),
+    ]
+
+    assert handle1.called_with(mocker.ANY, 'r')
+
+    assert handle2.called_with('/dirtag/state_counts_by_strain.txt', 'w')
+
+    calls = handle2.__enter__().write.call_args_list
+    assert handle2.__enter__().write.call_count == 3
+    assert calls[0][0] == \
+        ('strain\tpopulation\tgeographic_origin\tenvironmental_origin\t'
+         'num_regions_state2\tnum_regions_state3\tnum_regions_total\t'
+         'num_regions_state2_filtered1\tnum_regions_state3_filtered1\t'
+         'num_regions_total_filtered1\tnum_regions_state2_filtered2\t'
+         'num_regions_state3_filtered2\tnum_regions_total_filtered2\t'
+         'num_regions_state2_filtered2_inclusive\t'
+         'num_regions_state3_filtered2_inclusive\t'
+         'num_regions_total_filtered2_inclusive\tnum_bases_state2\t'
+         'num_bases_state3\tnum_bases_total\tnum_bases_state2_filtered1\t'
+         'num_bases_state3_filtered1\tnum_bases_total_filtered1\t'
+         'num_bases_state2_filtered2\tnum_bases_state3_filtered2\t'
+         'num_bases_total_filtered2\tnum_bases_state2_filtered2_inclusive\t'
+         'num_bases_state3_filtered2_inclusive\t'
+         'num_bases_total_filtered2_inclusive\t'
+         'num_bases_state2_or_state3_filtered2i\tnum_bases_2_filtered2i\n',)
+    assert calls[1][0] == \
+        ('s1\tpop1\tgeo1\tenv1\t1\t1\t2\t1\t1\t2\t0\t0\t0\t2\t2\t2\t11\t6\t'
+         '17\t11\t6\t17\t0\t0\t0\t17\t17\t17\t17\t17\n',)
+    assert calls[2][0] == \
+        ('s2\tpop2\tgeo2\tenv2\t1\t1\t2\t0\t1\t1\t0\t1\t1\t0\t1\t1\t16\t5\t21'
+         '\t0\t5\t5\t0\t5\t5\t0\t5\t5\t0\t0\n',)
+
+    assert mock_read.call_count == 4
+    fname = '/dirtag/blocks_state{s}_tag_filtered{f}intermediate.txt'
+    mock_read.assert_has_calls([
+        mocker.call(fname.format(s=2, f=1), '\t'),
+        mocker.call(fname.format(s=2, f=2), '\t'),
+        mocker.call(fname.format(s=3, f=1), '\t'),
+        mocker.call(fname.format(s=3, f=2), '\t')])
+
+
+@pytest.fixture
+def table():
+    return summarize_strain_states.Summary_Table()
+
+
+def test_table_init(table):
+    assert table.table == {}
+
+
+def test_table_set_region(table):
+    assert table.__dict__ == {'table': {}}
+    table.set_region('strain', 'species', 100)
+    assert table.__dict__ == {'table': {},
+                              'strain': 'strain',
+                              'species': 'species',
+                              'length': 100}
+
+
+def test_table_record_element(table):
+    assert table.table == {}
+    table.record_element('s1', 'k1')
+    assert table.table == {'s1': {'k1': 1}}
+
+    table.record_element('s1', 'k1', 10)
+    assert table.table == {'s1': {'k1': 11}}
+
+    table.record_element('s1', 'k2', -1)
+    assert table.table == {'s1': {'k1': 11, 'k2': -1}}
+
+    table.record_element('s2', 'k2', 0)
+    assert table.table == {'s1': {'k1': 11, 'k2': -1},
+                           's2': {'k2': 0}}
+
+
+def test_table_record_region(table):
+    table.record_region('s1', 'CBS', 100)
+    assert table.table == {'s1': {
+        'num_regions_CBS': 1,
+        'num_regions_total': 1,
+        'num_bases_CBS': 100,
+        'num_bases_total': 100,
+    }}
+
+    table.record_region('s1', 'N', 50)
+    assert table.table == {'s1': {
+        'num_regions_CBS': 1,
+        'num_regions_N': 1,
+        'num_regions_total': 2,
+        'num_bases_CBS': 100,
+        'num_bases_N': 50,
+        'num_bases_total': 150,
+    }}
+
+    table.record_region('s1', 'N', 50, '_filt')
+    assert table.table == {'s1': {
+        'num_regions_CBS': 1,
+        'num_regions_N': 1,
+        'num_regions_total': 2,
+        'num_bases_CBS': 100,
+        'num_bases_N': 50,
+        'num_bases_total': 150,
+        'num_regions_N_filt': 1,
+        'num_regions_total_filt': 1,
+        'num_bases_N_filt': 50,
+        'num_bases_total_filt': 50,
+    }}
+
+    # add underscore when concatenating
+    table.record_region('s1', 'N', 50, 'filt')
+    assert table.table == {'s1': {
+        'num_regions_CBS': 1,
+        'num_regions_N': 1,
+        'num_regions_total': 2,
+        'num_bases_CBS': 100,
+        'num_bases_N': 50,
+        'num_bases_total': 150,
+        'num_regions_N_filt': 2,
+        'num_regions_total_filt': 2,
+        'num_bases_N_filt': 100,
+        'num_bases_total_filt': 100,
+    }}
+
+    # don't update total
+    table.record_region('s1', 'N', 50, 'filt', False)
+    assert table.table == {'s1': {
+        'num_regions_CBS': 1,
+        'num_regions_N': 1,
+        'num_regions_total': 2,
+        'num_bases_CBS': 100,
+        'num_bases_N': 50,
+        'num_bases_total': 150,
+        'num_regions_N_filt': 3,
+        'num_regions_total_filt': 2,
+        'num_bases_N_filt': 150,
+        'num_bases_total_filt': 100,
+    }}
+
+    # don't update total
+    table.record_region('s2', 'N', 50, 'filt', False)
+    assert table.table == {
+        's1': {
+            'num_regions_CBS': 1,
+            'num_regions_N': 1,
+            'num_regions_total': 2,
+            'num_bases_CBS': 100,
+            'num_bases_N': 50,
+            'num_bases_total': 150,
+            'num_regions_N_filt': 3,
+            'num_regions_total_filt': 2,
+            'num_bases_N_filt': 150,
+            'num_bases_total_filt': 100,
+        },
+        's2': {
+            'num_regions_N_filt': 1,
+            'num_bases_N_filt': 50,
+        }}
+
+
+def test_table_record_alt_species(table):
+    table.set_region('strain', 'species', 100)
+    assert table.table == {}
+    table.record_alt_species(['a1', 'a2'])
+    assert table.table == {
+        'strain': {
+            'num_bases_2_filtered2i': 100,
+            'num_bases_a1_filtered2_inclusive': 100,
+            'num_bases_a1_or_a2_filtered2i': 100,
+            'num_bases_a2_filtered2_inclusive': 100,
+            'num_regions_a1_filtered2_inclusive': 1,
+            'num_regions_a2_filtered2_inclusive': 1,
+        }
+    }
+
+    table.set_region('strain', 'species', 50)
+    table.record_alt_species(['a1'])
+    assert table.table == {
+        'strain': {
+            'num_bases_2_filtered2i': 100,
+            'num_bases_a1_filtered2_inclusive': 150,
+            'num_bases_a1_or_a2_filtered2i': 100,
+            'num_bases_a2_filtered2_inclusive': 100,
+            'num_regions_a1_filtered2_inclusive': 2,
+            'num_regions_a2_filtered2_inclusive': 1,
+            'num_bases_species_filtered2': 50,
+            'num_regions_species_filtered2': 1,
+            'num_bases_total_filtered2': 50,
+            'num_regions_total_filtered2': 1,
+            'num_bases_1_filtered2i': 50,
+        }
+    }
+
+
+def test_table_region_found(table, mocker):
+    table.set_region('strain', 'species', 100)
+    mock_record = mocker.patch.object(Summary_Table, 'record_region')
+    table.region_found()
+    mock_record.assert_called_once_with('strain', 'species', 100)
+
+
+def test_table_region_passes_filter1(table, mocker):
+    table.set_region('strain', 'species', 100)
+    mock_record = mocker.patch.object(Summary_Table, 'record_region')
+    table.region_passes_filter1()
+    mock_record.assert_called_once_with('strain', 'species', 100, '_filtered1')
+
+
+def test_table_record_alt(table, mocker):
+    table.set_region('strain', 'species', 100)
+    mock_record = mocker.patch.object(Summary_Table, 'record_region')
+    table.record_alt('species2')
+    mock_record.assert_called_once_with('strain', 'species2',
+                                        100, '_filtered2_inclusive',
+                                        False)
+    mock_record.reset_mock()
+    table.record_alt('species')
+    mock_record.assert_called_once_with('strain', 'species',
+                                        100, '_filtered2_inclusive',
+                                        True)
+
+
+def test_table_add_strain_info(table):
+    table.table = {'s1': {}, 's2': {}}
+    strain_info = StringIO(
+        's1\t_\t_\tgeo1\tenv1\tpop1\n'  # overwrite below
+        'S1\t_\t_\tgeo2\tenv2\tpop2\n'  # upper should cast lower
+        's2\t_\t_\tgeo3\tenv3\tpop3\n'
+        's3\t_\t_\tgeo4\tenv4\tpop4\n'  # not recorded
+    )
+    table.add_strain_info(strain_info)
+    assert table.table == {
+        's1': {
+            'population': 'pop2',
+            'geographic_origin': 'geo2',
+            'environmental_origin': 'env2'
+        },
+        's2': {
+            'population': 'pop3',
+            'geographic_origin': 'geo3',
+            'environmental_origin': 'env3'
+        }
+    }
+
+
+def test_table_add_strain_info_empty(table):
+    table.table = {'s1': {}, 's2': {}}
+    strain_info = StringIO(
+        's1\t\t_\tgeo1\tenv1\tpop1\n'  # should not fail
+        's1\t_\t_\tgeo2\tenv2\tpop2\n'
+        's2\t_\t_\tgeo3\tenv3\tpop3\n'
+        's3\t_\t_\tgeo4\tenv4\tpop4\n'  # not recorded
+    )
+    table.add_strain_info(strain_info)
+    assert table.table == {
+        's1': {
+            'population': 'pop2',
+            'geographic_origin': 'geo2',
+            'environmental_origin': 'env2'
+        },
+        's2': {
+            'population': 'pop3',
+            'geographic_origin': 'geo3',
+            'environmental_origin': 'env3'
+        }
+    }
+
+
+def test_table_write_summary(table):
+    output = StringIO()
+    table.write_summary([], output)
+    assert output.getvalue() == (
+        'strain\tpopulation\tgeographic_origin\tenvironmental_origin\t'
+        'num_regions_total\tnum_regions_total_filtered1\t'
+        'num_regions_total_filtered2\tnum_regions_total_filtered2_inclusive\t'
+        'num_bases_total\tnum_bases_total_filtered1\tnum_bases_total_filtered2'
+        '\tnum_bases_total_filtered2_inclusive\n'
+    )
+
+    table.table = {
+        's1': {
+            'num_regions_total': 15,
+            'num_bases_total_filtered2_inclusive': 500,
+            'NOTHING': 'NOT READ'
+        },
+        's2': {
+            'population': 'pop2'
+        }
+    }
+    output = StringIO()
+    table.write_summary(['state'], output)
+    assert output.getvalue() == (
+        'strain\tpopulation\tgeographic_origin\tenvironmental_origin\t'
+        'num_regions_state\tnum_regions_total\tnum_regions_state_filtered1\t'
+        'num_regions_total_filtered1\tnum_regions_state_filtered2\t'
+        'num_regions_total_filtered2\tnum_regions_state_filtered2_inclusive\t'
+        'num_regions_total_filtered2_inclusive\tnum_bases_state\t'
+        'num_bases_total\tnum_bases_state_filtered1\t'
+        'num_bases_total_filtered1\t'
+        'num_bases_state_filtered2\tnum_bases_total_filtered2\t'
+        'num_bases_state_filtered2_inclusive\t'
+        'num_bases_total_filtered2_inclusive\n'
+        's1\t0\t0\t0\t0\t15\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t500\n'
+        's2\tpop2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'
+    )
+
+
+def test_table_get_fields(table):
+    fields = table.get_fields([])
+    assert fields == [
+        'population', 'geographic_origin', 'environmental_origin',
+        'num_regions_total',
+        'num_regions_total_filtered1',
+        'num_regions_total_filtered2',
+        'num_regions_total_filtered2_inclusive',
+        'num_bases_total',
+        'num_bases_total_filtered1',
+        'num_bases_total_filtered2',
+        'num_bases_total_filtered2_inclusive']
+
+    fields = table.get_fields(['s1'])
+    assert fields == [
+        'population', 'geographic_origin', 'environmental_origin',
+        'num_regions_s1', 'num_regions_total', 'num_regions_s1_filtered1',
+        'num_regions_total_filtered1', 'num_regions_s1_filtered2',
+        'num_regions_total_filtered2', 'num_regions_s1_filtered2_inclusive',
+        'num_regions_total_filtered2_inclusive', 'num_bases_s1',
+        'num_bases_total', 'num_bases_s1_filtered1',
+        'num_bases_total_filtered1', 'num_bases_s1_filtered2',
+        'num_bases_total_filtered2', 'num_bases_s1_filtered2_inclusive',
+        'num_bases_total_filtered2_inclusive']
+
+    fields = table.get_fields(['s1', 's2', 's3'])
+    assert fields == [
+        'population', 'geographic_origin', 'environmental_origin',
+        'num_regions_s1', 'num_regions_s2', 'num_regions_s3',
+        'num_regions_total', 'num_regions_s1_filtered1',
+        'num_regions_s2_filtered1', 'num_regions_s3_filtered1',
+        'num_regions_total_filtered1', 'num_regions_s1_filtered2',
+        'num_regions_s2_filtered2', 'num_regions_s3_filtered2',
+        'num_regions_total_filtered2', 'num_regions_s1_filtered2_inclusive',
+        'num_regions_s2_filtered2_inclusive',
+        'num_regions_s3_filtered2_inclusive',
+        'num_regions_total_filtered2_inclusive', 'num_bases_s1',
+        'num_bases_s2', 'num_bases_s3', 'num_bases_total',
+        'num_bases_s1_filtered1', 'num_bases_s2_filtered1',
+        'num_bases_s3_filtered1', 'num_bases_total_filtered1',
+        'num_bases_s1_filtered2', 'num_bases_s2_filtered2',
+        'num_bases_s3_filtered2', 'num_bases_total_filtered2',
+        'num_bases_s1_filtered2_inclusive', 'num_bases_s2_filtered2_inclusive',
+        'num_bases_s3_filtered2_inclusive',
+        'num_bases_total_filtered2_inclusive', 'num_bases_s1_or_s2_filtered2i',
+        'num_bases_s1_or_s3_filtered2i', 'num_bases_s2_or_s3_filtered2i',
+        'num_bases_2_filtered2i', 'num_bases_s1_or_s2_or_s3_filtered2i',
+        'num_bases_3_filtered2i']
diff --git a/code/test/helper_scripts/compare_filter_outputs.sh b/code/test/helper_scripts/compare_filter_outputs.sh
new file mode 100755
index 0000000..4a0e77d
--- /dev/null
+++ b/code/test/helper_scripts/compare_filter_outputs.sh
@@ -0,0 +1,35 @@
+#! /bin/bash
+
+actual=/tigress/tcomi/aclark4_temp/results/analysis_test/
+expected=/tigress/tcomi/aclark4_temp/results/analysisp4e2/
+echo starting comarison of $(basename $actual) to $(basename $expected)
+
+for file in $(ls ${expected}*_filtered1.txt); do
+    act=$(echo $file | sed 's/p4e2_filtered1.txt/filter1.txt/g')
+    cmp <(sort $act) <(sort $file) \
+        && echo $file passed! || echo $file failed #&& exit
+done
+
+for file in $(ls ${expected}*_filtered1intermediate.txt); do
+    act=$(echo $file | sed 's/p4e2_filtered1intermediate.txt/filter1inter.txt/g')
+    cmp <(sort $act | python intermediate_format_1.py) \
+        <(sort $file | python intermediate_format_1.py) \
+        && echo $file passed! || echo $file failed #&& exit
+done
+
+for file in $(ls ${expected}*_filtered2.txt); do
+    act=$(echo $file | sed 's/p4e2_filtered2.txt/filter2.txt/g')
+    cmp <(sort $act) <(sort $file) \
+        && echo $file passed! || echo $file failed #&& exit
+done
+
+for file in $(ls ${expected}*_filtered2intermediate.txt); do
+    act=$(echo $file | sed 's/p4e2_filtered2intermediate.txt/filter2inter.txt/g')
+    cmp <(sort $act | python intermediate_format_2.py) \
+        <(sort $file | python intermediate_format_2.py) \
+        && echo $file passed! || echo $file failed && exit
+done
+
+cmp <(sort ${expected}/filter_2_thresholds_p4e2.txt) \
+    <(sort ${expected}/filter2_thresholds.txt) \
+    && echo thresholds passed! || echo thresholds failed
diff --git a/code/test/helper_scripts/compare_outputs.sh b/code/test/helper_scripts/compare_outputs.sh
index 5c0f4a7..40544c2 100755
--- a/code/test/helper_scripts/compare_outputs.sh
+++ b/code/test/helper_scripts/compare_outputs.sh
@@ -1,13 +1,13 @@
 #! /bin/bash
 
-actual=/tigress/tcomi/aclark4_temp/results/analysis_test/
-expected=/tigress/tcomi/aclark4_temp/results/analysisp4e2/
+actual=/tigress/tcomi/aclark4_temp/results/analysis_chr1_test/
+expected=/tigress/tcomi/aclark4_temp/results/analysis_chr1/
 echo starting comarison of $(basename $actual) to $(basename $expected)
 
 module load anaconda3
 
 for file in $(ls $expected); do
-    act=$(echo $file | sed 's/\(.*\)p4e2\(\.txt.*\)/\1_test\2/')
+    act=$(echo $file | sed 's/__chr1//')
     if [[ $file = hmm* ]]; then
         cmp <(cat $actual$act | python hmm_format.py) \
             <(cat $expected$file | python hmm_format.py) \
diff --git a/code/test/helper_scripts/intermediate_format_1.py b/code/test/helper_scripts/intermediate_format_1.py
new file mode 100644
index 0000000..3ae7f27
--- /dev/null
+++ b/code/test/helper_scripts/intermediate_format_1.py
@@ -0,0 +1,25 @@
+import sys
+
+
+def main():
+    precision = 10
+    with sys.stdin as reader:
+        for line in reader:
+            line = line.strip()
+            tokens = line.split('\t')
+            if '=' in tokens[-1]:
+                eq_tokens = tokens[-1].split(' ')
+                for i in range(len(eq_tokens)):
+                    try:
+                        float(eq_tokens[i])
+                    except ValueError:
+                        continue
+                    if len(eq_tokens[i]) > precision:
+                        eq_tokens[i] = eq_tokens[i][:precision]
+                tokens[-1] = ' '.join(eq_tokens)
+                line = '\t'.join(tokens)
+            print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/test/helper_scripts/intermediate_format_2.py b/code/test/helper_scripts/intermediate_format_2.py
new file mode 100644
index 0000000..ec5ca97
--- /dev/null
+++ b/code/test/helper_scripts/intermediate_format_2.py
@@ -0,0 +1,30 @@
+import sys
+
+
+def main():
+    precision = 10
+    with sys.stdin as reader:
+        for line in reader:
+            line = line.strip()
+            tokens = line.split('\t')
+            # limit float sizes to 10 characters
+            for j in range(len(tokens)-2, len(tokens)):
+                float_tokens = tokens[j].split(',')
+                for i in range(len(float_tokens)):
+                    try:
+                        float(float_tokens[i])
+                    except ValueError:
+                        continue
+                    if len(float_tokens[i]) > precision:
+                        float_tokens[i] = float_tokens[i][:precision]
+                tokens[j] = ','.join(float_tokens)
+            # check if alt ids are equal, sorting is messed up from py2 to 3
+            id_toks = tokens[-2].split(',')
+            if len(id_toks) > 1 and id_toks[0] == id_toks[1]:
+                tokens[-3] = ','.join(sorted(tokens[-3].split(',')))
+            line = '\t'.join(tokens)
+            print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/test/helper_scripts/run_filter_1.sh b/code/test/helper_scripts/run_filter_1.sh
new file mode 100755
index 0000000..9d14467
--- /dev/null
+++ b/code/test/helper_scripts/run_filter_1.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+#SBATCH --time=0-1
+#SBATCH -n 1
+#SBATCH -o "/tigress/tcomi/aclark4_temp/results/filter_%A"
+
+export PYTHONPATH=/home/tcomi/projects/aclark4_introgression/code/
+
+module load anaconda3
+conda activate introgression3
+
+ARGS="_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
+
+python ${PYTHONPATH}analyze/filter_1_main.py $ARGS
diff --git a/code/test/helper_scripts/run_filter_2.sh b/code/test/helper_scripts/run_filter_2.sh
new file mode 100755
index 0000000..a3cbb08
--- /dev/null
+++ b/code/test/helper_scripts/run_filter_2.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+#SBATCH --time=0-1
+#SBATCH -n 1
+#SBATCH -o "/tigress/tcomi/aclark4_temp/results/filter_%A"
+
+export PYTHONPATH=/home/tcomi/projects/aclark4_introgression/code/
+
+module load anaconda3
+conda activate introgression3
+
+ARGS="0.98 _test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
+
+python ${PYTHONPATH}analyze/filter_2_main.py $ARGS
diff --git a/code/test/helper_scripts/run_filter_2_thresholds.sh b/code/test/helper_scripts/run_filter_2_thresholds.sh
new file mode 100755
index 0000000..9cb7708
--- /dev/null
+++ b/code/test/helper_scripts/run_filter_2_thresholds.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#SBATCH --time=0-1
+#SBATCH -n 1
+#SBATCH -o "/tigress/tcomi/aclark4_temp/results/thresh_%A"
+
+config=/home/tcomi/projects/aclark4_introgression/code/config.yaml
+
+module load anaconda3
+conda activate introgression3
+
+introgression \
+    --config $config \
+    --log-file test.log \
+    -vv \
+    filter-regions \
+    .999 .995 .985 .975 .965 .955 .945 .935 .925 .915 .905 .89 .87 .86
diff --git a/code/test/helper_scripts/run_summarize_region_quality.slurm.sh b/code/test/helper_scripts/run_summarize_region_quality.slurm.sh
index 4ca8ed9..b9a433b 100755
--- a/code/test/helper_scripts/run_summarize_region_quality.slurm.sh
+++ b/code/test/helper_scripts/run_summarize_region_quality.slurm.sh
@@ -5,11 +5,15 @@
 #SBATCH -n 1
 #SBATCH -o "/tigress/tcomi/aclark4_temp/results/summarize_%A_%a"
 
-export PYTHONPATH=/home/tcomi/projects/aclark4_introgression/code/
-
 module load anaconda3
 conda activate introgression3
 
-ARGS="_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
+config=/home/tcomi/projects/aclark4_introgression/code/config.yaml
 
-python ${PYTHONPATH}analyze/summarize_region_quality_main.py $SLURM_ARRAY_TASK_ID $ARGS
+introgression \
+    --config $config \
+    --log-file test.log \
+    -vvvv \
+    summarize-regions \
+        --state N_45 \
+        --state CBS432
diff --git a/code/test/helper_scripts/run_summarize_strain.sh b/code/test/helper_scripts/run_summarize_strain.sh
new file mode 100755
index 0000000..fd9d7d9
--- /dev/null
+++ b/code/test/helper_scripts/run_summarize_strain.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --time=0-1
+#SBATCH -n 1
+#SBATCH -o "/tigress/tcomi/aclark4_temp/results/filter_%A"
+
+config=/home/tcomi/projects/aclark4_introgression/code/config.yaml
+
+module load anaconda3
+conda activate introgression3
+
+introgression \
+    --config $config \
+    --log-file test.log \
+    -vv \
+    summarize-strains
diff --git a/code/test/helper_scripts/test_id_main.slurm b/code/test/helper_scripts/test_id_main.slurm
index ec4dea2..e42e9fa 100755
--- a/code/test/helper_scripts/test_id_main.slurm
+++ b/code/test/helper_scripts/test_id_main.slurm
@@ -6,18 +6,13 @@
 #SBATCH -n 1
 #SBATCH -o "/tigress/tcomi/aclark4_temp/results/id_%A"
 
-# ARGS=$(head -n $SLURM_ARRAY_TASK_ID predict_args.txt | tail -n 1)
-
-export PYTHONPATH=/home/tcomi/projects/aclark4_introgression/code/
-
-#ARGS="p4e2 .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
-ARGS="_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
-
-#Make sure chrms is set to only I
-#ARGS="_chr1_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
-#ARGS="_chr1 .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
+config=/home/tcomi/projects/aclark4_introgression/code/config.yaml
 
 module load anaconda3
 conda activate introgression3
 
-python $PYTHONPATH/analyze/id_regions_main.py $ARGS
+introgression \
+    --config $config \
+    -vvvv \
+    --log-file test.log \
+    id-regions
diff --git a/code/test/helper_scripts/test_predict.slurm b/code/test/helper_scripts/test_predict.slurm
index f0033b3..8f6f926 100755
--- a/code/test/helper_scripts/test_predict.slurm
+++ b/code/test/helper_scripts/test_predict.slurm
@@ -1,23 +1,17 @@
 #!/bin/bash
 
-## SBATCH --array=1
 #SBATCH --time=6-0
-
 #SBATCH -n 1
 #SBATCH -o "/tigress/tcomi/aclark4_temp/results/predict_%A"
 
-# ARGS=$(head -n $SLURM_ARRAY_TASK_ID predict_args.txt | tail -n 1)
-
-export PYTHONPATH=/home/tcomi/projects/aclark4_introgression/code/
-
+config=/home/tcomi/projects/aclark4_introgression/code/config.yaml
 #Make sure chrms is set to only I
-#ARGS="_chr1_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
-ARGS="_test .001 viterbi 10000 .025 10000 .025 10000 .025 10000 .025 unknown 1000 .01"
 
 module load anaconda3
 conda activate introgression3
 
-#python $PYTHONPATH/analyze/predict_main.py $ARGS
-# gzip after
-gzip /tigress/tcomi/aclark4_temp/results/analysis_test/positions__test.txt
-gzip /tigress/tcomi/aclark4_temp/results/analysis_test/probs__test.txt
+introgression \
+    --config $config \
+    --log-file test.log \
+    -vv \
+    predict
diff --git a/code/test/hmm/test_hmm_bw.py b/code/test/hmm/test_hmm_bw.py
index ced8288..7e6bf21 100644
--- a/code/test/hmm/test_hmm_bw.py
+++ b/code/test/hmm/test_hmm_bw.py
@@ -2,6 +2,7 @@
 import pytest
 from pytest import approx
 import numpy as np
+import logging as log
 
 
 def test_init():
@@ -42,11 +43,15 @@ def test_setters():
     assert '[0, 0] 0' in str(e)
 
 
-def test_print_results(capsys, hm):
+def test_print_results(mocker, hm):
+    mock_debug = mocker.patch('hmm.hmm_bw.log.debug')
+    log.basicConfig(level=log.DEBUG)
+
     hm.print_results(0, 1)
-    captured = capsys.readouterr()
-    out = captured.out.split('\n')
+    captured = mock_debug.call_args[0][0]
+    out = captured.split('\n')
 
+    print('\n'.join(out))
     assert out[0] == 'Iterations: 0'
     assert out[2] == 'Log Likelihood:'
     assert out[5] == 'Initial State Probabilities:'
@@ -69,10 +74,14 @@ def test_print_results(capsys, hm):
     assert float(out[19].split('=')[1]) == 0.8
 
 
-def test_train(capsys, hm):
+def test_train(mocker, hm):
+    mock_debug = mocker.patch('hmm.hmm_bw.log.debug')
+    log.basicConfig(level=log.DEBUG)
+
     hm.train()
     # get output from last report
-    out = capsys.readouterr().out.split('\n')[-23:]
+    captured = mock_debug.call_args[0][0]
+    out = captured.split('\n')[-23:]
     assert out[0] == 'Iterations: 2'
     assert out[2] == 'Log Likelihood:'
     assert out[5] == 'Initial State Probabilities:'
@@ -189,7 +198,8 @@ def test_emission_probabilities(hm3):
     den = np.logaddexp.reduce(gamma, axis=0)
     den = np.logaddexp.reduce(den, axis=0)
 
-    obs = np.array([i == hm.observations for i in range(len(hm.observed_states))])
+    obs = np.array([i == hm.observations
+                    for i in range(len(hm.observed_states))])
     obs = np.moveaxis(obs, [0, 1, 2], [2, 0, 1])
     gam = np.where(obs[:, :, None, :], gamma[:, :, :, None], np.NINF)
     num = np.logaddexp.reduce(np.logaddexp.reduce(gam))
@@ -211,15 +221,15 @@ def test_forward(hm):
         for o in range(1, len(hm.observations[seq])):
             row = []
             for current in range(len(hm.hidden_states)):
-                    total = -np.inf
-                    for prev in range(len(hm.hidden_states)):
-                        total = np.logaddexp(
-                            total,
-                            alpha_current[o-1][prev] +
-                            np.log(hm.transitions[prev][current]))
-                    total += np.log(
-                        hm.emissions[current][hm.observations[seq][o]])
-                    row.append(total)
+                total = -np.inf
+                for prev in range(len(hm.hidden_states)):
+                    total = np.logaddexp(
+                        total,
+                        alpha_current[o-1][prev] +
+                        np.log(hm.transitions[prev][current]))
+                total += np.log(
+                    hm.emissions[current][hm.observations[seq][o]])
+                row.append(total)
             alpha_current.append(row)
 
         alpha_iter.append(alpha_current)
@@ -302,8 +312,8 @@ def test_bw(hm):
                     norm = np.logaddexp(norm, prob)
 
             for i in range(len(hm.hidden_states)):
-                    for j in range(len(hm.hidden_states)):
-                        matrix[i][j] = matrix[i][j] - norm
+                for j in range(len(hm.hidden_states)):
+                    matrix[i][j] = matrix[i][j] - norm
 
             xi_current.append(matrix)
 
@@ -336,13 +346,13 @@ def test_calc_probs(hm):
             max_prob = np.NINF
             max_state = -1
             for prev_state in range(len(hm.hidden_states)):
-                    trans_prob = hm.transitions[prev_state][end_state]
-                    emis_prob = hm.emissions[end_state][hm.observations[pos]]
-                    prob = iter_probs[pos - 1][prev_state] + \
-                        np.log(trans_prob) + np.log(emis_prob)
-                    if prob > max_prob:
-                        max_prob = prob
-                        max_state = prev_state
+                trans_prob = hm.transitions[prev_state][end_state]
+                emis_prob = hm.emissions[end_state][hm.observations[pos]]
+                prob = iter_probs[pos - 1][prev_state] + \
+                    np.log(trans_prob) + np.log(emis_prob)
+                if prob > max_prob:
+                    max_prob = prob
+                    max_state = prev_state
 
             iter_probs[pos].append(max_prob)
             iter_states[pos].append(max_state)
diff --git a/code/test/misc/test_config_utils.py b/code/test/misc/test_config_utils.py
new file mode 100644
index 0000000..39a59e3
--- /dev/null
+++ b/code/test/misc/test_config_utils.py
@@ -0,0 +1,167 @@
+import pytest
+from misc.config_utils import (clean_config, clean_list,
+                               merge_lists, merge_dicts,
+                               get_nested, check_wildcards,
+                               validate)
+
+
+def test_simple():
+    config = {'base_name': 'base',
+              'test': '__BASE_NAME__/test.txt',
+              'test2': '__BASE_NAME__/__BASE_NAME__/test.txt',
+              'test3': '__BASE_NAME____BASE_NAME__/test.txt',
+              'test4': '__BASE_NAME__/__TEST__/test.txt',
+              'test5': '__BASE_NAME__/__TEST__/test__DIGIT__.txt',
+              'test6': '__BASE_NAME__//test__DIGIT__.txt',
+              'test7': '//test///test////test/',
+              'digit': 10,
+              }
+    config = clean_config(config)
+    assert config == {'base_name': 'base',
+                      'test': 'base/test.txt',
+                      'test2': 'base/base/test.txt',
+                      'test3': 'basebase/test.txt',
+                      'test4': 'base/base/test.txt/test.txt',
+                      'test5': 'base/base/test.txt/test10.txt',
+                      'test6': 'base/test10.txt',
+                      'test7': '/test/test/test/',
+                      'digit': 10,
+                      }
+
+
+def test_circular():
+    config = {'base_name': '__TEST2__',
+              'test': '__BASE_NAME__/test.txt',
+              'test2': '__BASE_NAME__/test.txt',
+              }
+    with pytest.raises(Exception) as e:
+        clean_config(config)
+    assert 'Failed to dereference all keys' in str(e)
+
+
+def test_nest_dict():
+    config = {
+        'base_name': 'base',
+        'test': '__BASE_NAME__/test.txt',
+        'dict2': {
+            'test2': '__BASE_NAME__/test2.txt',
+            'base_name': 'base2',
+            'dict3': {
+                'base_name': 'base3',
+                'test3': '__BASE_NAME__/test3.txt'
+            },
+            'dict4': {
+                'test4': '__BASE_NAME__/test3.txt'
+            }
+        },
+        'test5': '__BASE_NAME__/test5.txt',
+        'dict5': {
+            'base_name': '__BASE_NAME__',
+            'test6': '__BASE_NAME__/test_5.txt'
+        }
+    }
+    config = clean_config(config)
+    assert config == {
+        'base_name': 'base',
+        'test': 'base/test.txt',
+        'dict2': {
+            'test2': 'base2/test2.txt',
+            'base_name': 'base2',
+            'dict3': {
+                'base_name': 'base3',
+                'test3': 'base3/test3.txt'
+            },
+            'dict4': {
+                'test4': 'base2/test3.txt'
+            }},
+        'test5': 'base/test5.txt',
+        'dict5': {
+            'base_name': 'base',
+            'test6': 'base/test_5.txt'
+        }
+    }
+
+
+def test_clean_list():
+    config = [
+        'test',
+        {'base': 'base',
+         'test2': '__BASE__/test2'},
+        {'base': 'base2',
+         'test3': '__BASE__/test3'},
+        [1, 2, 3],
+        17
+    ]
+    assert clean_list(config) == [
+        'test',
+        {'base': 'base',
+         'test2': 'base/test2'},
+        {'base': 'base2',
+         'test3': 'base2/test3'},
+        [1, 2, 3],
+        17
+    ]
+
+    with pytest.raises(Exception) as e:
+        clean_list(['__NOT_FOUND__'])
+    assert 'Failed to dereference list entry: "__NOT_FOUND__"' in str(e)
+
+    assert clean_list(['__BASE__/test'], {'base': 'base'}) == ['base/test']
+
+
+def test_merge_lists():
+    assert merge_lists([], list('abc')) == list('abc')
+    assert merge_lists(list('abc'), list('abc')) == list('abc')
+    assert merge_lists(list('abc'), list('d')) == list('abcd')
+    assert merge_lists(list('abc'), []) == list('abc')
+    assert merge_lists(list('abc'), [1, 2, 3]) == ['a', 'b', 'c', 1, 2, 3]
+    assert merge_lists([{'a': 1, 'b': 2}, 1], [{'a': 1, 'b': 2}, 3]) ==\
+        [{'a': 1, 'b': 2}, 1, 3]
+    assert merge_lists([{'a': 1, 'b': 2}, 1], [{'a': 1, 'b': 3}, 3]) ==\
+        [{'a': 1, 'b': 2}, 1, {'a': 1, 'b': 3}, 3]
+
+
+def test_merge_dicts():
+    assert merge_dicts({1: 1, 2: 2}, {}) == {1: 1, 2: 2}
+    assert merge_dicts({}, {1: 1, 2: 2}) == {1: 1, 2: 2}
+    assert merge_dicts({1: 3}, {1: 1, 2: 2}) == {1: 1, 2: 2}
+    # only new value type matters
+    assert merge_dicts({1: 3, 2: {}}, {1: 1, 2: 2}) == {1: 1, 2: 2}
+    # nested dict
+    assert merge_dicts({1: 3, 2: {3: 4}}, {1: 1, 2: {3: 3}}) == \
+        {1: 1, 2: {3: 3}}
+    # nested list, just overwrite
+    assert merge_dicts({1: 3, 2: {3: [1, 2]}}, {1: 1, 2: {3: [3, 4]}}) == \
+        {1: 1, 2: {3: [3, 4]}}
+
+
+def test_get_nested():
+    assert get_nested({'a': 1}, 'a') == 1
+    assert get_nested({'a': 1}, 'b') is None
+    assert get_nested({'a': {'b': 2}}, 'a.b') == 2
+    assert get_nested({'a': {'b': 2}}, 'a.c') is None
+    assert get_nested({'a': {'b': {'c': 3}}}, 'a.b.c') == 3
+    assert get_nested(None, 'key') is None
+
+
+def test_check_wildcards(mocker):
+    assert check_wildcards('{test}.txt', 'test')
+    assert check_wildcards('{test}{string}.txt', 'test,string')
+
+    mock_log = mocker.patch('misc.config_utils.log.exception')
+    with pytest.raises(ValueError) as e:
+        check_wildcards('test.txt', 'test')
+
+    mock_log.assert_called_with('{test} not found in test.txt')
+    assert '{test} not found in test.txt' in str(e)
+
+
+def test_validate(mocker):
+    assert validate({}, '', '', 'test') == 'test'
+    assert validate({'path': 'test'}, 'path', '') == 'test'
+    assert validate({'path': 'test'}, 'path', '', '') == 'test'
+    mock_log = mocker.patch('misc.config_utils.log.exception')
+    with pytest.raises(ValueError) as e:
+        validate({'path': 'test'}, 'path2', 'except', '')
+    assert 'except' in str(e)
+    mock_log.assert_called_with('except')
diff --git a/code/test/misc/test_read_table.py b/code/test/misc/test_read_table.py
index aa5c7e4..0691f94 100644
--- a/code/test/misc/test_read_table.py
+++ b/code/test/misc/test_read_table.py
@@ -1,6 +1,5 @@
 from misc import read_table
 from io import StringIO
-import pytest
 
 
 def test_read_table_rows_empty(mocker):
@@ -20,8 +19,6 @@ def test_read_table_rows_empty(mocker):
 
     def return_args(arg):
         return arg
-    mocker.patch('misc.read_table.io.BufferedReader',
-                 side_effect=return_args)
 
     d, labels = read_table.read_table_rows('mocked.gz', '\t', False)
     mocked_gz.assert_called_with('mocked.gz', 'rt')
@@ -113,8 +110,6 @@ def test_read_table_columns_empty(mocker):
 
     def return_args(arg):
         return arg
-    mocker.patch('misc.read_table.io.BufferedReader',
-                 side_effect=return_args)
 
     d, labels = read_table.read_table_columns('mocked.gz', '\t')
     mocked_gz.assert_called_with('mocked.gz', 'rt')
diff --git a/code/test/misc/test_region_reader.py b/code/test/misc/test_region_reader.py
new file mode 100644
index 0000000..2b98c6b
--- /dev/null
+++ b/code/test/misc/test_region_reader.py
@@ -0,0 +1,261 @@
+from misc.region_reader import Region_Reader
+import pytest
+from pytest import approx
+from io import StringIO
+import numpy as np
+
+
+def test_init(mocker):
+    # fail on filename existing
+    mocker.patch('os.path.exists', return_value=False)
+    with pytest.raises(ValueError) as e:
+        Region_Reader('test')
+    assert 'test not found' in str(e)
+
+    # fail on filename format
+    mocker.patch('os.path.exists', return_value=True)
+    with pytest.raises(ValueError) as e:
+        Region_Reader('test')
+
+    # fail on pickle
+    mocker.patch('os.path.exists', side_effect=[True, False])
+    with pytest.raises(ValueError) as e:
+        Region_Reader('test.fa.gz')
+
+    # success, with defaults
+    mocker.patch('os.path.exists', side_effect=[True, True])
+    r = Region_Reader('test.fa.gz')
+    assert r.region_file == 'test.fa.gz'
+    assert r.pickle == 'test.pkl'
+    assert r.as_fa is False
+    assert r.suppress_header is True
+    assert r.num_lines == 14, 'Suppress header did not change num_lines'
+
+    # non defaults
+    mocker.patch('os.path.exists', side_effect=[True, True])
+    r = Region_Reader('test1.fa.gz',
+                      as_fa=True,
+                      suppress_header=False,
+                      num_lines=4)
+    assert r.region_file == 'test1.fa.gz'
+    assert r.pickle == 'test1.pkl'
+    assert r.as_fa is True
+    assert r.suppress_header is False
+    assert r.num_lines == 4
+
+
+@pytest.fixture
+def r(mocker):
+    mocker.patch('os.path.exists', side_effect=[True, True])
+    return Region_Reader('test.fa.gz')
+
+
+def test_read_region(r, capsys):
+    # get fa, don't suppress header
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               '#header\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 2
+    r.as_fa = True
+    r.suppress_header = False
+    r.index = {1: 16}
+    header, seqs = r.read_region('r1')
+    assert header == ['header 2']
+    assert seqs == approx(np.asarray(['line 2']))
+    soe = capsys.readouterr()
+    assert soe.out == '#header\n'
+    assert soe.err == ''
+
+    # print, suppress header
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               '#header\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 2
+    r.as_fa = False
+    r.suppress_header = True
+    r.read_region('1')
+    soe = capsys.readouterr()
+    assert soe.out == 'header 2\nline 2\n'
+    assert soe.err == ''
+
+
+def test_read_location(r, capsys):
+    # get fa, don't suppress header
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               '#header\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 2
+    r.as_fa = True
+    r.suppress_header = False
+    header, seqs = r.read_location(16)
+    assert header == ['header 2']
+    assert seqs == approx(np.asarray(['line 2']))
+    soe = capsys.readouterr()
+    assert soe.out == '#header\n'
+    assert soe.err == ''
+
+    # print, suppress header
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               '#header\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 2
+    r.as_fa = False
+    r.suppress_header = True
+    r.read_location(16)
+    soe = capsys.readouterr()
+    assert soe.out == 'header 2\nline 2\n'
+    assert soe.err == ''
+
+
+def test_convert_region(r):
+    with pytest.raises(ValueError) as e:
+        r.convert_region('z123')
+    assert 'z123 could not be parsed' in str(e)
+
+    with pytest.raises(ValueError) as e:
+        r.convert_region('zr123')
+    assert 'zr123 could not be parsed' in str(e)
+
+    assert r.convert_region('123') == 123
+    assert r.convert_region('r123') == 123
+
+
+def test_decode_region(r):
+    index = {1: 2, 10: 3, 100: 4}
+    r.index = index
+
+    # raise key error
+    with pytest.raises(KeyError) as e:
+        r.decode_region(3)
+    assert 'r3 not found in index' in str(e)
+
+    assert r.decode_region(1) == 2
+    assert r.decode_region(10) == 3
+    assert r.decode_region(100) == 4
+
+
+def test_yield_fa(r):
+    r.region_reader = StringIO('#h1\n'
+                               'header 1\n'
+                               'line 1\n'
+                               '#h2\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 2
+    regions = ['h1', 'h2']
+    headers = [['header 1'], ['header 2']]
+    seqs = [np.asarray([list('line 1')]), np.asarray([list('line 2')])]
+    i = 0
+    for region, header, seq in r.yield_fa():
+        assert region == regions[i]
+        assert header == headers[i]
+        assert seq == approx(seqs[i])
+        i += 1
+    assert i == 2
+
+    r.region_reader = StringIO('#h1\n'
+                               'header 1\n'
+                               'line 1\n'
+                               '#h2\n'
+                               'header 2\n'
+                               'line 2\n')
+    i = 0
+    # with keys added, should only yield one value
+    for region, header, seq in r.yield_fa({'h1': ''}.keys()):
+        assert region == regions[i]
+        assert header == headers[i]
+        assert seq == approx(seqs[i])
+        i += 1
+    assert i == 1
+
+    r.region_reader = StringIO('#h1\n'
+                               'header 1\n'
+                               'line 1\n'
+                               '#h2\n'
+                               'header 2\n'
+                               'line 2\n')
+    i = 0
+    # with keys added, should only yield one value
+    for region, header, seq in r.yield_fa({'h0': ''}.keys()):
+        assert region == regions[i]
+        assert header == headers[i]
+        assert seq == approx(seqs[i])
+        i += 1
+    assert i == 0
+
+
+def test_encode_fa(r):
+    # outside of file
+    r.region_reader = StringIO('')
+    with pytest.raises(ValueError) as e:
+        r.encode_fa(100)
+    assert '100 outside of file' in str(e)
+
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 4
+    header, seqs = r.encode_fa(0)
+    assert header == ['header 1', 'header 2']
+    assert seqs == approx(np.asarray(['line 1', 'line 2']))
+
+    r.region_reader = StringIO('header 1\n'
+                               'line 1\n'
+                               'header 2\n'
+                               'line 2\n')
+    r.num_lines = 3
+    header, seqs = r.encode_fa(0)
+    assert header == ['header 1', 'header 2']
+    assert seqs == approx(np.asarray(['line 1']))
+
+
+def test_print_region(r, capsys):
+    # outside of file
+    r.region_reader = StringIO('')
+    r.print_region(100)
+    soe = capsys.readouterr()
+    assert soe.out == ''
+    assert soe.err == '100 outside of file\n'
+
+    # outside of file on second position
+    r.region_reader = StringIO('a test\n')
+    r.num_lines = 2
+    r.print_region(0)
+    soe = capsys.readouterr()
+    assert soe.err == '0 outside of file\n'
+    assert soe.out == 'a test\n'
+
+    # normal
+    r.region_reader = StringIO('header\n'
+                               'line 1\n'
+                               'line 2\n'
+                               'header\n'
+                               'line 3\n')
+    r.num_lines = 1
+    r.print_region(0)
+    r.region_reader = StringIO('header\n'
+                               'line 1\n')
+    soe = capsys.readouterr()
+    assert soe.err == ''
+    assert soe.out == 'header\n'
+
+    # normal
+    r.region_reader = StringIO('head 1\n'
+                               'line 1\n'
+                               'line 2\n'
+                               'head 2\n'
+                               'line 3\n')
+    r.num_lines = 2
+    r.print_region(0)
+    soe = capsys.readouterr()
+    assert soe.err == ''
+    assert soe.out == 'head 1\nline 1\n'
diff --git a/code/test/sim/test_sim_process.py b/code/test/sim/test_sim_process.py
index 40c7953..2f54781 100644
--- a/code/test/sim/test_sim_process.py
+++ b/code/test/sim/test_sim_process.py
@@ -24,7 +24,7 @@ def test_get_max_path(hm):
     assert probs == max_probs
 
 
-def test_get_threshold_predicted(hm):
+def test_threshold_predicted(hm):
     post = hm.posterior_decoding()
     path, probs = sim_process.get_max_path(post[0], hm.hidden_states)
     for thresh in (0, 0.2, 0.5, 0.8, 1):