diff --git a/.gitignore b/.gitignore index cbfb72c..967c89c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ code/*/scratch/* code/setup/* .coverage *.swp +*egg-info +tags +*.log diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..773d6cb --- /dev/null +++ b/.travis.yml @@ -0,0 +1,25 @@ +language: python +python: + - "3.6" +before_install: + cd code +install: + - sudo apt-get update + - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; + - bash miniconda.sh -b -p $HOME/miniconda + - export PATH="$HOME/miniconda/bin:$PATH" + - hash -r + - conda config --set always_yes yes --set changeps1 no + - conda update -q conda + # Useful for debugging any issues with conda + - conda info -a + + - conda env create -q -n test-environment --file environment.yml + - source activate test-environment + - pip install codecov + +script: + pytest --cov --cov-config .coveragerc --flake8 + +after_success: + - codecov diff --git a/README.md b/README.md new file mode 100644 index 0000000..70ca603 --- /dev/null +++ b/README.md @@ -0,0 +1,199 @@ +[![Build Status](https://travis-ci.com/troycomi/introgression.svg?branch=master)](https://travis-ci.com/troycomi/introgression) +[![codecov](https://codecov.io/gh/troycomi/introgression/branch/master/graph/badge.svg)](https://codecov.io/gh/troycomi/introgression) + +# introgression +> Discovering yeast admixture through sequencing + +## Background +TBA + +## Installation +All required packages are specified in the conda environment located in +`code/environment.yml`. The introgression environment can be generated with +```bash +conda env create -f environment.yml +``` +To access the command line bindings of the main analyze class, +install the setup file using pip with +```bash +conda activate introgression +pip install --editable . +``` +while in the code directory. + +## Usage + +### Configuration +A set of initial parameters are provided in `code/config.yaml` which need to +be set specifically for your system and dataset. + +Strings of the form \_\_KEY\_\_ +are substituted during execution and are used as a shortcut. For example, +with `output\_root` set to `/data/results`, the value `__OUTPUT_ROOT__/genes/` +becomes `/data/results/genes/`. + +Strings of the form `{state}` are used for wildcards within the code. Their +location and surrounding characters can change, but the wildcard must be the +same. For example, `blocks_{state}.txt` can be changed to +`{state}_with-block.txt` but not `blocks_{st}.txt`. + +### Command Line +With the package installed and the conda environment activated, main methods +are accessed with the `introgression` command. Some documentation is provided +by adding the argument `--help` to introgression or any of its subcommands. + +#### introgression +Options include: +- --config: specify one or more configuration files. Files are evaluated in +order. Conflicting values are overwritten by the newest file. This allows a +base configuration for the system and analysis-specific configurations added +as needed. +- verbosity: set by varying the number of v's attached to the option, with +`-v` indicating a log level of critical and `-vvvvv` indicating debug logging. +- --log-file: Optional location to store log information. Default is stdout. +If set and run on an interactive shell, some commands will display progress +bars. + +Most subcommand options will overwrite corresponding values in the config +file. Leaving options unset without supplying a value in the config file +will raise an error. Some values are only set through the config file +including the list of chromosomes and the known states. + +Available subcommands are: +##### predict +The predict subcommand uses an HMM to predict regions of introgression from +alignment files. Several outputs are used in subsequent steps which refine +the predicted introgressed regions. + +Test strains on which to predict introgression can be supplied in the config +file under the name `strains` or pulled from the directory structure of +`test_strains`. + +Available options are: +- --alignment: input alignment file location with wildcards for +{prefix} (optional), {strain} and {chrom}. +- --prefix: An optional wildcard value for alignment files. If left blank, +will default to the known states joined with an underscore. Leaving the +{prefix} wildcard out of the alignment file will prevent its use as well. +- --blocks: An output file containing the regions predicted to belong to the +given state. Must contain the {state} wildcard which will be populated with a +known state during analysis. Columns are the strain, chromosomes, the +predicted state, start position, end position, and the number of sites +supporting the assignment. +- --test-strains. If strains are not provided in the config, this file with +{strain} and {chrom} wildcards will be used to populate the strains for +prediction. +- --hmm-initial: Output file with the initial parameters of the HMM for each +strain. +- --hmm-trained: Output file with HMM parameters following Baum-Welch training. +- --positions: Output file with indices of sites which are non-gapped sequences +which differ between reference alignments. +- --probabilities: Output file with the probability of each position belonging +to the master reference strain. +- --threshold: The threshold value to apply when filtering the predicted HMM +path through the test sequence. Either a float, indicating cutoff probability, +or 'viterbi' to indicate the Viterbi algorithm should be used to find the most +likely sequence of states. +- --only-poly-sites/--all-sites: A switch to indicate if all non-gapped, +sequenced sites should be considered during HMM training, or only polymorphic +sites. Default is only polymorphic sites. + +##### id-regions +id-regions prepends a column to block files with a unique region id, of the +form 'r#'. Regions are sorted by the start position of the region. Changing +the states to label will affect the region numbers as a different set of +regions will be considered. + +Available options are: +- --blocks: The input file to label with the wildcard {state}. This is the +file produced by predict in the previous step. +- --labeled: The output file, also containing {state} wildcard. +- --state: May be specified multiple times to indicate which states to add +labels to. Leaving unset will use the states in the config file (recommended). + +##### summarize-regions +Analyzes the regions predicted to be introgressed. Several columns are added +to the block file containing information about the region including the number +of matching sites to each state. + +Available options are: +- --state: May be set multiple times for each state to summarize. Leaving +unset will default to all states in the config file. +- --labeled: the labeled block file with {state} wildcard created in the +previous step. +- --masks: Sequence mask files with {strain} and {chrom} wildcards. +- --alignment: The input alignment file similar to the predict option. +- --positions: The position file created during predict. +- --quality: The output file with a {state} wildcard. +- --region: The alignment for each region in the labeled file with {state} +wildcard. Each state file contains all regions for the state. +- --region-index: A pickled python dictionary used for random access into the +region file. Must have {state} wildcard. + +##### filter-regions +From the quality files produced in `summarize-regions`, filter regions based +on several criteria including those with weak support for the alternative +hypothesis and those which can be assigned to multiple alternative states. + +Regions passing the 'introgressed filter' satisfy all of the following: +- fraction of gaps masked in reference > 0.5 +- fraction of gaps masked in predicted state > 0.5 +- number of matches to predicted > 7 +- number of matches to predicted > number of matches to reference +- sequence identity with predicted state is higher than reference +- sequence identity with reference is > 0.7 + +Regions passing the 'ambiguous filter' match only the predicted state. No +other state has: +- sequence identity >= sequence identity with predicted state * threshold +- matching bases >= matching bases with predicted state * threshold + +Available options are: +- --region: The region file from summarize-regions. +- --region-index: The region index file from summarize-regions. +- --quality: The quality file produced by summarize-regions with {state} +wildcard. +- --introgress-filter: The output file with only regions passing introgression +filter. Must contain {state} wildcard. +- --introgress-inter: An output file with all regions. Includes the reason +for filtering by the introgression filter or blank if it passes. +Must contain {state} wildcard. +- --ambiguous-filter: Output file containing only regions which pass the +ambiguous filter after passing the introgression filter. +Must contain {state} wildcard. +- --ambiguous-inter: Contains all regions from introgression filter with a +column for the reason the region failed ambiguous filtering. Must contain +{state} wildcard. +- --thresh: The threshold to apply to the ambiguous filter. +- --filter-sweep: If set and threshold values are supplied as arguments, +will output summary information for applying the ambiguous filter with various +threshold values. + +`filter-regions` accepts multiple threshold values as arguments to test +and output to the `filter-sweep` file. Sample usage would be +```bash +introgression --config config.yml \ + filter-regions + --threshold 0.995 \ + --filter-sweep sweep.txt \ + 0.99 0.98 0.8 # these are the sweep arguments +``` +where 0.99, 0.98 and 0.8 are used as test threshold values as summarized in +sweep.txt. Note that the ambiguous filter will only use the threshold 0.995 +in this example. + +##### summarize-strains +Summarize strains produces summary information for each test strain including +the number of regions and bases assigned to each hidden state, filtered at +each stage, and ambiguous between states. + +Available options are: +- --introgress-inter: The introgressed filter file as used in `filter-regions`. +- --ambiguous-inter: The ambiguous filter file as used in `filter-regions`. +- --strain-info: Tab separate table with information on the strain to include +with the summary output. Columns should be the strain name, alternate name, +location, environment, and population. +- --state-counts: The summary output file. + +## License +TBD diff --git a/code/align/aggregate_alignment_stats.py b/code/align/aggregate_alignment_stats.py index 290179a..a3e72bd 100644 --- a/code/align/aggregate_alignment_stats.py +++ b/code/align/aggregate_alignment_stats.py @@ -1,14 +1,17 @@ import os -import sys -sys.path.insert(0, '..') import global_params as gp gp_dir = '../' -stats_files = [gp_dir + gp.alignments_dir + x for x in filter(\ - lambda x: 'stats' in x and 'summary' not in x, os.listdir(gp_dir + gp.alignments_dir))] +stats_files = [gp_dir + gp.alignments_dir + x for x in filter( + lambda x: 'stats' in x and 'summary' not in x, + os.listdir(gp_dir + gp.alignments_dir))] # goal is to generate file for R (e.g. for two references and test strain): -# chromosome strain frac_S288c_S288c frac_S288c_CBS432 frac_S288c_x frac_CBS432_S288c frac_CBS432_CBS432 frac_CBS432_x frac_x_S288c frac_x_CBS432 frac_x_x aligned_length_S288c aligned_length_CBS432 aligned_length_x num_align_columns_0 num_align_columns_1 num_align_columns_2 num_align_columns_3 +# chromosome strain frac_S288c_S288c frac_S288c_CBS432 frac_S288c_x +# frac_CBS432_S288c frac_CBS432_CBS432 frac_CBS432_x frac_x_S288c +# frac_x_CBS432 frac_x_x aligned_length_S288c aligned_length_CBS432 +# aligned_length_x num_align_columns_0 num_align_columns_1 +# num_align_columns_2 num_align_columns_3 f = open(gp_dir + gp.alignments_dir + 'mafft_stats_summary.txt', 'w') @@ -25,15 +28,15 @@ for i in range(0, len(gp.alignment_ref_order) + 2): f.write('\t' + 'num_align_columns_' + str(i)) - + f.write('\n') all_strains = gp.alignment_ref_order + ['x'] # one line for each of these files for fn in stats_files: - print fn - + print(fn) + lines = [line.strip() for line in open(fn, 'r').readlines()] # histogram of number of number of strains aligned @@ -43,10 +46,10 @@ c.append(float(lines[i + offset].split(',')[1])) # aligned lengths - l = [] + lengths = [] offset += len(all_strains) + 1 + 2 for i in range(len(all_strains)): - l.append(float(lines[i + offset].split(',')[1])) + lengths.append(float(lines[i + offset].split(',')[1])) sx = lines[offset + len(all_strains) - 1].split(',')[0] @@ -67,7 +70,7 @@ for j in range(len(all_strains)): f.write('\t' + str(fr[i][j])) for i in range(len(all_strains)): - f.write('\t' + str(l[i])) + f.write('\t' + str(lengths[i])) for i in range(len(all_strains) + 1): f.write('\t' + str(c[i])) f.write('\n') diff --git a/code/align/align_helpers.py b/code/align/align_helpers.py index be216de..a1f44ee 100644 --- a/code/align/align_helpers.py +++ b/code/align/align_helpers.py @@ -1,12 +1,24 @@ import os import global_params as gp +from typing import List, Tuple -def flatten(l): +def flatten(l: List[List]) -> List: + ''' + Flatten list of lists into a single list + ''' return [item for sublist in l for item in sublist] -def get_strains(dirs): +def get_strains(dirs: List[str]) -> List[Tuple[str, str]]: + ''' + Find all strains in the provided list of directories + Returns a sorted list of tuples with (strain_name, directory) entries + Checks for files with the fasta_suffix and contain _chr + strain_name is the name of the file up to _chr. + Raises assertion error if the number of files found is < number of strains + * the number of chromosomes + ''' # get all non-reference strains of cerevisiae and paradoxus; could # generalize this someday... @@ -15,10 +27,11 @@ def get_strains(dirs): for d in dirs: fns = os.listdir(d) # only look at fasta files in the directory - fns = filter(lambda x: x.endswith(gp.fasta_suffix), fns) # only look at files containing '_chr' which should be chromosome # sequence files - fns = list(filter(lambda x: '_chr' in x, fns)) + fns = list( + filter(lambda x: x.endswith(gp.fasta_suffix) and '_chr' in x, + fns)) num_files = len(fns) if num_files == 0: print(f'found no chromosome sequence files in {d} ' @@ -33,7 +46,13 @@ def get_strains(dirs): return sorted(s) -def concatenate_fasta(input_files, names, output_file): +def concatenate_fasta(input_files: List[str], + names: List[str], + output_file: str) -> None: + ''' + Combines several fasta files together into a single output + Adds header between each input fasta as > name[i] filename + ''' with open(output_file, 'w') as output: for i, file in enumerate(input_files): with open(file, 'r') as input: diff --git a/code/align/alignment_stats.py b/code/align/alignment_stats.py index 9a06939..58f5ac3 100644 --- a/code/align/alignment_stats.py +++ b/code/align/alignment_stats.py @@ -1,8 +1,6 @@ -import sys -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc') -import read_fasta +from misc import read_fasta + # count sites where n, ..., 3, 2, 1 genomes aligned, etc. def num_strains_aligned_by_site(seqs): @@ -17,7 +15,8 @@ def num_strains_aligned_by_site(seqs): return num_strains_hist -# fraction of each strain's sequence contained in alignment + +# fraction of each strain's sequence contained in alignment # (should be 1) def fraction_strains_aligned(headers, seqs): nseqs = len(seqs) @@ -34,6 +33,7 @@ def fraction_strains_aligned(headers, seqs): return fracs_aligned, seq_lengths + # using each genome as reference, percentage of other genomes aligned def frac_aligned_to_reference(seqs, seq_lengths): nseqs = len(seqs) @@ -47,7 +47,8 @@ def frac_aligned_to_reference(seqs, seq_lengths): else: total = 0 for i in range(nsites): - if seqs[ref][i] != gp.gap_symbol and seqs[other][i] != gp.gap_symbol: + if seqs[ref][i] != gp.gap_symbol and \ + seqs[other][i] != gp.gap_symbol: total += 1 r.append(float(total) / seq_lengths[other]) fracs_aligned_to_ref.append(r) diff --git a/code/align/average_alignment_stats.py b/code/align/average_alignment_stats.py index 718cd85..b6c04af 100644 --- a/code/align/average_alignment_stats.py +++ b/code/align/average_alignment_stats.py @@ -1,14 +1,12 @@ import os -import sys -sys.path.insert(0, '..') import global_params as gp import numpy gp_dir = '../' -stats_files = [gp_dir + gp.alignments_dir + x for x in filter(\ +stats_files = [gp_dir + gp.alignments_dir + x for x in filter( lambda x: 'stats' in x, os.listdir(gp_dir + gp.alignments_dir))] -#avg_frac_aligned_by_chrm = dict(zip(gp.chrms, [0]*len(gp.chrms))) +# avg_frac_aligned_by_chrm = dict(zip(gp.chrms, [0]*len(gp.chrms))) avg_frac_aligned_p = 0 avg_frac_aligned_x = 0 total_p = 0 @@ -26,18 +24,18 @@ avg_frac_aligned_x += fx * lx total_p += lp total_x += lx - #print fn[fn.find('chr')-8:], fx, lx, lc + # print fn[fn.find('chr')-8:], fx, lx, lc a.append(fx) avg_frac_aligned_p /= total_p avg_frac_aligned_x /= total_x -print len(stats_files) -print avg_frac_aligned_p -print avg_frac_aligned_x +print(len(stats_files)) +print(avg_frac_aligned_p) +print(avg_frac_aligned_x) hist, edges = numpy.histogram(a, bins=30) -print hist -print edges -print sum(hist[:-1]) +print(hist) +print(edges) +print(sum(hist[:-1])) diff --git a/code/align/convert_coordinates.py b/code/align/convert_coordinates.py index e38f872..a14ded6 100644 --- a/code/align/convert_coordinates.py +++ b/code/align/convert_coordinates.py @@ -1,13 +1,11 @@ -import sys import gzip -sys.path.insert(0, '..') import global_params as gp def convert(s1, s2): i2 = -1 i2d = 0 - l = [] + result = [] for i in range(len(s1)): if s2[i] == gp.gap_symbol: i2d += 1 @@ -16,14 +14,14 @@ def convert(s1, s2): i2d = 0 if s1[i] != gp.gap_symbol: if i2d == 0: - l.append(str(i2)) + result.append(str(i2)) else: - l.append(str(i2) + '.' + str(i2d)) - return l + result.append(str(i2) + '.' + str(i2d)) + return result -def write_coordinates(l, fn): + +def write_coordinates(coords, fn): f = gzip.open(fn, 'wb') - f.write('\n'.join([str(x) for x in l])) + f.write('\n'.join([str(x) for x in coords])) f.write('\n') f.close() - diff --git a/code/align/convert_coordinates_main.py b/code/align/convert_coordinates_main.py index 22ff28e..08cc90a 100644 --- a/code/align/convert_coordinates_main.py +++ b/code/align/convert_coordinates_main.py @@ -1,33 +1,30 @@ -import sys import os -from convert_coordinates import * -sys.path.insert(0, '..') +from convert_coordinates import (write_coordinates, convert) import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta +from misc import read_fasta gp_dir = '../' fns = os.listdir(gp_dir + gp.alignments_dir) fns = filter(lambda fn: fn.endswith(gp.alignment_suffix), fns) for fn in fns: - print fn + print(fn) x = fn.split('_') chrm = x[-2] strain_names = x[0:-2] headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + fn) - + # for each index in cer reference, get index in other strain # (either par reference for 2-way alignment or cer strain for # 3-way) - coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ - strain_names[0] + '_to_' + strain_names[-1] + \ - '_' + chrm + '.txt.gz' + coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' + + strain_names[0] + '_to_' + strain_names[-1] + + '_' + chrm + '.txt.gz') write_coordinates(convert(seqs[0], seqs[-1]), coord_fn) # for each index in other strain, get index in cer reference - coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ - strain_names[-1] + '_to_' + strain_names[0] + \ - '_' + chrm + '.txt.gz' + coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' + + strain_names[-1] + '_to_' + strain_names[0] + + '_' + chrm + '.txt.gz') write_coordinates(convert(seqs[-1], seqs[0]), coord_fn) diff --git a/code/align/mask.py b/code/align/mask.py index df0ffa4..4d5ae74 100644 --- a/code/align/mask.py +++ b/code/align/mask.py @@ -1,7 +1,6 @@ import sys import os -from mask_helpers import * -import align_helpers +from mask_helpers import mask from analyze import read_args import global_params as gp @@ -33,25 +32,22 @@ current_strain_fn = d + strain_fn.replace('*', strain) current_strain_masked_fn = d + strain_masked_fn.replace('*', strain) - current_strain_intervals_fn = intervals_d + intervals_fn.replace('*', strain) + current_strain_intervals_fn = intervals_d + intervals_fn.replace('*', + strain) for chrm in gp.chrms: in_fn = current_strain_fn.replace('?', chrm) out_fn = current_strain_intervals_fn.replace('?', chrm) - + # get dustmasker intervals - cmd_string = gp.blast_install_path + 'dustmasker' + \ - ' -in ' + in_fn + \ - ' -out ' + out_fn + \ - ' -outfmt interval' - + cmd_string = (gp.blast_install_path + 'dustmasker' + + ' -in ' + in_fn + + ' -out ' + out_fn + + ' -outfmt interval') + os.system(cmd_string) # replace those intervals with Ns and write to masked fasta file masked_fn = current_strain_masked_fn.replace('?', chrm) mask(in_fn, masked_fn, out_fn) - - - - diff --git a/code/align/mask_helpers.py b/code/align/mask_helpers.py index 8bd7dc6..b71ede7 100644 --- a/code/align/mask_helpers.py +++ b/code/align/mask_helpers.py @@ -1,14 +1,11 @@ -import sys -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc') -import read_fasta -import write_fasta +from misc import read_fasta +from misc import write_fasta -def read_intervals(fn): +def read_intervals(fn): f = open(fn, 'r') - f.readline() # header + f.readline() # header line = f.readline() intervals = [] while line != '': @@ -18,8 +15,8 @@ def read_intervals(fn): f.close() return intervals -def mask(fn, masked_fn, intervals_fn): +def mask(fn, masked_fn, intervals_fn): headers, seqs = read_fasta.read_fasta(fn) seq = list(seqs[0]) intervals = read_intervals(intervals_fn) @@ -28,4 +25,3 @@ def mask(fn, masked_fn, intervals_fn): seq[i] = gp.unsequenced_symbol seq = ''.join(seq) write_fasta.write_fasta(headers, [seq], masked_fn) - diff --git a/code/align/master_alignment.py b/code/align/master_alignment.py index 7309012..aff6cc9 100644 --- a/code/align/master_alignment.py +++ b/code/align/master_alignment.py @@ -1,61 +1,60 @@ -# combine all chromosomal alignments into one master, indexed relative to cerevisiae reference - -import sys -import os -import copy -sys.path.insert(0, '../misc') -import read_maf -sys.path.insert(0, '..') +# combine all chromosomal alignments into one master +# indexed relative to cerevisiae reference + +from misc import read_maf import global_params as gp -complement = {'A':'T', 'T':'A', 'G':'C', 'C':'G', \ - 'a':'t', 't':'a', 'g':'c', 'c':'g', \ - 'N':'N', 'n':'n', '-':'-'} +complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', + 'a': 't', 't': 'a', 'g': 'c', 'c': 'g', + 'N': 'N', 'n': 'n', '-': '-'} + +flip = {'-': '+', '+': '-'} -flip = {'-':'+', '+':'-'} def reverse_start(start, length, total_length): return total_length - start - length -def reverse_complement(s): +def reverse_complement(s): r = [] for b in s[::-1]: r.append(complement[b]) return r -def forward_index(blocks): +def forward_index(blocks): # go through all blocks and add a field for start relative to # forward strand, and sequence in forward direction for label in blocks.keys(): - block = blocks[label] for strain in blocks[label]['strains'].keys(): start = blocks[label]['strains'][strain]['start'] seq = blocks[label]['strains'][strain]['sequence'] - + blocks[label]['strains'][strain]['forward_start'] = start blocks[label]['strains'][strain]['forward_sequence'] = seq if blocks[label]['strains'][strain]['strand'] == '-': - blocks[label]['strains'][strain]['forward_sequence'] = seq[::-1] + blocks[label]['strains'][strain]['forward_sequence'] = \ + seq[::-1] blocks[label]['strains'][strain]['forward_start'] = \ - reverse_start(start, blocks[label]['strains'][strain]['length'], \ - blocks[label]['strains'][strain]['aligned_length']) + reverse_start( + start, + blocks[label]['strains'][strain]['length'], + blocks[label]['strains'][strain]['aligned_length']) return blocks -# make all master sequences go in forward direction (+) and flip -# others as necessary -def master_forward(blocks, master): +def master_forward(blocks, master): + # make all master sequences go in forward direction (+) and flip + # others as necessary for label in blocks.keys(): - block = blocks[label] - if blocks[label]['strains'].has_key(master): + if master in blocks[label]['strains']: if blocks[label]['strains'][master]['strand'] == '-': for strain in blocks[label]['strains'].keys(): - aligned_length = blocks[label]['strains'][strain]['aligned_length'] + aligned_length = \ + blocks[label]['strains'][strain]['aligned_length'] seq = blocks[label]['strains'][strain]['sequence'] start = blocks[label]['strains'][strain]['start'] length = blocks[label]['strains'][strain]['length'] @@ -70,11 +69,12 @@ def master_forward(blocks, master): return blocks + def make_master(fn, master): # keyed by block label; most of info in each keyed by ['strains'][strain] blocks = read_maf.read_mugsy(fn) - + # flip all blocks so that master sequence is on + strand blocks = master_forward(blocks, master) # add fields giving index and sequence relative to + strand @@ -83,13 +83,13 @@ def make_master(fn, master): # make sequences with alignment columns present in master n = blocks['1']['strains'][master]['aligned_length'] all_strains = blocks['1']['strains'].keys() - a = dict(zip(all_strains, [[gp.unaligned_symbol] * n for s in all_strains])) + a = dict(zip(all_strains, + [[gp.unaligned_symbol] * n for s in all_strains])) # loop through all blocks for label in blocks.keys(): - block = blocks[label] # only care about aligned blocks that include master sequence - if blocks[label]['strains'].has_key(master): + if master in blocks[label]['strains']: absolute_ind = blocks[label]['strains'][master]['start'] master_seq = blocks[label]['strains'][master]['sequence'] block_length = len(master_seq) @@ -104,25 +104,23 @@ def make_master(fn, master): # apparently mugsy sometimes aligns the same part # of one genome to multiple parts of another # genome. this is a problem. - assert a[master][absolute_ind] == gp.unaligned_symbol, absolute_ind + assert a[master][absolute_ind] == gp.unaligned_symbol,\ + absolute_ind # loop through all the strains in this block for strain in strains: a[strain][absolute_ind] = \ - blocks[label]['strains'][strain]\ - ['forward_sequence'][relative_ind] + blocks[label]['strains'][strain][ + 'forward_sequence'][relative_ind] absolute_ind += 1 for strain in all_strains: a[strain] = ''.join(a[strain]) - print strain, a[strain].count(gp.unaligned_symbol) - - #assert total_aligned_master == n - a[master].count(gp.unaligned_symbol), \ - # str(total_aligned_master) + ' ' + str(n - a[master].count(gp.unaligned_symbol)) + print(strain, a[strain].count(gp.unaligned_symbol)) return a -def write_master(fn, a): +def write_master(fn, a): f = open(fn, 'w') for strain in a.keys(): f.write('> ' + strain + '\n') diff --git a/code/align/master_alignment_main.py b/code/align/master_alignment_main.py index 52d2126..f0c5a10 100644 --- a/code/align/master_alignment_main.py +++ b/code/align/master_alignment_main.py @@ -2,9 +2,8 @@ # to one reference import sys -from master_alignment import * -sys.path.insert(0, '..') -from align_helpers import * +from align.master_alignment import (make_master, write_master) +from align.align_helpers import (get_strains, flatten) import global_params as gp strains = get_strains(flatten(gp.non_ref_dirs.values())) @@ -19,9 +18,9 @@ '_'.join(gp.alignment_ref_order) + \ '_' + strain for chrm in gp.chrms: - print chrm + print(chrm) alignment_fn = alignment_prefix + '_chr' + chrm + gp.alignment_suffix - master_alignment_fn = alignment_prefix + '_chr' + chrm + '_master' + gp.fasta_suffix + master_alignment_fn = (alignment_prefix + '_chr' + + chrm + '_master' + gp.fasta_suffix) a = make_master(alignment_fn, gp.master_ref) write_master(master_alignment_fn, a) - diff --git a/code/align/polymorphism.py b/code/align/polymorphism.py index 566f2fe..1f191c4 100644 --- a/code/align/polymorphism.py +++ b/code/align/polymorphism.py @@ -1,6 +1,8 @@ # calculate polymorphism rate between reference genomes in 100-bp # windows across each chromosome +import sys +from misc import read_fasta headers, seqs = read_fasta.read_fasta(sys.argv[1]) a = dict(zip(headers, seqs)) diff --git a/code/align/ref_ids_main.py b/code/align/ref_ids_main.py index f6cf7b8..58812fd 100644 --- a/code/align/ref_ids_main.py +++ b/code/align/ref_ids_main.py @@ -1,14 +1,8 @@ -import re -import sys -import os -import copy from collections import defaultdict -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import mystats -import seq_functions -import read_fasta +from misc import mystats +from misc import seq_functions +from misc import read_fasta # get pairwise identities between all aligned references: # - overall average @@ -23,28 +17,28 @@ pair_chrm_ids = defaultdict(lambda: defaultdict(list)) for chrm in gp.chrms: - print chrm - fn = gp_dir + gp.alignments_dir + \ - '_'.join(gp.alignment_ref_order) + \ - '_chr' + chrm + '_mafft' + gp.alignment_suffix + print(chrm) + fn = (gp_dir + gp.alignments_dir + + '_'.join(gp.alignment_ref_order) + + '_chr' + chrm + '_mafft' + gp.alignment_suffix) headers, seqs = read_fasta.read_fasta(fn) for i in range(nrefs): ref1 = gp.alignment_ref_order[i] for j in range(i+1, nrefs): - print i, j + print(i, j) ref2 = gp.alignment_ref_order[j] ids = seq_functions.seq_id_windowed(seqs[i], seqs[j], window) - + pair_chrm_ids[(ref1, ref2)][chrm] = ids -fs = open(gp.analysis_out_dir_absolute + 'ref_ids_summary_' + \ - '_'.join(gp.alignment_ref_order) + '.txt', 'w') +fs = open(gp.analysis_out_dir_absolute + 'ref_ids_summary_' + + '_'.join(gp.alignment_ref_order) + '.txt', 'w') fs.write('pair\tchromosome\tmean\tmedian\n') -f = open(gp.analysis_out_dir_absolute + 'ref_ids_' + \ +f = open(gp.analysis_out_dir_absolute + 'ref_ids_' + '_'.join(gp.alignment_ref_order) + '.txt', 'w') f.write('pair\tid\n') @@ -53,14 +47,14 @@ pair_string = ','.join(pair) for chrm in gp.chrms: ids = pair_chrm_ids[pair][chrm] - fs.write(pair_string + '\t' + \ - chrm + '\t' + \ - str(mystats.mean(ids)) + '\t' + \ + fs.write(pair_string + '\t' + + chrm + '\t' + + str(mystats.mean(ids)) + '\t' + str(mystats.median(ids)) + '\n') all_ids += ids - fs.write(pair_string + '\t' + \ - 'all' + '\t' + \ - str(mystats.mean(all_ids)) + '\t' + \ + fs.write(pair_string + '\t' + + 'all' + '\t' + + str(mystats.mean(all_ids)) + '\t' + str(mystats.median(all_ids)) + '\n') for i in ids: diff --git a/code/align/run_alignment_stats.py b/code/align/run_alignment_stats.py index 0fd7052..7f99b73 100644 --- a/code/align/run_alignment_stats.py +++ b/code/align/run_alignment_stats.py @@ -1,8 +1,11 @@ +import sys import os -from alignment_stats import * -from align_helpers import * -sys.path.insert(0, '..') +from align.alignment_stats import (num_strains_aligned_by_site, + fraction_strains_aligned, + frac_aligned_to_reference) +from align.align_helpers import (flatten, get_strains) import global_params as gp +from misc import read_fasta # gives info related to how good an alignment is: # - number of sites where n, ..., 3, 2, 1, genomes aligned @@ -14,10 +17,11 @@ strain, d = s[int(sys.argv[1])] gp_dir = '../' -fn_start = gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + '_' + strain + '_chr' +fn_start = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) + + '_' + strain + '_chr') for chrm in gp.chrms: - print chrm + print(chrm) sys.stdout.flush() if not os.path.isfile(fn_start + chrm + '_mafft.maf'): @@ -30,8 +34,9 @@ # number of sites where n,...,3,2,1 genomes aligned num_strains_by_site = num_strains_aligned_by_site(seqs) - f_out.write(\ - '# histogram of number of strains aligned across all alignment columns\n') + f_out.write( + '# histogram of number of strains ' + 'aligned across all alignment columns\n') for n in range(len(num_strains_by_site)): f_out.write(str(n) + ',' + str(num_strains_by_site[n]) + '\n') f_out.write('\n') @@ -44,7 +49,8 @@ # length of chromosomes f_out.write('chromosome aligned lengths\n') for n in range(len(seqs)): - f_out.write(headers[n][1:].strip().split(' ')[0] + ',' + str(seq_lengths[n]) + '\n') + f_out.write(headers[n][1:].strip().split(' ')[0] + + ',' + str(seq_lengths[n]) + '\n') f_out.write('\n') # using each genome as reference, fraction of other genomes aligned @@ -52,7 +58,7 @@ frac_aligned_to_ref = frac_aligned_to_reference(seqs, seq_lengths) for ref in range(len(seqs)): f_out.write(headers[ref][1:].strip().split(' ')[0]) - for other in range(len(seqs)): + for other in range(len(seqs)): f_out.write(',' + str(frac_aligned_to_ref[ref][other])) f_out.write('\n') f_out.write('\n') diff --git a/code/align/run_mafft.py b/code/align/run_mafft.py index 4681f77..e5f6681 100644 --- a/code/align/run_mafft.py +++ b/code/align/run_mafft.py @@ -1,6 +1,6 @@ import sys import os -from align_helpers import * +from align.align_helpers import (concatenate_fasta) from analyze import read_args import global_params as gp @@ -19,8 +19,8 @@ if os.stat(args['alignments_directory'] + fn).st_size != 0: a.append(fn) ref_prefix = '_'.join(args['references']) + '_' -ref_fns = [args['reference_directories'][r] + r + '_chr' + '?' + \ - mask_suffix + gp.fasta_suffix \ +ref_fns = [args['reference_directories'][r] + r + '_chr' + '?' + + mask_suffix + gp.fasta_suffix for r in args['references']] if ref_only: @@ -30,16 +30,16 @@ ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns] combined_fn = 'run_mafft_' + chrm + '.temp' - concatenate_fasta(ref_fns_chrm, \ + concatenate_fasta(ref_fns_chrm, args['references'], combined_fn) - - align_fn = ref_prefix + 'chr' + chrm + \ - '_mafft' + gp.alignment_suffix + + align_fn = (ref_prefix + 'chr' + chrm + + '_mafft' + gp.alignment_suffix) align_fn_abs = args['alignments_directory'] + align_fn - cmd_string = gp.mafft_install_path + '/mafft ' + \ - combined_fn + ' > ' + align_fn_abs + '; ' - + cmd_string = (gp.mafft_install_path + '/mafft ' + + combined_fn + ' > ' + align_fn_abs + '; ') + cmd_string += 'rm ' + combined_fn + ';' print(cmd_string) @@ -60,7 +60,6 @@ # shell instance every time (I think there's a limit on the # command character count or something which is why we're not # making a single string for all strains) -#cmd_string = '' current_strain_fn = d + strain_fn.replace('*', strain) @@ -74,7 +73,7 @@ # if we don't already have an alignment for this strain/chromosome # (or that alignment file is empty), then make one - #if (align_fn not in a) or (os.stat(align_fn_abs).st_size == 0): + # if (align_fn not in a) or (os.stat(align_fn_abs).st_size == 0): if align_fn not in a: cmd_string = '' @@ -82,18 +81,18 @@ ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns] current_strain_fn_chrm = current_strain_fn.replace('?', chrm) combined_fn = 'run_mafft_' + strain + chrm + '.temp' - - concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], \ + + concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], args['references'] + [strain], combined_fn) - + # add --ep 0.123 to maybe get shorter alignment - #cmd_string += gp.mafft_install_path + '/mafft --ep 0.123 ' + \ + # cmd_string += gp.mafft_install_path + '/mafft --ep 0.123 ' + \ # combined_fn + ' > ' + align_fn_abs + '; ' - #cmd_string += gp.mafft_install_path + '/mafft --retree 1 ' + \ + # cmd_string += gp.mafft_install_path + '/mafft --retree 1 ' + \ # combined_fn + ' > ' + align_fn_abs + '; ' cmd_string += gp.mafft_install_path + '/mafft ' + \ combined_fn + ' > ' + align_fn_abs + '; ' - + cmd_string += 'rm ' + combined_fn + ';' print(cmd_string) @@ -109,7 +108,3 @@ else: print("already did this alignment: " + strain + ' chromosome ' + chrm) sys.stdout.flush() - -#print cmd_string -#sys.stdout.flush() -#os.system(cmd_string) diff --git a/code/align/run_mafft_refs.py b/code/align/run_mafft_refs.py index 21105c1..1242531 100644 --- a/code/align/run_mafft_refs.py +++ b/code/align/run_mafft_refs.py @@ -2,8 +2,7 @@ import sys import os -from align_helpers import * -sys.path.insert(0, '..') +from align.align_helpers import concatenate_fasta import global_params as gp masked = False @@ -16,9 +15,9 @@ if gp.resume_alignment: a = os.listdir(gp_dir + gp.alignments_dir) -ref_prefix = '_'.join(gp.alignment_ref_order) -ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + \ - mask_suffix + gp.fasta_suffix \ +ref_prefix = '_'.join(gp.alignment_ref_order) +ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + + mask_suffix + gp.fasta_suffix for r in gp.alignment_ref_order] @@ -26,11 +25,11 @@ # shell instance every time (I think there's a limit on the # command character count or something which is why we're not # making a single string for all strains) -#cmd_string = '' +# cmd_string = '' chrm = gp.chrms[int(sys.argv[1])] -print chrm +print(chrm) sys.stdout.flush() align_fn = ref_prefix + '_chr' + chrm + \ @@ -44,28 +43,24 @@ # first put all sequences in same (temporary) file ref_fns_chrm = [x.replace('?', chrm) for x in ref_fns] combined_fn = 'run_mafft_' + chrm + '.temp' - - concatenate_fasta(ref_fns_chrm, \ + + concatenate_fasta(ref_fns_chrm, gp.alignment_ref_order, combined_fn) - - cmd_string += gp.mafft_install_path + '/mafft ' + \ - combined_fn + ' > ' + align_fn_abs + '; ' - + + cmd_string += (gp.mafft_install_path + '/mafft ' + + combined_fn + ' > ' + align_fn_abs + '; ') + cmd_string += 'rm ' + combined_fn + ';' - - print cmd_string + + print(cmd_string) sys.stdout.flush() os.system(cmd_string) # want some kind of indication if alignment fails (due to # running out of memory probably) if os.stat(align_fn_abs).st_size == 0: - print 'alignment failed:' + ' chromosome ' + chrm + print('alignment failed: chromosome ' + chrm) sys.stdout.flush() sys.exit() else: - print "already did this alignment:" + ' chromosome ' + chrm - -#print cmd_string -#sys.stdout.flush() -#os.system(cmd_string) + print('already did this alignment: chromosome ' + chrm) diff --git a/code/align/run_mugsy.py b/code/align/run_mugsy.py index e858769..7af436b 100644 --- a/code/align/run_mugsy.py +++ b/code/align/run_mugsy.py @@ -1,7 +1,5 @@ -import sys import os -sys.path.insert(0, '..') -from align_helpers import * +from align.align_helpers import get_strains, flatten import global_params as gp # get all non-reference strains of cerevisiae and paradoxus @@ -22,23 +20,25 @@ ref_dirs = [gp.ref_dir[ref] for ref in gp.alignment_ref_order] for strain, d in s: - print strain + print(strain) cmd_string = cmd_string_start - + for chrm in [gp.chrms[-1]]: align_fn = ref_prefix + strain + '_chr' + chrm + gp.alignment_suffix - # if we don't already have an alignment for this strain/chromosome, then make one + # if we don't already have an alignment for this strain/chromosome, + # then make one if align_fn not in a: cmd_string += gp.mugsy_install_path + '/mugsy ' + \ '--directory ' + gp_dir + gp.alignments_dir + ' ' + \ '--prefix ' + ref_prefix + strain + '_chr' + chrm for ref in gp.alignment_ref_order: - cmd_string += ' ' + gp.ref_dir[ref] + '/' + \ - gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix - cmd_string += ' ' + d + '/' + strain + '_chr' + chrm + gp.fasta_suffix + '; ' - + cmd_string += (' ' + gp.ref_dir[ref] + '/' + + gp.ref_fn_prefix[ref] + '_chr' + + chrm + gp.fasta_suffix) + cmd_string += (' ' + d + '/' + strain + + '_chr' + chrm + gp.fasta_suffix + '; ') # commands can only be up to a certain length so break it up this way - print cmd_string + print(cmd_string) os.system(cmd_string) diff --git a/code/align/run_tcoffee.py b/code/align/run_tcoffee.py index c94a3af..dcf48bb 100644 --- a/code/align/run_tcoffee.py +++ b/code/align/run_tcoffee.py @@ -1,7 +1,6 @@ import sys import os -sys.path.insert(0, '..') -from align_helpers import * +from align.align_helpers import (get_strains, flatten, concatenate_fasta) import global_params as gp # get all non-reference strains of cerevisiae and paradoxus @@ -14,13 +13,14 @@ ref_prefix = '_'.join(gp.alignment_ref_order) + '_' -ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + gp.fasta_suffix \ - for r in gp.alignment_ref_order] +ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + + '_chr' + '?' + gp.fasta_suffix + for r in gp.alignment_ref_order] strain_fn = '*_chr?' + gp.fasta_suffix for strain, d in s: - print strain + print(strain) # building up one command string so that we don't create a new # shell instance every time (I think there's a limit on the @@ -29,9 +29,9 @@ cmd_string = '' current_strain_fn = d + strain_fn.replace('*', strain) - + for chrm in gp.chrms[:2]: - print chrm + print(chrm) align_fn = ref_prefix + strain + '_chr' + chrm + \ '_tcoffee' + gp.alignment_suffix # if we don't already have an alignment for this @@ -42,14 +42,15 @@ current_strain_fn_chrm = current_strain_fn.replace('?', chrm) combined_fn = 'run_tcoffee_' + strain + chrm + '.temp' - concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], combined_fn) + concatenate_fasta(ref_fns_chrm + [current_strain_fn_chrm], + combined_fn) cmd_string += gp.tcoffee_install_path + '/t_coffee ' + \ combined_fn + '; ' - #cmd_string += 'rm ' + combined_fn + ';' + # cmd_string += 'rm ' + combined_fn + ';' # commands can only be up to a certain length so break it up this way - print cmd_string + print(cmd_string) os.system(cmd_string) sys.exit() diff --git a/code/analyze/combine_chromosome_files_main.py b/code/analyze/combine_chromosome_files_main.py deleted file mode 100644 index 1ec1814..0000000 --- a/code/analyze/combine_chromosome_files_main.py +++ /dev/null @@ -1,31 +0,0 @@ -import sys -import os -import gzip -import predict -import global_params as gp - -args = predict.process_predict_args(sys.argv[1:]) - -header = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + args['known_states'][0] + \ - '_' + args['tag'] + '_chr' + gp.chrms[0] + '_quality.txt', 'r').readline() - -for species_from in args['known_states']: - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_quality.txt' - f = open(fn, 'w') - f.write(header) - for chrm in gp.chrms: - fn_chrm = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_chr' + chrm + '_quality.txt' - try: - fc = open(fn_chrm, 'r') - except: - continue - fc.readline() - for line in fc.readlines(): - f.write(line) - f.close() - diff --git a/code/analyze/extract_region.py b/code/analyze/extract_region.py index 476e4f0..e7649d7 100644 --- a/code/analyze/extract_region.py +++ b/code/analyze/extract_region.py @@ -1,21 +1,23 @@ #!/usr/bin/env python3 import argparse -import os -import pickle -import gzip -import sys +from misc.region_reader import Region_Reader +from typing import List, Tuple def main(): + ''' + Main method to read in arguments from stdin and perform lookup with + Region_Reader + ''' args = parse_args() - args = validate_args(args) - index = pickle.load(open(args['pickle'], 'rb')) - locations = decode_regions(args['regions'], index, args['list_sort']) - with gzip.open(args['filename'], 'rt') as reader: - write_regions(reader, locations, args['suppress_header']) + args, reader = validate_args(args) + with reader: + locations = decode_regions(args['regions'], + reader, args['list_sort']) + write_regions(reader, locations) -def parse_args(args=None): +def parse_args(args: List[str] = None) -> argparse.Namespace: ''' Read in input arguments or the supplied list of strings Returns a dictionary of options @@ -40,44 +42,31 @@ def parse_args(args=None): return vars(parser.parse_args(args)) -def validate_args(args): +def validate_args(args: argparse.Namespace) -> Tuple[argparse.Namespace, + Region_Reader]: ''' Performs checks and conversions of input, raises ValueErrors if invalid ''' - if not os.path.exists(args['filename']): - raise ValueError(f'{args["filename"]} not found') - - if args['filename'][-6:] != '.fa.gz': - raise ValueError(f'{args["filename"]} expected to be .fa.gz') + reader = Region_Reader(args['filename'], + as_fa=False, + suppress_header=args['suppress_header'], + num_lines=15) - args['pickle'] = args['filename'][:-6] + '.pkl' - if not os.path.exists(args['pickle']): - raise ValueError(f'{args["pickle"]} not found with region file') + args['regions'] = [reader.convert_region(r) for r in args['regions']] - parsed_regions = [] - for region in args['regions']: - r = region - if r[0] == 'r': - r = r[1:] - if not r.isdigit(): - raise ValueError(f'{region} could not be parsed') - parsed_regions.append(int(r)) - args['regions'] = parsed_regions + return args, reader - return args - -def decode_regions(regions, index, retain_sort): +def decode_regions(regions: List[int], + reader: Region_Reader, + retain_sort: bool) -> List[int]: ''' Converts list of regions to file locations based on index dictionary Retain_sort controls if the output list order is determined by the region order or the disk location (i.e. values of index dict) ''' - try: - result = [index[r] for r in regions] - except KeyError as e: - raise KeyError(f'r{e} not found in index') + result = [reader.decode_region(r) for r in regions] if retain_sort: return result @@ -85,25 +74,13 @@ def decode_regions(regions, index, retain_sort): return sorted(result) -def write_regions(reader, locations, suppress_header, num_lines=15): +def write_regions(reader: Region_Reader, locations: List[int]) -> None: ''' Writes the regions specified by index to stdout If print_header is false, ignore first line after location ''' - if suppress_header is True: - num_lines -= 1 - for location in locations: - reader.seek(location) - if suppress_header is True: - reader.readline() - for i in range(num_lines): - line = reader.readline() - if line == '': - print(f'{location} outside of file', file=sys.stderr) - break - else: - print(line, end='') + reader.read_location(location) if __name__ == '__main__': diff --git a/code/analyze/filter_1_main.py b/code/analyze/filter_1_main.py deleted file mode 100644 index acebb7d..0000000 --- a/code/analyze/filter_1_main.py +++ /dev/null @@ -1,72 +0,0 @@ -# two levels of filtering: -# 1. remove regions that don't look confidently introgressed at all, -# based on fraction gaps/masked, number of matches to S288c and not S288c -# --> _filtered1 -# 2. remove regions that we can't confidently pin on a specific reference, -# based on whether it matches similarly to other reference(s) -# --> _filtered2 - -# just do the first level here, then run filter_2_thresholds_main.py -# to choose filtering thresholds for next level - - -import re -import sys -import os -import copy -import read_args -from filter_helpers import * -import summarize_region_quality -import global_params as gp -from misc import read_table -from misc import read_fasta - -args = read_args.process_predict_args(sys.argv[1:]) - -for species_from in args['known_states'][1:]: - - print(species_from) - - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_quality.txt' - region_summary, fields = read_table.read_table_rows(fn, '\t') - - fields1i = fields + ['reason'] - fields1 = fields - - fn_out1i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered1intermediate.txt' - - fn_out1 = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered1.txt' - - f_out1i = open(fn_out1i, 'w') - f_out1i.write('\t'.join(fields1i) + '\n') - - f_out1 = open(fn_out1, 'w') - f_out1.write('\t'.join(fields1) + '\n') - - regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/regions/' + \ - species_from + gp.fasta_suffix + '.gz' - region_seqs = summarize_region_quality.read_region_file(regions_fn) - - for region_id in region_summary: - - region = region_summary[region_id] - - info_string = region_seqs[region_id]['info']['seq'] - - # filtering stage 1: things that we're confident in calling not - # S288c - p, reason = passes_filters1(region, info_string, args['known_states'][0]) - region['reason'] = reason - write_filtered_line(f_out1i, region_id, region, fields1i) - - if p: - write_filtered_line(f_out1, region_id, region, fields1) - - f_out1i.close() - f_out1.close() diff --git a/code/analyze/filter_2_main.py b/code/analyze/filter_2_main.py deleted file mode 100644 index d1d290f..0000000 --- a/code/analyze/filter_2_main.py +++ /dev/null @@ -1,83 +0,0 @@ -# two levels of filtering: -# 1. remove regions that don't look confidently introgressed at all, -# based on fraction gaps/masked, number of matches to S288c and not S288c -# --> _filtered1 -# 2. remove regions that we can't confidently pin on a specific reference, -# based on whether it matches similarly to other reference(s) -# --> _filtered2 - -# do second level of filtering here, based on previously selected -# thresholds - -import re -import sys -import os -import copy -import numpy as np -import read_args -import summarize_region_quality -from filter_helpers import * -import global_params as gp -from misc import read_table -from misc import read_fasta - -args = read_args.process_predict_args(sys.argv[2:]) -threshold = float(sys.argv[1]) - -for species_from in args['known_states'][1:]: - - print(species_from) - - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered1.txt' - region_summary, fields = read_table.read_table_rows(fn, '\t') - - fields2i = fields + ['predicted_species_original', 'alternative_ids', \ - 'alternative_P_counts'] - fields2 = fields - - fn_out2i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered2intermediate.txt' - - fn_out2 = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered2.txt' - - f_out2i = open(fn_out2i, 'w') - f_out2i.write('\t'.join(fields2i) + '\n') - - f_out2 = open(fn_out2, 'w') - f_out2.write('\t'.join(fields2) + '\n') - - regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/regions/' + \ - species_from + gp.fasta_suffix + '.gz' - region_seqs = summarize_region_quality.read_region_file(regions_fn) - - for region_id in region_summary: - - region = region_summary[region_id] - - info_string = region_seqs[region_id]['info']['seq'] - seqs = np.asarray([list(region_seqs[region_id][ref]['seq']) \ - for ref in args['known_states']]) - - # filtering stage 2: things that we're confident in calling - # introgressed from one species specifically - p, alt_states, alt_ids, alt_P_counts = passes_filters2(region, seqs, \ - threshold, \ - args['known_states']) - region['alternative_states'] = '/'.join(alt_states) - region['alternative_ids'] = '/'.join([str(x) for x in alt_ids]) - region['alternative_P_counts'] = '/'.join([str(x) for x in alt_P_counts]) - - region['predicted_species_original'] = region['predicted_species'] - region['predicted_species'] = region['alternative_states'] - write_filtered_line(f_out2i, region_id, region, fields2i) - - if p: - write_filtered_line(f_out2, region_id, region, fields2) - - f_out2i.close() - f_out2.close() diff --git a/code/analyze/filter_2_thresholds_main.py b/code/analyze/filter_2_thresholds_main.py deleted file mode 100644 index 7a62fe8..0000000 --- a/code/analyze/filter_2_thresholds_main.py +++ /dev/null @@ -1,65 +0,0 @@ -# explore different thresholds for calling introgressions for specific -# strains - -# specifically, try a range of thresholds, and for each one, calculate -# fraction of introgressions we've classified as 1 strain or every -# possible combination of strains - -# then we'll make some plots in R to see if there's a sort of obvious -# place to draw the line - -import re -import sys -import os -import copy -from collections import defaultdict -import predict -from filter_helpers import * -sys.path.insert(0, '..') -import global_params as gp -sys.path.insert(0, '../misc/') -import read_table -import read_fasta - -args = predict.process_predict_args(sys.argv[1:]) - -#thresholds = [.99, .98, .97, .96, .95, .94, .93, .92, .91, .9, .88, .85, .82, .8, .75, .7, .6, .5] -#thresholds = [.999, .995, .985, .975, .965, .955, .945, .935, .925, .915, .905, .89, .87, .86] -thresholds = [1] - -open_mode = 'a' -f = open(gp.analysis_out_dir_absolute + args['tag'] + \ - '/filter_2_thresholds_' + args['tag'] + '.txt', open_mode) -if open_mode == 'w': - f.write('threshold\tpredicted_state\talternative_states\tcount\n') -for threshold in thresholds: - print threshold - for species_from in args['known_states'][1:]: - - print '*', species_from - - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered1.txt' - region_summary, fields = read_table.read_table_rows(fn, '\t') - - d = defaultdict(int) - for region_id in region_summary: - #print region_id, '****' - region = region_summary[region_id] - headers, seqs = read_fasta.read_fasta(gp.analysis_out_dir_absolute + \ - args['tag'] + \ - '/regions/' + region_id + '.fa.gz', \ - gz = True) - info_string = seqs[-1] - seqs = seqs[:-1] - - p, alt_states, alt_ids, alt_P_counts = \ - passes_filters2(region, seqs, threshold) - - d[','.join(sorted(alt_states))] += 1 - - for key in d: - f.write(str(threshold) + '\t' + species_from + '\t' + \ - key + '\t' + str(d[key]) + '\n') -f.close() diff --git a/code/analyze/filter_helpers.py b/code/analyze/filter_helpers.py deleted file mode 100644 index 6174401..0000000 --- a/code/analyze/filter_helpers.py +++ /dev/null @@ -1,136 +0,0 @@ -import re -import sys -import os -import copy -import global_params as gp -from misc import mystats -from misc import read_table -from misc import seq_functions - -def write_filtered_line(f, region_id, region, fields): - f.write(region_id + '\t' + '\t'.join([str(region[field]) for field in fields[1:]])) - f.write('\n') - -def passes_filters(region): - - # fraction gaps + masked filter - fraction_gaps_masked_threshold = .5 - fraction_gaps_masked = \ - (float(region['number_gaps']) + float(region['number_masked_non_gap'])) / \ - (int(region['end']) - int(region['start']) + 1) - if fraction_gaps_masked > fraction_gaps_masked_threshold: - return False - - # number sites match only par filter - number_match_only_threshold = 7 - number_match_only = int(region['number_match_ref2_not_ref1']) - if number_match_only < number_match_only_threshold: - return False - - # divergence from cer filter (idea is that poor alignments will - # result in much larger divergence than we'd expect) - id_ref1_threshold = .7 - id_ref1 = float(region['number_match_ref1']) / \ - (float(region['aligned_length']) - float(region['number_gaps'])) - if id_ref1 < id_ref1_threshold: - return False - - return True - -def passes_filters1(region, info_string, r): - # filtering out things that we can't call introgressed in general - # with confidence (i.e. doesn't seem like a strong case against - # being S288c) - - s = region['predicted_species'] - - aligned_length = (int(region['end']) - int(region['start']) + 1) - - # FILTER: fraction gaps + masked - fraction_gaps_masked_threshold = .5 - # num_sites_nonmask_x is number of sites at which neither - # reference x nor the test sequence is masked or has a gap or - # unsequenced character - fraction_gaps_masked_r = \ - 1 - float(region['num_sites_nonmask_' + r]) / aligned_length - fraction_gaps_masked_s = \ - 1 - float(region['num_sites_nonmask_' + s]) / aligned_length - - if fraction_gaps_masked_r > fraction_gaps_masked_threshold: - return False, 'fraction gaps/masked in master = ' + \ - str(fraction_gaps_masked_r) - if fraction_gaps_masked_s > fraction_gaps_masked_threshold: - return False, 'fraction gaps/masked in predicted = ' + \ - str(fraction_gaps_masked_s) - - # FILTER: number sites analyzed by HMM that match predicted - # reference - count_P = info_string.count('P') - count_C = info_string.count('C') - number_match_only_threshold = 7 - if count_P < number_match_only_threshold: - return False, 'count_P = ' + str(count_P) - if count_P <= count_C: - return False, 'count_P = ' + str(count_P) + ' and count_C = ' + str(count_C) - - # FILTER: divergence with predicted reference and master reference - # (S288c) - id_predicted = float(region['match_nongap_' + s]) / \ - float(region['num_sites_nongap_' + s]) - id_master = float(region['match_nongap_' + r]) / \ - float(region['num_sites_nongap_' + r]) - - if id_master >= id_predicted: - return False, 'id with master = ' + str(id_master) + \ - ' and id with predicted = ' + str(id_predicted) - #if id_predicted < .7: - # return False, 'id with predicted = ' + str(id_predicted) - if id_master < .7: - return False, 'id with master = ' + str(id_master) - - return True, '' - -def passes_filters2(region, seqs, threshold, refs): - # filter out things we can't assign to one species specifically; - # also return the other reasonable alternatives if we're filtering - # it out - - n = len(seqs[0]) - s = region['predicted_species'] - - ids = {} - totals = {} - P_counts = {} - skip = [gp.gap_symbol, gp.unsequenced_symbol] - - for ri in range(1, len(refs)): - r_match, r_total = seq_functions.seq_id(seqs[-1], seqs[ri]) - if r_total != 0: - ids[refs[ri]] = float(r_match) / r_total - totals[refs[ri]] = r_total - P_count = 0 - for i in range(n): - if seqs[ri][i] in skip or seqs[0][i] in skip or seqs[-1][i] in skip: - continue - if seqs[-1][i] == seqs[ri][i] and seqs[-1][i] != seqs[0][i]: - P_count += 1 - P_counts[refs[ri]] = P_count - - alts = {} - for r in ids.keys(): - #if float(totals[r]) / totals[s] > .75 and \ - # ids[r] >= threshold * ids[s] and \ - # P_counts[r] >= threshold * P_counts[s]: - # TODO should threshold be the same for both? - if ids[r] >= threshold * ids[s] and \ - P_counts[r] >= threshold * P_counts[s]: - alts[r] = (ids[r], P_counts[r]) - - alt_states = sorted(alts.keys(), key=lambda x: alts[x][0], reverse=True) - alt_ids = [alts[state][0] for state in alt_states] - alt_P_counts = [alts[state][1] for state in alt_states] - - if len(alts) > 1: - return False, alt_states, alt_ids, alt_P_counts - - return True, alt_states, alt_ids, alt_P_counts diff --git a/code/analyze/filter_regions.py b/code/analyze/filter_regions.py new file mode 100644 index 0000000..4d1695f --- /dev/null +++ b/code/analyze/filter_regions.py @@ -0,0 +1,409 @@ +from misc import seq_functions +import numpy as np +from typing import List, Dict, TextIO, Tuple +import click +import logging as log +from contextlib import ExitStack, contextmanager +from misc import read_table +from misc.region_reader import Region_Reader +from analyze.introgression_configuration import Configuration + + +class Filterer(): + def __init__(self, configuration: Configuration): + self.config = configuration + + def filter_introgressed(self, + region: Dict, + info: str, + reference_species: str) -> Tuple[bool, str]: + ''' + filtering out things that we can't call introgressed in general + with confidence (i.e. doesn't seem like a strong case against + being S288c) + Return true if the region passes the filter, or false with a string + specifying which filter failed + Tests: + -fraction of gaps masked in reference > 0.5 + -fraction of gaps masked in predicted species > 0.5 + -number of matches to predicted > 7 + -number of matches to predicted > number matches to reference + -divergence with predicted species + ''' + + predicted_species = region['predicted_species'] + + aligned_length = (int(region['end']) - int(region['start']) + 1) + + # FILTER: fraction gaps + masked + fraction_gaps_masked_threshold = .5 + # num_sites_nonmask_x is number of sites at which neither + # reference x nor the test sequence is masked or has a gap or + # unsequenced character + fraction_gaps_masked_r = \ + 1 - int(region['num_sites_nonmask_' + + reference_species])/aligned_length + fraction_gaps_masked_s = \ + 1 - int(region['num_sites_nonmask_' + + predicted_species])/aligned_length + + if fraction_gaps_masked_r > fraction_gaps_masked_threshold: + return False, f'fraction gaps/masked in master = '\ + f'{fraction_gaps_masked_r}' + if fraction_gaps_masked_s > fraction_gaps_masked_threshold: + return False, f'fraction gaps/masked in predicted = '\ + f'{fraction_gaps_masked_s}' + + # FILTER: number sites analyzed by HMM that match predicted (P) + # reference (C) + count_P = np.sum(info == 'P') + count_C = np.sum(info == 'C') + number_match_only_threshold = 7 + if count_P < number_match_only_threshold: + return False, f'count_P = {count_P}' + if count_P <= count_C: + return False, f'count_P = {count_P} and count_C = {count_C}' + + # FILTER: divergence with predicted reference and master reference + # (S288c) + id_predicted = float(region['match_nongap_' + predicted_species]) / \ + float(region['num_sites_nongap_' + predicted_species]) + id_master = float(region['match_nongap_' + reference_species]) / \ + float(region['num_sites_nongap_' + reference_species]) + + if id_master >= id_predicted: + return False, f'id with master = {id_master} '\ + f'and id with predicted = {id_predicted}' + if id_master < .7: + return False, f'id with master = {id_master}' + + return True, '' + + def filter_ambiguous(self, + region: Dict, + seqs: np.array, + threshold: float, + refs: List[str]) -> Tuple[bool, + List[str], + List[float], + List[int]]: + ''' + filter out things we can't assign to one species specifically; + return the other reasonable alternatives if we're filtering + it out + Returns: + True if the region passes the filter + Fails the filter if number of matches and fraction matching are >= more + than one state for the region + Region is updated with: + A list of likely species for the region + A list of fraction of matching sequence for each species + A list of total matching sites + ''' + + s = region['predicted_species'] + + ids = {} + P_counts = {} + + seqs = np.asarray(seqs) + # skip any gap or unsequenced in ref or test + # also skip if ref and test equal (later test ri == test but not ref) + symbols = self.config.symbols + skip = np.any( + (seqs[0] == symbols['gap'], + seqs[0] == symbols['unsequenced'], + seqs[-1] == symbols['gap'], + seqs[-1] == symbols['unsequenced'], + seqs[0] == seqs[-1]), + axis=0) + + for ri, ref in enumerate(refs): + if ri == 0: + continue + r_match, r_total = seq_functions.seq_id(seqs[-1], seqs[ri]) + if r_total != 0: + ids[ref] = r_match / r_total + P_counts[ref] = np.sum( + np.logical_and( + np.logical_not(skip), + seqs[ri] == seqs[-1])) + + alts = {} + for r in ids.keys(): + # TODO should threshold be the same for both? + if ids[r] >= threshold * ids[s] and \ + P_counts[r] >= threshold * P_counts[s]: + alts[r] = (ids[r], P_counts[r]) + + alt_states = sorted(alts.keys(), + key=lambda x: alts[x][0], + reverse=True) + region['alternative_states'] = ','.join(alt_states) + + alt_ids = [alts[state][0] for state in alt_states] + region['alternative_ids'] = ','.join( + [str(x) for x in alt_ids]) + + alt_P_counts = [alts[state][1] for state in alt_states] + region['alternative_P_counts'] = ','.join( + [str(x) for x in alt_P_counts]) + + return len(alts) <= 1, alt_states + + def validate_arguments(self): + args = [ + 'introgressed', + 'introgressed_intermediate', + 'ambiguous', + 'ambiguous_intermediate', + 'filter_threshold', + 'known_states', + 'regions', + 'region_index', + 'symbols', + 'quality_blocks' + ] + variables = self.config.__dict__ + for arg in args: + if arg not in variables or variables[arg] is None: + err = ('Failed to validate Filterer, required argument ' + f"'{arg}' was unset") + log.exception(err) + raise ValueError(err) + + if 'filter_sweep' not in variables or \ + variables['filter_sweep'] is None: + log.warning(f"'filter_sweep' was unset and will not be run") + + def run(self, thresholds=[]): + ''' + Filter region files based on thresold in config and sweep + with the supplied threshold list + ''' + self.validate_arguments() + known_states = self.config.known_states + log.debug(f'Known states: {known_states}') + + with Filter_Sweep(self.config.filter_sweep, thresholds) as sweeper,\ + ExitStack() as stack: + + progress_bar = None + if self.config.log_file: + progress_bar = stack.enter_context( + click.progressbar( + length=len(known_states[1:]), + label='Filtering')) + + sweeper.write_header() + writers = Filter_Writers(self.config) + + for species_from in known_states[1:]: + + log.info(species_from) + + region_summary, fields = read_table.read_table_rows( + self.config.quality_blocks.format(state=species_from), + '\t') + + with writers.open_state(species_from, fields) as writers,\ + Region_Reader(self.config.regions.format( + state=species_from), as_fa=True) as region_reader: + + writers.write_headers() + + for region_id, _, seqs in region_reader.yield_fa(): + region = region_summary[region_id] + seqs, info_string = seqs[:-1], seqs[-1] + + # filtering stage 1: things that we're confident in + # calling not S288c + passes, reason = self.filter_introgressed( + region, + info_string, + known_states[0]) + region['reason'] = reason + + writers.write_introgressed(region_id, region, passes) + + if passes: + sweeper.record( + species_from, + lambda thresh: self.filter_ambiguous( + region, seqs, thresh, known_states)) + + passes, _ = self.filter_ambiguous( + region, seqs, + self.config.filter_threshold, known_states) + writers.write_ambiguous(region_id, region, passes) + + if progress_bar: + progress_bar.update(1) + + sweeper.write_results(known_states[1:]) + + +class Filter_Sweep(): + def __init__(self, + sweep_file: str, + thresholds: List[float]): + self.sweep_file = sweep_file + self.sweep_writer = None + self.thresholds = thresholds + self.data_table = {} + + def __enter__(self): + if self.sweep_file is not None and self.thresholds != []: + self.sweep_writer = open(self.sweep_file, 'w') + + return self + + def __exit__(self, type, value, traceback): + if self.sweep_writer: + self.sweep_writer.close() + + return traceback is None + + def write_header(self): + ''' + Write the header for the sweep filter file + ''' + if self.sweep_writer: + self.sweep_writer.write( + 'threshold\tpredicted_state\talternative_states\tcount\n') + + def record(self, species_from, thresh_lambda): + ''' + Record the thresholds for this filter sweep object. + The thresh lambda is an anonymous function that takes a threshold + and returns a tuple with the value at index 1 being the alternative + states. Filter_ambiguous is what this is meant for. + ''' + if self.sweep_writer is None: + return + + for thresh in self.thresholds: + _, states = thresh_lambda(thresh) + self.record_data_hit(thresh, species_from, states) + + def record_data_hit(self, threshold: float, species: str, states: List): + ''' + adds an entry to the data table or increments if exists + ''' + key = ','.join(sorted(states)) + if threshold not in self.data_table: + self.data_table[threshold] = {} + + if species not in self.data_table[threshold]: + self.data_table[threshold][species] = {} + + if key not in self.data_table[threshold][species]: + self.data_table[threshold][species][key] = 0 + + self.data_table[threshold][species][key] += 1 + + def write_results(self, states): + if self.sweep_writer is None: + return + + for thresh in self.thresholds: + for species in states: + if thresh in self.data_table and \ + species in self.data_table[thresh]: + d = self.data_table[thresh][species] + for key, value in d.items(): + self.sweep_writer.write( + f'{thresh}\t{species}\t{key}\t{value}\n') + + +class Filter_Writers(): + ''' + Writes the filter and intermediate files + ''' + def __init__(self, config): + self.files = { + 'introgressed': config.introgressed, + 'introgressed_int': config.introgressed_intermediate, + 'ambiguous': config.ambiguous, + 'ambiguous_int': config.ambiguous_intermediate + } + self.headers = None + self.writers = None + + @contextmanager + def open_state(self, state: str, fields: List): + ''' + Open output files for the particular state + ''' + self.headers = { + 'introgressed': fields, + 'introgressed_int': fields + ['reason'], + 'ambiguous': fields, + 'ambiguous_int': fields + ['alternative_states', + 'alternative_ids', + 'alternative_P_counts'] + } + + self.writers = {k: open(v.format(state=state), 'w') + for k, v in self.files.items()} + + yield self + + for writer in self.writers.values(): + writer.close() + + self.headers = None + self.writers = None + + def write_headers(self): + if self.headers is None or self.writers is None: + return + + for key, writer in self.writers.items(): + writer.write('\t'.join(self.headers[key]) + '\n') + + def write_filtered_line(self, + writer: TextIO, + region_id: str, + region: Dict, + fields: List) -> None: + ''' + Write the region id and values in "region" dict to open file writer + ''' + writer.write(f'{region_id}\t') + writer.write('\t'.join([str(region[field]) for field in fields[1:]])) + writer.write('\n') + + def write_introgressed(self, + region_id: str, + region: Dict, + passes: bool): + self.write_filtered_line( + self.writers['introgressed_int'], + region_id, + region, + self.headers['introgressed_int']) + + if passes: + self.write_filtered_line( + self.writers['introgressed'], + region_id, + region, + self.headers['introgressed']) + + def write_ambiguous(self, + region_id: str, + region: Dict, + passes: bool): + self.write_filtered_line( + self.writers['ambiguous_int'], + region_id, + region, + self.headers['ambiguous_int']) + + if passes: + self.write_filtered_line( + self.writers['ambiguous'], + region_id, + region, + self.headers['ambiguous']) diff --git a/code/analyze/id_regions.py b/code/analyze/id_regions.py new file mode 100644 index 0000000..f964881 --- /dev/null +++ b/code/analyze/id_regions.py @@ -0,0 +1,92 @@ +from contextlib import ExitStack +from operator import itemgetter +from analyze.introgression_configuration import Configuration +from analyze.predict import read_blocks +import click +import logging as log + + +class ID_producer(): + ''' + ID_producer + Adds unique region id to block files + ''' + def __init__(self, configuration: Configuration): + self.config = configuration + + def add_ids(self): + ''' + Adds a unique region id to block files, producing labeled text files + ''' + self.validate_arguments() + regions = dict(zip(self.config.chromosomes, + [[] for _ in self.config.chromosomes])) + with ExitStack() as stack: + writers = {} + + # Progress bars don't seem to show since these complete too fast + progress_bar = None + if self.config.log_file: + progress_bar = stack.enter_context( + click.progressbar( + length=len(self.config.states), + label='Reading in states')) + + for state in self.config.states: + # read in region as dict keyed by strain, chromosome: + # (start, end, number non gapped) + region = read_blocks(self.config.blocks.format(state=state)) + for strain, d_strain in region.items(): + for chrm, d_chrm in d_strain.items(): + for start, end, num in d_chrm: + regions[chrm].append( + (start, end, num, strain, state)) + + # open writer + writers[state] = stack.enter_context( + open(self.config.labeled_blocks.format(state=state), 'w')) + writers[state].write( + 'region_id\tstrain\tchromosome\tpredicted_species\t' + 'start\tend\tnum_sites_hmm\n') + + if progress_bar: + progress_bar.update(1) + id_counter = 1 + + if progress_bar: + progress_bar = stack.enter_context( + click.progressbar( + length=len(regions.keys()), + label='Adding regions')) + + for chrm, entries in regions.items(): + # sort by start, then strain + for start, end, num, strain, state in \ + sorted(entries, key=itemgetter(0, 3)): + writers[state].write( + f'r{id_counter}\t{strain}\t{chrm}\t{state}\t' + f'{start}\t{end}\t{num}\n') + id_counter += 1 + if progress_bar: + progress_bar.update(1) + + def validate_arguments(self): + ''' + Check that all required instance variables are set to perform a + id producer run. Returns true if valid, raises value error otherwise + ''' + args = [ + 'chromosomes', + 'blocks', + 'labeled_blocks', + 'states', + ] + variables = self.config.__dict__ + for arg in args: + if arg not in variables or variables[arg] is None: + err = ('Failed to validate ID Producer, required argument ' + f"'{arg}' was unset") + log.exception(err) + raise ValueError(err) + + return True diff --git a/code/analyze/id_regions_main.py b/code/analyze/id_regions_main.py deleted file mode 100644 index 951f124..0000000 --- a/code/analyze/id_regions_main.py +++ /dev/null @@ -1,54 +0,0 @@ -import sys -from analyze import predict -from analyze import read_args -from operator import itemgetter -import global_params as gp - - -def main(): - args = read_args.process_predict_args(sys.argv[1:]) - - # order regions by chromosome, start (break ties alphabetically by strain) - all_regions_by_chrm = dict(zip(gp.chrms, [[] for chrm in gp.chrms])) - output_files = {} - base_dir = gp.analysis_out_dir_absolute + args['tag'] - for species_from in args['states']: - - # strain chromosome predicted_species start end number_non_gap - fn = f'{base_dir}/blocks_{species_from}_{args["tag"]}.txt' - - # introgressed regions keyed by strain and then chromosome: - # (start, end, number_non_gap) - regions = predict.read_blocks(fn) - - for strain in regions: - for chrm in regions[strain]: - for entry in regions[strain][chrm]: - start, end, number_non_gap = entry - all_regions_by_chrm[chrm].append( - (start, end, number_non_gap, strain, species_from)) - - output_files[species_from] = f'{fn[:-4]}_labeled.txt' - - writers = {} - for species_from in args['states']: - writers[species_from] = open(output_files[species_from], 'w') - writers[species_from].write( - 'region_id\tstrain\tchromosome\tpredicted_species\t' - 'start\tend\tnum_sites_hmm\n') - - idc = 1 - for chrm in gp.chrms: - for entry in sorted(all_regions_by_chrm[chrm], key=itemgetter(0, 3)): - (start, end, number_non_gap, strain, species_from) = entry - writers[species_from].write( - f'r{idc}\t{strain}\t{chrm}\t{species_from}\t' - f'{start}\t{end}\t{number_non_gap}\n') - idc += 1 - - for species_from in args['states']: - writers[species_from].close() - - -if __name__ == "__main__": - main() diff --git a/code/analyze/introgression_configuration.py b/code/analyze/introgression_configuration.py new file mode 100644 index 0000000..61a50ff --- /dev/null +++ b/code/analyze/introgression_configuration.py @@ -0,0 +1,399 @@ +import glob +import re +from typing import Tuple, Dict, List +import logging as log +from misc.config_utils import (get_nested, clean_config, merge_dicts, + print_dict, validate, check_wildcards) + + +class Configuration(): + def __init__(self): + self.config = {} + self.log_file = None + + # these are very regular variables with state as a wildcard + state_files = [ + 'blocks', + 'labeled_blocks', + 'quality_blocks', + 'introgressed', + 'introgressed_intermediate', + 'ambiguous', + 'ambiguous_intermediate', + 'regions', + 'region_index', + ] + # no wildcards, non nullable + nonwild_files = [ + 'hmm_initial', + 'hmm_trained', + 'positions', + 'probabilities', + 'strain_info', + 'state_counts', + ] + var_list = [ + Variable('chromosomes'), + Threshold_Variable(), + Convergence_Variable(), + Symbols_Variable(), + Filter_Threshold_Variable(), + Variable('log_file', 'paths.log_file', nullable=True), + Variable('filter_sweep', 'paths.analysis.filter_sweep', + nullable=True), + Variable('masks', 'paths.analysis.masked_intervals', + wildcards='strain,chrom'), + ] + [ + Variable(n, f'paths.analysis.{n}', wildcards='state') + for n in state_files + ] + [ + Variable(n, f'paths.analysis.{n}') + for n in nonwild_files + ] + + self.variables = {v.name: v for v in var_list} + # these require too much state from configuration to split out + self.other_parsers = { + 'states': self._set_states, + 'prefix': self._set_prefix, + 'strains': self._set_strains, + 'alignment': self._set_alignment + } + + def add_config(self, configuration: Dict): + ''' + merge the provided configuration dictionary with this object. + Cleans configuration + ''' + self.config = clean_config( + merge_dicts(self.config, configuration)) + + def set(self, *args, **kwargs): + ''' + Set the supplied variable to the value provided. + If just a name is provided, set the value with a value of None + ''' + kwargs.update({a: None for a in args}) + for key, value in kwargs.items(): + if key in self.variables: + variable = self.variables[key] + self.__dict__[key] = variable.parse(value, self.config) + + elif key in self.other_parsers: + self.other_parsers[key](value) + + else: + err = f'Unknown variable to set: {key}' + log.exception(err) + raise ValueError(err) + + def get_states(self) -> Tuple[List, List]: + ''' + Build lists of known and unknown states from the analysis params + ''' + + ref = get_nested(self.config, 'analysis_params.reference.name') + if ref is None: + ref = [] + else: + ref = [ref] + + known = get_nested(self.config, 'analysis_params.known_states') + if known is None: + known = [] + + known_states = ref + [s['name'] for s in known] + + unknown = get_nested(self.config, 'analysis_params.unknown_states') + if unknown is None: + unknown = [] + + unknown_states = [s['name'] for s in unknown] + + return known_states, unknown_states + + def get_interval_states(self) -> List: + ''' + Build list of interval states, typically just known names + but if the state has an interval name, use that + ''' + ref = get_nested(self.config, 'analysis_params.reference') + + # set with name or empty list + if ref is None: + ref = [] + else: + ref = [ref] + + known = get_nested(self.config, 'analysis_params.known_states') + if known is None: + known = [] + + return [s['interval_name'] + if 'interval_name' in s + else s['name'] + for s in ref + known] + + def _set_states(self, states: List[str] = None): + ''' + Set the states for which to perform region naming + ''' + if states is None or states == []: + self.known_states, self.unknown_states = self.get_states() + self.states = self.known_states + self.unknown_states + else: + self.states = states + + self.interval_states = self.get_interval_states() + + if self.states == []: + err = 'No states specified' + log.exception(err) + raise ValueError(err) + + def _set_prefix(self, prefix: str = ''): + ''' + Set prefix string of the predictor to the supplied value or + build it from the known states + ''' + if not prefix: + if self.known_states == []: + err = 'Unable to build prefix, no known states provided' + log.exception(err) + raise ValueError(err) + + self.prefix = '_'.join(self.known_states) + else: + self.prefix = prefix + + def _set_strains(self, test_strains: str = ''): + ''' + build the strains to perform prediction on + ''' + if not test_strains: + test_strains = get_nested(self.config, 'paths.test_strains') + else: + # need to support list for test strains + test_strains = [test_strains] + + if test_strains is not None: + for test_strain in test_strains: + check_wildcards(test_strain, 'strain,chrom') + + self.find_strains(test_strains) + + def find_strains(self, test_strains: List[str] = None): + ''' + Helper method to get strains supplied in config, or from test_strains + ''' + strains = get_nested(self.config, 'strains') + self.test_strains = test_strains + + if strains is None: + if test_strains is None: + err = ('Unable to find strains in config and ' + 'no test_strains provided') + log.exception(err) + raise ValueError(err) + + # try to build strains from wildcards in test_strains + strains = {} + for test_strain in test_strains: + # find matching files + strain_glob = test_strain.format( + strain='*', + chrom='*') + log.info(f'searching for {strain_glob}') + for fname in glob.iglob(strain_glob): + # extract wildcard matches + match = re.match( + test_strain.format( + strain='(?P.*?)', + chrom='(?P[^_]*?)' + ), + fname) + if match: + log.debug( + f'matched with {match.group("strain", "chrom")}') + strain, chrom = match.group('strain', 'chrom') + if strain not in strains: + strains[strain] = set() + strains[strain].add(chrom) + + if len(strains) == 0: + err = ('Found no chromosome sequence files ' + f'in {test_strains}') + log.exception(err) + raise ValueError(err) + + # check if requested chromosomes are within the list of chroms + chrom_set = set(self.chromosomes) + for strain, chroms in strains.items(): + if not chrom_set.issubset(chroms): + not_found = chrom_set.difference(chroms).pop() + err = (f'Strain {strain} is missing chromosomes. ' + f'Unable to find chromosome \'{not_found}\'') + log.exception(err) + raise ValueError(err) + + self.strains = list(sorted(strains.keys())) + + else: # strains set in config + self.strains = list(sorted(set(strains))) + + def _set_alignment(self, alignment: str): + ''' + Set the alignment file, checking wildcards prefix, strain and chrom. + If prefix is present, it is substituted, otherwise checks just + strain and chrom + ''' + alignment = validate(self.config, + 'paths.analysis.alignment', + 'No alignment provided', + alignment) + + check_wildcards(alignment, 'strain,chrom') + if '{prefix}' in alignment: + self.alignment = alignment.replace('{prefix}', self.prefix) + else: + self.alignment = alignment + + def get(self, key: str): + ''' + Get nested key from underlying dictionary. Returning none if any + key is not in dict + ''' + return get_nested(self.config, key) + + def __repr__(self): + return ('Config file:\n' + + print_dict(self.config) + + '\nSettings:\n' + + print_dict({k: v for k, v in self.__dict__.items() + if k != 'config' and k != 'variables' + and k != 'other_parsers'}) + ) + + +class Variable(): + def __init__(self, name, config_path=None, nullable=False, wildcards=None): + self.name = name + if config_path: + self.config_path = config_path + else: + self.config_path = name + + self.nullable = nullable + self.wildcards = wildcards + + def parse(self, value, config={}): + if self.nullable: + if not value: + value = get_nested(config, self.config_path) + + else: + value = validate(config, self.config_path, + f'No {self.name} provided', value) + + if self.wildcards: + check_wildcards(value, self.wildcards) + + return value + + +class Threshold_Variable(Variable): + def __init__(self): + super().__init__('threshold', 'analysis_params.threshold') + + def parse(self, value, config={}): + value = super().parse(value, config) + + try: + value = float(value) + + except ValueError: + if value != 'viterbi': + err = f'Unsupported threshold value: {value}' + log.exception(err) + raise ValueError(err) + + return value + + +class Filter_Threshold_Variable(Variable): + def __init__(self): + super().__init__('filter_threshold', + 'analysis_params.filter_threshold') + + def parse(self, value, config={}): + value = super().parse(value, config) + + try: + value = float(value) + + except (ValueError, TypeError): + err = 'Filter threshold is not a valid number' + log.exception(err) + raise ValueError(err) + + return value + + +class Convergence_Variable(Variable): + def __init__(self): + super().__init__('convergence', + 'analysis_params.convergence_threshold', + nullable=True) + + def parse(self, value, config={}): + value = super().parse(value, config) + + try: + value = float(value) + + except (ValueError, TypeError): + log.warning('No value set for convergence_threshold, using ' + 'default of 0.001') + value = 0.001 + + return value + + +class Symbols_Variable(Variable): + def __init__(self): + super().__init__('symbols', '') + + def parse(self, value, config): + ''' + Set symbols based on config values, using defaults if unset + ''' + symbols = { + 'match': '+', + 'mismatch': '-', + 'unknown': '?', + 'unsequenced': 'n', + 'gap': '-', + 'unaligned': '?', + 'masked': 'x' + } + config_symbols = get_nested(config, 'HMM_symbols') + if config_symbols is not None: + for k, v in config_symbols.items(): + if k not in symbols: + log.warning("Unused symbol in configuration: " + f"{k} -> '{v}'") + else: + symbols[k] = v + log.debug(f"Overwriting default symbol for {k} with '{v}'") + + for k, v in symbols.items(): + if k not in config_symbols: + log.warning(f'Symbol for {k} unset in config, ' + f"using default '{v}'") + + else: + for k, v in symbols.items(): + log.warning(f'Symbol for {k} unset in config, ' + f"using default '{v}'") + + return symbols diff --git a/code/analyze/main.py b/code/analyze/main.py new file mode 100644 index 0000000..bff61bd --- /dev/null +++ b/code/analyze/main.py @@ -0,0 +1,337 @@ +import click +import yaml +import logging as log +import analyze.predict +from analyze.introgression_configuration import Configuration +from analyze.id_regions import ID_producer +from analyze.summarize_region_quality import Summarizer +from analyze.filter_regions import Filterer +from analyze.summarize_strain_states import Strain_Summarizer + + +# TODO also check for snakemake object? +@click.group(invoke_without_command=True) +@click.option('--config', '-c', + multiple=True, + type=click.File('r'), + help='Base configuration yaml.') +@click.option('-v', '--verbosity', count=True, default=3) +@click.option('--log-file', + default='', + help='Optional log file. If unset print to stdout.') +@click.pass_context +def cli(ctx, config, verbosity, log_file): + ''' + Main entry script to run analyze methods + ''' + + verbosity -= 1 + verbosity = 4 if verbosity > 4 else verbosity + levelstr, level = [ + ('CRITICAL', log.CRITICAL), + ('ERROR', log.ERROR), + ('WARNING', log.WARNING), + ('INFO', log.INFO), + ('DEBUG', log.DEBUG), + ][verbosity] + + ctx.ensure_object(Configuration) + + confs = len(config) + for path in config: + conf = yaml.safe_load(path) + ctx.obj.add_config(conf) + + ctx.obj.set(log_file=log_file) + if ctx.obj.log_file is not None: + log.basicConfig(level=level, filename=ctx.obj.log_file, filemode='w') + else: + log.basicConfig(level=level) + log.info(f'Verbosity set to {levelstr}') + + log.info(f'Read in {confs} config file{"" if confs == 1 else "s"}') + log.debug('Cleaned config:\n' + repr(ctx.obj)) + + if ctx.invoked_subcommand is None: + click.echo_via_pager( + click.style( + 'No command supplied. Read in the following config:\n', + fg='yellow') + repr(ctx.obj)) + + +@cli.command() +@click.option('--blocks', default='', help='Block file location with {state}') +@click.option('--prefix', default='', help='Prefix of test-strain files ' + 'default to list of states joined with _.') +@click.option('--test-strains', default='', + help='Test files location with {strain} and {chrom}') +@click.option('--hmm-initial', default='', + help='Initial hmm parameter text file') +@click.option('--hmm-trained', default='', + help='Trained hmm parameter text file') +@click.option('--positions', default='', + help='Positions file, gzipped') +@click.option('--probabilities', default='', + help='Probabilities file, gzipped') +@click.option('--threshold', default='', + help='Threshold to apply to estimated path. Valid values are ' + 'floats or `viterbi\'') +@click.option('--alignment', default='', + help='Alignment file location with ' + '{prefix}, {strain}, and {chrom}') +@click.option('--only-poly-sites/--all-sites', default=True, + help='Consider only polymorphic sites or all sites. ' + 'Default is only polymorphic.') +@click.pass_context +def predict(ctx, + blocks, + prefix, + test_strains, + hmm_initial, + hmm_trained, + positions, + probabilities, + threshold, + alignment, + only_poly_sites): + config = ctx.obj + + config.set('chromosomes') + log.info(f'Found {len(config.chromosomes)} chromosomes in config') + + config.set(threshold=threshold) + log.info(f'Threshold value is \'{config.threshold}\'') + + config.set(blocks=blocks) + log.info(f'Output blocks file is \'{config.blocks}\'') + + config.set('states') + config.set(prefix=prefix) + log.info(f'Prefix is \'{config.prefix}\'') + + config.set(strains=test_strains) + if config.test_strains is None: + log.info(f'No test_strains provided') + else: + str_len = len(config.test_strains) + log.info(f'Found {str_len} test strain' + f'{"" if str_len == 1 else "s"}') + str_len = len(config.strains) + log.info(f'Found {str_len} unique strain' + f'{"" if str_len == 1 else "s"}') + + config.set(hmm_initial=hmm_initial, + hmm_trained=hmm_trained, + positions=positions, + probabilities=probabilities, + alignment=alignment) + log.info(f'Hmm_initial file is \'{config.hmm_initial}\'') + log.info(f'Hmm_trained file is \'{config.hmm_trained}\'') + log.info(f'Positions file is \'{config.positions}\'') + log.info(f'Probabilities file is \'{config.probabilities}\'') + log.info(f'Alignment file is \'{config.alignment}\'') + + predictor = analyze.predict.Predictor(config) + if only_poly_sites: + log.info('Only considering polymorphic sites') + else: + log.info('Considering all sites') + predictor.run_prediction(only_poly_sites) + + +@cli.command() +@click.option('--blocks', default='', help='Block file location with {state}') +@click.option('--labeled', default='', help='Block file location with {state}') +@click.option('--state', multiple=True, help='States to add ids to') +@click.pass_context +def id_regions(ctx, blocks, labeled, state): + config = ctx.obj + config.set('chromosomes') + log.info(f'Found {len(config.chromosomes)} chromosomes in config') + + state = list(state) + config.set(states=state) + log.info(f'Found {len(config.states)} states to process') + + config.set(blocks=blocks) + log.info(f'Input blocks file is \'{config.blocks}\'') + + config.set(labeled_blocks=labeled) + log.info(f'Output blocks file is \'{config.labeled_blocks}\'') + + id_producer = ID_producer(config) + id_producer.add_ids() + + +@cli.command() +@click.option('--state', multiple=True, help='States to summarize') +@click.option('--labeled', default='', + help='Labeled block file with {state} ' + 'Created during id_regions') +@click.option('--masks', default='', + help='Mask file with {strain} and {chrom}') +@click.option('--alignment', default='', + help='Alignment file with {prefix} [optional], ' + '{strain} and {chrom}') +@click.option('--positions', default='', + help='Position file created during prediction') +@click.option('--quality', default='', + help='Output quality file with {state}') +@click.option('--region', default='', + help='Output region file with {state}, gzipped') +@click.option('--region-index', default='', + help='Output region index file with {state}, pickled') +@click.pass_context +def summarize_regions(ctx, + state, + labeled, + quality, + masks, + alignment, + positions, + region, + region_index): + config = ctx.obj + + config.set('states', + 'chromosomes') + log.info(f'Found {len(config.chromosomes)} chromosomes in config') + + config.set(labeled_blocks=labeled) + log.info(f'Labeled blocks file is \'{config.labeled_blocks}\'') + + config.set(quality_blocks=quality) + log.info(f'Quality file is \'{config.quality_blocks}\'') + + config.set(masks=masks) + log.info(f'Mask file is \'{config.masks}\'') + + config.set('prefix') + config.set(alignment=alignment) + log.info(f'Alignment file is \'{config.alignment}\'') + + config.set(positions=positions) + log.info(f'Positions file is \'{config.positions}\'') + + config.set(regions=region, region_index=region_index) + log.info(f'Region file is \'{config.regions}\'') + log.info(f'Region index file is \'{config.region_index}\'') + + config.set('symbols') + + summarizer = Summarizer(config) + summarizer.run(list(state)) + + +@cli.command() +@click.option('--thresh', help='Threshold to apply to ambiguous filter', + default=None, type=float) +@click.option('--introgress-filter', default='', + help='Filtered block file location with {state}.' + ' Contains only regions passing introgression filter') +@click.option('--introgress-inter', default='', + help='Filtered block file location with {state}.' + ' Contains all regions with reasons they failed filtering') +@click.option('--ambiguous-filter', default='', + help='Filtered block file location with {state}.' + ' Contains only regions passing ambiguous filter') +@click.option('--ambiguous-inter', default='', + help='Filtered block file location with {state}.' + ' Contains all regions passing introgressing filtering, ' + 'with reasons they failed ambiguous filtering') +@click.option('--filter-sweep', default='', + help='Contains summary results for applying ambiguous filter ' + 'with various threshold values supplied as arguments.') +@click.option('--region', default='', + help='Region file with {state}, gzipped') +@click.option('--region-index', default='', + help='Region index file with {state}, pickled') +@click.option('--quality', default='', + help='Quality file with {state}') +@click.argument('thresholds', nargs=-1, type=float) +@click.pass_context +def filter_regions(ctx, + thresh, + introgress_filter, + introgress_inter, + ambiguous_filter, + ambiguous_inter, + filter_sweep, + region, + region_index, + quality, + thresholds): + config = ctx.obj # type: Configuration + config.set('states') + + config.set(filter_threshold=thresh) + log.info(f"Filter threshold set to '{config.filter_threshold}'") + + config.set(introgressed=introgress_filter, + introgressed_intermediate=introgress_inter, + ambiguous=ambiguous_filter, + ambiguous_intermediate=ambiguous_inter, + filter_sweep=filter_sweep) + log.info(f"Introgressed filtered file is '{config.introgressed}'") + log.info('Introgressed intermediate file is ' + f"'{config.introgressed_intermediate}'") + log.info(f"Ambiguous filtered file is '{config.ambiguous}'") + log.info('Ambiguous intermediate file is ' + f"'{config.ambiguous_intermediate}'") + if config.filter_sweep is not None: + log.info(f"Filter sweep file is '{config.filter_sweep}'") + + config.set(regions=region, + region_index=region_index) + log.info(f'Region file is \'{config.regions}\'') + log.info(f'Region index file is \'{config.region_index}\'') + + config.set(quality_blocks=quality) + log.info(f'Quality file is \'{config.quality_blocks}\'') + + config.set('symbols') + + thresholds = list(thresholds) + log.info(f'Threshold sweep with: {thresholds}') + + filterer = Filterer(config) + filterer.run(thresholds) + + +@cli.command() +@click.option('--introgress-inter', default='', + help='Filtered block file location with {state}.' + ' Contains all regions with reasons they failed filtering') +@click.option('--ambiguous-inter', default='', + help='Filtered block file location with {state}.' + ' Contains all regions passing introgressing filtering, ' + 'with reasons they failed ambiguous filtering') +@click.option('--strain-info', default='', + help='Tab separated table with strain name, alternate name, ' + 'location, envionment, and population') +@click.option('--state-counts', default='', + help='Output state summary file') +@click.pass_context +def summarize_strains(ctx, + introgress_inter, + ambiguous_inter, + strain_info, + state_counts): + config = ctx.obj # type: Configuration + config.set('states') + config.set(introgressed_intermediate=introgress_inter, + ambiguous_intermediate=ambiguous_inter, + strain_info=strain_info, + state_counts=state_counts) + log.info('Introgressed intermediate file is ' + f"'{config.introgressed_intermediate}'") + log.info('Ambiguous intermediate file is ' + f"'{config.ambiguous_intermediate}'") + log.info(f"Strain information from '{config.strain_info}'") + log.info(f"State counts saved to '{config.state_counts}'") + strain_summarizer = Strain_Summarizer(config) + strain_summarizer.run() + + +if __name__ == '__main__': + cli() diff --git a/code/analyze/plotting/format_for_plot_gene_region.py b/code/analyze/plotting/format_for_plot_gene_region.py index 306576e..176912f 100644 --- a/code/analyze/plotting/format_for_plot_gene_region.py +++ b/code/analyze/plotting/format_for_plot_gene_region.py @@ -7,29 +7,23 @@ # - for gap -import re -import sys -import os -import copy import gzip -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import read_fasta -import read_table +from align import align_helpers +from misc import read_fasta # copy pasta + def try_int(s, default=-1): try: i = int(s) return i - except: + except ValueError: return default -def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): + +def referize(strain_seq, ref_ind_to_strain_ind, skip_char='N'): s = [skip_char for r in ref_ind_to_strain_ind] for i in range(len(ref_ind_to_strain_ind)): si = ref_ind_to_strain_ind[i] @@ -39,31 +33,34 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): s[i] = strain_seq[si] return s -#region_start = 787000 -#region_end = 794000 -#chrm = 'II' + +# region_start = 787000 +# region_end = 794000 +# chrm = 'II' region_start = 917571 - 100 region_end = 921647 + 100 chrm = 'IV' region_length = region_end - region_start + 1 -##====== +# ====== # get strains -##====== +# ====== -strain_dirs = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) +strain_dirs = align_helpers.get_strains( + align_helpers.flatten(gp.non_ref_dirs.values())) num_strains = len(strain_dirs) -##====== +# ====== # loop through all strains, getting appropriate sequence -##====== +# ====== # master reference and other reference seqs master_ref = gp.alignment_ref_order[0] master_fn = gp.ref_dir[master_ref] + gp.ref_fn_prefix[master_ref] + '_chr' + \ chrm + gp.fasta_suffix -master_seq = read_fasta.read_fasta(master_fn)[1][0][region_start:region_end+1].lower() +master_seq = read_fasta.read_fasta(master_fn)[1][0][ + region_start:region_end+1].lower() other_ref = gp.alignment_ref_order[1] @@ -74,30 +71,30 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()] other_ref_fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \ '_chr' + chrm + gp.fasta_suffix -other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(), \ +other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(), ref_ind_to_strain_ind)[region_start:region_end+1] # other strains seqs = {} for i in range(num_strains): strain, d = strain_dirs[i] - print strain - coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ - gp.master_ref + '_to_' + strain + \ - '_chr' + chrm + '.txt.gz' + print(strain) + coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' + + gp.master_ref + '_to_' + strain + + '_chr' + chrm + '.txt.gz') f_coord = gzip.open(coord_fn, 'rb') - ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()] + ref_ind_to_strain_ind = [try_int(line[:-1]) + for line in f_coord.readlines()] strain_fn = d + strain + '_chr' + chrm + gp.fasta_suffix - seqs[strain] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(), \ + seqs[strain] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(), ref_ind_to_strain_ind)[region_start:region_end+1] # write file fn = 'gene_region_variants.txt' f = open(fn, 'w') -f.write('ps\t' + '\t'.join([x[0] for x in strain_dirs]) + '\n') +f.write('ps\t' + '\t'.join([x[0] for x in strain_dirs]) + '\n') for i in range(region_length): - f.write(str(region_start + i)) for strain, d in strain_dirs: x = seqs[strain][i] @@ -119,4 +116,3 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): f.write('n') f.write('\n') f.close() - diff --git a/code/analyze/plotting/format_for_plotting.py b/code/analyze/plotting/format_for_plotting.py index fca2521..d0aca70 100644 --- a/code/analyze/plotting/format_for_plotting.py +++ b/code/analyze/plotting/format_for_plotting.py @@ -1,35 +1,29 @@ # format output files to be read easily and plotted in R -import re import sys -import os -import copy import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import mystats +from misc import mystats -##====== +# ====== # read in analysis parameters -##====== +# ====== suffix = '' if len(sys.argv == 3): suffix = sys.argv[1] -all_predict_args = [x.strip().split() for x in open(sys.argv[2], 'r').readlines()] -all_predict_args = [{'tag':x[0], 'improvement_frac':x[1], 'threshold':x[2], \ - 'expected_length':x[-2], 'expected_frac':x[-1]} \ +all_predict_args = [x.strip().split() + for x in open(sys.argv[2], 'r').readlines()] +all_predict_args = [{'tag': x[0], 'improvement_frac': x[1], 'threshold': x[2], + 'expected_length':x[-2], 'expected_frac':x[-1]} for x in all_predict_args] -l = range(0,36) -l.remove(19) -l.remove(25) -l = [0] -all_predict_args = [all_predict_args[i] for i in l] +arg_inds = range(0, 36) +arg_inds.remove(19) +arg_inds.remove(25) +arg_inds = [0] +all_predict_args = [all_predict_args[i] for i in arg_inds] ''' finished = range(1,36) @@ -49,39 +43,42 @@ sep = '\t' -##====== +# ====== # for plot: lengths of all introgressed regions -##====== +# ====== # one table for each tag # strain chrm region_length # one table for all tags -# tag improvement_frac threshold expected_length expected_frac avg_length lower upper median min max total_num_regions +# tag improvement_frac threshold expected_length expected_frac +# avg_length lower upper median min max total_num_regions -print 'working on region lengths' +print('working on region lengths') f = open(gp.analysis_out_dir_absolute + 'plot_region_lengths.txt', 'w') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'plot_region_lengths' + suffix + '_' + args['tag'] + '.txt', 'w') - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'introgressed_blocks_par' + suffix + '_' + args['tag'] + '_summary_plus.txt' + f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'plot_region_lengths' + suffix + '_' + args['tag'] + + '.txt', 'w') + fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'introgressed_blocks_par' + suffix + '_' + + args['tag'] + '_summary_plus.txt') region_summary = gene_predictions.read_region_summary(fn) lengths_all = [] for region in region_summary: length = int(region_summary[region]['end']) - \ int(region_summary[region]['start']) + 1 if int(region_summary[region]['number_match_ref2_not_ref1']) >= 5: - f_tag.write(region + sep + region_summary[region]['strain'] + sep + \ - region_summary[region]['chromosome'] + sep + \ + f_tag.write(region + sep + region_summary[region]['strain'] + sep + + region_summary[region]['chromosome'] + sep + str(length) + '\n') lengths_all.append(length) f_tag.close() - f.write(args['tag'] + sep + args['improvement_frac'] + sep + \ - args['threshold'] + sep + args['expected_length'] + sep + \ + f.write(args['tag'] + sep + args['improvement_frac'] + sep + + args['threshold'] + sep + args['expected_length'] + sep + args['expected_frac'] + sep) f.write(str(mystats.mean(lengths_all)) + sep) bs_lower, bs_upper = mystats.bootstrap(lengths_all) @@ -92,43 +89,44 @@ f.write(str(len(lengths_all)) + '\n') f.close() -print 'done' +print('done') sys.exit() -##====== +# ====== # for plot: number of genes per introgressed region -##====== +# ====== # one table for each tag # strain chrm region number_genes # one table for all tags -# tag improvement_frac threshold expected_length expected_frac avg_genes_per_region lower upper median min max +# tag improvement_frac threshold expected_length expected_frac +# avg_genes_per_region lower upper median min max -print 'working on number of genes for each region' +print('working on number of genes for each region') f = open(gp.analysis_out_dir_absolute + 'plot_number_genes_by_region.txt', 'w') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \ + f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + 'plot_number_genes_by_region_' + args['tag'] + '.txt', 'w') - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'genes_for_each_region_' + args['tag'] + '.txt' + fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'genes_for_each_region_' + args['tag'] + '.txt') genes = gene_predictions.read_genes_for_each_region_summary(fn) - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'introgressed_blocks_par_' + args['tag'] + '_summary.txt' + fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'introgressed_blocks_par_' + args['tag'] + '_summary.txt') region_summary = gene_predictions.read_region_summary(fn) num_genes_all = [] for region in genes: - f_tag.write(region + sep + region_summary[region]['strain'] + sep + \ - region_summary[region]['chromosome'] + sep + \ + f_tag.write(region + sep + region_summary[region]['strain'] + sep + + region_summary[region]['chromosome'] + sep + genes[region]['num_genes'] + '\n') num_genes_all.append(int(genes[region]['num_genes'])) f_tag.close() - f.write(args['tag'] + sep + args['improvement_frac'] + sep + \ - args['threshold'] + sep + args['expected_length'] + sep + \ + f.write(args['tag'] + sep + args['improvement_frac'] + sep + + args['threshold'] + sep + args['expected_length'] + sep + args['expected_frac'] + sep) f.write(str(mystats.mean(num_genes_all)) + sep) bs_lower, bs_upper = mystats.bootstrap(num_genes_all) @@ -138,91 +136,96 @@ f.write(str(max(num_genes_all)) + '\n') f.close() -print 'done' +print('done') -##====== +# ====== # for plot: number of introgressed bases for each strain -##====== +# ====== # one table for all tags -# tag improvement_frac threshold expected_length expected_frac strain number_bases +# tag improvement_frac threshold expected_length +# expected_frac strain number_bases -print 'working on number of bases for each strain' +print('working on number of bases for each strain') -f = open(gp.analysis_out_dir_absolute + \ +f = open(gp.analysis_out_dir_absolute + 'plot_number_introgressed_bases_by_strain.txt', 'w') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - fn = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'regions_for_each_strain_' + args['tag'] + '.txt' + fn = (gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + + 'regions_for_each_strain_' + args['tag'] + '.txt') regions = gene_predictions.read_regions_for_each_strain(fn) for strain in regions: total = 0 for r in regions[strain]['region_list']: total += int(r[1]) - f.write(args['tag'] + sep + args['improvement_frac'] + sep + \ - args['threshold'] + sep + args['expected_length'] + sep + \ + f.write(args['tag'] + sep + args['improvement_frac'] + sep + + args['threshold'] + sep + args['expected_length'] + sep + args['expected_frac'] + sep + strain + sep + str(total) + '\n') f.close() -print 'done' +print('done') -##====== +# ====== # for plot: number of introgressed genes for each strain -##====== +# ====== # one table for all tags -# tag improvement_frac threshold expected_length expected_frac strain number_genes +# tag improvement_frac threshold expected_length +# expected_frac strain number_genes -print 'working on number of genes for each strain' +print('working on number of genes for each strain') -f = open(gp.analysis_out_dir_absolute + \ +f = open(gp.analysis_out_dir_absolute + 'plot_number_introgressed_genes_by_strain.txt', 'w') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - fn = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'genes_for_each_strain_' + args['tag'] + '.txt' + fn = (gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + + 'genes_for_each_strain_' + args['tag'] + '.txt') genes = gene_predictions.read_genes_for_each_strain(fn) for strain in genes: - f.write(args['tag'] + sep + args['improvement_frac'] + sep + \ - args['threshold'] + sep + args['expected_length'] + sep + \ - args['expected_frac'] + sep + strain + sep + \ + f.write(args['tag'] + sep + args['improvement_frac'] + sep + + args['threshold'] + sep + args['expected_length'] + sep + + args['expected_frac'] + sep + strain + sep + genes[strain]['num_genes'] + sep + '\n') f.close() -print 'done' +print('done') -##====== -# for plot: number of strains each gene introgressed in -##====== +# ====== +# for plot: number of strains each gene introgressed in +# ====== # one table for each tag # gene num_strains # one table for all tags -# tag improvement_frac threshold expected_length expected_frac avg_strains_per_gene lower upper median min max total_num_genes total_num_genes_1 total_num_genes_>1 +# tag improvement_frac threshold expected_length expected_frac +# avg_strains_per_gene lower upper median min max total_num_genes +# total_num_genes_1 total_num_genes_>1 -print 'working on number of strains for each gene' +print('working on number of strains for each gene') -f = open(gp.analysis_out_dir_absolute + 'plot_number_strains_by_genes.txt', 'w') +f = open(gp.analysis_out_dir_absolute + 'plot_number_strains_by_genes.txt', + 'w') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \ + f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + 'plot_number_strains_by_genes_' + args['tag'] + '.txt', 'w') - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'strains_for_each_gene_' + args['tag'] + '.txt' + fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'strains_for_each_gene_' + args['tag'] + '.txt') strains = gene_predictions.read_strains_for_each_gene(fn) num_strains_all = [] for gene in strains: f_tag.write(gene + sep + strains[gene]['num_strains'] + '\n') num_strains_all.append(int(strains[gene]['num_strains'])) f_tag.close() - f.write(args['tag'] + sep + args['improvement_frac'] + sep + \ - args['threshold'] + sep + args['expected_length'] + sep + \ + f.write(args['tag'] + sep + args['improvement_frac'] + sep + + args['threshold'] + sep + args['expected_length'] + sep + args['expected_frac'] + sep) f.write(str(mystats.mean(num_strains_all)) + sep) bs_lower, bs_upper = mystats.bootstrap(num_strains_all) @@ -235,24 +238,25 @@ f.write(str(len(filter(lambda x: x > 1, num_strains_all))) + '\n') f.close() -print 'done' +print('done') -##====== -# for plot: average fraction of each (introgressed) gene that's introgressed -##====== +# ====== +# for plot: average fraction of each (introgressed) gene that's introgressed +# ====== # one table for each tag # gene avg_frac_introgressed lower upper median min max -print 'working on fraction of gene introgressed' +print('working on fraction of gene introgressed') for i in range(len(all_predict_args)): - print '-', i + print('-', i) args = all_predict_args[i] - f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'plot_frac_introgressed_by_genes_' + args['tag'] + '.txt', 'w') - fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'strains_for_each_gene_' + args['tag'] + '.txt' + f_tag = open(gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'plot_frac_introgressed_by_genes_' + args['tag'] + '.txt', + 'w') + fn = (gp.analysis_out_dir_absolute + args['tag'] + '/' + + 'strains_for_each_gene_' + args['tag'] + '.txt') strains = gene_predictions.read_strains_for_each_gene(fn) for gene in strains: fracs = [float(x[1]) for x in strains[gene]['strain_list']] @@ -265,4 +269,4 @@ f_tag.write(str(max(fracs)) + '\n') f_tag.close() -print 'done' +print('done') diff --git a/code/analyze/plotting/format_for_plotting2.py b/code/analyze/plotting/format_for_plotting2.py index 3362e63..4d9eaca 100644 --- a/code/analyze/plotting/format_for_plotting2.py +++ b/code/analyze/plotting/format_for_plotting2.py @@ -1,14 +1,6 @@ -import re import sys -import os -import copy -import gene_predictions -sys.path.insert(0, '..') +from analyze.to_update import gene_predictions import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import mystats tag = sys.argv[1] @@ -16,15 +8,15 @@ if len(sys.argv == 3): suffix = sys.argv[2] -fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'introgressed_blocks_par' + suffix + '_' + args['tag'] + '_summary_plus.txt' +fn = gp.analysis_out_dir_absolute + tag + '/' + \ + 'introgressed_blocks_par' + suffix + '_' + tag + '_summary_plus.txt' region_summary = gene_predictions.read_region_summary(fn) sep = '\t' -##====== +# ====== # for plot: lengths of all introgressed regions -##====== +# ====== # one table for each tag # strain chrm region_length diff --git a/code/analyze/plotting/format_for_plotting_region.py b/code/analyze/plotting/format_for_plotting_region.py index b6454dc..15cebad 100644 --- a/code/analyze/plotting/format_for_plotting_region.py +++ b/code/analyze/plotting/format_for_plotting_region.py @@ -1,24 +1,20 @@ -import gene_predictions +from analyze.to_update import gene_predictions import sys import os import gzip -sys.path.insert(0, '../misc/') -import read_fasta import global_params as gp -sys.path.insert(0, '../sim/') + def read_annotated_alignment(fn, nstrains): f = gzip.open(fn, 'rb') lines = f.readlines() f.close() - strains = [l[:-1] for l in lines[:nstrains]] genes = lines[nstrains + 2][len('genes:'):-1].split() - + x = 11 match_cer = '' match_par = '' gene = '' - gene_ind = -1 intd = '' while x < len(lines): @@ -38,13 +34,12 @@ def read_annotated_alignment(fn, nstrains): return match_cer, match_par, gene, genes, intd -def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn): +def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn): f = open(fn, 'w') f.write('ps\tmatch\tintd\tgene\n') - block_start = int(region['start']) - intd.index('I') - block_end = len(intd) - intd.rindex('I') + int(region['end']) + block_start = int(region['start']) - intd.index('I') out_of_gene = True gene_ind = -1 @@ -64,11 +59,13 @@ def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn): f.write('\n') f.close() + tag = sys.argv[1] region = sys.argv[2] -blocks_fn = gp.analysis_out_dir_absolute + tag + '/' + \ - 'introgressed_blocks_filtered_' + 'par' + '_' + tag + '_summary.txt' +blocks_fn = (gp.analysis_out_dir_absolute + tag + '/' + + 'introgressed_blocks_filtered_' + 'par' + + '_' + tag + '_summary.txt') r = gene_predictions.read_region_summary(blocks_fn) strain = r[region]['strain'] chrm = r[region]['chromosome'] @@ -86,6 +83,5 @@ def write_ps_annotated(match_cer, match_par, gene, glist, intd, region, fn): write_ps_annotated(match_cer, match_par, gene, glist, intd, r[region], fn_out) -#probs_f = gzip.open(gp.analysis_out_dir_absolute + tag + '/' + \ +# probs_f = gzip.open(gp.analysis_out_dir_absolute + tag + '/' + \ # 'probs_' + tag + '.txt.gz', 'rb') - diff --git a/code/analyze/plotting/format_polymorphism_for_r.py b/code/analyze/plotting/format_polymorphism_for_r.py index 58d10ce..3665d0d 100644 --- a/code/analyze/plotting/format_polymorphism_for_r.py +++ b/code/analyze/plotting/format_polymorphism_for_r.py @@ -1,7 +1,5 @@ # lol because i'm so bad at R -import sys -sys.path.insert(0, '..') import global_params as gp tag = 'u3_i.001_tv_l1000_f.01' @@ -16,7 +14,7 @@ d2_sums = {} for line in lines[1:]: chrm = line[0] - if not d_sums.has_key(chrm): + if chrm not in d_sums: d_sums[chrm] = 0 d2_sums[chrm] = 0 d[chrm] = {} @@ -53,24 +51,24 @@ fab += c try: fo = str(float(fo)/d_sums[chrm]) - except: + except ValueError: fo = 'NaN' try: fob = str(float(fob)/d2_sums[chrm]) - except: + except ValueError: fob = 'NaN' try: fa = str(float(fa)/d_sums[chrm]) - except: + except ValueError: fa = 'NaN' try: fab = str(float(fab)/d2_sums[chrm]) - except: + except ValueError: fab = 'NaN' - f.write(chrm + '\tone\tpolymorphic\t' + fo + '\t' + str(d_sums[chrm]) + '\n') - f.write(chrm + '\tone\tbiallelic\t' + fob + '\t' + str(d2_sums[chrm]) + '\n') - f.write(chrm + '\tall\tpolymorphic\t' + fa + '\t' + str(d_sums[chrm]) + '\n') - f.write(chrm + '\tall\tbiallelic\t' + fab + '\t' + str(d2_sums[chrm]) + '\n') + f.write(f'{chrm}\tone\tpolymorphic\t{fo}\t{d_sums[chrm]}\n') + f.write(f'{chrm}\tone\tbiallelic\t{fob}\t{d2_sums[chrm]}\n') + f.write(f'{chrm}\tall\tpolymorphic\t{fa}\t{d_sums[chrm]}\n') + f.write(f'{chrm}\tall\tbiallelic\t{fab}\t{d2_sums[chrm]}\n') f.close() diff --git a/code/analyze/predict.py b/code/analyze/predict.py index f6eef19..1e48ef4 100644 --- a/code/analyze/predict.py +++ b/code/analyze/predict.py @@ -1,366 +1,700 @@ import copy import gzip +import os import itertools +import click from collections import defaultdict, Counter from hmm import hmm_bw from sim import sim_predict from sim import sim_process -import global_params as gp -from misc import read_fasta import numpy as np +from typing import List, Dict, Tuple, TextIO +from contextlib import ExitStack +import logging as log +from misc.read_fasta import read_fasta +from analyze.introgression_configuration import Configuration + + +class Predictor(): + ''' + Predictor class + Stores all variables needed to run an HMM prediction + ''' + def __init__(self, configuration: Configuration): + self.config = configuration + + def run_prediction(self, only_poly_sites=True): + ''' + Run prediction with this predictor object + ''' + self.validate_arguments() + + hmm_builder = HMM_Builder(self.config) + hmm_builder.set_expected_values() + self.emission_symbols = \ + hmm_builder.update_emission_symbols(len(self.config.known_states)) + + with open(self.config.hmm_initial, 'w') as initial, \ + open(self.config.hmm_trained, 'w') as trained, \ + gzip.open(self.config.probabilities, 'wt') as probabilities, \ + gzip.open(self.config.positions, 'wt') as positions, \ + ExitStack() as stack: + + self.write_hmm_header(initial) + self.write_hmm_header(trained) + + block_writers = {state: + stack.enter_context( + open(self.config.blocks.format( + state=state), 'w')) + for state in + self.config.states} + for writer in block_writers.values(): + self.write_blocks_header(writer) + + counter = 0 + total = len(self.config.chromosomes) * len(self.config.strains) + progress_bar = None + if self.config.log_file: # logging to file + progress_bar = stack.enter_context( + click.progressbar( + length=total, + label='Running prediction')) + + for chrom in self.config.chromosomes: + for strain in self.config.strains: + counter += 1 + log.info(f'working on: {strain} {chrom} ' + f'({counter} of {total})') + + # get sequences and encode + alignment_file = self.config.alignment.format( + strain=strain, chrom=chrom) + + if not os.path.exists(alignment_file): + log.info(f'skipping, file {alignment_file} not found') + else: + hmm_initial, hmm_trained, pos = hmm_builder.run_hmm( + alignment_file, only_poly_sites) + + self.write_hmm(hmm_initial, initial, strain, chrom) + self.write_hmm(hmm_trained, trained, strain, chrom) + + # process and threshold hmm result + predicted_states, probs = self.process_path( + hmm_trained) + state_blocks = self.convert_to_blocks(predicted_states) + + self.write_positions(pos, positions, strain, chrom) + + for state, block in state_blocks.items(): + self.write_blocks(block, + pos, + block_writers[state], + strain, + chrom, + state) + + self.write_state_probs(probs, probabilities, + strain, chrom) + + if progress_bar: + progress_bar.update(1) + + def validate_arguments(self): + ''' + Check that all required instance variables are set to perform a + prediction run. Returns true if valid, raises value error otherwise + ''' + args = [ + 'chromosomes', + 'blocks', + 'prefix', + 'strains', + 'hmm_initial', + 'hmm_trained', + 'probabilities', + 'positions', + 'alignment', + 'known_states', + 'unknown_states', + 'threshold', + ] + variables = self.config.__dict__ + for arg in args: + if arg not in variables or variables[arg] is None: + err = ('Failed to validate Predictor, required argument ' + f"'{arg}' was unset") + log.exception(err) + raise ValueError(err) + + # check the parameters for each state are present + known_states = self.config.get('analysis_params.known_states') + if known_states is None: + err = 'Configuration did not provide any known_states' + log.exception(err) + raise ValueError(err) + + for s in known_states: + if 'expected_length' not in s: + err = f'{s["name"]} did not provide an expected_length' + log.exception(err) + raise ValueError(err) + if 'expected_fraction' not in s: + err = f'{s["name"]} did not provide an expected_fraction' + log.exception(err) + raise ValueError(err) + + unknown_states = self.config.get('analysis_params.unknown_states') + if unknown_states is not None: + for s in unknown_states: + if 'expected_length' not in s: + err = f'{s["name"]} did not provide an expected_length' + log.exception(err) + raise ValueError(err) + if 'expected_fraction' not in s: + err = f'{s["name"]} did not provide an expected_fraction' + log.exception(err) + raise ValueError(err) + + reference = self.config.get('analysis_params.reference') + if reference is None: + err = f'Configuration did not specify a reference strain' + log.exception(err) + raise ValueError(err) + + return True + + def write_hmm_header(self, writer: TextIO) -> None: + ''' + Write the header line for an hmm file to the provided textIO object + Output is tab delimited with: + strain chromosome initial_probs emissions transitions + ''' + + writer.write('strain\tchromosome\t') + + states = self.config.states + + writer.write('\t'.join( + [f'init_{s}' for s in states] + # initial + [f'emis_{s}_{symbol}' + for s in states + for symbol in self.emission_symbols] + # emissions + [f'trans_{s1}_{s2}' + for s1 in states + for s2 in states])) # transitions -def read_aligned_seqs(fn, strain): - headers, seqs = read_fasta.read_fasta(fn) - d = {} - for i in range(len(seqs)): - name = headers[i][1:].split(' ')[0] - d[name] = seqs[i] - - ref_seqs = [] - for ref in gp.alignment_ref_order: - ref_seqs.append(d[ref]) - predict_seq = d[strain] - - return ref_seqs, predict_seq - - -def set_expectations(args, n): - - species_to = args['known_states'][0] - species_from = args['known_states'][1:] - - args['expected_num_tracts'] = {} - args['expected_bases'] = {} - for s in species_from: - args['expected_num_tracts'][s] = \ - args['expected_frac'][s] * n / args['expected_length'][s] - args['expected_bases'][s] = args['expected_num_tracts'][s] * \ - args['expected_length'][s] - - args['expected_bases'][species_to] = \ - n - sum([args['expected_bases'][s] for s in species_from]) - - args['expected_num_tracts'][species_to] = \ - sum([args['expected_num_tracts'][s] for s in species_from]) + 1 - - args['expected_length'][species_to] = \ - args['expected_bases'][species_to] / args['expected_num_tracts'][species_to] - - -def ungap_and_code(predict_seq, ref_seqs, index_ref=0): - # index_ref is index of reference strain to index relative to - # build character array - sequences = np.array([list(predict_seq)] + - [list(r) for r in ref_seqs]) - - isbase = sequences != gp.gap_symbol - - # make boolean for valid characters - isvalid = np.logical_and(sequences != gp.gap_symbol, - sequences != gp.unsequenced_symbol) - - # positions are where everything is valid, index where the reference is - # valid. The +1 removes the predict sequence at index 0 - positions = np.where( - np.all(isvalid[:, isbase[index_ref+1, :]], axis=0))[0] - - matches = np.where(sequences[0] == sequences[1:], - gp.match_symbol, - gp.mismatch_symbol) - - # 1: indexing removes currently examined sequence - matches = [''.join(row) - for row in np.transpose(matches[:, np.all(isvalid, axis=0)])] - - # NOTE list is for unit test comparisons - return matches, positions - - -def poly_sites(sequences, positions): - seq_len = len(sequences[0]) - # check if seq only contains match_symbol - retain = np.vectorize( - lambda x: x.count(gp.match_symbol) != seq_len)(sequences) - indices = np.where(retain)[0] - ps_poly = [positions[i] for i in indices] - seq_poly = [sequences[i] for i in indices] - - return seq_poly, ps_poly - - -def get_symbol_freqs(sequence): - - individual = [] - weighted = [] - - symbols = defaultdict(int, Counter(sequence)) - total = len(sequence) - for k in symbols: - symbols[k] /= total - - sequence = np.array([list(s) for s in sequence]) - - # look along species - for s in np.transpose(sequence): - s = ''.join(s) - counts = Counter(s) - weighted.append(counts[gp.match_symbol]) - total = sum(counts.values()) - for k in counts: - counts[k] /= total - individual.append(defaultdict(int, counts)) - - total = sum(weighted) - weighted = [w / total for w in weighted] - return individual, symbols, weighted - - -def initial_probabilities(known_states, unknown_states, - expected_frac, weighted_match_freqs): - - init = [] - expectation_weight = .9 - for s, state in enumerate(known_states): - expected = expected_frac[state] - estimated = weighted_match_freqs[s] - init.append(expected * expectation_weight + - estimated * (1 - expectation_weight)) - - for state in unknown_states: - expected_frac = expected_frac[state] - init.append(expected_frac) - - return init / np.sum(init) - - -def emission_probabilities(known_states, unknown_states, symbols): - - probabilities = { - gp.mismatch_symbol + gp.match_symbol: 0.9, - gp.match_symbol + gp.match_symbol: 0.09, - gp.mismatch_symbol + gp.mismatch_symbol: 0.009, - gp.match_symbol + gp.mismatch_symbol: 0.001, - } - - mismatch_bias = .99 - - num_per_category = 2 ** (len(known_states) - 2) - for key in probabilities: - probabilities[key] *= num_per_category - - # for known states - symbol_array = np.array([list(s) for s in symbols], dtype=' None: + ''' + Write header line to tab delimited block file: + strain chromosome predicted_species start end num_sites_hmm + ''' + # NOTE: num_sites_hmm represents the sites considered by the HMM, + # so it might exclude non-polymorphic sites in addition to gaps + writer.write('\t'.join(['strain', + 'chromosome', + 'predicted_species', + 'start', + 'end', + 'num_sites_hmm']) + + '\n') + + def write_blocks(self, + state_seq_blocks: List[Tuple[int, int]], + positions: np.array, + writer: TextIO, + strain: str, + chrm: str, + species_pred: str) -> None: + ''' + Write entry into tab delimited block file, with columns: + strain chromosome predicted_species start end num_sites_hmm + ''' + writer.write('\n'.join( + ['\t'.join([strain, + chrm, + species_pred, + str(positions[start]), + str(positions[end]), + str(end - start + 1)]) + for start, end in state_seq_blocks])) + if state_seq_blocks: # ensure ends with \n + writer.write('\n') + + def write_positions(self, + positions: np.array, + writer: TextIO, + strain: str, + chrm: str) -> None: + ''' + Write the positions of the specific strain, chromosome as a line to the + provided textIO object + ''' + writer.write(f'{strain}\t{chrm}\t' + + '\t'.join([str(x) for x in positions]) + '\n') + + def write_state_probs(self, + probs: Dict[str, List[float]], + writer: TextIO, + strain: str, + chrm: str) -> None: + ''' + Write the probability of each state to the supplied textIO object + Output is tab delimited with: + strain chrom state1:prob1,prob2,...,probn state2... + ''' + writer.write(f'{strain}\t{chrm}\t') + + writer.write('\t'.join( + [f'{state}:' + + ','.join([f'{site[i]:.5f}' for site in probs]) + for i, state in enumerate(self.config.states)])) - # also should we care about number of tracts rather than fraction - # of genome? maybe theoretically, but that number is a lot more - # suspect + writer.write('\n') - states = known_states + unknown_states + def process_path(self, hmm: hmm_bw.HMM) -> Tuple[List[str], np.array]: + ''' + Process the hmm path based the the predictor threshold value + Return the predicted states and the probabilities of the master + reference sequence + ''' + probabilities = hmm.posterior_decoding()[0] + + # posterior + if type(self.config.threshold) is float: + path, path_probs = sim_process.get_max_path(probabilities, + hmm.hidden_states) + path_t = sim_process.threshold_predicted( + path, + path_probs, + self.config.threshold, + self.config.known_states[0]) + + return path_t, probabilities - fractions = np.array([expected_frac[s] for s in states]) - lengths = 1/np.array([expected_length[s] for s in states]) - - # general case, - # trans[i,j] = 1/ length[i] * expected[j] * 1 /(1 - fraction[i]) - transitions = np.outer( - np.multiply(lengths, 1/(1-fractions)), - fractions) - # when i == j, trans[i,j] = 1 - 1/length[i] - np.fill_diagonal(transitions, 1-lengths) - - # normalize - return transitions / transitions.sum(axis=1)[:, None] - - -def initial_hmm_parameters(seq, known_states, unknown_states, - expected_frac, expected_length): - - # get frequencies of individual symbols (e.g. '+') and all full - # combinations of symbols (e.g. '+++-') - individual_symbol_freqs, symbol_freqs, weighted_match_freqs = get_symbol_freqs(seq) - - init = initial_probabilities(known_states, unknown_states, - expected_frac, weighted_match_freqs) - emis = emission_probabilities(known_states, unknown_states, symbol_freqs.keys()) - trans = transition_probabilities(known_states, unknown_states, - expected_frac, expected_length) - - # new Hidden Markov Model - hmm = hmm_bw.HMM() - - hmm.set_initial_p(init) - hmm.set_emissions(emis) - hmm.set_transitions(trans) - return hmm - - -def predict_introgressed(ref_seqs, predict_seq, predict_args, - train=True, only_poly_sites=True, - return_positions=False): - - # code sequence by which reference it matches at each site; - # positions are relative to master (first) reference sequence - seq_coded, positions = ungap_and_code(predict_seq, ref_seqs) - if only_poly_sites: - seq_coded, positions = poly_sites(seq_coded, positions) - if return_positions: - return positions - - # sets expected number of tracts and bases for each reference - # based on expected length of introgressed tracts and expected - # total fraction of genome - set_expectations(predict_args, len(predict_seq)) - - # set initial hmm parameters based on combination of (1) initial - # expectations (length of introgressed tract and fraction of - # genome/total number tracts and bases) and (2) number of sites at - # which predict seq matches each reference - hmm = initial_hmm_parameters(seq_coded, - predict_args['known_states'], - predict_args['unknown_states'], - predict_args['expected_frac'], - predict_args['expected_length']) - - # make predictions - - # set states and initial probabilties - hmm.set_hidden_states(predict_args['states']) - - # copy before setting observations to save memory - hmm_init = copy.deepcopy(hmm) - - # set obs - hmm.set_observations([seq_coded]) - - # optional Baum-Welch parameter estimation - if train: - hmm.train(predict_args['improvement_frac']) - - p = hmm.posterior_decoding() - path, path_probs = sim_process.get_max_path(p[0], hmm.hidden_states) - - # posterior - if type(predict_args['threshold']) is float: - path_t = sim_process.threshold_predicted(path, path_probs, - predict_args['threshold'], - predict_args['states'][0]) - return path_t, p[0], hmm, hmm_init, positions - - else: - hmm.set_observations([seq_coded]) - predicted = sim_predict.convert_predictions(hmm.viterbi(), - predict_args['states']) - return predicted, p[0], hmm, hmm_init, positions - - -def convert_to_blocks(state_seq, states): - # single individual state sequence - blocks = {} - for state in states: - blocks[state] = [] - prev_species = state_seq[0] - block_start = 0 - block_end = 0 - for i in range(len(state_seq)): - if state_seq[i] == prev_species: - block_end = i else: - blocks[prev_species].append((block_start, block_end)) - block_start = i - block_end = i - prev_species = state_seq[i] - # add last block - if prev_species not in blocks: - blocks[prev_species] = [] - blocks[prev_species].append((block_start, block_end)) - - return blocks - - -def write_positions(ps, writer, strain, chrm): - writer.write(f'{strain}\t{chrm}\t' + - '\t'.join([str(x) for x in ps]) + '\n') - - -def read_positions(fn): - # dictionary keyed by strain and then chromosome - with gzip.open(fn, 'rb') as reader: + predicted = sim_predict.convert_predictions(hmm.viterbi(), + self.config.states) + return predicted, probabilities + + def convert_to_blocks(self, + state_seq: List[str]) -> Dict[ + str, List[Tuple[int, int]]]: + ''' + Convert a list of sequences into a structure of start and end positions + Return structure is a dict keyed on species with values of Lists of + each block, which is a tuple with start and end positions + ''' + # single individual state sequence + blocks = {} + for state in self.config.states: + blocks[state] = [] + prev_species = state_seq[0] + block_start = 0 + block_end = 0 + for i in range(len(state_seq)): + if state_seq[i] == prev_species: + block_end = i + else: + blocks[prev_species].append((block_start, block_end)) + block_start = i + block_end = i + prev_species = state_seq[i] + # add last block + if prev_species not in blocks: + blocks[prev_species] = [] + blocks[prev_species].append((block_start, block_end)) + + return blocks + + +class HMM_Builder(): + def __init__(self, configuration: Configuration): + self.config = configuration + self.config.set('symbols') + self.symbols = self.config.symbols + self.config.set('convergence') + + def update_emission_symbols(self, repeats: int): + ''' + Generate all permutations of match and mismatch symbols with + repeats number of characters, in lexigraphical order. + Sets internal state and returns the emission symbols + ''' + syms = [self.symbols['match'], self.symbols['mismatch']] + emis_symbols = [''.join(x) for x in + itertools.product(syms, + repeat=repeats)] + emis_symbols.sort() + self.emission_symbols = emis_symbols + return emis_symbols + + def get_symbol_freqs(self, sequence: np.array) -> Tuple[Dict, List]: + ''' + Calculate metrics from the provided, coded sequence + Returns: + the fraction of each matching pattern (e.g. +--++) + the weighted fraction of matches for each species + ''' + + weighted = [] + + symbols = defaultdict(int, Counter(sequence)) + total = len(sequence) + for k in symbols: + symbols[k] /= total + + sequence = np.array([list(s) for s in sequence]) + + # look along species + for s in np.transpose(sequence): + s = ''.join(s) + counts = Counter(s) + weighted.append(counts[self.symbols['match']]) + + total = sum(weighted) + weighted = [w / total for w in weighted] + return symbols, weighted + + def set_expected_values(self): + ''' + Get expected lengths and fractions for each state. + Assumes config has been validated by Predictor prior to running + ''' + self.expected_lengths = {} + self.expected_fractions = {} + known_states = self.config.get('analysis_params.known_states') + for state in known_states: + self.expected_lengths[state['name']] = state['expected_length'] + self.expected_fractions[state['name']] = state['expected_fraction'] + + unknown_states = self.config.get('analysis_params.unknown_states') + for state in unknown_states: + self.expected_lengths[state['name']] = state['expected_length'] + self.expected_fractions[state['name']] = state['expected_fraction'] + + reference = self.config.get('analysis_params.reference') + # expected fraction of reference is the remainder after other states + # are specified + self.expected_fractions[reference['name']] =\ + 1 - sum(self.expected_fractions.values()) + + self.ref_state = self.config.get('analysis_params.reference.name') + self.known_states = self.config.known_states + self.unknown_states = self.config.unknown_states + + # have to remove effect of unknown of these values for later + self.ref_fraction = self.expected_fractions[self.ref_state] + \ + sum([self.expected_fractions[s] for s in self.unknown_states]) + # sum of fraction / length, or 1 / tract length + self.other_sum = sum([self.expected_fractions[s['name']] / + self.expected_lengths[s['name']] + for s in known_states]) + + def update_expected_length(self, total_length: int): + ''' + Updates the expected length for the reference state + based on the provided total_length of the sequence. + This is the expected length of a single tract, determined as the sum + of the total length (sequence length * fraction) divided by the number + of tracts (sequence length * 1 / other's tracts). The + 1 assumes that + the sequence will start and end with the reference. + ''' + self.expected_lengths[self.ref_state] = ( + total_length * self.ref_fraction / + (total_length * self.other_sum + 1)) + + def initial_probabilities(self, + weighted_match_freqs: List[float]) -> np.array: + ''' + Estimate the initial probability of being in each state + based on the number of states and their expected fractions + Returns the initial probability of each state + ''' + + init = [] + expectation_weight = .9 + for s, state in enumerate(self.known_states): + expected = self.expected_fractions[state] + estimated = weighted_match_freqs[s] + init.append(expected * expectation_weight + + estimated * (1 - expectation_weight)) + + for state in self.unknown_states: + expected_frac = self.expected_fractions[state] + init.append(expected_frac) + + return init / np.sum(init) + + def emission_probabilities(self, + symbols: List[str]) -> List[Dict]: + ''' + Estimate initial emission probabilities + Return estimates as list of default dict of probabilities + ''' + + match = self.symbols['match'] + mismatch = self.symbols['mismatch'] + probabilities = { + mismatch + match: 0.9, + match + match: 0.09, + mismatch + mismatch: 0.009, + match + mismatch: 0.001, + } + + mismatch_bias = .99 + + num_per_category = 2 ** (len(self.known_states) - 2) + for key in probabilities: + probabilities[key] *= num_per_category + + # for known states + symbol_array = np.array([list(s) for s in symbols], dtype=' np.array: + ''' + Estimate initial transition probabilities + ''' + + # doesn't depend on sequence observations but maybe it should? + + # also should we care about number of tracts rather than fraction + # of genome? maybe theoretically, but that number is a lot more + # suspect + + states = self.config.states + + fractions = np.array([self.expected_fractions[s] for s in states]) + lengths = 1/np.array([self.expected_lengths[s] for s in states]) + + # general case, + # trans[i,j] = 1/ length[i] * expected[j] * 1 /(1 - fraction[i]) + transitions = np.outer( + np.multiply(lengths, 1/(1-fractions)), + fractions) + # when i == j, trans[i,j] = 1 - 1/length[i] + np.fill_diagonal(transitions, 1-lengths) + + # normalize + return transitions / transitions.sum(axis=1)[:, None] + + def build_initial_hmm(self, seq: np.array) -> hmm_bw.HMM: + ''' + Build a HMM object initialized based on expected values and sequence + ''' + + # get frequencies of individual symbols (e.g. '+') and all full + # combinations of symbols (e.g. '+++-') + (symbol_freqs, + weighted_match_freqs) = self.get_symbol_freqs(seq) + + # new Hidden Markov Model + hmm = hmm_bw.HMM() + + hmm.set_initial_p(self.initial_probabilities(weighted_match_freqs)) + hmm.set_emissions(self.emission_probabilities(symbol_freqs.keys())) + hmm.set_transitions(self.transition_probabilities()) + return hmm + + def run_hmm(self, + alignment_file: str, + only_poly_sites: bool = True) -> Tuple[hmm_bw.HMM, + hmm_bw.HMM, + np.array]: + ''' + Runs the hmm training, returning the initial and trained HMM along + with the positions of hmm importance + ''' + coded_sequence, positions, len_seq = \ + self.encode_sequence(alignment_file, only_poly_sites) + + self.update_expected_length(len_seq) + # set initial hmm parameters based on combination of (1) initial + # expectations (length of introgressed tract and fraction of + # genome/total number tracts and bases) and (2) number of sites at + # which predict seq matches each reference + hmm = self.build_initial_hmm(coded_sequence) + + # set states and initial probabilties + hmm.set_hidden_states(self.known_states + self.unknown_states) + + # copy before setting observations to save memory + hmm_init = copy.deepcopy(hmm) + + # set obs + hmm.set_observations([coded_sequence]) + + # Baum-Welch parameter estimation + hmm.train(self.config.convergence) + + return hmm_init, hmm, positions + + def encode_sequence(self, + alignment_file: str, + only_poly_sites: bool = True) -> Tuple[ + np.array, + np.array, + int]: + ''' + open the supplied alignment file, encode, and return the coded + sequence along with the positions. If only_poly_sites is True, + also filter out non-polymorphic sites. + Returns the encoded sequence, positions, and length of original seq + ''' + _, sequences = read_fasta(alignment_file) + + references = sequences[:-1] + predicted = sequences[-1] + + seq_coded, positions = self.ungap_and_code(predicted, references) + if only_poly_sites: + seq_coded, positions = self.poly_sites(seq_coded, positions) + + return seq_coded, positions, len(predicted) + + def ungap_and_code(self, + predict_seq: str, + ref_seqs: List[str], + index_ref: int = 0) -> Tuple[np.array, np.array]: + ''' + Remove any sequence locations where a gap is present and code + into matching or mismatching sequence + Returns the coded sequences, by default an array of + where matching, - + where mismatching. Also return the positions where the sequences are + not gapped. + ''' + # index_ref is index of reference strain to index relative to + # build character array + sequences = np.array([list(predict_seq)] + + [list(r) for r in ref_seqs]) + + isbase = sequences != self.symbols['gap'] + + # make boolean for valid characters + isvalid = np.logical_and(isbase, + sequences != self.symbols['unsequenced']) + + # positions are where everything is valid, index where the reference is + # valid. The +1 removes the predict sequence at index 0 + positions = np.where( + np.all(isvalid[:, isbase[index_ref+1, :]], axis=0))[0] + + matches = np.where(sequences[0] == sequences[1:], + self.symbols['match'], + self.symbols['mismatch']) + + matches = np.fromiter((''.join(row) + for row in np.transpose( + matches[:, np.all(isvalid, axis=0)])), + dtype=f'U{len(sequences) - 1}') + + return matches, positions + + def poly_sites(self, + sequences: np.array, + positions: np.array) -> Tuple[np.array, np.array]: + ''' + Remove all sequences where the sequence is all match_symbol + Returns the filtered sequence and position + ''' + seq_len = len(sequences[0]) + # check if seq only contains match_symbol + retain = np.vectorize( + lambda x: x.count(self.symbols['match']) != seq_len)(sequences) + indices = np.where(retain)[0] + + ps_poly = positions[indices] + seq_poly = sequences[indices] + + return seq_poly, ps_poly + + +def read_positions(filename: str) -> Dict[str, Dict[str, List[int]]]: + ''' + Read in positions from the provided filename, returning a dictionary + keyed first by the strain, then chromosome. Returned positions are + lists of ints + ''' + with gzip.open(filename, 'rt') as reader: result = defaultdict({}) for line in reader: line = line.split() strain, chrm = line[0:2] - ps = [int(x) for x in line[2:]] - result[strain][chrm] = ps + positions = [int(x) for x in line[2:]] + result[strain][chrm] = positions return result -def write_blocks_header(writer): - # NOTE: num_sites_hmm represents the sites considered by the HMM, - # so it might exclude non-polymorphic sites in addition to gaps - writer.write('\t'.join(['strain', - 'chromosome', - 'predicted_species', - 'start', - 'end', - 'num_sites_hmm']) - + '\n') - - -# TODO: find source of all the newlines in output!! -def write_blocks(state_seq_blocks, ps, writer, strain, chrm, species_pred): - # file format is: - # strain chrm predicted_species start end number_non_gap - writer.write('\n'.join( - ['\t'.join([strain, - chrm, - species_pred, - str(ps[start]), - str(ps[end]), - str(end - start + 1)]) - for start, end in state_seq_blocks])) - if state_seq_blocks: - writer.write('\n') - - -def read_blocks(fn, labeled=False): - # return dictionary of (start, end, number_non_gap, [region_id]), - # keyed by strain and then chromosome - with open(fn, 'r') as reader: +def read_blocks(filename: str, + labeled: bool = False) -> Dict[ + str, Dict[str, Tuple[int, int, int, str]]]: + ''' + Read in the supplied block file, returning a dict keyed on strain, + then chromosome. Values are tuples of start, end, and number of postions + for the block. + If labeled is true, values contain the region_id as last element + ''' + with open(filename, 'r') as reader: reader.readline() # header result = defaultdict(lambda: defaultdict(list)) for line in reader: @@ -375,58 +709,3 @@ def read_blocks(fn, labeled=False): item = (int(start), int(end), int(number_non_gap)) result[strain][chrm].append(item) return result - - -def get_emis_symbols(known_states): - - symbols = [gp.match_symbol, gp.mismatch_symbol] - emis_symbols = [''.join(x) for x in - itertools.product(symbols, repeat=len(known_states))] - emis_symbols.sort() - return emis_symbols - - -def write_hmm_header(known_states, unknown_states, symbols, writer): - - writer.write('strain\tchromosome\t') - - states = known_states + unknown_states - - writer.write('\t'.join( - [f'init_{s}' for s in states] + # initial - [f'emis_{s}_{symbol}' - for s in states - for symbol in symbols] + # emissions - [f'trans_{s1}_{s2}' - for s1 in states - for s2 in states])) # transitions - - writer.write('\n') - - -def write_hmm(hmm, writer, strain, chrm, emis_symbols): - writer.write(f'{strain}\t{chrm}\t') - - states = len(hmm.hidden_states) - writer.write('\t'.join( - [f'{p}' for p in hmm.initial_p] + # initial - [f'{hmm.emissions[i, hmm.symbol_to_ind[symbol]]}' - if symbol in hmm.symbol_to_ind else '0.0' - for i in range(states) - for symbol in emis_symbols] + # emission - [f'{hmm.transitions[i, j]}' - for i in range(states) - for j in range(states)] # transition - )) - writer.write('\n') - - -def write_state_probs(probs, writer, strain, chrm, states): - writer.write(f'{strain}\t{chrm}\t') - - writer.write('\t'.join( - [f'{state}:' + - ','.join([f'{site[i]:.5f}' for site in probs]) - for i, state in enumerate(states)])) - - writer.write('\n') diff --git a/code/analyze/predict_main.py b/code/analyze/predict_main.py deleted file mode 100644 index f9566e9..0000000 --- a/code/analyze/predict_main.py +++ /dev/null @@ -1,110 +0,0 @@ -import sys -import os -import predict -import read_args -import gzip -import predict -import global_params as gp -from misc import read_fasta - -# read in analysis parameters - -args = read_args.process_predict_args(sys.argv[1:]) - -##====== -# output files and if and where to resume -##====== - -if not os.path.isdir(gp.analysis_out_dir_absolute + args['tag']): - os.makedirs(gp.analysis_out_dir_absolute + args['tag']) - -# positions -# TODO move this to more general location and make separate files for -# each strain x chrm -base_dir = f'{gp.analysis_out_dir_absolute}{args["tag"]}' - -# introgressed blocks -blocks_f = {} -for s in args['states']: - blocks_f[s] = open(f'{base_dir}/blocks_{s}_{args["tag"]}.txt', 'w') - predict.write_blocks_header(blocks_f[s]) - -# HMM parameters -emis_symbols = predict.get_emis_symbols(args['known_states']) - -hmm_init_f = open(f'{base_dir}/hmm_init_{args["tag"]}.txt', 'w') -predict.write_hmm_header(args['known_states'], args['unknown_states'], - emis_symbols, hmm_init_f) - -hmm_f = open(f'{base_dir}/hmm_{args["tag"]}.txt', 'w') -predict.write_hmm_header(args['known_states'], args['unknown_states'], - emis_symbols, hmm_f) - -# posterior probabilities - -write_ps = True -if write_ps: - ps_f = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'wt') - -probs_f = gzip.open(f'{base_dir}/probs_{args["tag"]}.txt.gz', 'wt') - -# loop through all sequences and predict introgression - - -for chrm in gp.chrms: - - for strain, strain_dir in args['setup_args']['strain_dirs']: - - print(f'working on: {strain} {chrm}') - - ref_prefix = '_'.join(args['known_states']) - fn = (f'{args["setup_args"]["alignments_directory"]}{ref_prefix}_{strain}' - f'_chr{chrm}_mafft{gp.alignment_suffix}') - - if not os.path.exists(fn): - print(fn) - print(f'no alignment for {strain} {chrm}') - continue - - headers, seqs = read_fasta.read_fasta(fn) - - ref_seqs = seqs[:-1] - predict_seq = seqs[-1] - - # predict introgressed/non-introgressed tracts - - state_seq, probs, hmm, hmm_init, ps = \ - predict.predict_introgressed(ref_seqs, predict_seq, - args, train=True) - - state_seq_blocks = predict.convert_to_blocks(state_seq, args['states']) - - # output - - # the positions actually used in predictions - # (alignment columns with no gaps) - if write_ps: - predict.write_positions(ps, ps_f, strain, chrm) - - # blocks predicted to be introgressed, separate files for each species - for s in state_seq_blocks: - predict.write_blocks(state_seq_blocks[s], ps, blocks_f[s], - strain, chrm, s) - - # summary info about HMM (before training) - predict.write_hmm(hmm_init, hmm_init_f, strain, chrm, emis_symbols) - - # summary info about HMM (after training) - predict.write_hmm(hmm, hmm_f, strain, chrm, emis_symbols) - - # probabilities at each site - predict.write_state_probs(probs, probs_f, strain, - chrm, hmm.hidden_states) - -for k in blocks_f: - blocks_f[k].close() - -ps_f.close() -hmm_init_f.close() -hmm_f.close() -probs_f.close() diff --git a/code/analyze/read_args.py b/code/analyze/read_args.py index 26c6698..b78c022 100644 --- a/code/analyze/read_args.py +++ b/code/analyze/read_args.py @@ -1,10 +1,11 @@ import sys from align import align_helpers + def process_predict_args(args): - + d = {} - + i = 0 d['tag'] = args[i] @@ -14,7 +15,7 @@ def process_predict_args(args): i += 1 d['improvement_frac'] = float(args[i]) - + i += 1 d['threshold'] = args[i] if d['threshold'] != 'viterbi': @@ -34,8 +35,9 @@ def process_predict_args(args): d['expected_length'][state] = float(args[i]) i += 1 d['expected_frac'][state] = float(args[i]) - d['expected_frac'][d['known_states'][0]] = 1 - sum(d['expected_frac'].values()) - d['expected_length'][d['known_states'][0]] = 0 # calculate later + d['expected_frac'][d['known_states'][0]] = \ + 1 - sum(d['expected_frac'].values()) + d['expected_length'][d['known_states'][0]] = 0 # calculate later i += 1 while i < len(args): @@ -57,7 +59,8 @@ def process_predict_args(args): d['setup_args'] = setup_args return d - + + def read_setup_args(fn): x = {} @@ -72,7 +75,8 @@ def read_setup_args(fn): d = {} d['references'] = x['references'] - d['reference_directories'] = dict(zip(x['references'], x['reference_directories'])) + d['reference_directories'] = \ + dict(zip(x['references'], x['reference_directories'])) d['alignments_directory'] = x['alignments_directory'][0] d['strain_dirs'] = \ @@ -80,6 +84,7 @@ def read_setup_args(fn): return d + def get_predict_args_by_tag(fn, tag): f = open(fn, 'r') line = f.readline() @@ -90,4 +95,3 @@ def get_predict_args_by_tag(fn, tag): line = f.readline() print(f'tag not found: {tag}') return None - diff --git a/code/analyze/structure/structure_1_main.py b/code/analyze/structure/structure_1_main.py index 27e060b..bb6063d 100644 --- a/code/analyze/structure/structure_1_main.py +++ b/code/analyze/structure/structure_1_main.py @@ -3,24 +3,18 @@ import sys import os -import gzip import predict from collections import defaultdict -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta -import read_table -import seq_functions +from misc import read_fasta args = predict.process_predict_args(sys.argv[2:]) chrm = gp.chrms[int(sys.argv[1])] # maybe getting strains should be simpler -strains = [line.split('\t')[0] for line in \ - open(gp.analysis_out_dir_absolute + args['tag'] + \ +strains = [line.split('\t')[0] for line in + open(gp.analysis_out_dir_absolute + args['tag'] + '/state_counts_by_strain.txt', 'r').readlines()[1:]] nucs = set(['a', 't', 'g', 'c']) @@ -32,10 +26,10 @@ gp_dir = '../' -##====== +# ====== # use program ldselect to find set of tag snps all in low LD for # specified chromosome -##====== +# ====== # input file for ldselect is formatted so that each row is a snp and # each column is the genotype for a strain, e.g. @@ -45,13 +39,13 @@ snps = defaultdict(list) # loop through all the strains for strain in strains: - print '-', strain + print('-', strain) # read multiple alignment file for this strain with the master # reference (and other references which we don't care about # here) - headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + \ - '_'.join(gp.alignment_ref_order) + \ - '_' + strain + '_chr' + chrm + \ + headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + + '_'.join(gp.alignment_ref_order) + + '_' + strain + '_chr' + chrm + '_mafft.maf') # look at all alignment columns, keeping track of the index in # the master reference @@ -66,11 +60,11 @@ # get reference sequence (unaligned, without gaps) # TODO correct alignment file location -ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir + \ - '_'.join(gp.alignment_ref_order) + \ - '_' + strains[0] + '_chr' + chrm + \ +ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir + + '_'.join(gp.alignment_ref_order) + + '_' + strains[0] + '_chr' + chrm + '_mafft.maf')[1][0].replace(gp.gap_symbol, '') -open(out_dir + 'chromosome_lengths.txt', 'a').write(chrm + '\t' + \ +open(out_dir + 'chromosome_lengths.txt', 'a').write(chrm + '\t' + str(len(ref_seq)) + '\n') # loop through all the sites we collected above @@ -83,13 +77,13 @@ # TODO do names have to be integers and/or equal in length? snp_id = str(snp) # write row for master reference - f.write(snp_id + '\t' + \ - gp.alignment_ref_order[0] + '\t' + \ + f.write(snp_id + '\t' + + gp.alignment_ref_order[0] + '\t' + ref_seq[snp] + '\n') # and one row for each of the other strains for si in range(len(strains)): - f.write(snp_id + '\t' + \ - strains[si] + '\t' + \ + f.write(snp_id + '\t' + + strains[si] + '\t' + snps[snp][si] + '\n') f.close() @@ -97,7 +91,8 @@ """ # run ldselect on this input file fn_out = fn.replace('input', 'output') -os.system('perl ' + gp.ldselect_install_path + 'ldSelect.pl -pb ' + fn + ' > ' + fn_out) +os.system('perl ' + gp.ldselect_install_path + + 'ldSelect.pl -pb ' + fn + ' > ' + fn_out) # extract one tag snp from each set of equivalent tag snps from # ldselect output file diff --git a/code/analyze/structure/structure_2_main.py b/code/analyze/structure/structure_2_main.py index 9b22ed9..3298a44 100644 --- a/code/analyze/structure/structure_2_main.py +++ b/code/analyze/structure/structure_2_main.py @@ -1,15 +1,8 @@ import sys import os -import gzip import predict from collections import defaultdict -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta -import read_table -import seq_functions args = predict.process_predict_args(sys.argv[2:]) @@ -23,18 +16,18 @@ os.makedirs(out_dir_run + '/population_ranges') # maybe getting strains should be simpler -strains = [line.split('\t')[0] for line in \ - open(gp.analysis_out_dir_absolute + args['tag'] + \ +strains = [line.split('\t')[0] for line in + open(gp.analysis_out_dir_absolute + args['tag'] + '/state_counts_by_strain.txt', 'r').readlines()[1:]] gp_dir = '../' -nuc_to_int = {'a':1, 't':2, 'g':3, 'c':4} +nuc_to_int = {'a': 1, 't': 2, 'g': 3, 'c': 4} -##====== +# ====== # use program structure to find population proportion using either # unlinked tagsnps from ldselect, or just all snps -##====== +# ====== use_all_snps = True @@ -93,12 +86,13 @@ f = open(out_dir_run + 'structure_input_run' + run_id + '.txt', 'w') for chrm in gp.chrms: - f.write('\t\t\t' + '\t'.join([chrm + '_' + str(x) \ - for x in sorted(all_snps[chrm].keys())])) + f.write('\t\t\t' + '\t'.join([chrm + '_' + str(x) + for x in sorted(all_snps[chrm].keys())])) f.write('\n') for chrm in gp.chrms: - f.write('\t\t\t' + '\t'.join([str(map_distances[chrm][x]) \ - for x in sorted(map_distances[chrm].keys())])) + f.write('\t\t\t' + '\t'.join( + [str(map_distances[chrm][x]) + for x in sorted(map_distances[chrm].keys())])) f.write('\n') for strain in strains: @@ -118,7 +112,8 @@ """ os.system(gp.structure_install_path + 'structure -L ' + str(num_snps) + \ ' -K 6 -i ' + out_dir_run + 'structure_input_run' + run_id + \ - '.txt -o ' + out_dir_run + 'structure_output_k6_run' + run_id + '.txt') + '.txt -o ' + out_dir_run + 'structure_output_k6_run' + + run_id + '.txt') os.system('mv ' + out_dir_run + 'structure_output_k6_run' + \ run_id + '.txt_ss ' + out_dir_run + \ @@ -136,7 +131,7 @@ line = f.readline() while line != "Inferred ancestry of individuals:\n": line = f.readline() -f.readline() # column headings +f.readline() # column headings line = f.readline() f_out.write('strain\tpopulation\tfraction\tindex\n') while line != "\n": @@ -149,7 +144,7 @@ ind = i break for i in range(len(fracs)): - f_out.write(strain + '\t' + str(i + 1) + '\t' + \ + f_out.write(strain + '\t' + str(i + 1) + '\t' + str(fracs[i]) + '\t' + str(ind + 1) + '\n') line = f.readline() f.close() @@ -161,7 +156,8 @@ f = open(out_dir_run + 'structure_output_ss_k6_run' + run_id + '.txt', 'r') k = 6 -# read in posterior probabilities for each strain locus being in each population +# read in posterior probabilities for +# each strain locus being in each population line = f.readline() while line.strip() == '\n': line = f.readline() @@ -187,8 +183,6 @@ line = f.readline() f.close() - - # TODO at some point associate numbered populations with logical names # (i.e. ones from strope et al) @@ -198,17 +192,17 @@ # population_ranges_strain_chrX.txt # start end popx # start end popx/popy -# start end +# start end -chrm_lengths = [line[:-1].split('\t') for line in \ - open(out_dir + 'chromosome_lengths.txt', 'r').readlines()] -chrm_lengths = dict(zip([x[0] for x in chrm_lengths], \ +chrm_lengths = [line[:-1].split('\t') for line in + open(out_dir + 'chromosome_lengths.txt', 'r').readlines()] +chrm_lengths = dict(zip([x[0] for x in chrm_lengths], [int(x[1]) for x in chrm_lengths])) for strain in strains: for chrm in gp.chrms: ranges = [] - snps = sorted(strain_snp_pop[strain][chrm].keys()) + snps = sorted(strain_snp_pop[strain][chrm].keys()) start = snps[0] end = start previous_pop = strain_snp_pop[strain][chrm][start] @@ -221,7 +215,8 @@ else: ranges.append((start, end, previous_pop)) - ranges.append((end + 1, snp - 1, previous_pop + '/' + current_pop)) + ranges.append((end + 1, snp - 1, + previous_pop + '/' + current_pop)) start = snp end = snp previous_pop = current_pop @@ -231,7 +226,8 @@ ranges.append((end + 1, chrm_lengths[chrm], 'end')) # TODO file location - f = open(out_dir_run + 'population_ranges/population_ranges_' + strain + '_chr' + chrm + '_run' + run_id + '.txt', 'w') + f = open(out_dir_run + 'population_ranges/population_ranges_' + + strain + '_chr' + chrm + '_run' + run_id + '.txt', 'w') for r in ranges: f.write('\t'.join([str(x) for x in r]) + '\n') f.close() diff --git a/code/analyze/structure/structure_3_main.py b/code/analyze/structure/structure_3_main.py index 4446cdd..fc1403c 100644 --- a/code/analyze/structure/structure_3_main.py +++ b/code/analyze/structure/structure_3_main.py @@ -1,26 +1,19 @@ -## generate three files: +# generate three files: -## 1. introgressed regions annotated by which population background(s) -## they overlap +# 1. introgressed regions annotated by which population background(s) +# they overlap -## 2. population backgrounds annotated by how much introgression they -## have from each reference strain (or ambiguous strains) +# 2. population backgrounds annotated by how much introgression they +# have from each reference strain (or ambiguous strains) -## 3. counts of bases in for each strain x population background x -## introgresssing reference [or lack of introgression] +# 3. counts of bases in for each strain x population background x +# introgresssing reference [or lack of introgression] import sys -import os -import gzip import predict from collections import defaultdict -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta -import read_table -import seq_functions +from misc import read_table args = predict.process_predict_args(sys.argv[3:]) @@ -33,8 +26,8 @@ # TODO maybe getting strains should be simpler...at least make this # not copy pasta -strains = [line.split('\t')[0] for line in \ - open(gp.analysis_out_dir_absolute + args['tag'] + \ +strains = [line.split('\t')[0] for line in + open(gp.analysis_out_dir_absolute + args['tag'] + '/state_counts_by_strain.txt', 'r').readlines()[1:]] @@ -58,8 +51,10 @@ def find_pops(start, end, pop_ranges): bases.append(r[1] - r[0] + 1) return pops, bases + population_int_counts = defaultdict(lambda: defaultdict(int)) -strain_population_int_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) +strain_population_int_counts = defaultdict( + lambda: defaultdict(lambda: defaultdict(int))) population_totals = defaultdict(int) strain_population_totals = defaultdict(lambda: defaultdict(int)) all_alternative_states = set([]) @@ -74,24 +69,24 @@ def find_pops(start, end, pop_ranges): strain = regions[region_id]['strain'] regions_strain_chrm[strain][chrm][region_id] = regions[region_id] new_regions_fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + ref + \ - '_' + args['tag'] + '_populations.txt' + 'blocks_' + ref + \ + '_' + args['tag'] + '_populations.txt' f = open(new_regions_fn, 'w') labels = labels[1:] + ['population'] f.write('region_id' + '\t' + '\t'.join(labels) + '\n') - #for chrm in regions_strain_chrm[strain]: + # for chrm in regions_strain_chrm[strain]: for strain in strains: for chrm in gp.chrms: # TODO get rid of run_id in filenames? pop_ranges_fn = out_dir_run + 'population_ranges/' + \ 'population_ranges_' + strain + \ '_chr' + chrm + '_run' + run_id + '.txt' - pop_ranges = [line[:-1].split('\t') for line in \ + pop_ranges = [line[:-1].split('\t') for line in open(pop_ranges_fn, 'r').readlines()] pop_ranges = [(int(x[0]), int(x[1]), x[2]) for x in pop_ranges] for pr in pop_ranges: - population_totals[pr[2]] += pr[1] - pr[0] + 1 + population_totals[pr[2]] += pr[1] - pr[0] + 1 strain_population_totals[strain][pr[2]] += pr[1] - pr[0] + 1 for region_id in regions_strain_chrm[strain][chrm]: @@ -101,40 +96,47 @@ def find_pops(start, end, pop_ranges): # find the population ranges that the region start and end # coordinates fall within - pops, overlaps = find_pops(int(r['start']), int(r['end']), pop_ranges) + pops, overlaps = find_pops(int(r['start']), int(r['end']), + pop_ranges) regions_strain_chrm[strain][chrm][region_id]['population'] = \ ','.join(pops) - f.write(region_id + '\t' + \ - '\t'.join([str(regions_strain_chrm[strain][chrm][region_id][x])\ - for x in labels]) + '\n') + f.write(region_id + '\t' + + '\t'.join( + [str(regions_strain_chrm[strain][ + chrm][region_id][x]) + for x in labels]) + '\n') for i in range(len(pops)): - population_int_counts[pops[i]][r['alternative_states']] += \ + population_int_counts[pops[i]][r['alternative_states']] +=\ overlaps[i] - strain_population_int_counts[strain][pops[i]]\ - [r['alternative_states']] += overlaps[i] + strain_population_int_counts[strain][pops[i]][ + r['alternative_states']] += overlaps[i] -f = open(out_dir_run + 'population_introgression_counts_run' + run_id + '.txt', 'w') -f.write('population\treference\tnum_bases_introgressed\tfrac_bases_introgressed\n') +f = open(out_dir_run + 'population_introgression_counts_run' + run_id + '.txt', + 'w') +f.write('population\treference\t' + 'num_bases_introgressed\tfrac_bases_introgressed\n') for i in population_int_counts.keys(): for ref in population_int_counts[i].keys(): - f.write(str(i) + '\t' + ref + '\t' + str(population_int_counts[i][ref]) + '\t' + \ - str(float(population_int_counts[i][ref])/population_totals[i]) + '\n') + f.write(str(i) + '\t' + ref + '\t' + + str(population_int_counts[i][ref]) + '\t' + + str(float(population_int_counts[i][ref])/population_totals[i]) + + '\n') f.close() -f = open(out_dir_run + 'strain_population_introgression_counts_run' + \ +f = open(out_dir_run + 'strain_population_introgression_counts_run' + run_id + '.txt', 'w') -f.write('strain\tpopulation\treference\tnum_bases_introgressed' + \ +f.write('strain\tpopulation\treference\tnum_bases_introgressed' + '\tfrac_bases_introgressed\n') for strain in strains: for i in strain_population_int_counts[strain].keys(): for ref in all_alternative_states: count = strain_population_int_counts[strain][i][ref] total = strain_population_totals[strain][i] - #frac = 0 - #if total > 0: + # frac = 0 + # if total > 0: frac = float(count)/total - f.write(strain + '\t' + str(i) + '\t' + ref + '\t' + + f.write(strain + '\t' + str(i) + '\t' + ref + '\t' + str(count) + '\t' + str(frac) + '\n') f.close() diff --git a/code/analyze/summarize_region_quality.py b/code/analyze/summarize_region_quality.py index d9754a2..c2a4160 100644 --- a/code/analyze/summarize_region_quality.py +++ b/code/analyze/summarize_region_quality.py @@ -1,8 +1,18 @@ +from __future__ import annotations import bisect import gzip -import global_params as gp -from misc import binary_search import numpy as np +import pickle +from contextlib import ExitStack +import click +import logging as log +from collections import Counter +from misc import read_fasta +from misc import read_table +from misc import seq_functions +from typing import List, Tuple, Dict +from analyze.introgression_configuration import Configuration + cen_starts = [151465, 238207, 114385, 449711, 151987, 148510, 496920, 105586, 355629, 436307, 440129, 150828, @@ -37,12 +47,16 @@ tel_right_starts = [tel_coords[i] for i in range(2, len(tel_coords), 4)] tel_right_ends = [tel_coords[i] for i in range(3, len(tel_coords), 4)] +chromosomes = ('I II III IV V ' + 'VI VII VIII IX X ' + 'XI XII XIII XIV XV XVI').split() + def distance_from_telomere(start, end, chrm): assert start <= end, str(start) + ' ' + str(end) - i = gp.chrms.index(chrm) + i = chromosomes.index(chrm) # region entirely on left arm if end <= cen_starts[i]: return start - tel_left_ends[i] @@ -52,11 +66,12 @@ def distance_from_telomere(start, end, chrm): # region overlaps centromere: return minimum distance from either telomere return min(start - tel_left_ends[i], tel_right_starts[i] - end) + def distance_from_centromere(start, end, chrm): assert start <= end, str(start) + ' ' + str(end) - i = gp.chrms.index(chrm) + i = chromosomes.index(chrm) # region entirely on left arm if end <= cen_starts[i]: return cen_starts[i] - end @@ -66,13 +81,15 @@ def distance_from_centromere(start, end, chrm): # region overlaps centromere: return 0 return 0 + def write_region_summary_plus(fn, regions, fields): f = open(fn, 'w') f.write('region_id\t' + '\t'.join(fields) + '\n') keys = sorted(regions.keys(), key=lambda x: int(x[1:])) for region_id in keys: f.write(region_id + '\t') - f.write('\t'.join([str(regions[region_id][field]) for field in fields])) + f.write('\t'.join([str(regions[region_id][field]) + for field in fields])) f.write('\n') f.close() @@ -81,11 +98,12 @@ def gap_columns(seqs): g = 0 for i in range(len(seqs[0])): for seq in seqs: - if seq[i] == gp.gap_symbol: + if seq[i] == '-': # gp.gap_symbol: g += 1 break return g + def longest_consecutive(s, c): max_consecutive = 0 current_consecutive = 0 @@ -114,9 +132,9 @@ def masked_columns(seqs): mask = False gap = False for s in range(num_seqs): - if seqs[s][ps] == gp.gap_symbol: + if seqs[s][ps] == '-': # gp.gap_symbol: gap = True - elif seqs[s][ps] == gp.masked_symbol: + elif seqs[s][ps] == 'x': # gp.masked_symbol: mask = True if mask: mask_total += 1 @@ -124,6 +142,7 @@ def masked_columns(seqs): mask_non_gap_total += 1 return mask_total, mask_non_gap_total + def index_by_reference(ref_seq, seq): # return dictionary keyed by reference index, with value the # corresponding index in non-reference sequence @@ -132,20 +151,14 @@ def index_by_reference(ref_seq, seq): ri = 0 si = 0 for i in range(len(ref_seq)): - if ref_seq[i] != gp.gap_symbol: + if ref_seq[i] != '-': # gp.gap_symbol: d[ri] = si ri += 1 - if seq[i] != gp.gap_symbol: + if seq[i] != '-': # gp.gap_symbol: si += 1 return d -def index_alignment_by_reference(ref_seq): - # want a way to go from reference sequence coordinate to index in - # alignment - return np.where(ref_seq != gp.gap_symbol)[0] - - def num_sites_between(sites, start, end): # sites are sorted i = bisect.bisect_left(sites, start) @@ -153,172 +166,850 @@ def num_sites_between(sites, start, end): return j - i, sites[i:j] -def read_masked_intervals(fn): - with open(fn, 'r') as reader: - reader.readline() # header - ints = [] - for line in reader: - line = line.split() - ints.append((int(line[0]), int(line[2]))) - - return ints - - -def convert_intervals_to_sites(ints): - sites = [] - for start, end in ints: - sites += range(start, end + 1) - return np.array(sites) - - -def seq_id_hmm(seq1, seq2, offset, include_sites): - sites = np.array(include_sites) - offset - - info_gap = np.logical_or(seq1 == gp.gap_symbol, - seq2 == gp.gap_symbol) - info_unseq = np.logical_or(seq1 == gp.unsequenced_symbol, - seq2 == gp.unsequenced_symbol) - info_match = seq1 == seq2 - info_hmm = np.zeros(info_match.shape, bool) - sites = sites[np.logical_and(sites < len(info_match), sites >= 0)] - info_hmm[sites] = True - - total_sites = np.sum(info_hmm) - total_moatch = np.sum(np.logical_and(info_hmm, info_match)) - - # check all included are not gapped or skipped - include_in_skip = np.logical_and( - info_hmm, np.logical_or( - info_unseq, info_gap)) - if np.any(include_in_skip): - ind = np.where(include_in_skip)[0][0] - raise AssertionError(f'{seq1[ind]} {seq2[ind]} {ind}') - - return total_match, total_sites, \ - {'gap_flag': info_gap, 'unseq_flag': info_unseq, - 'hmm_flag': info_hmm, 'match': info_match} - - -def seq_id_unmasked(seq1, seq2, offset, exclude_sites1, exclude_sites2): - # total_sites is number of sites at which neither sequence is - # masked or has a gap or unsequenced character; total_match is the - # number of those sites at which the two sequences match - # gapped and unsequenced locations - info_gap = np.logical_or(seq1 == gp.gap_symbol, - seq2 == gp.gap_symbol) - info_unseq = np.logical_or(seq1 == gp.unsequenced_symbol, - seq2 == gp.unsequenced_symbol) - - # convert offset excluded sites to boolean array - info_mask = np.zeros(seq1.shape, bool) - if exclude_sites1 != []: - sites1 = np.array(exclude_sites1) - offset - sites1 = sites1[np.logical_and(sites1 < len(info_gap), - sites1 >= 0)] - info_mask[sites1] = True - if exclude_sites2 != []: - sites2 = np.array(exclude_sites2) - offset - sites2 = sites2[np.logical_and(sites2 < len(info_gap), - sites2 >= 0)] - info_mask[sites2] = True - - # find sites that are not masked, gapped, or unsequenced - sites = np.logical_not( - np.logical_or( - info_mask, - np.logical_or( - info_gap, info_unseq))) - - # determine totals - total_sites = np.sum(sites) - total_match = np.sum( - np.logical_and( - seq1 == seq2, - sites)) - - return total_match, total_sites, {'mask_flag': info_mask} - - n = len(seq1) - total_sites = 0 - total_match = 0 - - skip = [gp.gap_symbol, gp.unsequenced_symbol] - info_mask = [False for i in range(n)] - for i in range(n): +class Summarizer(): + ''' + Summarize region quality of each region + ''' + def __init__(self, configuration: Configuration): + self.config = configuration + + def validate_arguments(self): + ''' + Check that all required instance variables are set to perform a + summarize run. Returns true if valid, raises value error otherwise + ''' + args = [ + 'chromosomes', + 'labeled_blocks', + 'quality_blocks', + 'masks', + 'alignment', + 'positions', + 'regions', + 'region_index', + 'known_states', + 'unknown_states', + 'states', + 'symbols' + ] + variables = self.config.__dict__ + for arg in args: + if arg not in variables or variables[arg] is None: + err = ('Failed to validate Summarizer, required argument ' + f"'{arg}' was unset") + log.exception(err) + raise ValueError(err) + + reference = self.config.get('analysis_params.reference') + if reference is None: + err = f'Configuration did not specify a reference strain' + log.exception(err) + raise ValueError(err) + + return True + + def run(self, states: List[str] = None): + ''' + Summarize region quality of each region for the states specified + ''' + ref_ind, states = self.states_to_process(states) + + log.debug(f'reference index: {ref_ind}') + log.debug(f'states to analyze: {states}') + + known_states = self.config.known_states + log.debug(f'known_states {known_states}') + + analyzer = Sequence_Analyzer( + self.config.masks, + self.config.alignment, + self.config.known_states, + self.config.interval_states, + self.config.chromosomes, + self.config.symbols) + + log.debug(f'Sequence_Analyzer init with:') + log.debug(f'masks: {self.config.masks}') + log.debug(f'alignment: {self.config.alignment}') + + analyzer.build_masked_sites() + + for ind, state in enumerate(states): + log.info(f'Working on state {state}') + state_ind = self.config.states.index(state) + + with Position_Reader( + self.config.positions + ) as positions,\ + Region_Writer( + self.config.regions.format(state=state), + self.config.region_index.format(state=state), + known_states + ) as region_writer,\ + Quality_Writer( + self.config.quality_blocks.format(state=state) + ) as quality_writer,\ + ExitStack() as stack: + + progress_bar = None + if self.config.log_file: + progress_bar = stack.enter_context( + click.progressbar( + length=len(self.config.chromosomes), + label=f'State {ind+1} of {len(states)}')) + + for chrm in self.config.chromosomes: + log.info(f'Working on chromosome {chrm}') + region = Region_Database( + self.config.labeled_blocks.format(state=state), + chrm, + known_states) + + for strain, ps in positions.get_positions(region, chrm): + log.debug(f'{strain} {chrm}') + + analyzer.process_alignment(ref_ind, + state_ind, + chrm, + strain, + ps, + region, + region_writer) + + quality_writer.write_quality(region) + + if progress_bar: + progress_bar.update(1) + + def states_to_process(self, + states: List[str] = None) -> Tuple[int, + List[str]]: + ''' + Set the states to summarize to the values passed in. + If no values are specified, run all states in config + Checks if states are in config, warning if a state is not + found and raising an error if none of the states are in config. + ''' + reference = self.config.get('analysis_params.reference.name') + ref_ind = self.config.states.index(reference) + + if states is None or states == []: + to_process = self.config.states - if binary_search.present(exclude_sites1, i + offset) or \ - binary_search.present(exclude_sites2, i + offset): - info_mask[i] = True - continue - if seq1[i] not in skip and seq2[i] not in skip: - total_sites += 1 - if seq1[i] == seq2[i]: - total_match += 1 - - # TODO: keep track of gapped/masked sites for master/predicted to - # incorporate into info string later - return total_match, total_sites, {'mask_flag': info_mask} - - -def make_info_string_unknown(info, master_ind): - - # used with indices to decode result - decoder = np.array(list('Xx._-')) - indices = np.zeros(info['gap_any_flag'].shape, int) + else: + to_process = [] + for s in states: + if s in self.config.states: + to_process.append(s) + else: + log.warning(f"state '{s}' was not found as a state") + + if to_process == []: + err = 'No valid states were found to process' + log.exception(err) + raise ValueError(err) + + return ref_ind, to_process + + +class Flag_Info(): + ''' + Collection of boolean flags for sequence summary + ''' + def __init__(self): + self.gap_any = None + self.mask_any = None + self.unseq_any = None + self.hmm = None + self.gap = None + self.mask = None + self.unseq = None + self.match = None + + def initialize_flags(self, number_sequences: int, number_states: int): + ''' + Initialize internal flags to np arrays of false + ''' + self.gap_any = np.zeros((number_sequences), bool) + self.mask_any = np.zeros((number_sequences), bool) + self.unseq_any = np.zeros((number_sequences), bool) + self.gap = np.zeros((number_sequences, number_states), bool) + self.mask = np.zeros((number_sequences, number_states), bool) + self.unseq = np.zeros((number_sequences, number_states), bool) + self.match = np.zeros((number_sequences, number_states), bool) + + def add_sequence_flags(self, other: Flag_Info, state: int): + ''' + Join the other flag info with this info by replacing values + in the gap, unseq, and match arrays and performing OR with anys + ''' + # only write the first time + if state == 0: + self.hmm = other.hmm + + self.gap_any = np.logical_or(self.gap_any, other.gap) + self.unseq_any = np.logical_or(self.unseq_any, other.unseq) + + self.gap[:, state] = other.gap + self.unseq[:, state] = other.unseq + self.match[:, state] = other.match + + def add_mask_flags(self, other: Flag_Info, state: int): + ''' + Join the other flag info with this by replacing values in mask and + performing an OR with mask_any + ''' + self.mask_any = np.logical_or(self.mask_any, other.mask) + self.mask[:, state] = other.mask + + def encode_info(self, + master_ind: int, + predict_ind: int) -> str: + ''' + Summarize info flags into a string. master_ind is the index of + the master reference state. predict_ind is the index of the predicted + state. The return string is encoded for each position as: + '-': if either master or predict has a gap + '_': if either master or predict is masked + '.': if any state has a match + 'b': both predict and master match + 'c': master matches but not predict + 'p': predict matches but not master + 'x': no other condition applies + if the position is in the hmm_flag + it will be capitalized for x, p, c, or b + in order of precidence, e.g. if a position satisfies both '-' and '.', + it will be '-'. + ''' + + if predict_ind >= self.match.shape[1]: + return self.encode_unknown_info(master_ind) + + decoder = np.array(list('xXpPcCbB._-')) + indices = np.zeros(self.match.shape[0], int) + + indices[self.match[:, predict_ind]] += 2 # x to p if true + indices[self.match[:, master_ind]] += 4 # x to c, p to b + indices[self.hmm] += 1 # to upper + + matches = np.all(self.match, axis=1) + indices[matches] = 8 # . + indices[np.any( + self.mask[:, [master_ind, predict_ind]], + axis=1)] = 9 # _ + indices[np.any( + self.gap[:, [master_ind, predict_ind]], + axis=1)] = 10 # - + + return ''.join(decoder[indices]) + + def encode_unknown_info(self, + master_ind: int) -> str: + ''' + Summarize info dictionary into a string for unknown state. + master_ind is the index of the master reference state. + The return string is encoded as each position as: + '-': if any state has a gap + '_': if any state has a mask + '.': all states match + 'x': master matches + 'X': no other condition applies + in order of precidence, e.g. if a position satisfies both '-' and '.', + it will be '-'. + ''' + + # used with indices to decode result + decoder = np.array(list('Xx._-')) + indices = np.zeros(self.gap_any.shape, int) + + indices[self.match[:, master_ind]] = 1 # x + matches = np.all(self.match, axis=1) + indices[matches] = 2 # . + indices[self.mask_any] = 3 # _ + indices[self.gap_any] = 4 # - + + return ''.join(decoder[indices]) + + +class Sequence_Analyzer(): + ''' + Performs handling of masking, reading, and analyzing sequence data for + summarizing the sequences + ''' + def __init__(self, + mask_file: str, + alignment_file: str, + known_states: List, + interval_states: List, + chromosomes: List, + symbols: Dict): + self.masks = mask_file + self.alignments = alignment_file + self.known_states = known_states + self.interval_states = interval_states + self.chromosomes = chromosomes + self.symbols = symbols + + def build_masked_sites(self): + ''' + Read in all intervals files and return dictionary of intervals, + keyed first by chromosome, then state + ''' + result = {} + for chrom in self.chromosomes: + result[chrom] = {} + for state, name in zip(self.known_states, self.interval_states): + result[chrom][state] = self.read_masked_sites(chrom, name) + + self.masked_sites = result + + def read_masked_sites(self, chrom: str, strain: str) -> np.array: + filename = self.masks.format(chrom=chrom, strain=strain) + intervals = self.read_masked_intervals(filename) + sites = self.convert_intervals_to_sites(intervals) + return sites + + def convert_intervals_to_sites(self, + intervals: List[Tuple]) -> np.array: + ''' + Given a list of start, end positions, returns a 1D np.array of sites + contained in the intervals List + convert_intervals_to_sites([(1, 2), (4, 6)]) -> [1, 2, 4, 5, 6] + ''' + sites = [] + for start, end in intervals: + sites += range(start, end + 1) + return np.array(sites, dtype=int) + + def read_masked_intervals(self, + filename: str) -> List[Tuple[int, int]]: + ''' + Read the interval file provided and return start and end sequences + as a list of tuples of 2 ints + ''' + with open(filename, 'r') as reader: + reader.readline() # header + intervals = [] + for line in reader: + line = line.split() + intervals.append((int(line[0]), int(line[2]))) + + return intervals + + def get_stats(self, + current_sequence, + other_sequence, + slice_start, + aligned_index_positions, + masked_site): + ''' + Helper function to perform analyses on the sequences returning + the results of seq_id_hmm, seq_id, and seq_id_unmasked + ''' + + # only alignment columns used by HMM (polymorphic, no + # gaps in any strain) + hmm_stats = self.seq_id_hmm(other_sequence, + current_sequence, + slice_start, + aligned_index_positions) + + # all alignment columns, excluding ones with gaps in + # these two sequences + nongap_stats = seq_functions.seq_id(other_sequence, + current_sequence) + + # all alignment columns, excluding ones with gaps or + # masked bases or unsequenced in *these two sequences* + nonmask_stats = self.seq_id_unmasked(other_sequence, + current_sequence, + slice_start, + masked_site[0], + masked_site[1]) + + return hmm_stats, nongap_stats, nonmask_stats + + def seq_id_hmm(self, + seq1: np.array, + seq2: np.array, + offset: int, + include_sites: List[int]) -> Tuple[ + int, int, Flag_Info]: + ''' + Compare two sequences and provide statistics of their overlap + considering only the included sites. + Takes the two sequences to consider, an offset of the included sites, + and a list of the included sites. + Returns: + -the total number of matching sites, where seq1[i] == seq2[i] and + i is an element in included_sites - offset + -the total number of sites considered in the included sites, e.g. where + included_sites - offset >= 0 and < len(seq) + -a Flag_Info object with: + -gap: true where seq1 or seq1 == gap_symbol + -unseq: true where seq1 or seq1 == unsequenced_symbol + -hmm: true where hmm[i] is in included_sites - offset + -match: true where seq1 == seq2, regardless of symbol + ''' + sites = np.array(include_sites) - offset + + info = Flag_Info() + info.gap = np.logical_or(seq1 == self.symbols['gap'], + seq2 == self.symbols['gap']) + info.unseq = np.logical_or(seq1 == self.symbols['unsequenced'], + seq2 == self.symbols['unsequenced']) + info.match = seq1 == seq2 + info.hmm = np.zeros(info.match.shape, bool) + sites = sites[np.logical_and(sites < len(info.match), sites >= 0)] + info.hmm[sites] = True + + total_sites = np.sum(info.hmm) + total_match = np.sum(np.logical_and(info.hmm, info.match)) + + # check all included are not gapped or skipped + include_in_skip = np.logical_and( + info.hmm, np.logical_or( + info.unseq, info.gap)) + if np.any(include_in_skip): + ind = np.where(include_in_skip)[0][0] + err = ('Need to skip site specified as included ' + f'seq1: {seq1[ind]}, seq2: {seq2[ind]}, index: {ind}') + log.exception(err) + raise ValueError(err) + + return total_match, total_sites, info + + def seq_id_unmasked(self, + seq1: np.array, + seq2: np.array, + offset: int, + exclude_sites1: List[int], + exclude_sites2: List[int]) -> Tuple[ + int, int, Flag_Info]: + ''' + Compare two sequences and provide statistics of their overlap + considering only the included sites. + Takes two sequences, an offset applied to each excluded sites list + Returns: + -total number of matching sites in non-excluded sites. A position is + excluded if it is an element of either excluded site list - offset, + or it is a gap or unsequenced symbol in either sequence. + -total number of non-excluded sites + A Flag_Info object with: + -mask_flag: a boolean array that is true if the position is in + either excluded list - offset + ''' + info = Flag_Info() + info.gap = np.logical_or(seq1 == self.symbols['gap'], + seq2 == self.symbols['gap']) + info.unseq = np.logical_or(seq1 == self.symbols['unsequenced'], + seq2 == self.symbols['unsequenced']) + exclude_sites1 = np.array(exclude_sites1) + exclude_sites2 = np.array(exclude_sites2) + + # convert offset excluded sites to boolean array + info.mask = np.zeros(seq1.shape, bool) + if exclude_sites1.size != 0: + sites1 = exclude_sites1 - offset + sites1 = sites1[np.logical_and(sites1 < len(info.gap), + sites1 >= 0)] + info.mask[sites1] = True + + if exclude_sites2.size != 0: + sites2 = exclude_sites2 - offset + sites2 = sites2[np.logical_and(sites2 < len(info.gap), + sites2 >= 0)] + info.mask[sites2] = True + + # find sites that are not masked, gapped, or unsequenced + sites = np.logical_not( + np.logical_or( + info.mask, + np.logical_or( + info.gap, info.unseq))) + + # determine totals + total_sites = np.sum(sites) + total_match = np.sum( + np.logical_and( + seq1 == seq2, + sites)) + + return total_match, total_sites, info + + def process_alignment(self, + reference_index: int, + state_index: int, + chromosome: str, + strain: str, + positions: np.array, + region: Region_Database, + region_writer: Region_Writer): + ''' + Analyze the alignment of a given strain, chromosome, and position. + Result is stored in the provided region database + ''' + sequences, alignments, masked_sites = self.get_indices(chromosome, + strain) + + # convert position indices from indices in master reference to + # indices in alignment + ps_align = alignments[reference_index][positions] + + for i, (r_id, start, end) in enumerate(region.get_entries(strain)): + start, end = self.get_slice(start, end, + alignments[reference_index], + ps_align) + + info = Flag_Info() + info.initialize_flags( + end - start + 1, + len(self.known_states)) + + for ind, state in enumerate(self.known_states): + hmm, nongap, nonmask = self.get_stats( + sequences[-1][start:end + 1], + sequences[ind][start:end + 1], + start, + ps_align, + (masked_sites[ind], + masked_sites[-1])) + + region.set_region(strain, i, state, + hmm, + nongap, + nonmask) + + info.add_sequence_flags(hmm[2], ind) + info.add_mask_flags(nonmask[2], ind) + + info_string = info.encode_info(reference_index, state_index) + + region_writer.write_header(r_id) + region_writer.write_sequences( + strain, + alignments, + sequences, + (start, end)) + region_writer.write_info_string(info_string) + + # and keep track of each symbol count + region.update_counts(strain, i, info_string) + + def get_indices(self, chromosome: str, strain: str) -> Tuple: + ''' + Get the sequences and different indices for the provided + chromosome and strain + Returned tuple contains: + -sequences as np.array + -index alignment list of indices for each sequence + -masked_sites, index aligned for each sequence + ''' + _, sequences = read_fasta.read_fasta( + self.alignments.format(chrom=chromosome, strain=strain)) + + # to go from index in reference seq to index in alignment + alignments = [ + self.index_alignment_by_reference(seq) + for seq in sequences + ] + + masked = self.read_masked_sites(chromosome, strain) + + masked_sites = [ + alignments[ind][self.masked_sites[chromosome][state]] + for ind, state in enumerate(self.known_states) + ] + [alignments[-1][masked]] # for strain + + return sequences, alignments, masked_sites + + def index_alignment_by_reference(self, sequence: np.array) -> np.array: + ''' + Find locations of non-gapped sites in sequence + want a way to go from reference sequence coordinate to index in + alignment + ''' + return np.where(sequence != self.symbols['gap'])[0] + + def get_slice(self, + start: int, + end: int, + alignment: np.array, + ps_align: np.array) -> Tuple[int, int]: + ''' + Get start and end positions of index aligned sequence. + Checks that positions are valid (in ps_align), and raises + value errors otherwise + ''' + # index of start and end of region in aligned sequences + slice_start, slice_end = alignment[[start, end]] + + if not np.in1d([slice_start, slice_end], ps_align).all(): + err = 'Slice not found in position alignment' + log.exception(err) + raise ValueError(err) + + return slice_start, slice_end + + +class Region_Database(): + ''' + Contains data and logic for regions data during summarizing + ''' + def __init__(self, + labeled_file: str, + chromosome: str, + known_states: List[str]): + ''' + Read in labeled file and store resulting table and labels + ''' + self.info_string_symbols = list('.-_npbcxNPBCX') + + self.label_prefixes = ['match_nongap', + 'num_sites_nongap', + 'match_hmm', + 'match_nonmask', + 'num_sites_nonmask'] + + self.data, self.labels = read_table.read_table_columns( + labeled_file, + sep='\t', + group_by='strain', + chromosome=chromosome) + + if self.labels[0] != 'region_id': + err = 'Unexpected labeled format' + log.exception(err) + raise ValueError(err) + + for strain, data in self.data.items(): + n = len(data['region_id']) + + for s in known_states: + for lbl in self.label_prefixes: + data[f'{lbl}_{s}'] = [0] * n + + for s in self.info_string_symbols: + data['count_' + s] = [0] * n + + self.labels += [f'{lbl}_{st}' for lbl in self.label_prefixes + for st in known_states] + self.labels += ['count_' + x for x in self.info_string_symbols] + + def has_strain(self, strain: str) -> bool: + ''' + Checks if the strain is in this database + ''' + return strain in self.data + + def get_entries(self, strain: str) -> Tuple[str, int, int]: + ''' + returns an iterator for the region entries of the strain + with region id (string), start (int) and end (int) positions + ''' + if not self.has_strain(strain): + err = f'Region Database does not contain strain {strain}' + log.exception(err) + raise ValueError(err) + + r_ids = self.data[strain]['region_id'] + starts = self.data[strain]['start'] + ends = self.data[strain]['end'] + for i in range(len(r_ids)): + yield (r_ids[i], int(starts[i]), int(ends[i])) + + def set_region(self, + strain: str, + index: int, + state: str, + hmm, nongap, nonmask): + ''' + Set the region state with the provided values. + hmm, nongap and nonmask are tuples of the (match, total) values + ''' + ds = self.data[strain] + MATCH, TOTAL = 0, 1 + if hmm[TOTAL] is not None: + ds['num_sites_hmm'][index] = hmm[TOTAL] + + ds[f'match_hmm_{state}'][index] = hmm[MATCH] + + ds[f'match_nongap_{state}'][index] = nongap[MATCH] + ds[f'num_sites_nongap_{state}'][index] = nongap[TOTAL] + + ds[f'match_nonmask_{state}'][index] = nonmask[MATCH] + ds[f'num_sites_nonmask_{state}'][index] = nonmask[TOTAL] + + def update_counts(self, + strain: str, + index: int, + info_string: str): + ''' + Update the counts variables based on the provided info string + ''' + counts = Counter(info_string) + for sym in self.info_string_symbols: + self.data[strain]['count_' + sym][index] = counts[sym] + + def generate_output(self): + ''' + Yield lines for writing to the quality output file. + To save memory, this effectively deletes the data structure! + Outputs are tab delimited, sorted by region_id + ''' + # reorganize output as list of tuples ordered by label + output = [] + # have to store this as dict changes during iterations + strains = list(self.data.keys()) + for strain in strains: + # pop to limit memory usage + d = self.data.pop(strain) + output += list(zip(*[d[l] for l in self.labels])) + + # sort by region id (index 0, remove r #[1:]) + for entry in sorted(output, key=lambda e: int(e[0][1:])): + yield '\t'.join([str(e) for e in entry]) + '\n' + + def generate_header(self): + ''' + Generate a header line for the region database + ''' + return '\t'.join(self.labels) + '\n' + + +class Region_Writer(): + ''' + Controls the writing of region files and indices + ''' + def __init__(self, + region_file: str, + index_file: str, + known_states: List[str]): + self.region_file = region_file + self.index_file = index_file + self.index = {} + self.known_states = known_states + + def __enter__(self): + self.region_writer = gzip.open(self.region_file, 'wt') + + return self + + def __exit__(self, type, value, traceback): + self.region_writer.close() + + if traceback is None: + # write index + with open(self.index_file, 'wb') as index_writer: + pickle.dump(self.index, index_writer) + return True - indices[info['match_flag'][:, master_ind]] = 1 # x - matches = np.all(info['match_flag'], axis=1) - indices[matches] = 2 # . - indices[info['mask_any_flag']] = 3 # _ - indices[info['gap_any_flag']] = 4 # - + else: + return False + + def write_header(self, region_id: str): + ''' + Add a header line with the region id + ''' + self.index[int(region_id[1:])] = self.region_writer.tell() + self.region_writer.write(f'#{region_id}\n') + + def write_sequences(self, + strain: str, + alignments: List, + sequences: np.array, + indices: Tuple): + ''' + Write sequences to region file + ''' + start, end = indices + names = self.known_states + [strain] + for sj, name in enumerate(names): + startj = bisect.bisect_left(alignments[sj], start) + endj = bisect.bisect_left(alignments[sj], end) + + self.region_writer.write(f'> {name} {startj} {endj}\n') + + self.region_writer.write(''.join( + sequences[sj][start:end+1]) + '\n') + + def write_info_string(self, info_string: str): + ''' + Write info string with header to region file + ''' + # write info string + self.region_writer.write('> info\n') + self.region_writer.write(info_string + '\n') + + +class Position_Reader(): + ''' + Read in position file, yielding positions until no longer on current + chromosome + ''' + + def __init__(self, position_file): + self.position_file = position_file + self.last_position = 0 + + def __enter__(self): + self.reader = gzip.open(self.position_file, 'rt') + return self + + def __exit__(self, type, value, traceback): + self.reader.close() + return traceback is None + + def get_positions(self, + region: Region_Database, + chromosome: str) -> Tuple[str, np.array]: + self.reader.seek(self.last_position) + line = self.next_line() + while line != '': + line = line.split('\t') + + chrm = line[1] + if chrm != chromosome: + break - return ''.join(decoder[indices]) + strain = line[0] + if not region.has_strain(strain): + line = self.next_line() + continue + yield strain, np.array(line[2:], dtype=int) -def make_info_string(info, master_ind, predict_ind): + line = self.next_line() - if predict_ind >= info['match_flag'].shape[1]: - return make_info_string_unknown(info, master_ind) + def next_line(self) -> str: + self.last_position = self.reader.tell() + line = self.reader.readline() + return line - decoder = np.array(list('xXpPcCbB._-')) - indices = np.zeros(info['match_flag'].shape[0], int) - indices[info['match_flag'][:, predict_ind]] += 2 # x to p if true - indices[info['match_flag'][:, master_ind]] += 4 # x to c, p to b - indices[info['hmm_flag']] += 1 # to upper +class Quality_Writer(): + ''' + Control writing of quality file from region database + ''' + def __init__(self, quality_filename): + self.filename = quality_filename + self.first_write = True - matches = np.all(info['match_flag'], axis=1) - indices[matches] = 8 # . - indices[np.any( - info['mask_flag'][:, [master_ind, predict_ind]], - axis=1)] = 9 # _ - indices[np.any( - info['gap_flag'][:, [master_ind, predict_ind]], - axis=1)] = 10 # - + def __enter__(self): + self.writer = open(self.filename, 'w') + return self - return ''.join(decoder[indices]) + def __exit__(self, type, value, traceback): + self.writer.close() + return traceback is None -def read_region_file(fn): - f = gzip.open(fn, 'rb') - d = {} - line = f.readline().decode() - while line != '': - region_id = line[1:-1] - line = f.readline().decode() - seqs = {} - while line[0] != '#': - line = line[:-1].split(' ') - strain = line[1] - seqs[strain] = {} - if len(line) > 2: - seqs[strain]['start'] = int(line[2]) - seqs[strain]['end'] = int(line[3]) - seqs[strain]['seq'] = f.readline().decode()[:-1] - line = f.readline().decode() - if line == '': - break - d[region_id] = seqs + def write_quality(self, region: Region_Database): + ''' + Writes header if needed and region database values + ''' + if self.first_write is True: + self.writer.write(region.generate_header()) + self.first_write = False - f.close() - return d - + for line in region.generate_output(): + self.writer.write(line) diff --git a/code/analyze/summarize_region_quality_main.py b/code/analyze/summarize_region_quality_main.py deleted file mode 100644 index 7e6c2a9..0000000 --- a/code/analyze/summarize_region_quality_main.py +++ /dev/null @@ -1,281 +0,0 @@ -import sys -import os -import gzip -from analyze import read_args -from analyze.summarize_region_quality import (convert_intervals_to_sites, - read_masked_intervals, - index_alignment_by_reference, - seq_id_hmm, - seq_id_unmasked, - make_info_string) -import global_params as gp -from misc import read_fasta -from misc import read_table -from misc import seq_functions -import numpy as np -import bisect -import pickle - - -def main(): - - args = read_args.process_predict_args(sys.argv[2:]) - - task_ind = int(sys.argv[1]) - species_ind = task_ind - - species_from = args['states'][species_ind] - - base_dir = gp.analysis_out_dir_absolute + args['tag'] - - regions_dir = f'{base_dir}/regions/' - if not os.path.isdir(regions_dir): - os.mkdir(regions_dir) - - quality_writer = None - positions = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'rt') - line_number = 0 - - region_writer = gzip.open( - f'{regions_dir}{species_from}{gp.fasta_suffix}.gz', 'wt') - region_index = {} - - for chrm in gp.chrms: - # region_id strain chromosome predicted_species start end num_non_gap - regions_chrm, labels = read_table.read_table_columns( - f'{base_dir}/blocks_{species_from}_{args["tag"]}_labeled.txt', - '\t', - group_by='strain', - chromosome=chrm - ) - - for strain in regions_chrm: - n = len(regions_chrm[strain]['region_id']) - - for s in args['known_states']: - regions_chrm[strain]['match_nongap_' + s] = [0] * n - regions_chrm[strain]['num_sites_nongap_' + s] = [0] * n - regions_chrm[strain]['match_hmm_' + s] = [0] * n - regions_chrm[strain]['match_nonmask_' + s] = [0] * n - regions_chrm[strain]['num_sites_nonmask_' + s] = [0] * n - - info_string_symbols = list('.-_npbcxNPBCX') - for s in info_string_symbols: - regions_chrm[strain]['count_' + s] = [0] * n - - # get masked sites for all references, not just the current - # species_from we're considering regions from - masked_sites_refs = {} - for s, state in enumerate(args['known_states']): - masked_sites_refs[s] = \ - convert_intervals_to_sites( - read_masked_intervals( - f'{gp.mask_dir}{state}' - f'_chr{chrm}_intervals.txt')) - - # loop through chromosomes and strains, followed by species of - # introgression so that we only have to read each alignment in once - # move to last read chromosome - positions.seek(line_number) - line = positions.readline() - while line != '': - line = line.split('\t') - - current_chrm = line[1] - if current_chrm != chrm: - break - - strain = line[0] - if strain not in regions_chrm: - # record current position in case need to re read line - line_number = positions.tell() - line = positions.readline() - continue - - print(strain, chrm) - - # indices of alignment columns used by HMM - ps = np.array([int(x) for x in line[2:]]) - - headers, seqs = read_fasta.read_fasta( - args['setup_args']['alignments_directory'] + \ - '_'.join(args['known_states']) - + f'_{strain}_chr{chrm}_mafft{gp.alignment_suffix}') - - # to go from index in reference seq to index in alignment - ind_align = [] - for seq in seqs: - ind_align.append(index_alignment_by_reference(seq)) - - masked_sites = convert_intervals_to_sites( - read_masked_intervals( - f'{gp.mask_dir}{strain}_chr{chrm}_intervals.txt')) - - masked_sites_ind_align = [] - for s in range(len(args['known_states'])): - masked_sites_ind_align.append( - ind_align[s][masked_sites_refs[s]]) - - # add in sequence of query strain - masked_sites_ind_align.append( - ind_align[-1][masked_sites]) - - # convert position indices from indices in master reference to - # indices in alignment - ps_ind_align = ind_align[0][ps] - - # loop through all regions for the specified chromosome and the - # current strain - for i in range(len(regions_chrm[strain]['region_id'])): - r_id = regions_chrm[strain]['region_id'][i] - start = regions_chrm[strain]['start'][i] - end = regions_chrm[strain]['end'][i] - - # calculate: - # - identity with each reference - # - fraction of region that is gapped/masked - - # index of start and end of region in aligned sequences - slice_start = ind_align[0][int(start)] - slice_end = ind_align[0][int(end)] - assert slice_start in ps_ind_align, \ - f'{slice_start} {start} {r_id}' - assert slice_end in ps_ind_align, \ - f'{slice_end} {end} {r_id}' - - seqx = seqs[-1][slice_start:slice_end + 1] - len_seqx = slice_end - slice_start + 1 - len_states = len(args['known_states']) - - # . = all match - # - = gap in one or more sequences - # p = matches predicted reference - - info = {'gap_any_flag': np.zeros((len_seqx), bool), - 'mask_any_flag': np.zeros((len_seqx), bool), - 'unseq_any_flag': np.zeros((len_seqx), bool), - 'hmm_flag': np.zeros((len_seqx), bool), - 'gap_flag': np.zeros((len_seqx, len_states), bool), - 'mask_flag': np.zeros((len_seqx, len_states), bool), - 'unseq_flag': np.zeros((len_seqx, len_states), bool), - 'match_flag': np.zeros((len_seqx, len_states), bool)} - - for sj, statej in enumerate(args['known_states']): - seqj = seqs[sj][slice_start:slice_end+1] - - # only alignment columns used by HMM (polymorphic, no - # gaps in any strain) - total_match_hmm, total_sites_hmm, infoj = \ - seq_id_hmm(seqj, seqx, slice_start, ps_ind_align) - - if statej == species_from \ - or species_ind >= len(args['known_states']): - regions_chrm[strain]['num_sites_hmm'][i] = \ - total_sites_hmm - - # only write once, the first index - if sj == 0: - info['hmm_flag'] = infoj['hmm_flag'] - - info['gap_any_flag'] = np.logical_or( - info['gap_any_flag'], infoj['gap_flag']) - info['unseq_any_flag'] = np.logical_or( - info['unseq_any_flag'], infoj['unseq_flag']) - info['gap_flag'][:, sj] = infoj['gap_flag'] - info['unseq_flag'][:, sj] = infoj['unseq_flag'] - info['match_flag'][:, sj] = infoj['match'] - - regions_chrm[strain][f'match_hmm_{statej}'][i] = \ - total_match_hmm - - # all alignment columns, excluding ones with gaps in - # these two sequences - total_match_nongap, total_sites_nongap = \ - seq_functions.seq_id(seqj, seqx) - - regions_chrm[strain][f'match_nongap_{statej}'][i] =\ - total_match_nongap - regions_chrm[strain][f'num_sites_nongap_{statej}'][i] =\ - total_sites_nongap - - # all alignment columns, excluding ones with gaps or - # masked bases or unsequenced in *these two sequences* - total_match_nonmask, total_sites_nonmask, infoj = \ - seq_id_unmasked(seqj, seqx, slice_start, - masked_sites_ind_align[sj], - masked_sites_ind_align[-1]) - - info['mask_any_flag'] = np.logical_or( - info['mask_any_flag'], infoj['mask_flag']) - info['mask_flag'][:, sj] = infoj['mask_flag'] - - regions_chrm[strain][f'match_nonmask_{statej}'][i] = \ - total_match_nonmask - regions_chrm[strain][f'num_sites_nonmask_{statej}'][i] = \ - total_sites_nonmask - - region_index[int(r_id[1:])] = region_writer.tell() - region_writer.write(f'#{r_id}\n') - names = args['known_states'] + [strain] - for sj in range(len(names)): - # write sequence to region alignment file, along with - # start and end coordinates - startj = bisect.bisect_left(ind_align[sj], slice_start) - endj = bisect.bisect_left(ind_align[sj], slice_end) - - region_writer.write(f'> {names[sj]} {startj} {endj}\n') - region_writer.write( - ''.join(seqs[sj][slice_start:slice_end+1]) + '\n') - - # also write string with info about each site - info_string = make_info_string(info, 0, species_ind) - region_writer.write('> info\n') - region_writer.write(info_string + '\n') - - # TODO this can be made faster with numpy - # and keep track of each symbol count - for sym in info_string_symbols: - regions_chrm[strain]['count_' + sym][i] = \ - info_string.count(sym) - - # record current position in case need to re read line - line_number = positions.tell() - line = positions.readline() - sys.stdout.flush() - - labels += ['match_nongap_' + x for x in args['known_states']] - labels += ['num_sites_nongap_' + x for x in args['known_states']] - labels += ['match_hmm_' + x for x in args['known_states']] - labels += ['match_nonmask_' + x for x in args['known_states']] - labels += ['num_sites_nonmask_' + x for x in args['known_states']] - labels += ['count_' + x for x in info_string_symbols] - - assert labels[0] == 'region_id', 'Unexpected labeled format' - - # write on first execution - if quality_writer is None: - quality_writer = open(f'{base_dir}/blocks_{species_from}' - f'_{args["tag"]}_quality.txt', 'w') - - quality_writer.write('\t'.join(labels) + '\n') - - # reorganize output as list of tuples ordered by label - output = [] - strains = list(regions_chrm.keys()) - for strain in strains: - # pop to limit memory usage - d = regions_chrm.pop(strain) - output += list(zip(*[d[l] for l in labels])) - - # sort by region id (index 0, remove r) - for entry in sorted(output, key=lambda e: int(e[0][1:])): - quality_writer.write('\t'.join([str(e) for e in entry]) + '\n') - - quality_writer.close() - region_writer.close() - with open(f'{regions_dir}{species_from}.pkl', 'wb') as index: - pickle.dump(region_index, index) - - -if __name__ == '__main__': - main() diff --git a/code/analyze/summarize_strain_states.py b/code/analyze/summarize_strain_states.py new file mode 100644 index 0000000..24a1b44 --- /dev/null +++ b/code/analyze/summarize_strain_states.py @@ -0,0 +1,208 @@ +from analyze.introgression_configuration import Configuration +import logging as log +import itertools +from misc import read_table +from typing import List +from contextlib import ExitStack +import click + + +class Strain_Summarizer(): + def __init__(self, configuration: Configuration): + self.config = configuration + + def validate_arguments(self): + ''' + Check that all required instance variables are set to perform a + strain summary run. Returns true if valid, raises value error otherwise + ''' + args = [ + 'known_states', + 'introgressed_intermediate', + 'ambiguous_intermediate', + 'strain_info', + 'state_counts', + ] + variables = self.config.__dict__ + for arg in args: + if arg not in variables or variables[arg] is None: + err = ('Failed to validate strain summarizer,' + f" required argument '{arg}' was unset") + log.exception(err) + raise ValueError(err) + + return True + + def run(self): + ''' + Generate summary information for the state of + each position in the sequence + ''' + self.validate_arguments() + + summary = Summary_Table() + + states = self.config.known_states[1:] + with ExitStack() as stack: + progress_bar = None + if self.config.log_file: + progress_bar = stack.enter_context( + click.progressbar( + length=len(states), + label='State')) + for species_from in states: + + log.info(species_from) + + regions1, _ = read_table.read_table_rows( + self.config.introgressed_intermediate.format( + state=species_from), '\t') + regions2, _ = read_table.read_table_rows( + self.config.ambiguous_intermediate.format( + state=species_from), '\t') + + for region_id in regions1: + region1 = regions1[region_id] + + strain = region1['strain'] + length = int(region1['end']) - int(region1['start']) + 1 + + summary.set_region(strain, species_from, length) + summary.region_found() + + if region1['reason'] != '': # failed filter + continue + + summary.region_passes_filter1() + + region2 = regions2[region_id] + summary.record_alt_species( + region2['alternative_states'].split(',')) + + if progress_bar: + progress_bar.update(1) + + with open(self.config.strain_info, 'r') as reader: + summary.add_strain_info(reader) + + with open(self.config.state_counts, 'w') as writer: + summary.write_summary(states, writer) + + +class Summary_Table(): + def __init__(self): + self.table = {} + + def set_region(self, strain, species, length): + self.strain = strain + self.species = species + self.length = length + + def record_element(self, + strain: str, + key: str, + count: int = 1): + ''' + Increment the count of table[strain][key], adding new values as needed + ''' + + if strain not in self.table: + self.table[strain] = {} + + t = self.table[strain] + if key not in t: + t[key] = 0 + + t[key] += count + + def record_region(self, + strain: str, + species: str, + length: int, + suffix: str = "", + update_total: bool = True): + ''' + Record a region of provided length. + ''' + if suffix and suffix[0] != '_': + suffix = '_' + suffix + + self.record_element(strain, f'num_regions_{species}{suffix}', 1) + self.record_element(strain, f'num_bases_{species}{suffix}', length) + if update_total: + self.record_element(strain, f'num_bases_total{suffix}', length) + self.record_element(strain, f'num_regions_total{suffix}', 1) + + def record_alt_species(self, alt_states: List): + for species in alt_states: + self.record_alt(species) + + if len(alt_states) == 1: + self.record_region(self.strain, self.species, + self.length, '_filtered2') + else: + self.record_element(self.strain, + ('num_bases_' + + '_or_'.join(sorted(alt_states)) + + '_filtered2i'), + self.length) + + self.record_element(self.strain, + f'num_bases_{len(alt_states)}_filtered2i', + self.length) + + def region_found(self): + self.record_region(self.strain, self.species, self.length) + + def region_passes_filter1(self): + self.record_region(self.strain, self.species, + self.length, '_filtered1') + + def record_alt(self, alt_species): + self.record_region(self.strain, alt_species, + self.length, '_filtered2_inclusive', + self.species == alt_species) + + def add_strain_info(self, reader): + for line in reader: + strain, _, _, geo, env, pop = line[:-1].split('\t') + strain = strain.lower() + if strain in self.table: + d = self.table[strain] + d['population'] = pop + d['geographic_origin'] = geo + d['environmental_origin'] = env + + def write_summary(self, states, writer): + fields = self.get_fields(states) + + # write header + writer.write('strain\t' + '\t'.join(fields) + '\n') + + for strain in sorted(self.table.keys()): + row = self.table[strain] + entries = [row[field] + if field in row + else 0 + for field in fields] + + entries = [str(s) for s in [strain] + entries] + + writer.write('\t'.join(entries) + '\n') + + def get_fields(self, states): + fields = ['population', 'geographic_origin', 'environmental_origin'] +\ + [f'num_{thing}_{state}{value}' + for thing in ('regions', 'bases') + for value in ('', '_filtered1', + '_filtered2', '_filtered2_inclusive') + for state in states + ['total'] + ] + + r = sorted(states) + for n in range(2, len(r)+1): + fields += [f'num_bases_{"_or_".join(combo)}_filtered2i' + for combo in itertools.combinations(r, n)] + fields += [f'num_bases_{n}_filtered2i'] + + return fields diff --git a/code/analyze/summarize_strain_states_main.py b/code/analyze/summarize_strain_states_main.py deleted file mode 100644 index d161570..0000000 --- a/code/analyze/summarize_strain_states_main.py +++ /dev/null @@ -1,117 +0,0 @@ -import re -import sys -import os -import copy -import itertools -import gene_predictions -import predict -from collections import defaultdict -from filter_helpers import * -sys.path.insert(0, '..') -import global_params as gp -sys.path.insert(0, '../misc/') -import read_table -import read_fasta - -args = predict.process_predict_args(sys.argv[1:]) - -d = defaultdict(lambda: defaultdict(int)) -for species_from in args['known_states'][1:]: - - print species_from - - fn_filtered1i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered1intermediate.txt' - fn_filtered2i = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ - 'blocks_' + species_from + \ - '_' + args['tag'] + '_filtered2intermediate.txt' - - regions1, fields1 = read_table.read_table_rows(fn_filtered1i, '\t') - regions2, fields2 = read_table.read_table_rows(fn_filtered2i, '\t') - - for region_id in regions1: - - strain = regions1[region_id]['strain'] - length = int(regions1[region_id]['end']) - int(regions1[region_id]['start']) + 1 - d[strain]['num_regions_' + species_from] += 1 - d[strain]['num_regions_total'] += 1 - d[strain]['num_bases_' + species_from] += length - d[strain]['num_bases_total'] += length - if regions1[region_id]['reason'] == '': - d[strain]['num_regions_' + species_from + '_filtered1'] += 1 - d[strain]['num_regions_total_filtered1'] += 1 - d[strain]['num_bases_' + species_from + '_filtered1'] += length - d[strain]['num_bases_total_filtered1'] += length - - alt_states = regions2[region_id]['alternative_states'].split(',') - for species_from_alt in alt_states: - d[strain]['num_regions_' + species_from_alt + \ - '_filtered2_inclusive'] += 1 - d[strain]['num_bases_' + species_from_alt + \ - '_filtered2_inclusive'] += length - if species_from_alt == species_from: - d[strain]['num_regions_total_filtered2_inclusive'] += 1 - d[strain]['num_bases_total_filtered2_inclusive'] += length - - if len(alt_states) == 1: - d[strain]['num_regions_' + species_from + \ - '_filtered2'] += 1 - d[strain]['num_regions_total_filtered2'] += 1 - d[strain]['num_bases_' + species_from + \ - '_filtered2'] += length - d[strain]['num_bases_total_filtered2'] += length - - - else: - d[strain]['num_bases_' + '_or_'.join(sorted(alt_states)) + '_filtered2i'] += length - - d[strain]['num_bases_' + str(len(alt_states)) + '_filtered2i'] += length - - -strain_info = [line[:-1].split('\t') for line in open('../../100_genomes_info.txt', 'r')] -strain_origins = dict(zip([x[0].lower() for x in strain_info], \ - [(x[5], x[3], x[4]) for x in strain_info])) -for strain in d.keys(): - d[strain]['population'] = strain_origins[strain][0] - d[strain]['geographic_origin'] = strain_origins[strain][1] - d[strain]['environmental_origin'] = strain_origins[strain][2] - -fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + 'state_counts_by_strain.txt' -f = open(fn, 'w') -fields = [] - -fields += ['population', 'geographic_origin', 'environmental_origin'] - -fields += ['num_regions_' + x for x in args['known_states'][1:]] -fields += ['num_regions_total'] -fields += ['num_regions_' + x + '_filtered1' for x in args['known_states'][1:]] -fields += ['num_regions_total_filtered1'] -fields += ['num_regions_' + x + '_filtered2' for x in args['known_states'][1:]] -fields += ['num_regions_total_filtered2'] -fields += ['num_regions_' + x + '_filtered2_inclusive' for x in args['known_states'][1:]] -fields += ['num_regions_total_filtered2_inclusive'] - -fields += ['num_bases_' + x for x in args['known_states'][1:]] -fields += ['num_bases_total'] -fields += ['num_bases_' + x + '_filtered1' for x in args['known_states'][1:]] -fields += ['num_bases_total_filtered1'] -fields += ['num_bases_' + x + '_filtered2' for x in args['known_states'][1:]] -fields += ['num_bases_total_filtered2'] -fields += ['num_bases_' + x + '_filtered2_inclusive' for x in args['known_states'][1:]] -fields += ['num_bases_total_filtered2_inclusive'] - -r = sorted(gp.alignment_ref_order[1:]) -for n in range(2, len(r)+1): - x = itertools.combinations(r, n) - for combo in x: - fields += ['num_bases_' + '_or_'.join(combo) + '_filtered2i'] - fields += ['num_bases_' + str(n) + '_filtered2i'] - -f.write('strain' + '\t' + '\t'.join(fields) + '\n') - -for strain in sorted(d.keys()): - f.write(strain + '\t') - f.write('\t'.join([str(d[strain][x]) for x in fields])) - f.write('\n') -f.close() diff --git a/code/analyze/to_update/aggregate_genes_by_strains_main.py b/code/analyze/to_update/aggregate_genes_by_strains_main.py index e7ac91e..f7e008f 100644 --- a/code/analyze/to_update/aggregate_genes_by_strains_main.py +++ b/code/analyze/to_update/aggregate_genes_by_strains_main.py @@ -1,28 +1,19 @@ import sys -import os -import gzip -import predict from collections import defaultdict -from summarize_region_quality import * -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta -import read_table -import seq_functions +from misc import read_table tag = sys.argv[1] fn = gp.analysis_out_dir_absolute + tag + \ '/introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt' -regions_filtered, l = read_table.read_table_rows(fn, "\t") +regions_filtered, _ = read_table.read_table_rows(fn, "\t") gene_strains = defaultdict(set) strain_genes = defaultdict(lambda: defaultdict(set)) for chrm in gp.chrms: - + fn = gp.analysis_out_dir_absolute + tag + \ '/genes_for_each_region_chr' + chrm + '_' + \ tag + '.txt' @@ -43,18 +34,18 @@ for gene in gene_strains: gene_counts[gene] = len(gene_strains[gene]) -f_out = open(gp.analysis_out_dir_absolute + tag + \ - '/genes_for_each_strain_filtered_' + \ +f_out = open(gp.analysis_out_dir_absolute + tag + + '/genes_for_each_strain_filtered_' + tag + '.txt', 'w') f_out.write('strain\tchromosome\tnum_genes\n') for chrm in gp.chrms: for strain in strain_genes[chrm]: - f_out.write(strain + '\t' + chrm + '\t' + \ + f_out.write(strain + '\t' + chrm + '\t' + str(len(strain_genes[chrm][strain])) + '\n') f_out.close() -f_out = open(gp.analysis_out_dir_absolute + tag + \ - '/genes_strain_hist_' + \ +f_out = open(gp.analysis_out_dir_absolute + tag + + '/genes_strain_hist_' + tag + '.txt', 'w') f_out.write('gene\tnum_strains\n') for gene in sorted(gene_counts.keys()): diff --git a/code/analyze/to_update/annotate_positions.py b/code/analyze/to_update/annotate_positions.py index 16b6378..0393c58 100644 --- a/code/analyze/to_update/annotate_positions.py +++ b/code/analyze/to_update/annotate_positions.py @@ -1,9 +1,8 @@ -import sys import re import gzip -sys.path.insert(0, '../misc/') -import overlap -import read_fasta +from misc import overlap +from misc import read_fasta + def get_genes(fn): @@ -15,14 +14,17 @@ def get_genes(fn): f.close() return genes + def get_orfs(fn): headers, seqs = read_fasta.read_fasta(fn) orfs = {} for h in headers: - m = re.search(' (?P[a-zA-Z0-9]+)_(?P[a-zA-Z0-9\.]+):(?P[0-9]+):(?P[0-9]+)', h) + m = re.search(r' (?P[a-zA-Z0-9]+)_(?P[a-zA-Z0-9\.]+)' + ':(?P[0-9]+):(?P[0-9]+)', h) orfs[(int(m.group('start')), int(m.group('end')))] = m.group('name') return orfs + def write_annotated_file(coords, genes, orfs, fn): # could definitely do this all way more efficiently sep = '\t' @@ -33,14 +35,13 @@ def write_annotated_file(coords, genes, orfs, fn): if int(coords[i]) == coords[i]: f.write(str(int(coords[i])) + sep) gene = overlap.contained_any_named(coords[i], genes) - if gene != None: + if gene is not None: f.write(gene) f.write(sep) else: f.write(str(coords[i]) + sep + sep) orf = overlap.contained_any_named(i, orfs) - if orf != None: + if orf is not None: f.write(orf) f.write('\n') f.close() - diff --git a/code/analyze/to_update/annotate_positions_main.py b/code/analyze/to_update/annotate_positions_main.py index c5647b0..f721acb 100644 --- a/code/analyze/to_update/annotate_positions_main.py +++ b/code/analyze/to_update/annotate_positions_main.py @@ -4,28 +4,24 @@ # gene # in ORF? -import re import sys import os -import copy import gzip -from annotate_positions import * -sys.path.insert(0, '..') +from annotate_positions import (get_genes, get_orfs, write_annotated_file) import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers +from align import align_helpers -##====== +# ====== # get strains -##====== +# ====== i = int(sys.argv[1]) s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) strain, d = s[i] -##====== +# ====== # get genes on each chromosome -##====== +# ====== genes_by_chrm = {} for chrm in gp.chrms: @@ -33,10 +29,10 @@ '_genes.txt' genes_by_chrm[chrm] = get_genes(fn) -##====== +# ====== # loop through all strains and chromosomes, generating annotated # position file for each -##====== +# ====== coord_dir = gp.analysis_out_dir_absolute + 'coordinates/' if not os.path.exists(coord_dir + 'annotated'): @@ -44,37 +40,15 @@ for chrm in gp.chrms: - print strain, chrm + print(strain, chrm) fn = strain + '_to_' + gp.master_ref + '_chr' + chrm + '.txt.gz' fn_orfs = d + 'orfs/' + strain + '_chr' + chrm + \ - '_orfs' + gp.fasta_suffix + '_orfs' + gp.fasta_suffix orfs = get_orfs(fn_orfs) fn_out = coord_dir + 'annotated/' + fn - coords = [float(line) for line in gzip.open(coord_dir + fn, 'rb').readlines()] + coords = [float(line) + for line in gzip.open(coord_dir + fn, 'rb').readlines()] write_annotated_file(coords, genes_by_chrm[chrm], orfs, fn_out) - - - - - -#for strain, d in s: - - #m = re.search('(?P[a-zA-Z0-9]+)_to_(?P[a-zA-Z0-9]+)_chr(?P[IVXM]+)', fn) - #if m == None: - # continue - #strain1 = m.group('strain1') - #strain2 = m.group('strain2') - #chrm = m.group('chrm') - - #if strain1 == gp.master_ref: - # continue - - # don't deal with paradoxus just for now - #if strain1 in gp.alignment_ref_order or strain2 != gp.master_ref: - # continue - - #print fn - diff --git a/code/analyze/to_update/annotate_regions.py b/code/analyze/to_update/annotate_regions.py index fd032a0..8758484 100644 --- a/code/analyze/to_update/annotate_regions.py +++ b/code/analyze/to_update/annotate_regions.py @@ -1,9 +1,6 @@ import gzip import gene_predictions -import sys import global_params as gp -sys.path.insert(0, '../misc/') - def get_block_by_site(all_regions, seq): @@ -19,8 +16,8 @@ def get_block_by_site(all_regions, seq): return introgressed_by_site -def write_predictions_annotated(alignment_headers, alignment_seqs, master, \ - strain_labels, match_by_site, \ +def write_predictions_annotated(alignment_headers, alignment_seqs, master, + strain_labels, match_by_site, gene_by_site, block_by_site, masked, fn): f = gzip.open(fn, 'wb') @@ -36,10 +33,10 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \ individual_indices = [0] * num_seqs # header - f.write('ps_ref' + sep + 'ps_strain' + sep + \ - sep.join(strain_labels) + sep + \ - 'match' + sep + \ - 'gene' + sep + 'block' + sep + \ + f.write('ps_ref' + sep + 'ps_strain' + sep + + sep.join(strain_labels) + sep + + 'match' + sep + + 'gene' + sep + 'block' + sep + sep.join([lab + '_masked' for lab in strain_labels]) + '\n') lines = [] @@ -57,7 +54,7 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \ ind_ref += 1 ps_ref = str(ind_ref) line += ps_ref + sep - + # index in strain ps_strain = None if alignment_seqs[-1][i] == gp.gap_symbol: @@ -76,7 +73,7 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \ line += match_by_site[r][i] line += sep - if gene_by_site[i] != None: + if gene_by_site[i] is not None: line += gene_by_site[i] line += sep @@ -87,17 +84,18 @@ def write_predictions_annotated(alignment_headers, alignment_seqs, master, \ line += sep if alignment_seqs[si][i] != gp.gap_symbol: # TODO update n to x - if masked[si][individual_indices[si]] == 'n': #gp.masked_symbol: + if masked[si][individual_indices[si]] == 'n': # masked line += gp.masked_symbol individual_indices[si] += 1 - + line += '\n' - + lines.append(line) f.writelines(lines) f.close() + # TODO give this a more general name/place def read_predictions_annotated(fn): sep = '\t' @@ -109,10 +107,7 @@ def read_predictions_annotated(fn): line = line[:-1].split(sep) for i in range(len(labels)): d[labels[i]].append(line[i]) - #d[line[0]] = dict(zip(labels[1:], line[1:])) + # d[line[0]] = dict(zip(labels[1:], line[1:])) line = f.readline() f.close() return d - - - diff --git a/code/analyze/to_update/annotate_regions_main.py b/code/analyze/to_update/annotate_regions_main.py index a06bb2c..3a60544 100644 --- a/code/analyze/to_update/annotate_regions_main.py +++ b/code/analyze/to_update/annotate_regions_main.py @@ -1,28 +1,25 @@ -# ps_cer ps_strain cer_ref par_ref strain gene introgressed_region cer_masked par_masked strain_masked +# ps_cer ps_strain cer_ref par_ref strain gene introgressed_region +# cer_masked par_masked strain_masked -import re import sys import os -import copy -import gene_predictions -from annotate_regions import * +import gene_predictions +from annotate_regions import (write_predictions_annotated, + get_block_by_site) import predict -import pickle -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta +from misc import read_fasta -##====== +# ====== # read in analysis parameters -##====== +# ====== refs, strains, args = predict.process_args(sys.argv[1:]) chrm = sys.argv[1] -##====== +# ====== # read in introgressed/unknown regions and alignments -##====== +# ====== gp_dir = '../' @@ -40,9 +37,9 @@ fn_align_prefix = gp_dir + gp.alignments_dir fn_align_prefix += '_'.join([refs[s][0] for s in args['species']]) + '_' -##====== +# ====== # produce annotated files -##====== +# ====== # for keeping track of all genes introgressed in each strain, and the # fraction introgressed @@ -68,20 +65,20 @@ fn_genes = gp.analysis_out_dir_absolute + '/' + \ master_ref + '_chr' + chrm + '_genes.txt' -print 'reading genes on chromosome', chrm +print('reading genes on chromosome', chrm) # dictionary keyed by name: (start, end) genes = gene_predictions.read_genes(fn, fn_genes) -print 'done reading genes' +print('done reading genes') # loop through all strains that we've called introgression in, and # associate genes with the regions they overlap for strain in regions.keys(): - - print '***', strain, chrm + + print('***', strain, chrm) sys.stdout.flush() - fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' + \ - 'predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz' + fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' +\ + 'predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz' if not os.path.exists(os.path.dirname(fn_out)): os.makedirs(os.path.dirname(fn_out)) @@ -92,29 +89,30 @@ # read alignment blocks for this strain and chromosome fn_align = fn_align_prefix + \ - strain + '_chr' + chrm + '_mafft' + gp.alignment_suffix + strain + '_chr' + chrm + '_mafft' + gp.alignment_suffix alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align) # read masked (unaligned) sequences seq_masked_fns = [header.split()[-1] for header in alignment_headers] - seq_masked_fns = [mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix \ + seq_masked_fns = [mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix for mfn in seq_masked_fns] seqs_masked = [read_fasta.read_fasta(mfn)[1][0] for mfn in seq_masked_fns] labels = ref_labels + [strain] - + # mark each site as matching each reference or not - ref_match_by_site = gene_predictions.get_ref_match_by_site(alignment_seqs, labels) + ref_match_by_site = gene_predictions.get_ref_match_by_site(alignment_seqs, + labels) # mark each site as in a gene or not - genes_by_site = gene_predictions.get_genes_by_site(genes, alignment_seqs[0]) + genes_by_site = gene_predictions.get_genes_by_site(genes, + alignment_seqs[0]) # mark each site as introgressed or not all_regions = [regions[strain][chrm]] - if regions_unk.has_key(strain) and regions_unk[strain].has_key(chrm): + if strain in regions_unk.has_key and chrm in regions_unk[strain]: all_regions.append(regions_unk[strain][chrm]) block_by_site = get_block_by_site(all_regions, alignment_seqs[0]) - write_predictions_annotated(alignment_headers, alignment_seqs, 0, \ - ref_labels + [strain], ref_match_by_site, \ - genes_by_site, block_by_site, seqs_masked, fn_out) - - + write_predictions_annotated(alignment_headers, alignment_seqs, 0, + ref_labels + [strain], ref_match_by_site, + genes_by_site, block_by_site, + seqs_masked, fn_out) diff --git a/code/analyze/to_update/check_paralogs_main.py b/code/analyze/to_update/check_paralogs_main.py index 1bc39bd..bde7994 100644 --- a/code/analyze/to_update/check_paralogs_main.py +++ b/code/analyze/to_update/check_paralogs_main.py @@ -1,6 +1,6 @@ # Loop through all introgressed genes (might be just a small part) # that have paralogs -# Extract introgressed portion of gene +# Extract introgressed portion of gene # Blast that portion against: # - Cerevisiae gene # - Paradoxus gene (region aligned to cerevisiae gene) @@ -13,23 +13,14 @@ # - Paradoxus paralog -> interesting... -import re -import sys import os import math -import Bio.SeqIO -import copy import gzip -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import read_table -import read_fasta -import write_fasta -import mystats +from align import align_helpers +from misc import read_table +from misc import read_fasta +from misc import write_fasta postprocess = False @@ -48,7 +39,7 @@ # dict of dicts keyed by region id and column names; includes filtered # and unfiltered regions region_to_genes = {} -f = open(gp.analysis_out_dir_absolute + tag + \ +f = open(gp.analysis_out_dir_absolute + tag + '/genes_for_each_region_' + tag + '.txt', 'r') line = f.readline() while line != '': @@ -60,10 +51,10 @@ f.close() # dict of lists keyed by region id -t_regions_filtered, l = \ - read_table.read_table_rows(gp.analysis_out_dir_absolute + tag + \ - '/introgressed_blocks_filtered_par_' + tag + \ - '_summary_plus.txt', \ +t_regions_filtered, _ = \ + read_table.read_table_rows(gp.analysis_out_dir_absolute + tag + + '/introgressed_blocks_filtered_par_' + tag + + '_summary_plus.txt', '\t', header=True) @@ -73,12 +64,12 @@ for region_id in region_to_genes: genes = region_to_genes[region_id] for gene in genes: - if not gene_to_regions.has_key(gene): + if gene not in gene_to_regions: gene_to_regions[gene] = [] gene_to_regions[gene].append(region_id) if region_id in t_regions_filtered: for gene in genes: - if not gene_to_regions_filtered.has_key(gene): + if gene not in gene_to_regions_filtered: gene_to_regions_filtered[gene] = [] gene_to_regions_filtered[gene].append(region_id) @@ -95,7 +86,7 @@ # read in all gene coordinates gene_coords = {} for chrm in gp.chrms: - f = open(gp.analysis_out_dir_absolute + \ + f = open(gp.analysis_out_dir_absolute + 'S288c_chr' + chrm + '_genes.txt', 'r') lines = [line.strip().split('\t') for line in f.readlines()] f.close() @@ -108,8 +99,9 @@ genes_to_analyze = gene_to_regions_filtered.keys() if postprocess: - genes_to_analyze = [line.split('\t')[0] for line in \ - open('check_paralogs_out_cer_paralog.tsv', 'r').readlines()] + genes_to_analyze = [line.split('\t')[0] for line in + open('check_paralogs_out_cer_paralog.tsv', + 'r').readlines()] genes_to_analyze = list(set(genes_to_analyze)) ip = 0 @@ -117,13 +109,13 @@ if gene not in paralogs: continue - print ip + print(ip) ip += 1 chrm, ref_gene_start, ref_gene_end = gene_coords[gene] gene_headers, gene_seqs = \ - read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + \ + read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + gene + '_from_alignment.fa') gene_headers = [x[1:].strip() for x in gene_headers] strain_seqs = dict(zip(gene_headers, gene_seqs)) @@ -133,7 +125,7 @@ paralog = paralogs[gene] gene_headers, gene_seqs = \ - read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + \ + read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' + paralog + '/' + paralog + '_from_alignment.fa') gene_headers = [x[1:].strip() for x in gene_headers] strain_paralog_seqs = dict(zip(gene_headers, gene_seqs)) @@ -153,8 +145,8 @@ f.close() cmd_string = gp.blast_install_path + 'makeblastdb' + \ - ' -in ' + db_fn + \ - ' -dbtype nucl' + ' -in ' + db_fn + \ + ' -dbtype nucl' os.system(cmd_string) strain_intd_seqs = {} @@ -164,25 +156,29 @@ ref_region_start = int(t_regions_filtered[region]['start']) ref_region_end = int(t_regions_filtered[region]['end']) - ref_to_strain_coords = [float(x[:-1]) for x in \ - gzip.open(gp.analysis_out_dir_absolute + \ - 'coordinates/S288c_to_' + strain + \ - '_chr' + chrm + '.txt.gz').readlines()] + ref_to_strain_coords = [float(x[:-1]) for x in + gzip.open(gp.analysis_out_dir_absolute + + 'coordinates/S288c_to_' + strain + + '_chr' + chrm + + '.txt.gz').readlines()] - gene_start = int(max(0, math.ceil(ref_to_strain_coords[ref_gene_start]))) + gene_start = int(max(0, math.ceil( + ref_to_strain_coords[ref_gene_start]))) gene_end = int(math.floor(ref_to_strain_coords[ref_gene_end])) - - region_start = int(max(0, math.ceil(ref_to_strain_coords[ref_region_start]))) + + region_start = int(max(0, math.ceil( + ref_to_strain_coords[ref_region_start]))) region_end = int(math.floor(ref_to_strain_coords[ref_region_end])) start = max(gene_start, region_start) end = min(gene_end, region_end) - chrom_seq = read_fasta.read_fasta(strain_dirs[strain] + strain + '_chr' + \ + chrom_seq = read_fasta.read_fasta(strain_dirs[strain] + + strain + '_chr' + chrm + gp.fasta_suffix)[1][0] seq = chrom_seq[start:end+1] - if not strain_intd_seqs.has_key(strain): + if strain not in strain_intd_seqs: strain_intd_seqs[strain] = chrom_seq[gene_start:gene_end+1].lower() relative_start = start - gene_start relative_end = end - gene_start @@ -196,11 +192,11 @@ f.close() cmd_string = gp.blast_install_path + 'blastn' + \ - ' -db ' + db_fn + \ - ' -query ' + query_fn + \ - ' -out ' + out_fn + \ - ' -outfmt ' + outfmt - print cmd_string + ' -db ' + db_fn + \ + ' -query ' + query_fn + \ + ' -out ' + out_fn + \ + ' -outfmt ' + outfmt + print(cmd_string) os.system(cmd_string) if os.stat(out_fn).st_size == 0: @@ -210,9 +206,9 @@ ' -out ' + out_fn + \ ' -task "blastn-short"' + \ ' -outfmt ' + outfmt - print cmd_string + print(cmd_string) os.system(cmd_string) - + lines = open(out_fn, 'r').readlines() best_key = 'none' if len(lines) != 0: @@ -236,10 +232,10 @@ # write reference genes and paralogs and all introgressed # genes to file and then align fn = gp.analysis_out_dir_absolute + tag + '/paralogs/' + \ - gene + gp.fasta_suffix - headers = ['S288c ' + gene, 'CBS432 ' + gene, \ + gene + gp.fasta_suffix + headers = ['S288c ' + gene, 'CBS432 ' + gene, 'S288c ' + paralog, 'CBS432 ' + paralog] - seqs = [cer_seq.lower(), par_seq.lower(), \ + seqs = [cer_seq.lower(), par_seq.lower(), cer_paralog_seq.lower(), par_paralog_seq.lower()] for strain in strain_intd_seqs: headers.append(strain + ' ' + gene) @@ -248,10 +244,10 @@ aligned_fn = fn.replace(gp.fasta_suffix, gp.alignment_suffix) cmd_string = gp.mafft_install_path + '/mafft ' + \ - ' --quiet --reorder --preservecase ' + \ - fn + ' > ' + aligned_fn + ' --quiet --reorder --preservecase ' + \ + fn + ' > ' + aligned_fn os.system(cmd_string) - + f = open('check_paralogs_out.tsv', 'w') f.write('category\tnum_total_genes\tnum_unique_genes\n') for key in keys: @@ -265,7 +261,5 @@ for item in all_rankings[key]: fk.write('\t'.join(item) + '\n') fk.close() - -f.close() - +f.close() diff --git a/code/analyze/to_update/combine_all_strains.py b/code/analyze/to_update/combine_all_strains.py index 947764b..fa331ca 100644 --- a/code/analyze/to_update/combine_all_strains.py +++ b/code/analyze/to_update/combine_all_strains.py @@ -3,19 +3,12 @@ import os import math import Bio.SeqIO -import copy -import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import seq_functions -import read_table -import read_fasta -import write_fasta -import mystats -import overlap +from misc import seq_functions +from misc import read_table +from misc import read_fasta +from misc import overlap + def get_range_seq(start, end, seq_fn): @@ -23,16 +16,17 @@ def get_range_seq(start, end, seq_fn): range_seq = chrm_seq[start:end+1] return range_seq + def get_ref_gene_seq(gene, gene_coords_fn, seq_fn): - d1, labels = read_table.read_table_rows(gene_coords_fn, '\t', \ + d1, labels = read_table.read_table_rows(gene_coords_fn, '\t', header=False, key_ind=0) d = {} for g in d1: if d1[g][0] == '""': d[g] = d1[g][1:] else: - d[d1[g][0]] = d1[g][1:] + d[d1[g][0]] = d1[g][1:] gene_start = int(d[gene][2]) - 1 gene_end = int(d[gene][3]) - 1 @@ -45,6 +39,7 @@ def get_ref_gene_seq(gene, gene_coords_fn, seq_fn): assert gene_start < gene_end return gene_seq, gene_start, gene_end, strand + def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1): headers, seqs = read_fasta.read_fasta(fn) n = len(seqs[0]) @@ -63,29 +58,30 @@ def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1): pr.append(str(ri)) ps.append(str(si)) if flip_ref: - return {'ps_ref':ps, 'ps_strain':pr} - return {'ps_ref':pr, 'ps_strain':ps} + return {'ps_ref': ps, 'ps_strain': pr} + return {'ps_ref': pr, 'ps_strain': ps} # by taking part of sequence aligned with reference coordinates -def get_range_seqs(strains, chrm, start, end, tag, gp_dir = '../'): +def get_range_seqs(strains, chrm, start, end, tag, gp_dir='../'): # TODO this shouldn't actually be dependent on tag strain_range_seqs = {} for strain, d in strains: - print strain + print(strain) fn = d + strain + '_chr' + chrm + gp.fasta_suffix chrm_seq = read_fasta.read_fasta(fn)[1][0] t = None try: - t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \ - tag + '/' + \ - 'site_summaries/predictions_' + \ - strain + \ - '_chr' + chrm + \ - '_site_summary.txt.gz', '\t') - except: + t, labels = read_table.read_table_columns( + gp.analysis_out_dir_absolute + + tag + '/' + + 'site_summaries/predictions_' + + strain + + '_chr' + chrm + + '_site_summary.txt.gz', '\t') + except FileNotFoundError: # for par reference which doesn't have site summary file align_fn = gp_dir + gp.alignments_dir + \ '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \ @@ -97,13 +93,13 @@ def get_range_seqs(strains, chrm, start, end, tag, gp_dir = '../'): start_strain = int(math.ceil(float(ref_ind_to_strain_ind[str(start)]))) end_strain = int(math.floor(float(ref_ind_to_strain_ind[str(end)]))) - - strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain+1], \ - start_strain, end_strain) + strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain+1], + start_strain, end_strain) return strain_range_seqs -def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, start, end): +def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, + orf_headers, orf_seqs, start, end): # choosing best hit by reciprocal blast -> not reliable tho if len(hits) == 1: return hits[0][0] @@ -124,10 +120,10 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star f.write(seq + '\n') f.close() cmd_string = gp.blast_install_path + 'blastn' + \ - ' -db ' + ref_chrm_fn + \ - ' -query ' + orf_query_fn + \ - ' -out ' + out_fn + \ - ' -outfmt ' + outfmt + ' -db ' + ref_chrm_fn + \ + ' -query ' + orf_query_fn + \ + ' -out ' + out_fn + \ + ' -outfmt ' + outfmt os.system(cmd_string) f = open(out_fn, 'r') nhits = [line[:-1].split('\t') for line in f.readlines()] @@ -135,7 +131,7 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star nstart = int(nhits[0][-2]) nend = int(nhits[0][-1]) # this division is hacky and unprincipled - o = overlap.overlap(start, end, nstart, nend) / float(hit[1]) + o = overlap.overlap(start, end, nstart, nend) / float(hit[1]) if o > greatest_overlap: greatest_overlap = o best_hit = hit[0] @@ -143,10 +139,11 @@ def choose_best_hit_rev(hits, query_fn, ref_chrm_fn, orf_headers, orf_seqs, star os.remove(out_fn) return best_hit -def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\ + +def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs, strain_ind_to_ref_ind, gp_dir='../'): - greatest_overlap = 0 # don't want to take overlaps of 0 + greatest_overlap = 0 # don't want to take overlaps of 0 best_hit = None x = None seq = None @@ -171,7 +168,7 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\ c2 = chunk2.find(':', c1+1) seq = seqs[i] orf_start = int(chunk2[c1+1:c2]) - orf_end = int(chunk2[c2+1:]) + orf_end = int(chunk2[c2+1:]) strand = '1' if orf_start > orf_end: temp = orf_end @@ -179,8 +176,10 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\ orf_start = temp strand = '-1' break - current_start = int(math.ceil(float(strain_ind_to_ref_ind[str(orf_start)]))) - current_end = int(math.floor(float(strain_ind_to_ref_ind[str(orf_end)]))) + current_start = int(math.ceil( + float(strain_ind_to_ref_ind[str(orf_start)]))) + current_end = int(math.floor( + float(strain_ind_to_ref_ind[str(orf_end)]))) o = overlap.overlap(start, end, current_start, current_end) if o > greatest_overlap: greatest_overlap = o @@ -190,16 +189,19 @@ def choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs,\ orf_start_max = orf_start orf_end_max = orf_end strand_max = strand - seq_max = seq # don't need to reverse complement (blast does this) + seq_max = seq # don't need to reverse complement (blast does this) - print greatest_overlap + print(greatest_overlap) return best_hit, x_max, seq_max, orf_start_max, orf_end_max, strand_max + # by blasting ORFs -def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand, tag, +def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, + start, end, strand, tag, strain_ind_to_ref_ind): - - #outfmt = '"6 qseqid sseqid slen qstart qend length mismatch gapopen gaps sseq"' + + # outfmt = '"6 qseqid sseqid slen qstart qend \ + # length mismatch gapopen gaps sseq"' outfmt = '"6 sseqid slen evalue bitscore"' strain_gene_seqs = {} @@ -208,38 +210,39 @@ def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand, tag, if strain != 'yjm1332': continue - print '-', strain + print('-', strain) sys.stdout.flush() - fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix + fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix cmd_string = gp.blast_install_path + 'blastn' + \ - ' -db ' + fn + \ - ' -query ' + query_fn + \ - ' -out ' + out_fn + \ - ' -outfmt ' + outfmt - #print cmd_string + ' -db ' + fn + \ + ' -query ' + query_fn + \ + ' -out ' + out_fn + \ + ' -outfmt ' + outfmt + # print(cmd_string) os.system(cmd_string) - hits = [line[:-1].split('\t') for line in open(out_fn, 'r').readlines()] - num_hits = len(hits) + hits = [line[:-1].split('\t') + for line in open(out_fn, 'r').readlines()] if len(hits) == 0: strain_gene_seqs[strain] = ('nohit', '', -1, -1, '') continue - #best_orf_id = hits[0][0] + # best_orf_id = hits[0][0] headers, seqs = read_fasta.read_fasta(fn) best_orf_id, x, seq, orf_start, orf_end, orf_strand = \ - choose_best_hit(hits, start, end, tag, strain, chrm, headers, seqs, \ + choose_best_hit(hits, start, end, tag, strain, + chrm, headers, seqs, strain_ind_to_ref_ind[strain]) - print hits - print best_orf_id - print orf_strand, strand + print(hits) + print(best_orf_id) + print(orf_strand, strand) sys.exit() - if best_orf_id == None or orf_strand != strand: + if best_orf_id is None or orf_strand != strand: strain_gene_seqs[strain] = ('nohit', '', -1, -1, '') continue strain_gene_seqs[strain] = (x, seq, orf_start, orf_end, orf_strand) os.remove(out_fn) return strain_gene_seqs - + # can't actually count on annotations def get_gene_seqs_gb(fn, gene, chrm): @@ -249,38 +252,40 @@ def get_gene_seqs_gb(fn, gene, chrm): strains = set([]) for strain_chrm_record in gb_records: desc = strain_chrm_record.description - m = re.search(' (?P[a-zA-Z0-9]+) chromosome (?P[IVXM]+)', \ + m = re.search(' (?P[a-zA-Z0-9]+) chromosome (?P[IVXM]+)', desc) chrm_current = m.group('chrm') strain = m.group('strain').lower() strains.add(strain) - #if len(strain_gene_seqs) > 82: - # break - print strain, chrm_current + # if len(strain_gene_seqs) > 82: + # break + print(strain, chrm_current) if chrm_current != chrm: continue for feature in strain_chrm_record.features: - if feature.type == 'CDS' and feature.qualifiers.has_key('gene') and \ + if feature.type == 'CDS' and 'gene' in feature.qualifiers and \ feature.qualifiers['gene'][0] == gene: desc = strain_chrm_record.description - m = re.search(\ - ' (?P[a-zA-Z0-9]+) chromosome (?P[IVXM]+)', \ + m = re.search( + ' (?P[a-zA-Z0-9]+) ' + 'chromosome (?P[IVXM]+)', desc) seq = str(feature.extract(strain_chrm_record.seq).lower()) start = str(feature.location.start) end = str(feature.location.end) strand = str(feature.location.strand) locus_tag = feature.qualifiers['locus_tag'][0] - strain_gene_seqs[strain] = {'seq':seq, \ - 'chrm':chrm, \ - 'start':start, \ - 'end':end, \ - 'strand':strand,\ - 'locus_tag':locus_tag} - - print '- found gene in', strain + strain_gene_seqs[strain] = {'seq': seq, + 'chrm': chrm, + 'start': start, + 'end': end, + 'strand': strand, + 'locus_tag': locus_tag} + + print('- found gene in', strain) return strain_gene_seqs, list(strains) + # because don't have gb file for paradoxus... def get_gene_seqs_fsa(fn, gene, chrm): f = open(fn, 'r') @@ -295,12 +300,13 @@ def get_gene_seqs_fsa(fn, gene, chrm): line = f.readline() f.close() - seqfa = open(gp.ref_dir['CBS432'] + 'CBS432_chr' + chrm + '.fa', 'r').read() + seqfa = open(gp.ref_dir['CBS432'] + + 'CBS432_chr' + chrm + '.fa', 'r').read() seqfa = seqfa.replace('\n', '') if seq in seqfa: - print 'found paradoxus seq' + print('found paradoxus seq') else: - print 'did not find paradoxus seq' + print('did not find paradoxus seq') fg = open('a.txt', 'w') fg.write(seq + '\n') fg.write(seqfa + '\n') @@ -308,4 +314,3 @@ def get_gene_seqs_fsa(fn, gene, chrm): return seq.lower() line = f.readline() - diff --git a/code/analyze/to_update/combine_gene_all_strains_main.py b/code/analyze/to_update/combine_gene_all_strains_main.py index 2816954..b8c6912 100644 --- a/code/analyze/to_update/combine_gene_all_strains_main.py +++ b/code/analyze/to_update/combine_gene_all_strains_main.py @@ -1,4 +1,5 @@ -# TODO - when blasting, take best gene, except if there are multiple hits, prioritize the one that overlaps the region we'd expect based on alignment +# TODO - when blasting, take best gene, except if there are multiple hits, +# prioritize the one that overlaps the region we'd expect based on alignment # input a gene or start/end coordinates @@ -6,34 +7,30 @@ # - for gene, relies on annotations/orfs # - for coordinates, relies on alignments -import re import sys import os import math -import Bio.SeqIO import copy -from combine_all_strains import * +from analyze.to_update.combine_all_strains import (get_gene_seqs, + get_inds_from_alignment, + get_ref_gene_seq) import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import read_table -import read_fasta -import write_fasta -import mystats +from align import align_helpers +from misc import read_table +from misc import write_fasta tag = sys.argv[1] gene = sys.argv[2] chrm = sys.argv[3] -#all_outfiles = [] +# all_outfiles = [] -print 'getting gene sequence from reference strain' +print('getting gene sequence from reference strain') ref = 'S288c' ref_gene_coords_fn = '../../data/S288c_verified_orfs.tsv' -ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix +ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] \ + + '_chr' + chrm + gp.fasta_suffix ref_gene_seq, ref_start, ref_end, ref_strand = \ get_ref_gene_seq(gene, ref_gene_coords_fn, ref_seq_fn) query_fn = gene + '.txt' @@ -41,19 +38,19 @@ f.write(ref_gene_seq + '\n') f.close() -print 'getting gene sequences from all strains' +print('getting gene sequences from all strains') gp_dir = '../' s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) ref_ind_to_strain_ind = {} strain_ind_to_ref_ind = {} for strain, d in s: - print '*', strain + print('*', strain) sys.stdout.flush() - t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \ - tag + '/' + \ - 'site_summaries/predictions_' + \ - strain + \ - '_chr' + chrm + \ + t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + + tag + '/' + + 'site_summaries/predictions_' + + strain + + '_chr' + chrm + '_site_summary.txt.gz', '\t') ref_ind_to_strain_ind[strain] = dict(zip(t['ps_ref'], t['ps_strain'])) strain_ind_to_ref_ind[strain] = dict(zip(t['ps_strain'], t['ps_ref'])) @@ -63,15 +60,19 @@ '_mafft' + gp.alignment_suffix t = get_inds_from_alignment(align_fn, True) other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]] -ref_ind_to_strain_ind[other_ref_strain] = dict(zip(t['ps_ref'], t['ps_strain'])) -strain_ind_to_ref_ind[other_ref_strain] = dict(zip(t['ps_strain'], t['ps_ref'])) +ref_ind_to_strain_ind[other_ref_strain] = dict( + zip(t['ps_ref'], t['ps_strain'])) +strain_ind_to_ref_ind[other_ref_strain] = dict( + zip(t['ps_strain'], t['ps_ref'])) s.append((other_ref_strain, gp.ref_dir[gp.alignment_ref_order[1]])) -strain_gene_seqs = get_gene_seqs(query_fn, s, chrm, ref_seq_fn, ref_start, ref_end, ref_strand, tag, strain_ind_to_ref_ind) +strain_gene_seqs = get_gene_seqs(query_fn, s, chrm, ref_seq_fn, ref_start, + ref_end, ref_strand, + tag, strain_ind_to_ref_ind) os.remove(query_fn) -print 'writing all gene sequences to file' +print('writing all gene sequences to file') keys = sorted(strain_gene_seqs.keys()) -headers = [key + ' ' + strain_gene_seqs[key][0] + ' ' + \ +headers = [key + ' ' + strain_gene_seqs[key][0] + ' ' + strain_gene_seqs[key][-1] for key in keys] seqs = [strain_gene_seqs[key][1] for key in keys] strains = [ref] + keys @@ -86,18 +87,18 @@ suffixes = ['', '_filtered'] for suffix in suffixes: - print ' '.join(['finding', suffix, 'regions that overlap gene']) + print(' '.join(['finding', suffix, 'regions that overlap gene'])) # read in filtered regions fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ - 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' - regions, l = read_table.read_table_rows(fn_regions, '\t') + 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' + regions, _ = read_table.read_table_rows(fn_regions, '\t') # figure out which strains are introgressed/which regions overlap gene fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \ - 'genes_for_each_region_chr' + chrm + '_' + tag + '.txt' + 'genes_for_each_region_chr' + chrm + '_' + tag + '.txt' region_to_genes = \ gene_predictions.read_genes_for_each_region_summary(fn_genes_regions) - #strains = [x[0] for x in s] + # strains = [x[0] for x in s] regions_overlapping = {} # TODO does this actually ensure that regions are sorted appropriately # in fasta headers below? @@ -106,12 +107,12 @@ if regions[region]['chromosome'] == chrm and \ gene in [x[0] for x in region_to_genes[region]['gene_list']]: strain = regions[region]['strain'] - if not regions_overlapping.has_key(strain): + if strain not in regions_overlapping: regions_overlapping[strain] = [] regions_overlapping[strain].append(region) - print ' '.join(['writing all gene sequences to file, with', \ - suffix, 'introgressed bases capitalized']) + print(' '.join(['writing all gene sequences to file, with', + suffix, 'introgressed bases capitalized'])) headers_current = copy.deepcopy(headers) seqs_current = copy.deepcopy(seqs) for i in range(len(seqs)): @@ -123,38 +124,35 @@ if strain not in regions_overlapping: continue g = strain_gene_seqs[strain] - t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \ - tag + '/' + \ - 'site_summaries/predictions_' + \ - strain + \ - '_chr' + chrm + \ - '_site_summary.txt.gz', '\t') + t, labels = read_table.read_table_columns( + gp.analysis_out_dir_absolute + tag + '/' + + 'site_summaries/predictions_' + strain + '_chr' + chrm + + '_site_summary.txt.gz', '\t') for region in regions_overlapping[strain]: header += ' ' + region start_strain = \ - math.ceil(float(\ - ref_ind_to_strain_ind[strain][regions[region]['start']])) + math.ceil(float(ref_ind_to_strain_ind[ + strain][regions[region]['start']])) end_strain = \ - math.floor(float(\ - ref_ind_to_strain_ind[strain][regions[region]['end']])) + math.floor(float(ref_ind_to_strain_ind[ + strain][regions[region]['end']])) start_relative = int(max(start_strain - int(g[2]), 0)) end_relative = int(end_strain - int(g[2])) seq = seq[:start_relative] + \ - seq[start_relative:end_relative+1].upper() + \ - seq[end_relative+1:] - seqs_current[i] = seq + seq[start_relative:end_relative+1].upper() + \ + seq[end_relative+1:] + seqs_current[i] = seq headers_current[i] = header gene_seqs_fn = gp.analysis_out_dir_absolute + tag + \ - '/genes/' + gene + '/' + gene + \ - '_introgressed' + suffix + gp.fasta_suffix + '/genes/' + gene + '/' + gene + \ + '_introgressed' + suffix + gp.fasta_suffix write_fasta.write_fasta(headers_current, seqs_current, gene_seqs_fn) - - print 'aligning gene sequences' - gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix, gp.alignment_suffix) + print('aligning gene sequences') + gene_seqs_aligned_fn = gene_seqs_fn.replace(gp.fasta_suffix, + gp.alignment_suffix) cmd_string = gp.mafft_install_path + '/mafft ' + \ - ' --quiet --reorder --preservecase ' + \ - gene_seqs_fn + ' > ' + gene_seqs_aligned_fn + ' --quiet --reorder --preservecase ' + \ + gene_seqs_fn + ' > ' + gene_seqs_aligned_fn os.system(cmd_string) - diff --git a/code/analyze/to_update/combine_region_all_strains_main.py b/code/analyze/to_update/combine_region_all_strains_main.py index cc79a6b..234671a 100644 --- a/code/analyze/to_update/combine_region_all_strains_main.py +++ b/code/analyze/to_update/combine_region_all_strains_main.py @@ -3,44 +3,38 @@ # - for gene, relies on annotations/orfs # - for coordinates, relies on alignments -import re import sys import os import math -import Bio.SeqIO import copy -from combine_all_strains import * -import gene_predictions -sys.path.insert(0, '..') +from combine_all_strains import (get_range_seq, + get_range_seqs) import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import read_table -import read_fasta -import write_fasta -import mystats - +from align import align_helpers +from misc import read_table +from misc import write_fasta tag = sys.argv[1] start = int(sys.argv[2]) end = int(sys.argv[3]) chrm = sys.argv[4] -print 'getting range sequence from reference strain' +print('getting range sequence from reference strain') ref = 'S288c' -ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + '_chr' + chrm + gp.fasta_suffix +ref_seq_fn = gp.ref_dir[ref] + gp.ref_fn_prefix[ref] + \ + '_chr' + chrm + gp.fasta_suffix ref_range_seq = get_range_seq(start, end, ref_seq_fn) -print 'getting range sequences from all strains' +print('getting range sequences from all strains') s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) -s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], gp.ref_dir[gp.alignment_ref_order[1]])) +s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], + gp.ref_dir[gp.alignment_ref_order[1]])) # keyed by strain: (seq, start, end) strain_range_seqs = get_range_seqs(s, chrm, start, end, tag) -print 'writing all range sequences to file' +print('writing all range sequences to file') keys = sorted(strain_range_seqs.keys()) -headers = [key + ' ' + str(strain_range_seqs[key][1]) + ':' + \ +headers = [key + ' ' + str(strain_range_seqs[key][1]) + ':' + str(strain_range_seqs[key][2]) for key in keys] seqs = [strain_range_seqs[key][0] for key in keys] strains = [ref] + keys @@ -55,11 +49,11 @@ suffixes = ['', '_filtered'] for suffix in suffixes: - print ' '.join(['finding', suffix, 'regions that overlap range']) + print(' '.join(['finding', suffix, 'regions that overlap range'])) # read in filtered regions fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ - 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' - regions, l = read_table.read_table_rows(fn_regions, '\t') + 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' + regions, _ = read_table.read_table_rows(fn_regions, '\t') regions_overlapping = {} # TODO does this actually ensure that regions are sorted appropriately @@ -67,17 +61,17 @@ region_keys_ordered = sorted(regions.keys(), key=lambda x: int(x[1:])) for region in region_keys_ordered: if regions[region]['chromosome'] == chrm and \ - ((int(regions[region]['start']) > start and \ - int(regions[region]['start']) < end) or \ - (int(regions[region]['end']) > start and \ + ((int(regions[region]['start']) > start and + int(regions[region]['start']) < end) or + (int(regions[region]['end']) > start and int(regions[region]['end']) < end)): strain = regions[region]['strain'] - if not regions_overlapping.has_key(strain): + if strain not in regions_overlapping: regions_overlapping[strain] = [] regions_overlapping[strain].append(region) - print ' '.join(['writing all range sequences to file, with', \ - suffix, 'introgressed bases capitalized']) + print(' '.join(['writing all range sequences to file, with', + suffix, 'introgressed bases capitalized'])) headers_current = copy.deepcopy(headers) seqs_current = copy.deepcopy(seqs) for i in range(len(seqs)): @@ -89,35 +83,33 @@ if strain not in regions_overlapping: continue r = strain_range_seqs[strain] - t, labels = read_table.read_table_columns(gp.analysis_out_dir_absolute + \ - tag + '/' + \ - 'site_summaries/predictions_' + \ - strain + \ - '_chr' + chrm + \ - '_site_summary.txt.gz', '\t') + t, labels = read_table.read_table_columns( + gp.analysis_out_dir_absolute + tag + '/' + + 'site_summaries/predictions_' + strain + '_chr' + chrm + + '_site_summary.txt.gz', '\t') ref_ind_to_strain_ind = dict(zip(t['ps_ref'], t['ps_strain'])) for region in regions_overlapping[strain]: header += ' ' + region - start_strain = math.ceil(float(\ - ref_ind_to_strain_ind[regions[region]['start']])) - end_strain = math.floor(float(\ - ref_ind_to_strain_ind[regions[region]['end']])) + start_strain = math.ceil(float( + ref_ind_to_strain_ind[regions[region]['start']])) + end_strain = math.floor(float( + ref_ind_to_strain_ind[regions[region]['end']])) start_relative = int(max(start_strain - int(r[1]), 0)) end_relative = int(end_strain - int(r[1])) - seq = seq[:start_relative] + \ - seq[start_relative:end_relative+1].upper() + \ - seq[end_relative+1:] + seq = (seq[:start_relative] + + seq[start_relative:end_relative+1].upper() + + seq[end_relative+1:]) seqs_current[i] = seq headers_current[i] = header range_seqs_fn = gp.analysis_out_dir_absolute + tag + '/ranges/' + label + \ - '/' + label + '_introgressed' + suffix + gp.fasta_suffix + '/' + label + '_introgressed' + suffix + gp.fasta_suffix write_fasta.write_fasta(headers_current, seqs_current, range_seqs_fn) - - print 'aligning range sequences' - range_seqs_aligned_fn = range_seqs_fn.replace(gp.fasta_suffix, gp.alignment_suffix) + print('aligning range sequences') + range_seqs_aligned_fn = range_seqs_fn.replace(gp.fasta_suffix, + gp.alignment_suffix) cmd_string = gp.mafft_install_path + '/mafft ' + \ - ' --reorder --preservecase ' + \ - range_seqs_fn + ' > ' + range_seqs_aligned_fn + ' --reorder --preservecase ' + \ + range_seqs_fn + ' > ' + range_seqs_aligned_fn os.system(cmd_string) diff --git a/code/analyze/to_update/compare.py b/code/analyze/to_update/compare.py index 460203e..57d9b95 100644 --- a/code/analyze/to_update/compare.py +++ b/code/analyze/to_update/compare.py @@ -1,10 +1,9 @@ -import sys -import os -sys.path.insert(0, '..') import global_params as gp -lines = [x.split(',') for x in open('/tigress/AKEY/akey_vol2/aclark4/nobackup/introgression/data/Table_S5_introgressed_genes.csv', 'r').readlines()] +lines = [x.split(',') for x in open( + '/tigress/AKEY/akey_vol2/aclark4/nobackup/introgression/data/\ + Table_S5_introgressed_genes.csv', 'r').readlines()] genes = [] genes_verified = [] for i in range(2, len(lines)): @@ -14,12 +13,13 @@ gp.analysis_out_dir_absolute -lines = [x.split(' ') for x in open('../../results/introgressed_id_genes.txt', 'r').readlines()] +lines = [x.split(' ') for x in open('../../results/introgressed_id_genes.txt', + 'r').readlines()] my_genes = [x[0] for x in lines] -print len(genes), 'genes from paper' -print len(genes_verified), 'verified genes from paper' -print len(my_genes), '(verified) genes I identify' +print(len(genes), 'genes from paper') +print(len(genes_verified), 'verified genes from paper') +print(len(my_genes), '(verified) genes I identify') pm = [] pnm = [] @@ -33,27 +33,28 @@ if g not in genes: npm.append(g) -print 'genes found in paper that I found (', len(pm), '):' +print('genes found in paper that I found (', len(pm), '):') for x in pm: - print x -print 'genes found in paper that I didn\'t find (', len(pnm), '):' + print(x) +print('genes found in paper that I didn\'t find (', len(pnm), '):') for x in pnm: - print x -print 'genes that I found not in paper(', len(npm), '):' + print(x) +print('genes that I found not in paper(', len(npm), '):') for x in npm: - print x + print(x) -lines = [x.strip().split(' ') for x in open('../../results/introgressed_id_genes_fns.txt', 'r').readlines()] +lines = [x.strip().split(' ') for x in open( + '../../results/introgressed_id_genes_fns.txt', 'r').readlines()] gene_to_fns = {} for line in lines: gene_to_fns[line[0]] = line[1:] while True: - gene = raw_input('=========================================\nwhich gene? ') + gene = input('=========================================\nwhich gene? ') try: gene_to_fns[gene] - except: - print 'that gene wasn\'t one i found' + except KeyError: + print('that gene wasn\'t one i found') continue for fn in gene_to_fns[gene]: f = open(fn) @@ -78,16 +79,17 @@ seq += 'p' else: seq += '-' - print '==========', fn + print('==========', fn) line_length = 10000 for i in range(0, len(seq), line_length): - print seqc[i:i+line_length] - print seqp[i:i+line_length] - print seqx[i:i+line_length] - print seq[i:i+line_length] - print - print - raw_input('') + print(seqc[i:i+line_length]) + print(seqp[i:i+line_length]) + print(seqx[i:i+line_length]) + print(seq[i:i+line_length]) + print() + print() + input('') -# TODO: get alignments for genes found in paper but not by me; print positions in each genome before alignments +# TODO: get alignments for genes found in paper but not by me; +# print positions in each genome before alignments diff --git a/code/analyze/to_update/compare_3strains_main.py b/code/analyze/to_update/compare_3strains_main.py index 7e7e560..ab190d5 100644 --- a/code/analyze/to_update/compare_3strains_main.py +++ b/code/analyze/to_update/compare_3strains_main.py @@ -1,20 +1,7 @@ -import re -import sys -import os -import copy import itertools from collections import defaultdict -import gene_predictions -import predict -from filter_helpers import * -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import mystats -import read_table -import read_fasta +from misc import read_table strains = ['yjm1252', 'yjm1078', 'yjm248'] @@ -36,12 +23,13 @@ for base in range(start, end + 1): bases_by_strains[chrm][base].append(strain) -#for base in sorted(bases_by_strains['I'].keys()): +# for base in sorted(bases_by_strains['I'].keys()): # print base, bases_by_strains['I'][base] - + categories = [] -for i in range(1,len(strains) + 1): - categories += [tuple(sorted(x)) for x in itertools.combinations(strains, i)] +for i in range(1, len(strains) + 1): + categories += [tuple(sorted(x)) + for x in itertools.combinations(strains, i)] cat_counts = defaultdict(int) for chrm in bases_by_strains.keys(): @@ -55,4 +43,3 @@ for cat in categories: f.write(','.join(cat) + '\t' + str(cat_counts[cat]) + '\n') f.close() - diff --git a/code/analyze/to_update/compare_predictions_main.py b/code/analyze/to_update/compare_predictions_main.py index c967c92..753c0a1 100644 --- a/code/analyze/to_update/compare_predictions_main.py +++ b/code/analyze/to_update/compare_predictions_main.py @@ -1,20 +1,9 @@ -import re import sys -import os -import copy -import itertools -import gene_predictions import predict from collections import defaultdict -from filter_helpers import * -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import mystats -import read_table -import read_fasta +from misc import read_table + # similar to find_pops function in structure_3_main.py def overlap_with_any(start, end, blocks): @@ -32,23 +21,24 @@ def overlap_with_any(start, end, blocks): break return count + args = predict.process_predict_args(sys.argv[1:]) -## comparing to other prediction run; e.g. comparing using just one -## introgressed reference state to using multiple; this is a little -## janky because some of the file names and formatting have changed +# comparing to other prediction run; e.g. comparing using just one +# introgressed reference state to using multiple; this is a little +# janky because some of the file names and formatting have changed other_region_fn = gp.analysis_out_dir_absolute + 'u3_i.001_tv_l1000_f.01/' + \ - 'introgressed_blocks_filtered_par_u3_i.001_tv_l1000_f.01_summary_plus.txt' + 'introgressed_blocks_filtered_par_u3_i.001_tv_l1000_f.01_summary_plus.txt' rt_other, fields_other = read_table.read_table_rows(other_region_fn, '\t') regions_other = defaultdict(lambda: defaultdict(list)) for region_id in rt_other: chrm = rt_other[region_id]['chromosome'] strain = rt_other[region_id]['strain'] - regions_other[chrm][strain].append((int(rt_other[region_id]['start']), \ + regions_other[chrm][strain].append((int(rt_other[region_id]['start']), int(rt_other[region_id]['end']))) for chrm in gp.chrms: for strain in regions_other[chrm].keys(): - regions_other[chrm][strain].sort(key = lambda x: x[0]) + regions_other[chrm][strain].sort(key=lambda x: x[0]) regions = defaultdict(lambda: defaultdict(list)) @@ -63,12 +53,12 @@ def overlap_with_any(start, end, blocks): for region_id in rt: chrm = rt[region_id]['chromosome'] strain = rt[region_id]['strain'] - regions[chrm][strain].append((int(rt[region_id]['start']), \ + regions[chrm][strain].append((int(rt[region_id]['start']), int(rt[region_id]['end']), rt[region_id]['alternative_states'])) for chrm in gp.chrms: for strain in regions[chrm].keys(): - regions[chrm][strain].sort(key = lambda x: x[0]) + regions[chrm][strain].sort(key=lambda x: x[0]) # count bases found in every possible combination of species_from + # presence/absence in regions_other @@ -79,14 +69,14 @@ def overlap_with_any(start, end, blocks): # current predictions for strain in regions[chrm].keys(): for region in regions[chrm][strain]: - x = overlap_with_any(region[0], region[1], regions_other[chrm][strain]) + x = overlap_with_any(region[0], + region[1], regions_other[chrm][strain]) length = region[1] - region[0] + 1 alt_states = region[2].split(',') d[strain][tuple(['other'] + alt_states)] += x d[strain][tuple(alt_states)] += length - x assert x <= length - # other predictions for strain in regions_other[chrm].keys(): for region in regions_other[chrm][strain]: @@ -95,13 +85,14 @@ def overlap_with_any(start, end, blocks): d[strain][('other', 'any')] += x d[strain][('other',)] += length - x assert x <= length - -fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + 'state_counts_comparison.txt' +fn = gp.analysis_out_dir_absolute + args['tag'] +\ + '/' + 'state_counts_comparison.txt' f = open(fn, 'w') f.write('strain\tlabel\tcount\n') for strain in d.keys(): for label in d[strain].keys(): - f.write(strain + '\t' + ','.join(label) + '\t' + str(d[strain][label]) + '\n') + f.write(strain + '\t' + ','.join(label) + + '\t' + str(d[strain][label]) + '\n') f.close() diff --git a/code/analyze/to_update/compare_to_strope.py b/code/analyze/to_update/compare_to_strope.py index 297ebd4..7fed803 100644 --- a/code/analyze/to_update/compare_to_strope.py +++ b/code/analyze/to_update/compare_to_strope.py @@ -1,22 +1,10 @@ # compare set of genes I've called to set called in Strope et al (100 # genomes paper) -import re import sys -import os -import math -import Bio.SeqIO -import copy import gene_predictions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import read_table -import read_fasta -import write_fasta -import mystats +from misc import read_table tag = sys.argv[1] @@ -35,14 +23,14 @@ if line[7+i] == 'P': strains_int_par.append(strains[i]) n_int_par = len(strains_int_par) - genes_strope[line[2]] = (n_int_par, n_int_other, n_del, strains_int_par, \ + genes_strope[line[2]] = (n_int_par, n_int_other, n_del, strains_int_par, line[1], line[4]) sys_standard_strope[line[1]] = line[2] - + fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt' # dict keyed by region: {strain:, start:, end:, etc} -regions, l = read_table.read_table_rows(fn_regions, '\t') +regions, _ = read_table.read_table_rows(fn_regions, '\t') region_to_genes = {} for chrm in gp.chrms: fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \ @@ -53,9 +41,9 @@ region_to_genes.update(region_to_genes_current) genes_by_strain = {} for region in regions: - if not genes_by_strain.has_key(regions[region]['strain']): + if regions[region]['strain'] not in genes_by_strain: genes_by_strain[regions[region]['strain']] = set([]) - [genes_by_strain[regions[region]['strain']].add(gene) \ + [genes_by_strain[regions[region]['strain']].add(gene) for gene in [x[0] for x in region_to_genes[region]['gene_list']]] genes = {} @@ -97,7 +85,8 @@ # TODO fix my gene list then get rid of this all_genes = {} for chrm in gp.chrms: - fn_all_genes = gp.analysis_out_dir_absolute + 'S288c_chr' + chrm + '_genes.txt' + fn_all_genes = gp.analysis_out_dir_absolute +\ + 'S288c_chr' + chrm + '_genes.txt' f_all_genes = open(fn_all_genes, 'r') lines = [line.strip().split('\t') for line in f_all_genes.readlines()] f_all_genes.close() @@ -107,7 +96,6 @@ strand = 'NA' all_genes[line[0]] = ('NA', chrm, start, end, strand) - fn_paralogs = '../../data/S288c_paralogs.tsv' f_paralogs = open(fn_paralogs, 'r') lines = [line.strip().split('\t') for line in f_paralogs.readlines()] @@ -117,7 +105,6 @@ if line[0] != "": paralogs[line[0]] = line[3] - f_s = open('compare_to_strope/genes_strope_only.txt', 'w') f_m = open('compare_to_strope/genes_me_only.txt', 'w') f_sm = open('compare_to_strope/genes_both.txt', 'w') @@ -147,7 +134,8 @@ f_sp.write(gene + '\n') if gene in genes or (gene in sys_standard and sys_standard[gene] in genes): continue - elif not (gene in all_genes or (gene in sys_standard and sys_standard[gene] in all_genes)): + elif not (gene in all_genes or + (gene in sys_standard and sys_standard[gene] in all_genes)): continue elif genes_strope[gene][0] == 0: continue @@ -157,19 +145,19 @@ f_s.write(gene + '\n') c_s += 1 if gene in paralogs: - c_s_p +=1 + c_s_p += 1 f_s.close() f_m.close() f_sm.close() f_mp.close() f_sp.close() -print 'number strope only:', c_s -print 'number me only:', c_m -print 'number strope and me:', c_sm -print 'number strope only paralogs', c_s_p -print 'number me only paralogs', c_m_p -print 'number strope and me paralogs', c_sm_p -print 'number paralogs', len(paralogs) +print('number strope only:', c_s) +print('number me only:', c_m) +print('number strope and me:', c_sm) +print('number strope only paralogs', c_s_p) +print('number me only paralogs', c_m_p) +print('number strope and me paralogs', c_sm_p) +print('number paralogs', len(paralogs)) -print paralogs.keys() +print(paralogs.keys()) diff --git a/code/analyze/to_update/count_coding_changes.py b/code/analyze/to_update/count_coding_changes.py index a4af578..8ce9ec6 100644 --- a/code/analyze/to_update/count_coding_changes.py +++ b/code/analyze/to_update/count_coding_changes.py @@ -1,10 +1,7 @@ -import sys -import os -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import seq_functions -import read_fasta +from misc import seq_functions +from misc import read_fasta + def get_aligned_genes(fn, strains): headers, seqs = read_fasta.read_fasta(fn) @@ -44,7 +41,6 @@ def ambiguous(gene, ref_start, ref_end, coords, orfs): def count_coding(seq_master, seq_ref, seq_strain, start, end): - if not seq_master.startswith('ATG'): seq_master = seq_functions.reverse_complement(seq_master) assert seq_master.startswith('ATG'), seq_master @@ -87,16 +83,15 @@ def count_coding(seq_master, seq_ref, seq_strain, start, end): def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end): - - print seq_master - print seq_ref - print seq_strain - print start, end + print(seq_master) + print(seq_ref) + print(seq_strain) + print(start, end) seq_master = seq_master.upper() seq_ref = seq_ref.upper() seq_strain = seq_strain.upper() - + ind_master = 0 ind_ref = 0 ind_strain = 0 @@ -177,10 +172,9 @@ def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end): if codon_strain != codon_master: aa_master = seq_functions.codon_table.get(codon_master) - aa_ref = seq_functions.codon_table.get(codon_ref) aa_strain = seq_functions.codon_table.get(codon_strain) - if aa_master == None or aa_strain == None: + if aa_master is None or aa_strain is None: if gaps_master > gaps_strain: t_insert += gaps_master - gaps_strain else: @@ -212,10 +206,9 @@ def count_coding_with_gaps(seq_master, seq_ref, seq_strain, start, end): else: t_non_ref += 1 - print t_syn, t_non, t_syn_ref, t_non_ref - print t_insert, t_delete, t_insert_ref, t_delete_ref - print frameshift + print(t_syn, t_non, t_syn_ref, t_non_ref) + print(t_insert, t_delete, t_insert_ref, t_delete_ref) + print(frameshift) return t_syn, t_non, t_syn_ref, t_non_ref, \ t_insert/3.0, t_delete/3.0, t_insert_ref/3.0, t_delete_ref/3.0, \ gene_delete, gene_delete_ref, frameshift_count - diff --git a/code/analyze/to_update/count_coding_changes_main.py b/code/analyze/to_update/count_coding_changes_main.py index d0a4220..96c1705 100644 --- a/code/analyze/to_update/count_coding_changes_main.py +++ b/code/analyze/to_update/count_coding_changes_main.py @@ -1,36 +1,34 @@ import sys import os import gzip -from count_coding_changes import * +from count_coding_changes import get_aligned_genes, count_coding_with_gaps import annotate_positions -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import overlap -import read_table -import read_fasta +from misc import overlap +from misc import read_table +from misc import read_fasta -##====== +# ====== # command line arguments -##====== +# ====== tag = sys.argv[1] -##====== +# ====== # read in introgressed regions -##====== +# ====== -# key region ids by chromosome and then strain +# key region ids by chromosome and then strain fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt' -regions, l = read_table.read_table_rows(fn_regions, '\t') +regions, _ = read_table.read_table_rows(fn_regions, '\t') region_ids_by_chrm_strain = {} for r in regions.keys(): strain = regions[r]['strain'] chrm = regions[r]['chromosome'] - if not region_ids_by_chrm_strain.has_key(chrm): + if chrm not in region_ids_by_chrm_strain: region_ids_by_chrm_strain[chrm] = {} - if not region_ids_by_chrm_strain[chrm].has_key(strain): + if strain not in region_ids_by_chrm_strain[chrm]: region_ids_by_chrm_strain[chrm][strain] = [] region_ids_by_chrm_strain[chrm][strain].append(r) @@ -48,82 +46,84 @@ f.close() -##====== +# ====== # count sites within all regions that are coding/noncoding, plus some # more details about coding changes -##====== +# ====== other_ref = gp.alignment_ref_order[1] region_totals = {} gene_totals = {} strain_totals = {} -totals = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \ - 'insert':0, 'delete':0, 'insert_ref':0, 'delete_ref':0, \ - 'gene_delete':0, 'gene_delete_ref':0, \ - 'ref_gene_only':0, 'strain_orf_only':0, \ - 'coding':0, 'noncoding':0, 'frameshift':0} +totals = {'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0, + 'insert': 0, 'delete': 0, 'insert_ref': 0, 'delete_ref': 0, + 'gene_delete': 0, 'gene_delete_ref': 0, + 'ref_gene_only': 0, 'strain_orf_only': 0, + 'coding': 0, 'noncoding': 0, 'frameshift': 0} for chrm in gp.chrms: - print chrm + print(chrm) # read in cer reference genes fn = gp.analysis_out_dir_absolute + gp.master_ref + '_chr' + chrm + \ - '_genes.txt' - genes, l = read_table.read_table_rows(fn, '\t', header=False, key_ind=0) + '_genes.txt' + genes, _ = read_table.read_table_rows(fn, '\t', header=False, key_ind=0) for gene in genes: genes[gene] = (int(genes[gene][0]), int(genes[gene][1])) # read in cer ref -> par ref position file fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \ - '_to_' + other_ref + '_chr' + chrm + '.txt.gz' - master_to_other_ref_pos = [float(line[:-1]) \ + '_to_' + other_ref + '_chr' + chrm + '.txt.gz' + master_to_other_ref_pos = [float(line[:-1]) for line in gzip.open(fn, 'rb').readlines()] # read in cer ref chromosome sequence fn = gp.ref_dir[gp.master_ref] + gp.ref_fn_prefix[gp.master_ref] + \ - '_chr' + chrm + gp.fasta_suffix + '_chr' + chrm + gp.fasta_suffix master_seq = read_fasta.read_fasta(fn)[1][0] # read in par ref chromosome sequence fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \ - '_chr' + chrm + gp.fasta_suffix + '_chr' + chrm + gp.fasta_suffix other_ref_seq = read_fasta.read_fasta(fn)[1][0] # read in par ref ORFs fn = gp.ref_dir[other_ref] + 'orfs/' + other_ref + \ - '_chr' + chrm + '_orfs' + gp.fasta_suffix + '_chr' + chrm + '_orfs' + gp.fasta_suffix ref_orfs = annotate_positions.get_orfs(fn) for strain in region_ids_by_chrm_strain[chrm].keys(): - print '-', strain - - if not strain_totals.has_key(strain): - strain_totals[strain] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \ - 'ref_gene_only':0, 'strain_orf_only':0, \ - 'coding':0, 'noncoding':0} - - # read in cer ref -> strain position file + print('-', strain) + + if strain not in strain_totals: + strain_totals[strain] = { + 'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0, + 'ref_gene_only': 0, 'strain_orf_only': 0, + 'coding': 0, 'noncoding': 0} + + # read in cer ref -> strain position file fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \ - '_to_' + strain + '_chr' + chrm + '.txt.gz' - master_to_strain_pos = [float(line[:-1]) \ + '_to_' + strain + '_chr' + chrm + '.txt.gz' + master_to_strain_pos = [float(line[:-1]) for line in gzip.open(fn, 'rb').readlines()] # read in strain chromosome sequence fn = gp.non_ref_dirs[gp.master_ref][0] + strain + \ - '_chr' + chrm + gp.fasta_suffix + '_chr' + chrm + gp.fasta_suffix strain_seq = read_fasta.read_fasta(fn)[1][0] # read in strain ORFs fn = gp.non_ref_dirs[gp.master_ref][0] + 'orfs/' + strain + \ - '_chr' + chrm + '_orfs' + gp.fasta_suffix + '_chr' + chrm + '_orfs' + gp.fasta_suffix orfs = annotate_positions.get_orfs(fn) for region in region_ids_by_chrm_strain[chrm][strain]: - region_totals[region] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \ - 'ref_gene_only':0, 'strain_orf_only':0, \ - 'coding':0, 'noncoding':0} + region_totals[region] = { + 'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0, + 'ref_gene_only': 0, 'strain_orf_only': 0, + 'coding': 0, 'noncoding': 0} # is each site in region in a master ref gene and/or # strain ORF? @@ -132,10 +132,11 @@ t_gene_not_orf = 0 t_not_gene_orf = 0 t_not_gene_not_orf = 0 - for site in range(int(regions[region]['start']), \ + for site in range(int(regions[region]['start']), int(regions[region]['end'])): in_gene = overlap.contained_any(site, genes.values()) - in_orf = overlap.contained_any(master_to_strain_pos[site], orfs.keys()) + in_orf = overlap.contained_any( + master_to_strain_pos[site], orfs.keys()) if in_gene: if in_orf: t_gene_orf += 1 @@ -170,58 +171,63 @@ # read multiple alignment for the gene, in which we've # previously selected the best orfs to match the gene - fn = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + \ - gene + '_introgressed_filtered.maf' + fn = gp.analysis_out_dir_absolute + tag + '/genes/' \ + + gene + '/' + gene + '_introgressed_filtered.maf' if not os.path.isfile(fn): - print 'do not have alignment for', gene + print('do not have alignment for', gene) continue - aligned_genes = get_aligned_genes(fn, \ - [gp.master_ref, other_ref, strain]) + aligned_genes = get_aligned_genes( + fn, [gp.master_ref, other_ref, strain]) - print gene, strain + print(gene, strain) # for now, ignore cerevisiae reference genes that # don't map perfectly to an ORF in the strain and # paradoxus reference - #if ambiguous(gene, gene_start, gene_end, master_to_strain_pos, orfs): + # if ambiguous(gene, gene_start, gene_end, + # master_to_strain_pos, orfs): # continue - #if ambiguous(gene, gene_start, gene_end, \ + # if ambiguous(gene, gene_start, gene_end, \ # master_to_other_ref_pos, ref_orfs): # continue - + # extract gene sequence from references and strain g_master = master_seq[gene_start:gene_end+1] - g_ref = other_ref_seq[int(master_to_other_ref_pos[gene_start]):\ + g_ref = other_ref_seq[int(master_to_other_ref_pos[gene_start]): int(master_to_other_ref_pos[gene_end])+1] - g_strain = strain_seq[int(master_to_strain_pos[gene_start]):\ + g_strain = strain_seq[int(master_to_strain_pos[gene_start]): int(master_to_strain_pos[gene_end])+1] # get overlap between gene and introgressed region - o_start, o_end = overlap.overlap_region(genes[gene][0], \ - genes[gene][1], \ - int(regions[region]['start']), \ - int(regions[region]['end'])) + o_start, o_end = overlap.overlap_region( + genes[gene][0], + genes[gene][1], + int(regions[region]['start']), + int(regions[region]['end'])) # count synonymous and non synonymous changes due to # paradoxus (deal with gene direction correctly) # t_syn, t_non = count_coding(g_master, g_ref, g_strain, \ - # o_start-gene_start, o_end-gene_start) + # o_start-gene_start, + # o_end-gene_start) # alternative method that deals with imperfect matches t_syn, t_non, t_syn_ref, t_non_ref, \ t_insert, t_delete, t_insert_ref, t_delete_ref, \ gene_delete, gene_delete_ref, frameshift = \ - count_coding_with_gaps(aligned_genes[gp.master_ref], \ - aligned_genes[other_ref], \ - aligned_genes[strain], \ - o_start-gene_start, o_end-gene_start) + count_coding_with_gaps(aligned_genes[gp.master_ref], + aligned_genes[other_ref], + aligned_genes[strain], + o_start-gene_start, + o_end-gene_start) # add to totals for region, gene, strain, and overall - if not gene_totals.has_key(gene): - gene_totals[gene] = {'syn':0, 'non':0, 'syn_ref':0, 'non_ref':0, \ - 'insert':0, 'delete':0, \ - 'insert_ref':0, 'delete_ref':0, \ - 'gene_delete':0, 'gene_delete_ref':0, \ - 'frameshift':0} + if gene not in gene_totals: + gene_totals[gene] = { + 'syn': 0, 'non': 0, 'syn_ref': 0, 'non_ref': 0, + 'insert': 0, 'delete': 0, + 'insert_ref': 0, 'delete_ref': 0, + 'gene_delete': 0, 'gene_delete_ref': 0, + 'frameshift': 0} gene_totals[gene]['syn'] += t_syn gene_totals[gene]['non'] += t_non gene_totals[gene]['syn_ref'] += t_syn_ref @@ -256,9 +262,9 @@ totals['gene_delete_ref'] += gene_delete_ref totals['frameshift'] += frameshift -##====== +# ====== # write output file -##====== +# ====== fn = gp.analysis_out_dir_absolute + tag + '/' + 'coding_changes_summary_' + \ tag + '.txt' @@ -271,30 +277,26 @@ for strain in strain_totals: for key in strain_totals[strain].keys(): - f.write(strain + sep + 'strain' + sep + \ + f.write(strain + sep + 'strain' + sep + str(strain_totals[strain][key]) + sep + key + '\n') for gene in gene_totals: for key in gene_totals[gene].keys(): - f.write(gene + sep + 'gene' + sep + \ + f.write(gene + sep + 'gene' + sep + str(gene_totals[gene][key]) + sep + key + '\n') for region in region_totals: for key in region_totals[region].keys(): - f.write(region + sep + 'region' + sep + \ + f.write(region + sep + 'region' + sep + str(region_totals[region][key]) + sep + key + '\n') -f.close() - - - +f.close() # new plan # for each region # for each site in region # is it in ref gene and/or strain orf? (keep track of four totals) -# # for each gene # get corresponding orfs in par and strain @@ -305,4 +307,3 @@ # - categories: # multiples of 3 # not multiples of 3 -> stop counting/ignore gene? - diff --git a/code/analyze/to_update/count_introgressed_main.py b/code/analyze/to_update/count_introgressed_main.py index 2663c12..5cbc0d3 100644 --- a/code/analyze/to_update/count_introgressed_main.py +++ b/code/analyze/to_update/count_introgressed_main.py @@ -1,24 +1,12 @@ # counts total amount of sites introgressed on each chromosome -import re -import sys -import os -import math -import gzip -import itertools -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import overlap -import read_table -import read_fasta -import write_fasta -import mystats +from misc import read_table -chrm_sizes = [230218, 813184, 316620, 1531933, 576874, 270161, 1090940, 562643, 439888, 745751, 666816, 1078177, 924431, 784333, 1091291, 948066] +chrm_sizes = [230218, 813184, 316620, 1531933, 576874, + 270161, 1090940, 562643, 439888, 745751, + 666816, 1078177, 924431, 784333, 1091291, 948066] tag = 'u3_i.001_tv_l1000_f.01' @@ -30,13 +18,13 @@ for region in d: chrm = d[region]['chromosome'] strain = d[region]['strain'] - regions_by_chrm[chrm].append((strain, \ - int(d[region]['start']), \ + regions_by_chrm[chrm].append((strain, + int(d[region]['start']), int(d[region]['end']))) hist = {} for chrm in gp.chrms: - print chrm + print(chrm) chrm_size = chrm_sizes[gp.chrms.index(chrm)] x = [0 for i in range(chrm_size)] for ri in range(len(regions_by_chrm[chrm])): @@ -56,7 +44,7 @@ total = 0 for chrm in gp.chrms: f.write(chrm + '\t') - chrm_size = chrm_sizes[gp.chrms.index(chrm)] + chrm_size = chrm_sizes[gp.chrms.index(chrm)] at_least_one = chrm_size - hist[chrm][0] total += at_least_one f.write(str(at_least_one) + '\t') diff --git a/code/analyze/to_update/frequency_of_introgression_main.py b/code/analyze/to_update/frequency_of_introgression_main.py index 8004fdb..89b2da0 100644 --- a/code/analyze/to_update/frequency_of_introgression_main.py +++ b/code/analyze/to_update/frequency_of_introgression_main.py @@ -1,25 +1,11 @@ -import re -import sys -import os -import copy -import itertools from collections import defaultdict -import gene_predictions -import predict -from filter_helpers import * -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import mystats -import read_table -import read_fasta +from misc import read_table tag = 'u3_i.001_tv_l1000_f.01' species_from = 'par' -#strains3 = ['yjm1252', 'yjm1078', 'yjm248'] +# strains3 = ['yjm1252', 'yjm1078', 'yjm248'] fn = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks_filtered_' + species_from + \ @@ -30,7 +16,7 @@ strains = set([]) for region_id in regions: strain = regions[region_id]['strain'] - #if strain not in strains3: + # if strain not in strains3: strains.add(strain) chrm = regions[region_id]['chromosome'] start = int(regions[region_id]['start']) @@ -50,4 +36,3 @@ for i in range(len(strains)): f.write(str(i) + '\t' + str(counts[i]) + '\n') f.close() - diff --git a/code/analyze/to_update/gene_overlap_main.py b/code/analyze/to_update/gene_overlap_main.py index 9c274e4..3e431e0 100644 --- a/code/analyze/to_update/gene_overlap_main.py +++ b/code/analyze/to_update/gene_overlap_main.py @@ -10,10 +10,10 @@ # bases within coding sequence are upper case. In addition, there is a # corresponding file S288c_CBS432_strain_chrX_start-end.genes.txt # listing the genes that overlap this region, and the indices of -# the bases they overlap, in this format: -# gene_name\t0-149\t25236-25385 +# the bases they overlap, in this format: +# gene_name\t0-149\t25236-25385 # gene_name\t200-600\t.... -# +# # also generate a file in results/tag/gene_alignments/ for each # introgressed gene, which contains one threeway alignment for each # strain in which the gene was called introgressed...followed by all @@ -27,93 +27,86 @@ # versions (gene_introgressed.fasta), and also to all of the versions # (gene_all.fasta). -# TODO: -## _annotated file should be .txt not .maf -## also modify so that 80 characters per line -## and extra row showing summary of which references match +# TODO: +# _annotated file should be .txt not .maf +# also modify so that 80 characters per line +# and extra row showing summary of which references match -import re import sys -import os -import copy -from gene_predictions import * +from gene_predictions import read_gene_file import predict -import pickle from collections import defaultdict -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../misc/') -import read_fasta -import overlap +from misc import overlap -##====== +# ====== # read in analysis parameters -##====== +# ====== args = predict.process_predict_args(sys.argv[1:]) gp_dir = '../' open_mode = 'w' -##====== +# ====== # read in reference gene coordinates -##====== +# ====== genes = {} for chrm in gp.chrms: fn_genes = gp.analysis_out_dir_absolute + '/' + \ gp.master_ref + '_chr' + chrm + '_genes.txt' - # + genes[chrm] = read_gene_file(fn_genes) -##====== +# ====== # do all the stuff -##====== +# ====== for species_from in args['states']: - ##====== + # ====== # read in introgressed regions for current state - ##====== + # ====== # strain chromosome predicted_species start end number_non_gap blocks_fn = gp.analysis_out_dir_absolute + args['tag'] + '/' + \ 'introgressed_blocks_' + species_from + '_' + args['tag'] + \ '_labeled.txt' - # introgressed regions keyed by strain and then chromosome: + # introgressed regions keyed by strain and then chromosome: # (region_id, start, end, number_non_gap) regions = predict.read_blocks(blocks_fn, labeled=True) - ##====== + # ====== # extract alignments and genes for introgressed regions - ##====== + # ====== - fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'genes_for_each_region_' + species_from + '_' + \ - args['tag'] + '.txt' + fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + \ + '/' + 'genes_for_each_region_' + species_from + '_' + \ + args['tag'] + '.txt' f_genes_regions = open(fn_genes_regions, open_mode) f_genes_regions.write('region_id\tnumber_genes\tgenes\tfracs\n') d_regions_to_genes = defaultdict(lambda: defaultdict(float)) - #fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - # 'regions_for_each_strain_' species_from + '_' + \ - # args['tag'] + '.txt' - #f_regions_strains = open(fn_regions_strains, open_mode) - #f_regions_strains.write('strain\tregions\n') - #d_strains_to_regions = defaultdict(list) + # fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag']\ + # + '/' + 'regions_for_each_strain_' species_from\ + # + '_' + args['tag'] + '.txt' + # f_regions_strains = open(fn_regions_strains, open_mode) + # f_regions_strains.write('strain\tregions\n') + # d_strains_to_regions = defaultdict(list) - fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'genes_for_each_strain_' + species_from + '_' + \ - args['tag'] + '.txt' + fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + \ + '/' + 'genes_for_each_strain_' + species_from + '_' + \ + args['tag'] + '.txt' f_genes_strains = open(fn_genes_strains, open_mode) f_genes_strains.write('strain\tnumber_genes\tgenes\tfracs\n') d_strains_to_genes = defaultdict(lambda: defaultdict(float)) - fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'strains_for_each_gene_' + species_from + '_' + \ - args['tag'] + '.txt' + fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + \ + '/' + 'strains_for_each_gene_' + species_from + '_' + \ + args['tag'] + '.txt' f_strains_genes = open(fn_strains_genes, open_mode) f_strains_genes.write('gene\tnum_strains\tstrains\tfracs\n') d_genes_to_strains = defaultdict(lambda: defaultdict(float)) @@ -123,42 +116,42 @@ for entry in regions[strain][chrm]: region_id, start, end, number_non_gap = entry for gene in genes[chrm]: - o = overlap.overlap(start, end, \ - genes[chrm][gene][0], genes[chrm][gene][1]) + o = overlap.overlap(start, end, + genes[chrm][gene][0], + genes[chrm][gene][1]) if o > 0: - gene_length = float(genes[chrm][gene][1] - \ + gene_length = float(genes[chrm][gene][1] - genes[chrm][gene][0] + 1) frac_o = o / gene_length d_regions_to_genes[region_id][gene] += frac_o d_strains_to_genes[strain][gene] += frac_o d_genes_to_strains[gene][strain] += frac_o - for region in sorted(d_regions_to_genes.keys(), key=lambda x: int(x[1:])): g = sorted(d_regions_to_genes[region].keys()) f_genes_regions.write(region + '\t' + str(len(g)) + '\t') f_genes_regions.write(','.join(g) + '\t') - f_genes_regions.write(','.join([str(d_regions_to_genes[region][x]) \ + f_genes_regions.write(','.join([str(d_regions_to_genes[region][x]) for x in g]) + '\n') for strain in sorted(d_strains_to_genes.keys()): g = sorted(d_strains_to_genes[strain].keys()) f_genes_strains.write(strain + '\t' + str(len(g)) + '\t') f_genes_strains.write(','.join(g) + '\t') - f_genes_strains.write(','.join([str(d_strains_to_genes[strain][x]) \ + f_genes_strains.write(','.join([str(d_strains_to_genes[strain][x]) for x in g]) + '\n') for gene in sorted(d_genes_to_strains.keys()): s = sorted(d_genes_to_strains[gene].keys()) f_strains_genes.write(gene + '\t' + str(len(s)) + '\t') f_strains_genes.write(','.join(s) + '\t') - f_strains_genes.write(','.join([str(d_genes_to_strains[gene][x]) \ + f_strains_genes.write(','.join([str(d_genes_to_strains[gene][x]) for x in s]) + '\n') f_genes_regions.close() f_genes_strains.close() f_strains_genes.close() -""" +""" # produce region summmary file with all the same info, but also with # region ids (r1-rn), and with genes overlapping each region @@ -178,7 +171,8 @@ fn_align_prefix += '_'.join([refs[s][0] for s in args['species']]) + '_' # for annotated region files (output) -fn_region_prefix = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/regions/' +fn_region_prefix = gp.analysis_out_dir_absolute + \ + '/' + args['tag'] + '/regions/' if not os.path.isdir(fn_region_prefix): os.makedirs(fn_region_prefix) @@ -193,19 +187,23 @@ write_region_summary_header(refs_ordered, f_region_summary) fn_genes_regions = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'genes_for_each_region_chr' + chrm + '_' + args['tag'] + '.txt' + 'genes_for_each_region_chr' + chrm + '_' \ + + args['tag'] + '.txt' f_genes_regions = open(fn_genes_regions, open_mode) fn_regions_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'regions_for_each_strain_chr' + chrm + '_' + args['tag'] + '.txt' + 'regions_for_each_strain_chr' + chrm + '_' + \ + args['tag'] + '.txt' f_regions_strains = open(fn_regions_strains, open_mode) fn_genes_strains = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'genes_for_each_strain_chr' + chrm + '_' + args['tag'] + '.txt' + 'genes_for_each_strain_chr' + chrm + '_' + \ + args['tag'] + '.txt' f_genes_strains = open(fn_genes_strains, open_mode) fn_strains_genes = gp.analysis_out_dir_absolute + '/' + args['tag'] + '/' + \ - 'strains_for_each_gene_chr' + chrm + '_' + args['tag'] + '.txt' + 'strains_for_each_gene_chr' + chrm + '_' + \ + args['tag'] + '.txt' f_strains_genes = open(fn_strains_genes, open_mode) # for keeping track of all genes introgressed in each strain, and the @@ -240,7 +238,7 @@ # loop through all strains that we've called introgression in, and # associate genes with the regions they overlap for strain in regions.keys(): - + print '***', strain, chrm sys.stdout.flush() # skip this strain x chromosome if there are no introgressed @@ -254,7 +252,7 @@ alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align) labels = ref_labels + [strain] - + # mark each site as matching each reference or not ref_match_by_site = get_ref_match_by_site(alignment_seqs, labels) # mark each site as in a gene or not @@ -281,7 +279,6 @@ # regions are indexed by (unaligned) master ref sequence write_region_alignment(alignment_headers, alignment_seqs, fn_region, \ entry[0], entry[1], 0) - # write region to file in annotated/readable format fn_region_annotated = fn_region_current_prefix + '_annotated' + \ @@ -290,34 +287,35 @@ write_region_alignment_annotated(labels, alignment_seqs, \ fn_region_annotated, \ entry[0], entry[1], 0, \ - genes, ref_match_by_site, + genes, ref_match_by_site, genes_by_site, \ introgressed_by_site, 100) #==== # region summary file with extra info #==== - + # strain chromosome predicted_species start end number_non_gap # number_match_ref1 number_match_ref2 number_match_only_ref1 # number_match_ref2_not_ref1 number_mismatch_all_ref write_region_summary_line(entry, strain, chrm, species_from, \ alignment_seqs, labels, \ - relative_start, relative_end, f_region_summary) + relative_start, relative_end, + f_region_summary) #==== # genes for each region summary file #==== # region_id num_genes gene frac_intd gene frac_intd - + frac_intd = write_genes_for_each_region_summary_line(entry[3], \ genes_by_site, \ genes, \ relative_start, \ relative_end, \ - alignment_seqs[0], \ + alignment_seqs[0], f_genes_regions) for gene in frac_intd: # keep track of all genes for each strain... @@ -331,14 +329,13 @@ gene_strains_dic[gene][strain] = 0 gene_strains_dic[gene][strain] += frac_intd[gene] - #==== # strains for each gene summary file #==== -# (could do this for one chromsoome at a time if we wanted) +# (could do this for one chromsoome at a time if we wanted) # gene num_strains strain frac_intd strain frac_intd - + write_strains_for_each_gene_lines(gene_strains_dic, f_strains_genes) #==== @@ -363,7 +360,5 @@ f_regions_strains.close() f_genes_strains.close() f_strains_genes.close() - - """ diff --git a/code/analyze/to_update/gene_predictions.py b/code/analyze/to_update/gene_predictions.py index 0507a44..0d118e3 100644 --- a/code/analyze/to_update/gene_predictions.py +++ b/code/analyze/to_update/gene_predictions.py @@ -1,14 +1,9 @@ import re -import sys import os -import copy import gzip -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../sim/') -import sim_analyze_hmm_bw as sim -sys.path.insert(0, '../misc/') -import write_fasta +from misc import write_fasta + def index_ignoring_gaps(s, i, s_start): '''returns the index of the ith (starting at 0) non-gap character in @@ -31,6 +26,7 @@ def index_ignoring_gaps(s, i, s_start): x += 1 return x + def get_ref_match_by_site(seqs, labels): # for master: matches _only_ that ref @@ -51,7 +47,7 @@ def get_ref_match_by_site(seqs, labels): if seqs[0][i] == seqs[-1][i]: ref_match_by_site[0][i] = labels[0][0] - + for r in range(1, nrefs): if seqs[r][i] == seqs[-1][i]: # matches this ref and master ref -> both blank @@ -69,10 +65,9 @@ def get_ref_match_by_site(seqs, labels): else: ref_match_by_site[r][i] = '.' ref_match_by_site[0][i] = '.' - return [''.join(s) for s in ref_match_by_site] - + def get_ref_match_by_site_2(seqs, labels): @@ -99,6 +94,7 @@ def get_ref_match_by_site_2(seqs, labels): return [''.join(s) for s in ref_match_by_site] + def get_genes_by_site(genes, seq): genes_by_site = [None for site in seq] @@ -110,6 +106,7 @@ def get_genes_by_site(genes, seq): genes_by_site[i] = gene_name return genes_by_site + def get_introgressed_by_site(regions, seq): introgressed_by_site = [' ' for site in seq] @@ -119,19 +116,20 @@ def get_introgressed_by_site(regions, seq): for i in range(start_ind, end_ind+1): introgressed_by_site[i] = 'i' return ''.join(introgressed_by_site) - + def write_region_alignment(headers, seqs, fn, start, end, master_ind): - + relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0)) relative_end = index_ignoring_gaps(seqs[master_ind], end, 0) - + region_seqs = [seq[relative_start:relative_end+1] for seq in seqs] write_fasta.write_fasta(headers, region_seqs, fn, gz=True) + def get_genes_in_region(start, end, genes): - + region_genes = [] for gene_name in genes: gene_start, gene_end = genes[gene_name] @@ -142,24 +140,27 @@ def get_genes_in_region(start, end, genes): region_genes.sort(key=lambda x: x[1]) return region_genes -def write_region_alignment_annotated(labels, seqs, fn, start, end, \ - master_ind, genes, ref_match_by_site, \ - genes_by_site, \ + +def write_region_alignment_annotated(labels, seqs, fn, start, end, + master_ind, genes, ref_match_by_site, + genes_by_site, introgressed_by_site, context): relative_start_with_context = \ max(0, index_ignoring_gaps(seqs[master_ind], start-context, 0)) relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0)) relative_end = index_ignoring_gaps(seqs[master_ind], end, 0) - relative_end_with_context = index_ignoring_gaps(seqs[master_ind], end+context, 0) - - region_seqs = [seq[relative_start_with_context:relative_end_with_context+1] \ - for seq in seqs] + relative_end_with_context = index_ignoring_gaps(seqs[master_ind], + end+context, 0) + + region_seqs = [ + seq[relative_start_with_context:relative_end_with_context+1] + for seq in seqs] # for reference matching lines ref_match_strings = [] for r in ref_match_by_site: - ref_match_strings.append(\ + ref_match_strings.append( r[relative_start_with_context:relative_end_with_context+1]) # for gene line @@ -168,10 +169,12 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \ region_genes_set = list(set(region_genes)) try: region_genes_set.remove(None) - except: + except ValueError: pass region_genes_set.sort(key=lambda x: genes[x][1]) - gene_string = ''.join([' ' if entry == None else '=' for entry in region_genes]) + gene_string = ''.join([' ' + if entry is None else '=' + for entry in region_genes]) # for introgression line introgressed_string = \ @@ -188,7 +191,7 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \ # assume master ref comes first f.write('matches only ' + labels[0] + '\n') # and assume ref seqs come before predict seq - for label in labels[1:-1]: + for label in labels[1:-1]: f.write('matches ' + label + ' and mismatches ' + labels[0] + '\n') f.write('genes: ' + ' '.join(region_genes_set) + '\n') f.write('introgressed\n\n') @@ -212,6 +215,7 @@ def write_region_alignment_annotated(labels, seqs, fn, start, end, \ return relative_start, relative_end + def read_gene_file(fn): f = open(fn, 'r') genes = {} @@ -223,6 +227,7 @@ def read_gene_file(fn): f.close() return genes + def write_gene_file(genes, fn): f = open(fn, 'w') for gene in genes: @@ -230,15 +235,17 @@ def write_gene_file(genes, fn): f.write(gene + '\t' + str(start) + '\t' + str(end) + '\n') f.close() + def write_region_summary_header(refs, f): - f.write('region_id\tstrain\tchromosome\tpredicted_species\tstart\tend\t' + \ + f.write('region_id\tstrain\tchromosome\tpredicted_species\tstart\tend\t' + 'number_non_gap\t') f.write('\t'.join(['number_match_' + ref for ref in refs]) + '\t') f.write('\t'.join(['number_match_only_' + ref for ref in refs]) + '\t') f.write('number_mismatch_all_refs\n') -def write_region_summary_line(region, strain, chrm, predicted_species, seqs, labels, - start, end, f): + +def write_region_summary_line(region, strain, chrm, predicted_species, + seqs, labels, start, end, f): # region_id [strain chromosome predicted_species start end number_non_gap] # number_match_ref1 number_match_ref2 number_match_only_ref1 @@ -246,8 +253,8 @@ def write_region_summary_line(region, strain, chrm, predicted_species, seqs, lab sep = '\t' - f.write(region[3] + sep + strain + sep + chrm + sep + predicted_species + \ - sep + str(region[0]) + sep + str(region[1]) + sep + \ + f.write(region[3] + sep + strain + sep + chrm + sep + predicted_species + + sep + str(region[0]) + sep + str(region[1]) + sep + str(region[2]) + sep) ids = [0] * (len(seqs) - 1) @@ -280,12 +287,13 @@ def write_region_summary_line(region, strain, chrm, predicted_species, seqs, lab continue for r in range(1, len(seqs) - 1): unique_ids[r] += match_refs[r] - + f.write(sep.join([str(x) for x in ids]) + sep) f.write(sep.join([str(x) for x in unique_ids]) + sep) f.write(str(mismatch_all) + '\n') f.flush() + def read_region_summary(fn): # region_id [strain chromosome predicted_species start end number_non_gap] # number_match_ref1 number_match_ref2 number_match_only_ref1 @@ -294,39 +302,41 @@ def read_region_summary(fn): f = open(fn, 'r') line = f.readline() d = {} - fields = ['strain', 'chromosome', 'predicted_species', 'start', 'end', \ - 'number_non_gap', 'number_match_ref1', 'number_match_ref2', \ - 'number_match_only_ref1', 'number_match_ref2_not_ref1', \ + fields = ['strain', 'chromosome', 'predicted_species', 'start', 'end', + 'number_non_gap', 'number_match_ref1', 'number_match_ref2', + 'number_match_only_ref1', 'number_match_ref2_not_ref1', 'number_mismatch_all_ref'] while line != '': line = line[:-1].split('\t') - #TODO actually fix the multiple header lines scattered throughout + # TODO actually fix the multiple header lines scattered throughout if line[0] != 'region_id': d[line[0]] = dict(zip(fields, line[1:])) line = f.readline() f.close() return d -def write_genes_for_each_region_summary_line(region_id, genes_by_site, gene_summary, \ + +def write_genes_for_each_region_summary_line(region_id, genes_by_site, + gene_summary, start, end, seq, f): - + # region_id num_genes gene frac_intd gene frac_intd genes = genes_by_site[start:end+1] genes_set = list(set(genes)) try: genes_set.remove(None) - except: + except ValueError: pass seq_region = seq[start:end+1] gene_site_counts = dict(zip(genes_set, [0]*len(genes_set))) for i in range(len(seq_region)): - if seq_region[i] != gp.gap_symbol and genes[i] != None: + if seq_region[i] != gp.gap_symbol and genes[i] is not None: gene_site_counts[genes[i]] += 1 frac_intd = {} for gene in genes_set: gene_length = gene_summary[gene][1] - gene_summary[gene][0] + 1 frac_intd[gene] = float(gene_site_counts[gene]) / gene_length - + sep = '\t' f.write(region_id + sep) f.write(str(len(genes_set))) @@ -337,6 +347,7 @@ def write_genes_for_each_region_summary_line(region_id, genes_by_site, gene_summ return frac_intd + def read_genes_for_each_region_summary(fn): # region_id num_genes gene frac_intd gene frac_intd @@ -348,18 +359,19 @@ def read_genes_for_each_region_summary(fn): gene_list = [] for i in range(2, len(line), 2): gene_list.append((line[i], line[i+1])) - d[line[0]] = {'num_genes':line[1], 'gene_list':gene_list} + d[line[0]] = {'num_genes': line[1], 'gene_list': gene_list} line = f.readline() f.close() return d + def write_regions_for_each_strain(regions, f): # strain num_regions region length region length sep = '\t' for strain in regions: f.write(strain + sep) - num_regions = sum([len(regions[strain][chrm]) \ + num_regions = sum([len(regions[strain][chrm]) for chrm in regions[strain].keys()]) f.write(str(num_regions)) for chrm in regions[strain].keys(): @@ -369,6 +381,7 @@ def write_regions_for_each_strain(regions, f): f.write('\n') f.flush() + def read_regions_for_each_strain(fn): # strain num_regions region length region length @@ -380,11 +393,12 @@ def read_regions_for_each_strain(fn): region_list = [] for i in range(2, len(line), 2): region_list.append((line[i], line[i+1])) - d[line[0]] = {'num_regions':line[1], 'region_list':region_list} + d[line[0]] = {'num_regions': line[1], 'region_list': region_list} line = f.readline() f.close() return d - + + def write_genes_for_each_strain(strain_genes_dic, f): # strain num_genes gene frac_intd gene frac_intd @@ -396,6 +410,7 @@ def write_genes_for_each_strain(strain_genes_dic, f): f.write('\n') f.flush() + def read_genes_for_each_strain(fn): # strain num_genes gene frac_intd gene frac_intd @@ -407,11 +422,12 @@ def read_genes_for_each_strain(fn): gene_list = [] for i in range(2, len(line), 2): gene_list.append((line[i], line[i+1])) - d[line[0]] = {'num_genes':line[1], 'gene_list':gene_list} + d[line[0]] = {'num_genes': line[1], 'gene_list': gene_list} line = f.readline() f.close() return d + def write_strains_for_each_gene_lines(gene_strains_dic, f): # (this is actually the same as above function, but it's confusing @@ -426,6 +442,7 @@ def write_strains_for_each_gene_lines(gene_strains_dic, f): f.write('\n') f.flush() + def read_strains_for_each_gene(fn): # gene num_strains strain frac_intd strain frac_intd @@ -437,11 +454,12 @@ def read_strains_for_each_gene(fn): strain_list = [] for i in range(2, len(line), 2): strain_list.append((line[i], line[i+1])) - d[line[0]] = {'num_strains':line[1], 'strain_list':strain_list} + d[line[0]] = {'num_strains': line[1], 'strain_list': strain_list} line = f.readline() f.close() return d + def read_genes(fn, fn_genes): if os.path.isfile(fn_genes): @@ -464,11 +482,12 @@ def read_genes(fn, fn_genes): break # starting with new gene - #assert line.strip().startswith('gene'), line + # assert line.strip().startswith('gene'), line skip_this_gene = False # regex for finding coordinates - m = re.search(r'[><]?(?P[0-9]+)[.><,0-9]*\.\.[><]?(?P[0-9]+)', line) + m = re.search(r'[><]?(?P[0-9]+)' + r'[.><,0-9]*\.\.[><]?(?P[0-9]+)', line) # subtract one to index from zero TODO is this correct? end is # inclusive @@ -495,20 +514,25 @@ def read_genes(fn, fn_genes): if gene_name != '': genes[gene_name] = (start, end) else: - print 'gene name not found: ' + line + print('gene name not found: ' + line) f.close() write_gene_file(genes, fn_genes) return genes + """ def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \ introgressed_genes, gene_info, tag, threshold=0): - + f_all = open(fn_all, 'w') - f_all.write('gene\tchromosome\tstart\tend\tnumber_strains\taverage_introgressed_fraction\taverage_number_non_gap\taverage_ref_from_count\n') + f_all.write('gene\tchromosome\tstart\tend\tnumber_strains' + '\taverage_introgressed_fraction\taverage_number_non_gap' + '\taverage_ref_from_count\n') - f_gene_heading = 'region_id\tstrain\tstart\tend\tintrogressed_fraction\tnumber_non_gap\tref_from_count\n' + f_gene_heading = ('region_id\tstrain\tstart\tend\t' + 'introgressed_fraction\tnumber_non_gap' + '\tref_from_count\n') strain_genes = {} @@ -518,7 +542,8 @@ def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \ sum_introgressed_fraction = {} sum_number_non_gap = {} sum_ref_from_count = {} - fn_gene = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '.txt' + fn_gene = (gp.analysis_out_dir_absolute + tag + + '/genes/' + gene + '.txt') if not os.path.exists(os.path.dirname(fn_gene)): os.makedirs(os.path.dirname(fn_gene)) f_gene = open(fn_gene, 'w') @@ -530,7 +555,8 @@ def summarize_gene_info(fn_all, fn_strains, fn_strains_g, \ sum_introgressed_fraction[strain] = 0 sum_number_non_gap[strain] = 0 sum_ref_from_count[strain] = 0 - sum_introgressed_fraction[strain] += entry['introgressed_fraction'] + sum_introgressed_fraction[strain] += entry[ + 'introgressed_fraction'] sum_number_non_gap[strain] += entry['number_non_gap'] sum_ref_from_count[strain] += entry['ref_from_count'] if strain not in strain_genes: diff --git a/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py b/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py index 6bda7a6..c53b29a 100644 --- a/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py +++ b/code/analyze/to_update/nucleotide_diversity_from_introgression_main.py @@ -1,33 +1,25 @@ -## calculate nucleotide diversity for all sites and for all sites -## excluding introgression; also calculate the same but only in coding -## regions +# calculate nucleotide diversity for all sites and for all sites +# excluding introgression; also calculate the same but only in coding +# regions -import re import sys -import os import copy -import math import gzip -import itertools -sys.path.insert(0, '..') import global_params as gp -sys.path.insert(0, '../align/') -import align_helpers -sys.path.insert(0, '../misc/') -import overlap -import read_table -import read_fasta -import write_fasta -import mystats +from align import align_helpers +from misc import read_table +from misc import read_fasta + def try_int(s, default=-1): try: i = int(s) return i - except: + except ValueError: return default -def count_diffs(s, t, skip_char = 'N'): + +def count_diffs(s, t, skip_char='N'): assert len(s) == len(t) num = 0 den = 0 @@ -39,10 +31,11 @@ def count_diffs(s, t, skip_char = 'N'): den += 1 return num, den -## generate a sequence that has the current strain's base for each -## site in the reference sequence, and skip_char for any site where -## the base is a gap/unknown (this is all based on the alignment) -def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): + +# generate a sequence that has the current strain's base for each +# site in the reference sequence, and skip_char for any site where +# the base is a gap/unknown (this is all based on the alignment) +def referize(strain_seq, ref_ind_to_strain_ind, skip_char='N'): s = [skip_char for r in ref_ind_to_strain_ind] for i in range(len(ref_ind_to_strain_ind)): si = ref_ind_to_strain_ind[i] @@ -52,6 +45,7 @@ def referize(strain_seq, ref_ind_to_strain_ind, skip_char = 'N'): s[i] = strain_seq[si] return s + def mark_excluded(seq, regions, fill='N'): seqi = copy.deepcopy(seq) for start, end in regions: @@ -59,6 +53,7 @@ def mark_excluded(seq, regions, fill='N'): seqi[i] = fill return seqi + def mark_included(seq, regions, fill='N'): s = [fill for r in seq] for start, end in regions: @@ -66,15 +61,17 @@ def mark_included(seq, regions, fill='N'): s[i] = seq[i] return s + tag = 'u3_i.001_tv_l1000_f.01' -######## -## read in introgressed regions, as well as strains and reference genes -######## +# ###### +# read in introgressed regions, as well as strains and reference genes +# ###### -## dictionary of introgressed regions keyed by chromosome and then -## strain -regions_by_chrm_and_strain = dict(zip(gp.chrms, [{} for i in range(len(gp.chrms))])) +# dictionary of introgressed regions keyed by chromosome and then +# strain +regions_by_chrm_and_strain = dict(zip(gp.chrms, + [{} for i in range(len(gp.chrms))])) fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks_filtered_par_' + tag + '_summary_plus.txt' d, labels = read_table.read_table_rows(fn_regions, '\t') @@ -82,20 +79,20 @@ def mark_included(seq, regions, fill='N'): for region in d: chrm = d[region]['chromosome'] strain = d[region]['strain'] - if not regions_by_chrm_and_strain[chrm].has_key(strain): + if strain not in regions_by_chrm_and_strain[chrm]: regions_by_chrm_and_strain[chrm][strain] = [] - regions_by_chrm_and_strain[chrm][strain].append((int(d[region]['start']), \ + regions_by_chrm_and_strain[chrm][strain].append((int(d[region]['start']), int(d[region]['end']))) -## read in all strains +# read in all strains strain_dirs = align_helpers.get_strains(gp.non_ref_dirs[gp.master_ref]) num_strains = len(strain_dirs) -## read in genes in reference sequence into dictionary keyed by -## chromosome +# read in genes in reference sequence into dictionary keyed by +# chromosome ref_genes = {} for chrm in gp.chrms: ref_genes[chrm] = [] - f = open(gp.analysis_out_dir_absolute + gp.master_ref + \ + f = open(gp.analysis_out_dir_absolute + gp.master_ref + '_chr' + chrm + '_genes.txt', 'r') line = f.readline() while line != '': @@ -104,9 +101,9 @@ def mark_included(seq, regions, fill='N'): line = f.readline() f.close() -######## -## calculate nucleotide diversity -######## +# ###### +# calculate nucleotide diversity +# ###### # all sites total_frac = 0 @@ -127,7 +124,7 @@ def mark_included(seq, regions, fill='N'): # total number of strain pairs num_comparisons = 0 -## loop through all strains +# loop through all strains for i in range(num_strains): strain_i, d_i = strain_dirs[i] strain_i_seqs = {} @@ -135,49 +132,53 @@ def mark_included(seq, regions, fill='N'): strain_i_seqs_coding = {} strain_i_seqs_coding_nonint = {} - ## for each + # for each for chrm in gp.chrms: - ## coordinate conversion between reference and current strain + # coordinate conversion between reference and current strain coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ gp.master_ref + '_to_' + strain_i + \ '_chr' + chrm + '.txt.gz' f_coord = gzip.open(coord_fn, 'rb') - ref_ind_to_strain_i_ind = [try_int(line[:-1]) for line in f_coord.readlines()] + ref_ind_to_strain_i_ind = [try_int(line[:-1]) + for line in f_coord.readlines()] - ## current strain fasta file for current chromosome + # current strain fasta file for current chromosome strain_fn = d_i + strain_i + '_chr' + chrm + gp.fasta_suffix - print strain_i, chrm - - ## get chromosome sequence for this strain relative to - ## reference strain (the base for this strain at each site in - ## the reference, based on original alignment); - ## gaps/unsequenced sites/etc marked as 'N' - strain_i_seqs[chrm] = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(),\ - ref_ind_to_strain_i_ind) - - ## get version of sequence where everything that doesn't fall - ## within gene is replaced by 'N' - strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm],\ + print(strain_i, chrm) + + # get chromosome sequence for this strain relative to + # reference strain (the base for this strain at each site in + # the reference, based on original alignment); + # gaps/unsequenced sites/etc marked as 'N' + strain_i_seqs[chrm] = referize( + read_fasta.read_fasta(strain_fn)[1][0].lower(), + ref_ind_to_strain_i_ind) + + # get version of sequence where everything that doesn't fall + # within gene is replaced by 'N' + strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm], ref_genes[chrm]) - ## also get version of above sequences where introgressed sites are - ## replaced by 'N' + # also get version of above sequences where introgressed sites are + # replaced by 'N' strain_i_seqs_nonint[chrm] = copy.deepcopy(strain_i_seqs[chrm]) - strain_i_seqs_coding_nonint[chrm] = copy.deepcopy(strain_i_seqs_coding[chrm]) - if regions_by_chrm_and_strain[chrm].has_key(strain_i): - strain_i_seqs_nonint[chrm] = mark_excluded(strain_i_seqs[chrm],\ - regions_by_chrm_and_strain[chrm][strain_i]) + strain_i_seqs_coding_nonint[chrm] = copy.deepcopy( + strain_i_seqs_coding[chrm]) + if strain_i in regions_by_chrm_and_strain[chrm]: + strain_i_seqs_nonint[chrm] = mark_excluded( + strain_i_seqs[chrm], + regions_by_chrm_and_strain[chrm][strain_i]) strain_i_seqs_coding_nonint[chrm] = \ - mark_excluded(strain_i_seqs_coding[chrm],\ + mark_excluded(strain_i_seqs_coding[chrm], regions_by_chrm_and_strain[chrm][strain_i]) - ## loop through all strains to get second strain for current pair + # loop through all strains to get second strain for current pair for j in range(i+1, num_strains): strain_j, d_j = strain_dirs[j] - print strain_i, strain_j - ## keep track of total number of strain pairs we're looking - ## at, so we can divide total by that later + print(strain_i, strain_j) + # keep track of total number of strain pairs we're looking + # at, so we can divide total by that later num_comparisons += 1 num = 0 @@ -190,46 +191,49 @@ def mark_included(seq, regions, fill='N'): den_coding_nonint = 0 for chrm in gp.chrms: - ## do the same reading in of sequence for this strain, - ## relative to reference, and also excluding introgressed - ## sites + # do the same reading in of sequence for this strain, + # relative to reference, and also excluding introgressed + # sites coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \ gp.master_ref + '_to_' + strain_j + \ '_chr' + chrm + '.txt.gz' f_coord = gzip.open(coord_fn, 'rb') - ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()] - + ref_ind_to_strain_ind = [try_int(line[:-1]) + for line in f_coord.readlines()] + strain_fn = d_j + strain_j + '_chr' + chrm + gp.fasta_suffix - strain_j_seq = referize(read_fasta.read_fasta(strain_fn)[1][0].lower(),\ - ref_ind_to_strain_ind) + strain_j_seq = referize( + read_fasta.read_fasta(strain_fn)[1][0].lower(), + ref_ind_to_strain_ind) strain_j_seq_coding = mark_included(strain_j_seq, ref_genes[chrm]) strain_j_seq_nonint = copy.deepcopy(strain_j_seq) strain_j_seq_coding_nonint = copy.deepcopy(strain_j_seq_coding) - if regions_by_chrm_and_strain[chrm].has_key(strain_j): - strain_j_seq_nonint = mark_excluded(strain_j_seq,\ - regions_by_chrm_and_strain[chrm][strain_j]) - strain_j_seq_coding_nonint = mark_excluded(strain_j_seq_coding,\ - regions_by_chrm_and_strain[chrm][strain_j]) - - ## count sites that differ between the two strains - ## (ignoring any sites where one of the strains has 'N') - ## and add to appropriate running total - - ## all sites + if strain_j in regions_by_chrm_and_strain[chrm]: + strain_j_seq_nonint = mark_excluded( + strain_j_seq, regions_by_chrm_and_strain[chrm][strain_j]) + strain_j_seq_coding_nonint = mark_excluded( + strain_j_seq_coding, + regions_by_chrm_and_strain[chrm][strain_j]) + + # count sites that differ between the two strains + # (ignoring any sites where one of the strains has 'N') + # and add to appropriate running total + + # all sites num_chrm, den_chrm = count_diffs(strain_i_seqs[chrm], strain_j_seq) num += num_chrm den += den_chrm total_fracs[chrm] += float(num_chrm)/den_chrm # nonintrogressed - num_chrm_nonint, den_chrm_nonint = count_diffs(strain_i_seqs_nonint[chrm],\ - strain_j_seq_nonint) + num_chrm_nonint, den_chrm_nonint = count_diffs( + strain_i_seqs_nonint[chrm], strain_j_seq_nonint) num_nonint += num_chrm_nonint den_nonint += den_chrm_nonint total_fracs_nonint[chrm] += float(num_chrm_nonint)/den_chrm_nonint - ## all coding sites + # all coding sites num_chrm_coding, den_chrm_coding = \ count_diffs(strain_i_seqs_coding[chrm], strain_j_seq_coding) num_coding += num_chrm_coding @@ -238,20 +242,20 @@ def mark_included(seq, regions, fill='N'): # coding, nonintrogressed num_chrm_coding_nonint, den_chrm_coding_nonint = \ - count_diffs(strain_i_seqs_coding_nonint[chrm],\ + count_diffs(strain_i_seqs_coding_nonint[chrm], strain_j_seq_coding_nonint) num_coding_nonint += num_chrm_coding_nonint den_coding_nonint += den_chrm_coding_nonint total_fracs_coding_nonint[chrm] += \ float(num_chrm_coding_nonint)/den_chrm_coding_nonint - print num_comparisons, chrm, \ - total_fracs[chrm], \ - total_fracs_nonint[chrm], \ - 1 - total_fracs_nonint[chrm]/total_fracs[chrm], \ - total_fracs_coding[chrm], \ - total_fracs_coding_nonint[chrm], \ - 1 - total_fracs_coding_nonint[chrm]/total_fracs_coding[chrm] + print(num_comparisons, chrm, + total_fracs[chrm], + total_fracs_nonint[chrm], + 1 - total_fracs_nonint[chrm]/total_fracs[chrm], + total_fracs_coding[chrm], + total_fracs_coding_nonint[chrm], + 1 - total_fracs_coding_nonint[chrm]/total_fracs_coding[chrm]) # and keep track across all chromosomes total_frac += float(num)/den @@ -259,9 +263,10 @@ def mark_included(seq, regions, fill='N'): total_frac_coding += float(num_coding)/den_coding total_frac_coding_nonint += float(num_coding_nonint)/den_coding_nonint - print num_comparisons, total_frac, total_frac_nonint, \ - 1 - total_frac_nonint/total_frac, total_frac_coding, \ - total_frac_coding_nonint, 1 - total_frac_coding_nonint/total_frac_coding + print(num_comparisons, total_frac, total_frac_nonint, + 1 - total_frac_nonint/total_frac, total_frac_coding, + total_frac_coding_nonint, + 1 - total_frac_coding_nonint/total_frac_coding) sys.stdout.flush() # nucleotide diversity is the running total of fractions of sites that @@ -272,23 +277,23 @@ def mark_included(seq, regions, fill='N'): nuc_div_coding = total_frac_coding/num_comparisons nuc_div_coding_nonint = total_frac_coding_nonint/num_comparisons -print nuc_div -print nuc_div_nonint -print nuc_div_coding -print nuc_div_coding_nonint +print(nuc_div) +print(nuc_div_nonint) +print(nuc_div_coding) +print(nuc_div_coding_nonint) -######## -## write overall results and results for individual chromosome to file -######## +# ###### +# write overall results and results for individual chromosome to file +# ###### -f = open(gp.analysis_out_dir_absolute + tag + '/polymorphism/' + \ +f = open(gp.analysis_out_dir_absolute + tag + '/polymorphism/' + 'nucleotide_diversity_c.txt', 'w') f.write('chromosome\tpi\tpi_nonint\tpi_coding\tpi_coding_nonint\n') -f.write('all\t' + str(nuc_div) + '\t' + str(nuc_div_nonint) + \ +f.write('all\t' + str(nuc_div) + '\t' + str(nuc_div_nonint) + '\t' + str(nuc_div_coding) + '\t' + str(nuc_div_coding_nonint) + '\n') for chrm in gp.chrms: - f.write(chrm + '\t' + str(total_fracs[chrm]/num_comparisons) + '\t' + \ - str(total_fracs_nonint[chrm]/num_comparisons) + '\t' + \ - str(total_fracs_coding[chrm]/num_comparisons) + '\t' + \ + f.write(chrm + '\t' + str(total_fracs[chrm]/num_comparisons) + '\t' + + str(total_fracs_nonint[chrm]/num_comparisons) + '\t' + + str(total_fracs_coding[chrm]/num_comparisons) + '\t' + str(total_fracs_coding_nonint[chrm]/num_comparisons) + '\n') f.close() diff --git a/code/annotate/fix.py b/code/annotate/fix.py index 4c7ca15..76cf352 100644 --- a/code/annotate/fix.py +++ b/code/annotate/fix.py @@ -1,12 +1,6 @@ -import sys import os -#from orf import * -sys.path.insert(0, '../align') -import align_helpers -sys.path.insert(0, '..') -import global_params as gp -#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' +# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' d = '../../data/CBS432/orfs/' fns = os.listdir(d) for fn in fns: diff --git a/code/annotate/makeblastdb.py b/code/annotate/makeblastdb.py index 606969a..4e7e967 100644 --- a/code/annotate/makeblastdb.py +++ b/code/annotate/makeblastdb.py @@ -1,18 +1,13 @@ -import sys import os -#from orf import * -sys.path.insert(0, '../align') -import align_helpers -sys.path.insert(0, '..') import global_params as gp -#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' +# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' d = '../../data/CBS432/orfs/' -#d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' +# d = '/tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/orfs/' fns = os.listdir(d) for fn in fns: cmd_string = gp.blast_install_path + 'makeblastdb' + \ ' -dbtype nucl' + \ ' -in ' + d + fn - print cmd_string + print(cmd_string) os.system(cmd_string) diff --git a/code/annotate/orfs_main.py b/code/annotate/orfs_main.py index 9c0a906..6ec3a05 100644 --- a/code/annotate/orfs_main.py +++ b/code/annotate/orfs_main.py @@ -3,49 +3,48 @@ import sys import os -#from orf import * -sys.path.insert(0, '../align') -import align_helpers -sys.path.insert(0, '..') +from align import align_helpers import global_params as gp -ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + \ - gp.fasta_suffix \ +ref_fns = [gp.ref_dir[r] + gp.ref_fn_prefix[r] + '_chr' + '?' + + gp.fasta_suffix for r in gp.alignment_ref_order] # get all non-reference strains of cerevisiae and paradoxus s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) # and get paradoxus reference as well -s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], gp.ref_dir[gp.alignment_ref_order[1]])) +s.append((gp.ref_fn_prefix[gp.alignment_ref_order[1]], + gp.ref_dir[gp.alignment_ref_order[1]])) strain_fn = '*_chr?' + gp.fasta_suffix f = open('orfs.sh', 'w') -for i in range(78,94): +for i in range(78, 94): strain, d = s[i] - print strain + print(strain) current_strain_fn = strain_fn.replace('*', strain) for chrm in gp.chrms: - print chrm + print(chrm) sys.stdout.flush() - + current_strain_chrm_fn = current_strain_fn.replace('?', chrm) - orf_fn = strain + '_chr' + chrm + \ - '_orfs' + gp.fasta_suffix + orf_fn = strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix orf_d = d + '/orfs/' if not os.path.isdir(orf_d): os.makedirs(orf_d) - cmd_string = gp.orffinder_install_path + '/ORFfinder' + \ - ' -in ' + d + current_strain_chrm_fn + \ - ' -s 0' + \ - ' -out ' + orf_d + orf_fn + \ - ' -outfmt 1 -n true; \n' - #print cmd_string + cmd_string = (gp.orffinder_install_path + '/ORFfinder' + + ' -in ' + d + current_strain_chrm_fn + + ' -s 0' + + ' -out ' + orf_d + orf_fn + + ' -outfmt 1 -n true; \n') + # print(cmd_string) os.system(cmd_string) f.write(cmd_string) f.close() -# "../../../../software/ORFfinder -in /tigress/AKEY/akey_vol2/aclark4/nobackup/100_genomes/genomes_gb/yjm248_chrI.fa -out a.txt -outfmt 1 -n true" +# "../../../../software/ORFfinder \ +# -in /tigress/AKEY/akey_vol2/aclark4/nobackup/\ +# 100_genomes/genomes_gb/yjm248_chrI.fa -out a.txt -outfmt 1 -n true" diff --git a/code/beer_strains/fastq_to_fasta.py b/code/beer_strains/fastq_to_fasta.py index 838af93..d7959ac 100644 --- a/code/beer_strains/fastq_to_fasta.py +++ b/code/beer_strains/fastq_to_fasta.py @@ -1,12 +1,15 @@ # take fastq files containing reads and quality information, along -# with reference genome, and convert to fasta file ... or vcf file and then fasta?? +# with reference genome, and convert to fasta file ... +# or vcf file and then fasta?? import os import sys fastq_dir = '/net/dunham/vol2/Giang/DunhamBeer/DunhamBeer' -quality_chars = list('!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~') +quality_chars = list('!"#$%&\'()*+,-./0123456789:;<=>?@\ + ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`\ + abcdefghijklmnopqrstuvwxyz{|}~') char_to_score = dict(zip(quality_chars, range(1, len(quality_chars)))) fastq_dir = '/net/dunham/vol2/Giang/DunhamBeer/DunhamBeer/' @@ -16,34 +19,41 @@ if '.1.fastq' in l and 'stats' not in l and l[0] != 'N': fns.append(l[:-8]) -ref_fasta = '/net/akey/vol2/aclark4/nobackup/100_genomes/genomes/S288c_SGD-R64.fa' +ref_fasta = '/net/akey/vol2/aclark4/nobackup/\ + 100_genomes/genomes/S288c_SGD-R64.fa' ##### # align reads with bwa ##### samdir = '/net/akey/vol2/aclark4/nobackup/introgression/data/beer/dunham/sam/' -os.system('module load bwa/latest') # this doesn't actually work because it makes a new shell instance every time - TODO fix this -cmd = 'bwa index ' + ref_fasta -#print cmd -#os.system(cmd) +# this doesn't actually work because it makes a new shell instance every time +# TODO fix this +os.system('module load bwa/latest') +cmd = 'bwa index ' + ref_fasta +# print cmd +# os.system(cmd) for fn in fns: - cmd = 'bwa mem ' + ref_fasta + ' ' + fastq_dir + fn + '.1.fastq ' + fastq_dir + fn + '.2.fastq' + ' > ' + samdir + fn + '.sam' - print cmd + cmd = 'bwa mem ' + ref_fasta + ' ' + fastq_dir + fn + \ + '.1.fastq ' + fastq_dir + fn + '.2.fastq' + ' > ' + \ + samdir + fn + '.sam' + print(cmd) os.system(cmd) sys.exit() -##### +# ### # run base recalibrator -##### +# ### -outdir = '/net/akey/vol2/aclark4/nobackup/introgression/data/beer/dunham/fasta/' +outdir = '/net/akey/vol2/aclark4/nobackup/introgression/\ + data/beer/dunham/fasta/' for fn in fns: # -knownSites database of previously known polymorphisms - os.system('java -jar ~/software/GenomeAnalysisTK.jar -T BaseRecalibrator -R ' + ref_fasta + ' -I ' + fastq_dir + fn + ' -o ' + outdir + fn[:-1] + 'a') - -##### -# run -##### - + os.system('java -jar ~/software/GenomeAnalysisTK.jar ' + '-T BaseRecalibrator -R ' + ref_fasta + ' -I ' + + fastq_dir + fn + ' -o ' + outdir + fn[:-1] + 'a') + +# ### +# run +# ### diff --git a/code/beer_strains/vcf_to_fasta.py b/code/beer_strains/vcf_to_fasta.py index 01296d1..689ffe4 100644 --- a/code/beer_strains/vcf_to_fasta.py +++ b/code/beer_strains/vcf_to_fasta.py @@ -1,5 +1,6 @@ import sys + def read_vcf(fn): f = open(fn, 'r') @@ -14,14 +15,15 @@ def read_vcf(fn): f.close() return v + def vcf_to_fasta(v, fn_ref, fn_out): f_ref = open(fn_ref, 'r') - f_out = open(fn_out, 'w') line = f_ref.readline() while line != '': - + line = f_ref.readline() - + + v = read_vcf(sys.argv[1]) vcf_to_fasta(v, sys.argv[2]) diff --git a/code/config.yaml b/code/config.yaml new file mode 100644 index 0000000..5a0fc47 --- /dev/null +++ b/code/config.yaml @@ -0,0 +1,138 @@ +--- +# biological parameters +mu: 1.84e-10 + + +# should we leave the alignments already completed in the alignments +# directory alone? +resume_alignment: False + +HMM_symbols: + match: '+' + mismatch: '-' + unknown: '?' + unsequenced: 'n' + gap: '-' + unaligned: '?' + masked: 'x' + +output_root: /tigress/tcomi/aclark4_temp/results +input_root: /tigress/AKEY/akey_vol2/aclark4/nobackup + +paths: + # optional log file + # log_file: introgression.log + fasta_suffix: .fa # suffix for _all_ fasta files + # suffix for _all_ alignment files + # this needs to match the suffix output by mugsy + alignment_suffix: .maf + + masks: /tigress/tcomi/aclark4_temp/par4/masked/ + alignments: /tigress/tcomi/aclark4_temp/par4/ + + test_strains: + - "__INPUT_ROOT__/100_genomes/genomes_gb/\ + {strain}_chr{chrom}.fa" + + simulations: + sim_base: __OUTPUT_ROOT__/sim + prefix: sim_out_ + suffix: .txt + + analysis: + analysis_base: __OUTPUT_ROOT__/analysis + regions: __ANALYSIS_BASE__/regions/{state}.fa.gz + region_index: __ANALYSIS_BASE__/regions/{state}.pkl + genes: __ANALYSIS_BASE__/genes/ + blocks: __ANALYSIS_BASE__/blocks_{state}.txt + labeled_blocks: __ANALYSIS_BASE__/blocks_{state}_labeled.txt + quality_blocks: __ANALYSIS_BASE__/blocks_{state}_quality.txt + hmm_initial: __ANALYSIS_BASE__/hmm_initial.txt + hmm_trained: __ANALYSIS_BASE__/hmm_trained.txt + probabilities: __ANALYSIS_BASE__/probabilities.txt.gz + alignment: __ALIGNMENTS__/{prefix}_{strain}_chr{chrom}_mafft.maf + positions: __ANALYSIS_BASE__/positions.txt.gz + masked_intervals: __MASKS__/{strain}_chr{chrom}_intervals.txt + introgressed: __ANALYSIS_BASE__/blocks_{state}_filter1.txt + introgressed_intermediate: "__ANALYSIS_BASE__/\ + blocks_{state}_filter1inter.txt" + ambiguous: __ANALYSIS_BASE__/blocks_{state}_filter2.txt + ambiguous_intermediate: "__ANALYSIS_BASE__/\ + blocks_{state}_filter2inter.txt" + filter_sweep: __ANALYSIS_BASE__/filter2_thresholds.txt + # strain_info is a tsv file with strain, _, _, geographic origin, + # environmental origin, population + strain_info: "/home/tcomi/projects/aclark4_introgression/\ + 100_genomes_info.txt" + state_counts: __ANALYSIS_BASE__/state_counts.txt + + # software install locations + software: + root_install: /tigress/anneec/software + mugsy: __ROOT_INSTALL__/mugsy/ + tcoffee: "__ROOT_INSTALL__/\ + T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/" + mafft: __ROOT_INSTALL__/mafft/bin/ + ms: __ROOT_INSTALL__/msdir/ + # including dustmasker + blast: "__ROOT_INSTALL__/ncbi-blast-2.7.1+-src/\ + c++/ReleaseMT/bin/" + orffinder: __ROOT_INSTALL__/ + ldselect: __ROOT_INSTALL__/ldSelect/ + structure: __ROOT_INSTALL__/structure/ + +chromosomes: ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', + 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'] + +# can optionally list all strains to consider +# if blank will glob with TEST_STRAINS paths +# strains: + +# can provide a prefix for the alignment files +# if blank will be the reference and known state names joined with '_' + +analysis_params: + convergence_threshold: 0.001 + # threshold can be 'viterbi' or a float to threshold HMM probabilities + threshold: viterbi + input_root: /tigress/AKEY/akey_vol2/aclark4/nobackup + filter_threshold: 0.98 + + # master known state, prepeded to list of known states + reference: + name: S288c + base_dir: __INPUT_ROOT__/100_genomes/genomes/S288c_SGD-R64/ + gene_bank_dir: __INPUT_ROOT__/S288c/ + interval_name: S288c_SGD-R64 # if different than name + + known_states: + - name: CBS432 + base_dir: /tigress/anneec/projects/introgression/data/CBS432/ + gene_bank_dir: __INPUT_ROOT__/CBS432/ + expected_length: 10000 + expected_fraction: 0.025 + + - name: N_45 + base_dir: __INPUT_ROOT__/para_sgrp/strains/N_45/ + gene_bank_dir: null + expected_length: 10000 + expected_fraction: 0.025 + + - name: DBVPG6304 + base_dir: __INPUT_ROOT__/para_sgrp/strains/DBVPG6304/ + gene_bank_dir: null + expected_length: 10000 + expected_fraction: 0.025 + + - name: UWOPS91_917_1 + base_dir: __INPUT_ROOT__/para_sgrp/strains/UWOPS91_917_1/ + gene_bank_dir: null + expected_length: 10000 + expected_fraction: 0.025 + + unknown_states: + - name: unknown + expected_length: 1000 + expected_fraction: 0.01 + + gene_bank_all: __INPUT_ROOT__/100_genomes/sequence.gb diff --git a/code/environment.yml b/code/environment.yml index 7dca4f3..e3ba0e6 100644 --- a/code/environment.yml +++ b/code/environment.yml @@ -1,20 +1,24 @@ ---- -name: introgression3 +name: introgression channels: + - conda-forge - defaults dependencies: - atomicwrites=1.3.0=py_0 - attrs=18.2.0=py37h28b3542_0 - blas=1.0=mkl - - ca-certificates=2019.1.23=0 - - certifi=2018.11.29=py37_0 + - ca-certificates=2019.5.15=0 + - certifi=2019.3.9=py37_0 - coverage=4.5.2=py37h7b6447c_0 + - entrypoints=0.3=py37_1000 + - flake8=3.7.7=py37_0 - intel-openmp=2019.1=144 - libedit=3.1.20181209=hc058e9b_0 - libffi=3.2.1=hd88cf55_4 - libgcc-ng=8.2.0=hdf63c60_1 - libgfortran-ng=7.3.0=hdf63c60_0 - libstdcxx-ng=8.2.0=hdf63c60_1 + - llvmlite=0.28.0=py37hd408876_0 + - mccabe=0.6.1=py_1 - mkl=2019.1=144 - mkl_fft=1.0.10=py37ha843d7b_0 - mkl_random=1.0.2=py37hd81dba3_0 @@ -22,14 +26,18 @@ dependencies: - ncurses=6.1=he6710b0_1 - numpy=1.15.4=py37h7e9f1db_0 - numpy-base=1.15.4=py37hde5b4d6_0 - - openssl=1.1.1a=h7b6447c_0 + - openssl=1.1.1c=h7b6447c_1 - pip=19.0.1=py37_0 - pluggy=0.8.1=py37_0 - py=1.7.0=py37_0 + - pycodestyle=2.5.0=py_0 + - pyflakes=2.1.1=py_0 - pytest=4.2.1=py37_0 - pytest-cov=2.6.1=py37_0 + - pytest-flake8=1.0.4=py37_0 - pytest-mock=1.10.0=py37_0 - python=3.7.2=h0371630_0 + - pyyaml=5.1=py37h7b6447c_0 - readline=7.0=h7b6447c_5 - setuptools=40.8.0=py37_0 - six=1.12.0=py37_0 @@ -37,4 +45,7 @@ dependencies: - tk=8.6.8=hbc83047_0 - wheel=0.32.3=py37_0 - xz=5.2.4=h14c3975_4 + - yaml=0.1.7=had09818_2 - zlib=1.2.11=h7b6447c_3 + - pip: + - click==7.0 diff --git a/code/global_params.py b/code/global_params.py index f01ec9d..a21ef1e 100644 --- a/code/global_params.py +++ b/code/global_params.py @@ -1,33 +1,34 @@ -#==== +# ==== # biological parameters -#==== +# ==== mu = 1.84 * 10 ** -10 -#==== +# ==== # file extensions -#==== +# ==== # suffix for _all_ fasta files fasta_suffix = '.fa' -# suffix for _all_ alignment files; this needs to match the suffix output by mugsy +# suffix for _all_ alignment files +# this needs to match the suffix output by mugsy alignment_suffix = '.maf' -#==== +# ==== # sequence locations/names -#==== +# ==== -## now specified in setup_args file +# now specified in setup_args file -#==== +# ==== # alignment files -#==== +# ==== -## alignments directory now specified in setup_args file +# alignments directory now specified in setup_args file mask_dir = '../alignments/masked/' -#mask_dir = '/tigress/tcomi/aclark4_temp/par4/masked/' +# mask_dir = '/tigress/tcomi/aclark4_temp/par4/masked/' # should we leave the alignments already completed in the alignments # directory alone? @@ -36,9 +37,9 @@ # master_ref now automatically assumed to be first # reference specified in setup_args file -#==== +# ==== # HMM -#==== +# ==== match_symbol = '+' mismatch_symbol = '-' @@ -49,13 +50,12 @@ unaligned_symbol = '?' masked_symbol = 'x' -#==== +# ==== # simulations -#==== +# ==== # output directory for simulpations sim_out_dir_absolute = '/tigress/tcomi/aclark4_temp/results/sim' -#sim_out_dir_absolute = '/tigress/AKEY/akey_vol2/aclark4/projects/introgression/results/sim/' # prefix for simulation output sim_out_prefix = 'sim_out_' @@ -63,9 +63,9 @@ # suffix for simulation output sim_out_suffix = '.txt' -#==== +# ==== # analysis -#==== +# ==== analysis_out_dir_absolute = \ '/tigress/AKEY/akey_vol2/aclark4/projects/introgression/results/analysis/' @@ -74,20 +74,22 @@ genes_out_dir_absolute = analysis_out_dir_absolute + '/genes/' -#==== +# ==== # software install locations -#==== +# ==== mugsy_install_path = '/tigress/anneec/software/mugsy/' -tcoffee_install_path = '/tigress/anneec/software/T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/' +tcoffee_install_path = '/tigress/anneec/software/\ + T-COFFEE_installer_Version_11.00.8cbe486_linux_x64/bin/' mafft_install_path = '/tigress/anneec/software/mafft/bin/' ms_install_path = '/tigress/anneec/software/msdir/' # including dustmasker -blast_install_path = '/tigress/anneec/software/ncbi-blast-2.7.1+-src/c++/ReleaseMT/bin/' +blast_install_path = '/tigress/anneec/software/\ + ncbi-blast-2.7.1+-src/c++/ReleaseMT/bin/' orffinder_install_path = '/tigress/anneec/software/' @@ -95,11 +97,12 @@ structure_install_path = '/tigress/anneec/software/structure/' -#==== +# ==== # other -#==== +# ==== -chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'] -#chrms = ['I'] +chrms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', + 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'] +# chrms = ['I'] chrms_ara = dict(zip(chrms, range(1, len(chrms)+1))) diff --git a/code/hmm/hmm_bw.py b/code/hmm/hmm_bw.py index 26c8e37..e787916 100644 --- a/code/hmm/hmm_bw.py +++ b/code/hmm/hmm_bw.py @@ -1,5 +1,6 @@ import numpy as np from typing import List, Dict, Tuple +import logging as log class HMM: @@ -22,7 +23,7 @@ def set_hidden_states(self, states: List[str]) -> None: def set_observed_states(self, states: List[str]) -> None: ''' Sets the observed states of the HMM to the supplied list of strings - If not supplied will set to list of keys provided by emissions + If not supplied will set to list of keys provided by emissions ''' self.observed_states = states @@ -86,46 +87,43 @@ def set_initial_p(self, initial_p: List[float]) -> None: ''' self.initial_p = np.array(initial_p) - assert np.isclose(np.sum(initial_p), 1), f"{initial_p} {sum(initial_p)}" + assert np.isclose(np.sum(initial_p), 1), \ + f"{initial_p} {sum(initial_p)}" def print_results(self, iterations: int, LL: float) -> None: ''' Write current state of HMM to stdout ''' - print( - f'''Iterations: {iterations} + message = f'Iterations: {iterations}\n\nLog Likelihood:\n{LL:.30e}' -Log Likelihood: -{LL:.30e} - -Initial State Probabilities:''' - ) + message += '\n\nInitial State Probabilities:\n' for i in range(len(self.hidden_states)): - print(f"{self.hidden_states[i]}={self.initial_p[i]:.30e}") - print() - print("Transition Probabilities:") + message += f'{self.hidden_states[i]}={self.initial_p[i]:.30e}\n' + + message += '\nTransition Probabilities:\n' for i in range(len(self.hidden_states)): for j in range(len(self.hidden_states)): - print(f"{self.hidden_states[i]},{self.hidden_states[j]}\ - ={self.transitions[i][j]:.30e}") - print() - print("Emission Probabilities:") + message += f"{self.hidden_states[i]},{self.hidden_states[j]}\ + ={self.transitions[i][j]:.30e}\n" + + message += '\nEmission Probabilities:\n' for i in range(len(self.hidden_states)): for k in sorted(self.observed_states): - print(f"{self.hidden_states[i]},{k}=\ - {self.emissions[i, self.symbol_to_ind[k]]:.30e}") - print() + message += f"{self.hidden_states[i]},{k}=\ + {self.emissions[i, self.symbol_to_ind[k]]:.30e}\n" + message += '\n' + log.debug(message) def train(self, - improvement_frac: float = .01, - max_iterations: int = None) -> None: + improvement_frac: float = 0.01, + max_iterations: int = None) -> None: ''' Train the hmm until either the max iterations is reached or the log likelihood fails to improve beyond the improvement factor ''' # calculate current log likelihood - print("calculating alpha") + log.debug('calculating alpha') alpha = self.forward() LL = self.log_likelihood(alpha) @@ -141,32 +139,32 @@ def train(self, and iterations < max_iterations)\ or LL - prev_LL > threshold: - print(f"Iteration {iterations}") + log.info(f'Iteration {iterations}') - print("calculating beta") + log.debug('calculating beta') beta = self.backward() - print("calculating gamma") + log.debug('calculating gamma') gamma = self.state_probs(alpha, beta) - print("calculating xi") + log.debug('calculating xi') xi = self.bw(alpha, beta) - print("updating parameters") + log.debug('updating parameters') self.initial_p = self.initial_probabilities(gamma) self.transitions = self.transition_probabilities(xi, gamma) self.emissions = self.emission_probabilities(gamma) assert np.isclose(np.sum(self.initial_p), 1), \ - f"{beta}\n{np.sum(self.initial_p)} {self.initial_p}" + f'{beta}\n{np.sum(self.initial_p)} {self.initial_p}' for t in self.transitions: assert np.isclose(np.sum(t), 1), \ - f"{xi} {gamma} {np.sum(t)} {t}" + f'{xi} {gamma} {np.sum(t)} {t}' for e in self.emissions: - assert np.isclose(np.sum(e), 1), f"{np.sum(e.values())} {e}" + assert np.isclose(np.sum(e), 1), f'{np.sum(e.values())} {e}' iterations += 1 - print("calculating alpha") + log.debug("calculating alpha") alpha = self.forward() prev_LL = LL @@ -176,11 +174,11 @@ def train(self, self.print_results(iterations, LL) if LL < prev_LL and not np.isclose(LL, prev_LL): - # NOTE does not stop execution - print('PROBLEM: log-likelihood stopped increasing; \ - stopping training now') + log.error('PROBLEM: log-likelihood stopped increasing; ' + 'stopping training now') + return - print(f"finished in {iterations} iterations") + log.info(f'finished in {iterations} iterations') def log_likelihood(self, alpha: np.array) -> float: ''' @@ -291,14 +289,16 @@ def forward(self) -> np.array: # Markov process was at state j at time t # returns array of size observations, observations[0], hidden_states # determine emission probabilities for each measured value - emis = np.transpose(np.log(self.emissions[:, self.observations])) + with np.errstate(divide='ignore'): + emis = np.transpose(np.log(self.emissions[:, self.observations])) trans = np.log(self.transitions) alpha = np.empty((len(self.observations), len(self.observations[0]), len(self.hidden_states)), float) # initialize to initial probabilitiy * observed emission - alpha[:, 0, :] = np.log(self.initial_p[None, :]) + emis[0, :, :] + with np.errstate(divide='ignore'): + alpha[:, 0, :] = np.log(self.initial_p[None, :]) + emis[0, :, :] # recursively fill array for i in range(1, len(self.observations[0])): alpha[:, i, :] = np.logaddexp.reduce(alpha[:, i-1, :][:, :, None] + @@ -312,8 +312,9 @@ def backward(self) -> np.array: ''' # probability that the sequence from t+1 to end was observed # and Markov process was at state j at time t - emis = np.transpose(np.log(self.emissions[:, self.observations])) - trans = np.log(self.transitions) + with np.errstate(divide='ignore'): + emis = np.transpose(np.log(self.emissions[:, self.observations])) + trans = np.log(self.transitions) beta = np.zeros((len(self.observations), len(self.observations[0]), len(self.hidden_states)), float) @@ -327,7 +328,7 @@ def backward(self) -> np.array: def calculate_max_states(self) -> Tuple[np.array, np.array]: ''' - Find the maximum likelihood hidden states and the corresponding + Find the maximum likelihood hidden states and the corresponding log probability for each state. Returned tuple is (probability, states) ''' @@ -337,12 +338,14 @@ def calculate_max_states(self) -> Tuple[np.array, np.array]: len(self.hidden_states)), int) # build array of emissions based on observations - emissions = np.log(np.transpose(self.emissions)[self.observations]) + with np.errstate(divide='ignore'): + emissions = np.log(np.transpose(self.emissions)[self.observations]) + + trans_emis = np.log(self.transitions[None, :, :]) +\ + emissions[:, None, :] - trans_emis = np.log(self.transitions[None, :, :]) +\ - emissions[:, None, :] + probabilities[0, :] = np.log(self.initial_p) + emissions[0] - probabilities[0, :] = np.log(self.initial_p) + emissions[0] states[0, :] = -1 for i in range(1, len(emissions)): diff --git a/code/misc/binary_search.py b/code/misc/binary_search.py index 73c4cee..79594ca 100644 --- a/code/misc/binary_search.py +++ b/code/misc/binary_search.py @@ -1,7 +1,9 @@ import bisect +from typing import List -def present(a, x): - 'Locate the leftmost value exactly equal to x' + +def present(a: List[int], x: int) -> bool: + 'Locate the leftmost value exactly equal to x in a' i = bisect.bisect_left(a, x) if i != len(a) and a[i] == x: return True diff --git a/code/misc/config_utils.py b/code/misc/config_utils.py new file mode 100644 index 0000000..2efeb5d --- /dev/null +++ b/code/misc/config_utils.py @@ -0,0 +1,226 @@ +import re +from copy import copy +from typing import Dict, List +import logging as log + + +''' +config_utils.py + +Helper functions for working with yaml config files +''' + + +def clean_config(config: Dict, + valid_replacements: Dict[str, str] = None) -> Dict: + ''' + Performs subsitution of variables in string recursively replacing + strings of the form __.+__ with the matching key. Nested variables + with the same name replace parent values. + config is the possibly nested dict with values to replace + valid_replacements are the valid entries for performing replacements + ''' + result = {} + if valid_replacements is None: + valid_replacements = dict() + len_values = len(config) + while config: + # want to look at valid replacements first, + # to possibly replace their values + keys = config.keys() + keys = list([k for k in keys if k in valid_replacements] + + [k for k in keys if k not in valid_replacements]) + + for key in keys: + value = config[key] + if isinstance(value, str): + value = replace_entry(value, valid_replacements) + if value is None: + continue # don't remove + result[key] = value + valid_replacements[key] = value + + elif isinstance(value, dict): + result[key] = clean_config(value, + copy(valid_replacements)) + + elif isinstance(value, list): + result[key] = clean_list(value, + valid_replacements) + + else: + result[key] = value + valid_replacements[key] = str(value) + + config.pop(key) + + if len_values == len(config): + raise Exception('Failed to dereference all keys, remaining ' + f'values are:\n {print_dict(config)}') + + len_values = len(config) + + return result + + +def clean_list(config: List, + valid_replacements: Dict[str, str] = None) -> List: + ''' + Performs substitution on list of config objects + ''' + result = [] + for value in config: + if isinstance(value, str): + output = replace_entry(value, valid_replacements) + if output is None: + raise Exception(f'Failed to dereference list entry: "{value}"') + result.append(output) + + elif isinstance(value, list): + result.append(clean_list(value, valid_replacements)) + + elif isinstance(value, dict): + result.append(clean_config(value, copy(valid_replacements))) + + else: + result.append(value) + + return result + + +def replace_entry(value: str, valid_replacements: Dict[str, str]) -> str: + ''' + Replace instances of __.+__ with the key in valid_replacements + If valid replacements is none or the key is not found, return None + Else return the (possibly) substituted string with all instances of /+ + replaced with / (common in path replacements) + ''' + replacements = re.findall('__(.+?)__', value) + for replacement in set(replacements): + replace = replacement.lower() + if valid_replacements is None or replace not in valid_replacements: + return None + value = re.sub(f'__{replacement}__', + valid_replacements[replace], + value) + return re.sub('/+', '/', value) + + +def print_dict(d: Dict, lvl: int = 0) -> str: + ''' + Return pretty representation of the dictionary d. + lvl is the starting amount to indent the line + ''' + result = '' + for k, v in d.items(): + if isinstance(v, dict): + result += ' ' * lvl + f'{k} -\n' + result += print_dict(v, lvl+1) + elif isinstance(v, list): + result += ' ' * lvl + f'{k} -\n' + result += print_list(v, lvl+1) + else: + result += ' ' * lvl + f'{k} - {v}\n' + return result + + +def print_list(l: List, lvl: int = 0) -> str: + ''' + Return pretty representation of the list l. + lvl is the startin amount to indent the line + ''' + result = '' + for i, v in enumerate(l): + if isinstance(v, dict): + result += ' ' * lvl + f'{i}:\n' + print_dict(v, lvl+1) + elif isinstance(v, list): + result += ' ' * lvl + f'{i}:\n' + print_list(v, lvl+1) + else: + result += ' ' * lvl + f'{i}:\t{v},\n' + return result + + +def merge_dicts(parent: Dict, new: Dict) -> Dict: + ''' + Merge the new dict into parent. Existing items are overwritten, + dicts are merged recursively, lists are combined as sets. + ''' + + for k, v in new.items(): + if k in parent: + if isinstance(v, dict): + parent[k] = merge_dicts(parent[k], v) + + else: + parent[k] = v + else: + parent[k] = v + + return parent + + +def merge_lists(parent: List, new: List) -> List: + ''' + Merge new list into parent. If new item isn't in list, add it. + Overwriting and nesting is not supported as it seems ill-defined. + ''' + for i, v in enumerate(new): + if v not in parent: + parent.append(v) + + return parent + + +def get_nested(config: Dict, keys: str): + ''' + Return the value of the nested keys, or none if the key is invalid + keys is a period separated list of keys as a string + ''' + if config is None: + return None + keys = keys.split('.') + value = config + try: + for k in keys: + value = value[k] + except KeyError: + return None + return value + + +def check_wildcards(path: str, wildcards: str) -> bool: + ''' + Check if the supplied path contains all required wildcards + wildcards are provided as a comma separated list string + returns true if all wildcards are present in path, e.g. {wildcard} in path + else raises a ValueError with the unfound wildcard + ''' + for wildcard in wildcards.split(','): + if f'{{{wildcard}}}' not in path: + err = f'{{{wildcard}}} not found in {path}' + log.exception(err) + raise ValueError(err) + + return True + + +def validate(config: Dict, + path: str, + exception: str, + value: str = None): + ''' + validate the supplied value, raising exception if no value is found + config: the config dictionary to lookup + path: the path in nested config dict + exception: string to display if no value is found + value: starting value. values of None or '' will cause lookup into config + ''' + + if value is None or value == '': + value = get_nested(config, path) + + if value is None: + log.exception(exception) + raise ValueError(exception) + + return value diff --git a/code/misc/mystats.py b/code/misc/mystats.py index c18d7a9..f71e2fb 100644 --- a/code/misc/mystats.py +++ b/code/misc/mystats.py @@ -1,43 +1,48 @@ import math import numpy.random -def mean(l): - l = filter(lambda x: x != 'NA' and not math.isnan(x), l) - if len(l) == 0: - #TODO float('nan') ? + +def mean(values): + values = filter(lambda x: x != 'NA' and not math.isnan(x), values) + if len(values) == 0: + # TODO float('nan') ? return 'NA' - return float(sum(l)) / len(l) + return float(sum(values)) / len(values) + -def std_dev(l): - l = filter(lambda x: x != 'NA' and not math.isnan(x), l) - if len(l) == 0: +def std_dev(values): + values = filter(lambda x: x != 'NA' and not math.isnan(x), values) + if len(values) == 0: return 'NA' - if len(l) == 1: + if len(values) == 1: return 0 - m = mean(l) - return math.sqrt(sum([(x - m)**2 for x in l]) / (len(l) - 1)) + m = mean(values) + return math.sqrt(sum([(x - m)**2 for x in values]) / (len(values) - 1)) + -def std_err(l): - l = filter(lambda x: x != 'NA' and not math.isnan(x), l) - if len(l) == 0: +def std_err(values): + values = filter(lambda x: x != 'NA' and not math.isnan(x), values) + if len(values) == 0: return 'NA' - return std_dev(l) / math.sqrt(len(l)) + return std_dev(values) / math.sqrt(len(values)) -def bootstrap(l, n = 100, alpha = .05): - l = filter(lambda x: x != 'NA' and not math.isnan(x), l) - x = len(l) + +def bootstrap(values, n=100, alpha=.05): + values = filter(lambda x: x != 'NA' and not math.isnan(x), values) + x = len(values) if x == 0: return 'NA', 'NA' a = [] for i in range(n): - a.append(mean(numpy.random.choice(l, size = x, replace = True))) + a.append(mean(numpy.random.choice(values, size=x, replace=True))) a.sort() - #print len(a), a.count(0) - #print mean(a) + # print len(a), a.count(0) + # print mean(a) return a[int(alpha * n * .5)], a[int((1 - alpha * .5) * n)] -def median(l): - m = sorted(l) + +def median(values): + m = sorted(values) x = len(m) if x % 2 == 0: return mean([m[x/2], m[x/2-1]]) diff --git a/code/misc/overlap.py b/code/misc/overlap.py index d19ed2a..06001fb 100644 --- a/code/misc/overlap.py +++ b/code/misc/overlap.py @@ -17,10 +17,11 @@ def overlap(start1, end1, start2, end2): return end2 - start1 + 1 else: return 0 - - #if start1 < start2: - # return max(end1 - start2 + 1, 0) - max(end1 - end2, 0) - #return max(end2 - start1 + 1, 0) - max(end2 - end1, 0) + + # if start1 < start2: + # return max(end1 - start2 + 1, 0) - max(end1 - end2, 0) + # return max(end2 - start1 + 1, 0) - max(end2 - end1, 0) + def overlap_any(start1, end1, coords): for start2, end2 in coords: @@ -28,24 +29,28 @@ def overlap_any(start1, end1, coords): return True return False + def contained(i, start, end): return i >= start and i <= end + def contained_any(i, coords): for start2, end2 in coords: if contained(i, start2, end2): return True return False + def contained_any_named(i, coords): for start2, end2 in coords.keys(): if contained(i, start2, end2): return coords[(start2, end2)] return None + def overlap_region(start1, end1, start2, end2): o_start = max(start1, start2) o_end = min(end1, end2) if o_start > o_end: - return -1, -1 # disjoint ranges + return -1, -1 # disjoint ranges return o_start, o_end diff --git a/code/misc/read_fasta.py b/code/misc/read_fasta.py index 4554396..a422f30 100644 --- a/code/misc/read_fasta.py +++ b/code/misc/read_fasta.py @@ -1,8 +1,14 @@ import gzip import numpy as np +from typing import Tuple, List -def read_fasta(fn, gz=False): +def read_fasta(fn: str, gz: bool = False) -> Tuple[ + List[str], np.array]: + ''' + Read the provided fasta file, returning the + headers (lines starting with >) and sequences + ''' headers = [] seqs = [] diff --git a/code/misc/read_maf.py b/code/misc/read_maf.py index 69bcac4..7950f08 100644 --- a/code/misc/read_maf.py +++ b/code/misc/read_maf.py @@ -1,6 +1,7 @@ import re -def read_mugsy(fn, required_mult = 1): + +def read_mugsy(fn, required_mult=1): f = open(fn, 'r') line = f.readline() while line[0] == '#': @@ -9,9 +10,9 @@ def read_mugsy(fn, required_mult = 1): while line != '': assert line[0] == 'a', line block = {} - m = re.search('a score=(?P[0-9]+) ' +\ - 'label=(?P