diff --git a/DanceMapper.py b/DanceMapper.py index d3e18e3..2fac9ce 100644 --- a/DanceMapper.py +++ b/DanceMapper.py @@ -1,7 +1,10 @@ - +import os import numpy as np import sys, argparse, itertools import datetime +import warnings +#suppress warning from cython when using a different version of numpy vs what cython compiled +warnings.filterwarnings("ignore", message="numpy.dtype size changed") # get path to functions needed for mutstring I/O import externalpaths @@ -15,6 +18,8 @@ import accessoryFunctions as aFunc from BernoulliMixture import BernoulliMixture +import concat_profile_mut_N as concat_prof_mut +import de_cat_N7_N13 as de_cat @@ -22,7 +27,7 @@ class DanceMap(object): - def __init__(self, modfile=None, untfile=None, profilefile=None, seqlen=None, ignorebg=False, **kwargs): + def __init__(self, modfile=None, untfile=None, profilefile=None, seqlen=None, ignorebg=False, concat=False, **kwargs): """Define important global parameters""" # reads contains positions that are 'read' @@ -56,7 +61,21 @@ def __init__(self, modfile=None, untfile=None, profilefile=None, seqlen=None, ig # contains final BMsolution, if fitting done self.BMsolution = None - + + # Arrays for ring experiments + self.ring_read = None + self.ring_comut = None + self.ring_inotj = None + self.null_read = None + self.null_comut = None + self.null_inotj = None + + self.concat=concat + if self.concat: + self.inactivate_n7 = kwargs["inactivate_n7"] + #print "Inactivate N7: {} ".format(self.inactivate_n7) + else: + self.inactivate_n7 = None if profilefile is not None: self.init_profile(profilefile, ignorebg) @@ -71,6 +90,7 @@ def __init__(self, modfile=None, untfile=None, profilefile=None, seqlen=None, ig if modfile is not None: self.readPrimaryData(modfile, **kwargs) + # if provided, now update self.profile.backprofile with rate computed # using reads filtered according to same criteria for primary data if untfile is not None: @@ -81,6 +101,8 @@ def __init__(self, modfile=None, untfile=None, profilefile=None, seqlen=None, ig self.initializeActiveCols(**kwargs) + + def init_profile(self, profilefile, ignorebg=False): self.profile = ReactivityProfile(profilefile) @@ -99,8 +121,15 @@ def init_profile(self, profilefile, ignorebg=False): def readPrimaryData(self, modfilename, minreadcoverage=None, undersample=-1, **kwargs): - # Determine mincoverage quality filter + + + #Hacky way to ensure primers are NaN'd out of the default read coverage. Before this the + #only options NaN'd out in the normProfile were Nts where the background was greater than + # .02. All the rest, including primers, remained not NaN. + if self.profile is not None: + self.profile.normalize() + if minreadcoverage is None and self.minreadcoverage is None: minreadcoverage = self.seqlen @@ -109,8 +138,13 @@ def readPrimaryData(self, modfilename, minreadcoverage=None, undersample=-1, **k if self.profile is not None: minreadcoverage -= np.sum(np.isnan(self.profile.normprofile)) + + self.minreadcoverage = int(round(minreadcoverage*0.75)) - + + # Avoid erroneous doubling of minreadcoverage due to concatenation + if self.concat: + self.minreadcoverage = int(round(self.minreadcoverage / 2)) print('No mincoverage specified. Using default 75% coverage = {} nts'.format(self.minreadcoverage)) @@ -193,7 +227,6 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 # copy so we don't modify argument invalidcols = list(invalidcols) - if verbal and len(invalidcols)>0: print("Nts {} set invalid by user".format(self.ntindices[invalidcols])) @@ -201,6 +234,8 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 # supplement invalid cols from profile, checking for nans profilenan = [] if self.profile is not None: + + for i, val in enumerate(self.profile.normprofile): if val != val and i not in invalidcols: invalidcols.append(i) @@ -214,6 +249,7 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 signal = np.sum(self.mutations, axis=0, dtype=np.float) lowsignal = [] + for i in np.where(signal < invalidrate*self.numreads)[0]: if i not in invalidcols: lowsignal.append(i) @@ -259,6 +295,9 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 # determine low reactivity cols lowrx = [] + + + if self.profile is not None: with np.errstate(invalid='ignore'): for i in np.where(self.profile.subprofile < minrxbg)[0]: @@ -270,6 +309,7 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 if verbal and len(lowrx) > 0: print("Nts {} set to inactive due to low modification rate".format(self.ntindices[lowrx])) + # handle G and U nts if maskG: @@ -301,6 +341,29 @@ def initializeActiveCols(self, invalidcols=[], inactivecols=[], invalidrate=0.00 + #IF N7 and inactivate_n7 (check kwargs) then + #For column in N7 columns if column not in invalid and not in inactive, append + if self.concat: + if self.inactivate_n7: + for i, nt in enumerate(self.sequence[len(self.sequence)/2:]): + + i_adjust = i + len(self.sequence) / 2 + + + + if i_adjust not in self.invalid_columns and i_adjust not in inactive: + + inactive.append(i_adjust) + + else: + for i, nt in enumerate(self.sequence[len(self.sequence)/2:]): + i_adjust = i + len(self.sequence) / 2 + + if i_adjust in inactive and i_adjust in self.invalid_columns: + print("Error! nt in both inactive and invalid") + #elif i_adjust not in self.invalid_columns and i_adjust not in inactive: + # print("{} is active".format(i_adjust)) + inactive.sort() self.inactive_columns = np.array(inactive, dtype=int) @@ -804,10 +867,89 @@ def qualityCheck(self, bm=None): print('-----------------------------------------\n') - - - def computeNormalizedReactivities(self, oldDMS=False): - """From converged mu params and profile, compute normalized params""" + def splitProfile(self, maxProfile): + """ Splits profile in two. Divides each attribute in half amongst + two different profiles and returns the value. + """ + + sequence = maxProfile.sequence + nts = maxProfile.nts + rawprofile, rawerror = maxProfile.profile('raw', True) + subprofile, suberror = maxProfile.profile('sub', True) + backprofile, backerror = maxProfile.profile('back', True) + normprofile, normerror = maxProfile.profile('norm', True) + + N13Profile = self.profile.copy() + N7Profile = self.profile.copy() + + N13Profile.sequence = sequence[:len(sequence)/2] + N13Profile.nts = nts[:len(nts)/2] + N13Profile.rawprofile = rawprofile[:len(rawprofile)/2] + N13Profile.rawerror = rawerror[:len(rawerror)/2] + N13Profile.backprofile = backprofile[:len(backprofile)/2] + N13Profile.backerror = backerror[:len(backerror)/2] + N13Profile.normprofile = normprofile[:len(normprofile)/2] + N13Profile.subprofile = subprofile[:len(subprofile)/2] + N13Profile.suberror = suberror[:len(suberror)/2] + + + N7Profile.sequence = sequence[len(sequence)/2:] + N7Profile.nts = nts[len(nts)/2:] + N7Profile.rawprofile = rawprofile[len(rawprofile)/2:] + N7Profile.rawerror = rawerror[len(rawerror)/2:] + N7Profile.backprofile = backprofile[len(backprofile)/2:] + N7Profile.backerror = backerror[len(backerror)/2:] + N7Profile.normprofile = normprofile[len(normprofile)/2:] + N7Profile.subprofile = subprofile[len(subprofile)/2:] + N7Profile.suberror = suberror[len(suberror)/2:] + + + return N13Profile, N7Profile + + def joinProfile(self, N13prof, N7prof): + """ + Joins two profiles together. Concatenates each attribute. + """ + cat_prof = N13prof.copy() + + sequenceN13 = N13prof.sequence + ntsN13 = N13prof.nts + rawprofileN13, rawerrorN13 = N13prof.profile('raw', True) + subprofileN13, suberrorN13 = N13prof.profile('sub', True) + backprofileN13, backerrorN13 = N13prof.profile('back', True) + normprofileN13, normerrorN13 = N13prof.profile('norm', True) + + sequenceN7 = N7prof.sequence + ntsN7 = N7prof.nts + rawprofileN7, rawerrorN7 = N7prof.profile('raw', True) + subprofileN7, suberrorN7 = N7prof.profile('sub', True) + backprofileN7, backerrorN7 = N7prof.profile('back', True) + normprofileN7, normerrorN7 = N7prof.profile('norm', True) + + + cat_prof.sequence = np.concatenate((sequenceN13, sequenceN7)) + cat_prof.nts = np.concatenate((ntsN13, ntsN7)) + cat_prof.rawprofile = np.concatenate((rawprofileN13, rawprofileN7)) + cat_prof.rawerror = np.concatenate((rawerrorN13, rawerrorN7)) + cat_prof.subprofile = np.concatenate((subprofileN13, subprofileN7)) + cat_prof.suberror = np.concatenate((suberrorN13, suberrorN7)) + cat_prof.backprofile = np.concatenate((backprofileN13, backprofileN7)) + cat_prof.backerror = np.concatenate((backerrorN13, backerrorN7)) + cat_prof.normprofile = np.concatenate((normprofileN13, normprofileN7)) + cat_prof.normerror = np.concatenate((normerrorN13, normerrorN7)) + + + + return cat_prof + + + #Normalize profile + def computeNormalizedReactivities(self, oldDMS=False, concat=False): + """From converged mu params and profile, compute normalized params + If the data run is concatenated - will normalize the first half + of the data in accordance with standard N1/3 normalization + and the second half with N7 normalization. + """ model = self.BMsolution @@ -818,38 +960,119 @@ def computeNormalizedReactivities(self, oldDMS=False): maxProfile = self.profile.copy() maxProfile.rawprofile = np.max(model.mu, axis=0) maxProfile.backgroundSubtract(normalize=False) + + + if concat: + + #Split maxProfile into N1/3 half and N7 half + N13maxProfile, N7maxProfile = self.splitProfile(maxProfile) + + if not oldDMS: + normfactors = N13maxProfile.normalize(eDMS=True) + normFactorN7 = N7maxProfile.normalize(eDMS=True, N7=True) + + else: + normfactors = maxProfile.normalize(oldDMS=True) + + - if not oldDMS: - normfactors = maxProfile.normalize(eDMS=True) else: - normfactors = maxProfile.normalize(oldDMS=True) + if not oldDMS: + normfactors = maxProfile.normalize(eDMS=True) + else: + normfactors = maxProfile.normalize(oldDMS=True) - print(normfactors) + # now create new normalized profiles profiles = [] + + + + for p in xrange(model.pdim): - prof = self.profile.copy() - prof.rawprofile = np.copy(model.mu[p,:]) - prof.backgroundSubtract(normalize=False) + + + #If concat + if concat: + catprofile = self.profile.copy() + N13prof, N7prof = self.splitProfile(catprofile) + + + N13prof.rawprofile = np.copy(model.mu[p,:][:len(model.mu[p,:])/2]) + N7prof.rawprofile = np.copy(model.mu[p,:][len(model.mu[p,:])/2:]) + + N13prof.backgroundSubtract(normalize=False) + N7prof.backgroundSubtract(normalize=False) + + for i,nt in enumerate(N13prof.sequence): + try: + N13prof.normprofile[i] = N13prof.subprofile[i]/normfactors[nt] + except KeyError: + prof.normprofile[i] = np.nan + + + for i, nt in enumerate(N7prof.sequence): + try: + if not np.isnan(N7prof.subprofile[i]): + if nt in normFactorN7 and np.isfinite(normFactorN7[nt]): + if i + 1 < len(N7prof.subprofile): + if N7prof.subprofile[i] < 0: + print("Setting NT {} to 3.32 since the background subtracted value is less than 0".format(i)) + N7prof.normprofile[i] = 3.32 + elif N7prof.sequence[i+1] == 'A' or N7prof.sequence[i+1] == 'N': + N7prof.normprofile[i] = np.log2(normFactorN7['g_pur']/N7prof.subprofile[i]) + else: + N7prof.normprofile[i] = np.log2(normFactorN7['g_pyr']/N7prof.subprofile[i]) + + + else: + N7prof.normprofile[i] = np.nan + + + + + + except Exception as e: + print("Error, exception e: ", e) + + + #CHECK N7 and N13 outputs + + cat_profile = self.joinProfile(N13prof, N7prof) + profiles.append(cat_profile) + + #Else + else: + prof = self.profile.copy() + prof.rawprofile = np.copy(model.mu[p,:]) + prof.backgroundSubtract(normalize=False) + + for i,nt in enumerate(prof.sequence): + try: + prof.normprofile[i] = prof.subprofile[i]/normfactors[nt] + except KeyError: + prof.normprofile[i] = np.nan + + - for i,nt in enumerate(prof.sequence): - try: - prof.normprofile[i] = prof.subprofile[i]/normfactors[nt] - except KeyError: - prof.normprofile[i] = np.nan + + + + profiles.append(prof) - profiles.append(prof) + + return profiles - def writeReactivities(self, output, oldDMS=False): + def writeReactivities(self, output, oldDMS=False, concat=False): """Print out model reactivities self.profile must be defined """ @@ -861,8 +1084,7 @@ def writeReactivities(self, output, oldDMS=False): return # compute normalized parameters - profiles = self.computeNormalizedReactivities(oldDMS) - + profiles = self.computeNormalizedReactivities(oldDMS, concat) with open(output, 'w') as OUT: @@ -887,6 +1109,8 @@ def writeReactivities(self, output, oldDMS=False): else: for prof in profiles: muline += '{0:.4f}\t{1:.4f}\t\t'.format(prof.normprofile[nt], prof.rawprofile[nt]) + + muline += '{0:.4f}'.format(self.profile.backprofile[nt]) @@ -924,7 +1148,7 @@ def readModelFromFile(self, fname, verbal=True): def _sample_RINGs(self, window=1, corrtype='apc', bgfile=None, assignprob=0.9, mincount=10, - subtractwindow=True, montecarlo=False, verbal=True): + subtractwindow=True, montecarlo=False, verbal=True, N7=False, concat=False, fasta=None): """Assign sample reads based on posterior prob and return list of RINGexperiment objs window = correlation window corrtype = metric for computing correlations @@ -932,7 +1156,8 @@ def _sample_RINGs(self, window=1, corrtype='apc', bgfile=None, assignprob=0.9, m assignprob = posterior prob. used for assigning reads to models. If -1, assign reads as MAP subtractwindow = exclude nt window when assigning read for that window montecarlo = sample reads using MC logic - verbal = verbal""" + verbal = verbal + N7 = N7""" # setup the activestatus mask. Assign reads using both active & inactive cols @@ -943,54 +1168,105 @@ def _sample_RINGs(self, window=1, corrtype='apc', bgfile=None, assignprob=0.9, m # fill in the matrices if montecarlo: + #deprecated should probably remove if verbal: print('Using MC for sample RING read assignment') raise AttributeError('Monte Carlo option has been removed') + read, comut, inotj = aFunc.fillRINGMatrix_montecarlo(self.reads, self.mutations, activestatus, self.BMsolution.mu, self.BMsolution.p, window, self.reads.shape[0], subtractwindow) + else: if verbal: print('Using {:.3f} as posterior prob for sample RING read assignment'.format(assignprob)) - - read, comut, inotj = aFunc.fillRINGMatrix(self.reads, self.mutations, activestatus, + #check to see if we've already filled the matrices for this window to prevent redundant calculation + if self.ring_read is None and self.ring_comut is None and self.ring_inotj is None: + self.ring_read, self.ring_comut, self.ring_inotj = aFunc.fillRINGMatrix(self.reads, self.mutations, activestatus, self.BMsolution.mu, self.BMsolution.p, window, assignprob, subtractwindow) - - - + relist = [] # populate RINGexperiment objects - for p in xrange(self.BMsolution.pdim): - - ring = RINGexperiment(arraysize=self.seqlen, corrtype=corrtype, verbal=verbal) - - ring.sequence = self.sequence - - ring.window = window - ring.ex_readarr = read[p] - ring.ex_comutarr = comut[p] - ring.ex_inotjarr = inotj[p] - - # fill bg arrays (only need to do once; copy for >0 models) - if bgfile is not None: - if p==0: - ring.initDataMatrices('bg', bgfile, window=window, - mincoverage=self.minreadcoverage, verbal=verbal) - else: - ring.bg_readarr = relist[0].bg_readarr - ring.bg_comutarr = relist[0].bg_comutarr - ring.bg_inotjarr = relist[0].bg_inotjarr - - ring.computeCorrelationMatrix(mincount=mincount, verbal=verbal, ignorents=self.invalid_columns) - - #ring.writeDataMatrices('ex', 't-{}-{}-{}-{}'.format(p,subtractwindow,assignprob,montecarlo)) - - relist.append(ring) + # handle N7 rings + if self.concat: + for p in xrange(self.BMsolution.pdim): + + ring = RINGexperiment(fasta=fasta, arraysize=self.seqlen, corrtype=corrtype, verbal=verbal, concat=concat, N7=N7) + + if N7: + indices_to_remove = [x for x in range(0, (len(self.sequence)/2))] + ring.sequence = self.sequence[:len(self.sequence)/2] + toignore = [x -( len(self.sequence) / 2) for x in self.invalid_columns if x >= len(self.sequence)/2] + + elif concat: + indices_to_remove = np.intp([]) + ring.sequence = self.sequence[:len(self.sequence)/2] + toignore = self.invalid_columns + + else: + indices_to_remove = [x for x in range(len(self.sequence)/2, len(self.sequence)+1)] + ring.sequence = self.sequence[:len(self.sequence)/2] + toignore = [x for x in self.invalid_columns if x < len(self.sequence)/2] + + cur_read = np.copy(self.ring_read[p]) + cur_comut = np.copy(self.ring_comut[p]) + cur_inotj = np.copy(self.ring_inotj[p]) + + for elem in [0, 1]: + cur_read = np.delete(cur_read, indices_to_remove, axis=elem) + cur_comut = np.delete(cur_comut, indices_to_remove, axis=elem) + cur_inotj = np.delete(cur_inotj, indices_to_remove, axis=elem) + + + ring.window = window + ring.ex_readarr = cur_read + ring.ex_comutarr = cur_comut + ring.ex_inotjarr = cur_inotj + + # fill bg arrays (only need to do once; copy for >0 models) + if bgfile is not None: + if p==0: + ring.initDataMatrices('bg', bgfile, window=window, + mincoverage=self.minreadcoverage, verbal=verbal) + else: + ring.bg_readarr = relist[0].bg_readarr + ring.bg_comutarr = relist[0].bg_comutarr + ring.bg_inotjarr = relist[0].bg_inotjarr + + ring.computeCorrelationMatrix(mincount=mincount, verbal=verbal, ignorents=toignore) + + + relist.append(ring) + else: + for p in xrange(self.BMsolution.pdim): + + ring = RINGexperiment(arraysize=self.seqlen, corrtype=corrtype, verbal=verbal) + + ring.sequence = self.sequence + + ring.window = window + ring.ex_readarr = self.ring_read[p] + ring.ex_comutarr = self.ring_comut[p] + ring.ex_inotjarr = self.ring_inotj[p] + + # fill bg arrays (only need to do once; copy for >0 models) + if bgfile is not None: + if p==0: + ring.initDataMatrices('bg', bgfile, window=window, + mincoverage=self.minreadcoverage, verbal=verbal) + else: + ring.bg_readarr = relist[0].bg_readarr + ring.bg_comutarr = relist[0].bg_comutarr + ring.bg_inotjarr = relist[0].bg_inotjarr + + ring.computeCorrelationMatrix(mincount=mincount, verbal=verbal, ignorents=self.invalid_columns) + + relist.append(ring) if verbal: print('\n') @@ -999,7 +1275,7 @@ def _sample_RINGs(self, window=1, corrtype='apc', bgfile=None, assignprob=0.9, m def _null_RINGs(self, window=1, corrtype='g', assignprob=0.9, mincount=10, - subtractwindow=True, montecarlo=False, verbal=False): + subtractwindow=True, montecarlo=False, verbal=False, N7=False, concat=False, fasta=None): """Create null (uncorrelated) model and assign reads based on posterior prob to determine null-model correlations. Returns list of RINGexperiment objs @@ -1031,6 +1307,7 @@ def _null_RINGs(self, window=1, corrtype='g', assignprob=0.9, mincount=10, # fill in the matrices if montecarlo: + if verbal: print('Using MC for null RING read assignment') @@ -1038,39 +1315,80 @@ def _null_RINGs(self, window=1, corrtype='g', assignprob=0.9, mincount=10, read, comut, inotj = aFunc.fillRINGMatrix_montecarlo(nullEM.reads, nullEM.mutations, activestatus, mu, self.BMsolution.p, window, self.reads.shape[0], subtractwindow) + else: if verbal: print('Using {:.3f} as posterior prob for null RING read assignment'.format(assignprob)) - read, comut, inotj = aFunc.fillRINGMatrix(nullEM.reads, nullEM.mutations, activestatus, + if self.null_read is None and self.null_comut is None and self.null_inotj is None: + self.null_read, self.null_comut, self.null_inotj = aFunc.fillRINGMatrix(nullEM.reads, nullEM.mutations, activestatus, mu, self.BMsolution.p, window, assignprob, subtractwindow) + relist = [] # populate RINGexperiment objects - for p in xrange(self.BMsolution.pdim): - - ring = RINGexperiment(arraysize=self.seqlen, corrtype=corrtype, verbal=verbal) - - ring.sequence = self.sequence - - ring.window = window - ring.ex_readarr = read[p] - ring.ex_comutarr = comut[p] - ring.ex_inotjarr = inotj[p] - - ring.computeCorrelationMatrix(mincount=mincount, ignorents=self.invalid_columns, verbal=verbal) - #ring.writeDataMatrices('ex', 'null-{}.txt'.format(p)) - relist.append(ring) + if self.concat: + for p in xrange(self.BMsolution.pdim): + + ring = RINGexperiment(arraysize=self.seqlen, corrtype=corrtype, verbal=verbal) + + if N7: + indices_to_remove = [x for x in range(0, (len(self.sequence)/2))] + ring.sequence = self.sequence[:(len(self.sequence)/2)] + #get nts to ignore in N7 only, divide by 2 to adjust frame + toignore = [x - (len(self.sequence) / 2) for x in self.invalid_columns if x >= len(self.sequence)/2] + elif concat: + indices_to_remove = np.intp([]) + ring.sequence = self.sequence + toignore = self.invalid_columns + else: + indices_to_remove = [x for x in range(len(self.sequence)/2, len(self.sequence)+1)] + ring.sequence = self.sequence[:(len(self.sequence)/2)] + toignore = [x for x in self.invalid_columns if x < len(self.sequence)/2] + + cur_read = np.copy(self.null_read[p]) + cur_comut = np.copy(self.null_comut[p]) + cur_inotj = np.copy(self.null_inotj[p]) + + + for elem in [0, 1]: + cur_read = np.delete(cur_read, indices_to_remove, axis=elem) + cur_comut = np.delete(cur_comut, indices_to_remove, axis=elem) + cur_inotj = np.delete(cur_inotj, indices_to_remove, axis=elem) + + + ring.window = window + ring.ex_readarr = cur_read + ring.ex_comutarr = cur_comut + ring.ex_inotjarr = cur_inotj + + + ring.computeCorrelationMatrix(mincount=mincount, ignorents=toignore, verbal=verbal) + relist.append(ring) + else: + for p in xrange(self.BMsolution.pdim): + + ring = RINGexperiment(arraysize=self.seqlen, corrtype=corrtype, verbal=verbal) + + ring.sequence = self.sequence + + ring.window = window + ring.ex_readarr = self.null_read[p] + ring.ex_comutarr = self.null_comut[p] + ring.ex_inotjarr = self.null_inotj[p] + + ring.computeCorrelationMatrix(mincount=mincount, ignorents=self.invalid_columns, verbal=verbal) + relist.append(ring) return relist def computeRINGs(self, window=1, bgfile=None, assignprob=0.9, mincount=10, - subtractwindow=True, montecarlo=False, nulldifftest=True, verbal=True): + subtractwindow=True, montecarlo=False, nulldifftest=True, verbal=True, N7=False, concat=False, fasta=None): """Compute RINGs based on posterior prob and mask out (i,j) pairs that are observed in null (uncorrelated) model. Return list of RINGexperiment objects @@ -1082,11 +1400,15 @@ def computeRINGs(self, window=1, bgfile=None, assignprob=0.9, mincount=10, subtractwindow = exclude nt window when assigning read for that window montecarlo = sample reads using MC logic verbal = verbal""" - + if self.concat: + length = len(self.sequence) / 2 + else: + length = len(self.sequence) + # compute rings from experimental data sample = self._sample_RINGs(window=window, bgfile=bgfile, assignprob=assignprob, mincount=mincount, - subtractwindow=subtractwindow, montecarlo=montecarlo, verbal=verbal) + subtractwindow=subtractwindow, montecarlo=montecarlo, verbal=verbal, N7=N7, concat=concat, fasta=fasta) if not nulldifftest: return sample @@ -1094,7 +1416,7 @@ def computeRINGs(self, window=1, bgfile=None, assignprob=0.9, mincount=10, # compute correlations from null model (clustering only w/o correlations) null = self._null_RINGs(window=window, assignprob=assignprob, mincount=10, - subtractwindow=subtractwindow, montecarlo=montecarlo) + subtractwindow=subtractwindow, montecarlo=montecarlo, N7=N7, concat=concat, fasta=fasta) @@ -1121,7 +1443,7 @@ def computeRINGs(self, window=1, bgfile=None, assignprob=0.9, mincount=10, sample_p = sample[p] null_p = null[p] - for i,j in itertools.combinations(range(self.seqlen), 2): + for i,j in itertools.combinations(range(length), 2): nulldiff = sample_p.significantDifference(i,j, null_p.ex_readarr[i,j], null_p.ex_inotjarr[i,j], null_p.ex_inotjarr[j,i], null_p.ex_comutarr[i,j]) @@ -1156,6 +1478,70 @@ def writeReadsMutations(self, outputfile): out.write('\n') +#################################################################################### + +# Concat file helper functions +def cleanup(*args): + for f in args: os.remove(f) + +def check_any_missing(*files): + if not all(os.path.exists(f) for f in files): + raise ValueError("Error: " + ", ".join(files) + " not located in the same directory.") + +def insert_prefix(prefix, name): + if "/" in prefix: + spl_path = prefix.split("/") + prefix = "/".join(spl_path[:-1]) + return prefix + "/" + name + spl_path[-1] + else: + return name + prefix + +def make_fasta(profile, fasta, fa_name): + with open(fasta, "w") as fasta_f: + N1_profile = ReactivityProfile(profile) + fasta_f.write('>{}\n'.format(fa_name)) + fasta_f.write(''.join(N1_profile.sequence)) + +def make_concatenated_N7_files(modified_parsed, untreated_parsed, profile): + mod_mut = args.modified_parsed + mod_mutga = mod_mut + "ga" + prof_txt = args.profile + prof_txtga = prof_txt + "ga" + unt_mut = None + unt_mutga = None + + + # Ensure .mut and .mutga / .txt and .txtga are located in the same folder + check_any_missing(mod_mut, mod_mutga) + check_any_missing(prof_txt, prof_txtga) + + # Enables proper temp file production in cases where user passes the prefix as a path instead of + # a simple string. ie "/path/prefix" instead of "prefix" + mod_output = insert_prefix(args.outputprefix + ".mut", ".temp_mod_concat_") + prof_output = insert_prefix(args.outputprefix + ".txt" , ".temp_prof_concat_") + fasta_output = insert_prefix(args.outputprefix + ".fa" , ".temp_fasta_") + + + # Concats the .mut and .mutga as well as .txt and .txtga files together into temp files informed + # by user defined prefix + concat_prof_mut.concat_mut(mod_mut[:mod_mut.index(".")], mod_output, prof_txt[:prof_txt.index(".")]) + concat_prof_mut.concat_profile(prof_txt[:prof_txt.index(".")], prof_output) + + # Create a temp fasta to from the profile provided to satisfy N7 ring dependency + make_fasta(args.profile, fasta_output, prof_txt[:prof_txt.index(".")]) + + + # Create temp file for untreated .mut and .mutga if applicable + if args.untreated_parsed: + unt_output = insert_prefix(args.outputprefix + ".mut", ".temp_unt_concat_") + unt_mut = args.untreated_parsed + unt_mutga = unt_mut + "ga" + + check_any_missing(unt_mut, unt_mutga) + concat_prof_mut.concat_mut(unt_mut[:unt_mut.index(".")], unt_output, prof_txt[:prof_txt.index(".")]) + + + return mod_output, unt_output, prof_output, fasta_output #################################################################################### @@ -1232,6 +1618,10 @@ def parseArguments(): optional.add_argument('--oldDMSnorm', action='store_true', help='Use old style (pre-eDMS) normalization') optional.add_argument('--suppressverbal', action='store_false', help='Suppress verbal output') optional.add_argument('--outputprefix', type=str, default='emfit', help='Write output files with this prefix (default=emfit)') + + optional.add_argument('--concat', action='store_true', default=False, help='Concatenate mut and mutga files together internally for processing. Must place .mut and .mutga files as well as profile.txt and .txtga files in same directory.') + + optional.add_argument('--activate_n7', action='store_true', default=False, help='Flag to activate N7 columns during DanceMap clustering. Default = False.') parser._action_groups.append(optional) @@ -1272,13 +1662,6 @@ def parseArguments(): return args - - - - - - - if __name__=='__main__': # Log file messaging for keeping track of run info @@ -1294,18 +1677,35 @@ def parseArguments(): args = parseArguments() print('Arguments = {}\n\n'.format(args)) + if args.concat: + mod_output, unt_output, prof_output, fasta_output = make_concatenated_N7_files(args.modified_parsed, args.untreated_parsed, args.profile) + # Modify DM call to accomodate concatenated files + DM = DanceMap(modfile=mod_output, untfile=unt_output, + profilefile=prof_output, + minrxbg = args.minrxbg, + maskG = args.maskG, + maskU = args.maskU, + maskN = args.maskN, + concat=args.concat, + minreadcoverage=args.mincoverage, + undersample=args.undersample, + ignorebg=args.ignore_untreated, + verbal=args.suppressverbal, + inactivate_n7= not args.activate_n7 + ) + else: + DM = DanceMap(modfile=args.modified_parsed, untfile=args.untreated_parsed, + profilefile=args.profile, + minrxbg = args.minrxbg, + maskG = args.maskG, + maskU = args.maskU, + maskN = args.maskN, + minreadcoverage=args.mincoverage, + undersample=args.undersample, + ignorebg=args.ignore_untreated, + verbal=args.suppressverbal) - DM = DanceMap(modfile=args.modified_parsed, untfile=args.untreated_parsed, - profilefile=args.profile, - minrxbg = args.minrxbg, - maskG = args.maskG, - maskU = args.maskU, - maskN = args.maskN, - minreadcoverage=args.mincoverage, - undersample=args.undersample, - ignorebg=args.ignore_untreated, - verbal=args.suppressverbal) - + if args.fit: DM.findBestModel(args.maxcomponents, trials=args.trials, @@ -1314,8 +1714,15 @@ def parseArguments(): verbal=args.suppressverbal, writeintermediate = args.writeintermediates) - DM.writeReactivities(args.outputprefix+'-reactivities.txt', oldDMS=args.oldDMSnorm) - DM.BMsolution.writeModel(args.outputprefix+'.bm') + if args.concat: + DM.BMsolution.writeModel(args.outputprefix+'-concat.bm') + DM.writeReactivities(args.outputprefix+'-concat-reactivities.txt', oldDMS=args.oldDMSnorm, concat=args.concat) + de_cat.decat_react(args.outputprefix+'-concat-reactivities.txt', args.outputprefix+'-N13-reactivities.txt', args.outputprefix+'-N7-reactivities.txt') + de_cat.decat_bm(args.outputprefix+'-concat.bm', args.outputprefix+'-N13.bm', args.outputprefix+'-N7.bm') + + else: + DM.writeReactivities(args.outputprefix+'-reactivities.txt', oldDMS=args.oldDMSnorm) + DM.BMsolution.writeModel(args.outputprefix+'.bm') @@ -1327,12 +1734,16 @@ def parseArguments(): verbal = args.suppressverbal, writeintermediate = args.writeintermediates, forcefit = True) - - DM.writeReactivities(args.outputprefix+'-reactivities.txt', oldDMS=args.oldDMSnorm) - DM.BMsolution.writeModel(args.outputprefix+'.bm') - + if args.concat: + DM.BMsolution.writeModel(args.outputprefix+'-concat.bm') + DM.writeReactivities(args.outputprefix+'-concat-reactivities.txt', oldDMS=args.oldDMSnorm, concat=args.concat) + de_cat.decat_react(args.outputprefix+'-concat-reactivities.txt', args.outputprefix+'-N13-reactivities.txt', args.outputprefix+'-N7-reactivities.txt') + de_cat.decat_bm(args.outputprefix+'-concat.bm', args.outputprefix+'-N13.bm', args.outputprefix+'-N7.bm') + else: + DM.writeReactivities(args.outputprefix+'-reactivities.txt', oldDMS=args.oldDMSnorm) + DM.BMsolution.writeModel(args.outputprefix+'.bm') elif args.readfromfile is not None: @@ -1345,19 +1756,31 @@ def parseArguments(): if args.suppressverbal: print('--------------Computing RINGs--------------') + if args.concat: + N1_RE_list = DM.computeRINGs(window=args.window, bgfile=args.untreated_parsed, subtractwindow=args.inclwindow, mincount=args.mincount, assignprob=args.readprob_cut, verbal=args.suppressverbal) + N1N7_RE_list = DM.computeRINGs(fasta=fasta_output, bgfile=unt_output, subtractwindow=args.inclwindow, mincount=args.mincount, assignprob=args.readprob_cut, verbal=args.suppressverbal, concat=args.concat ) + N7_RE_list = DM.computeRINGs(bgfile=args.untreated_parsed + "ga", subtractwindow=args.inclwindow, mincount=args.mincount, assignprob=args.readprob_cut, verbal=args.suppressverbal, N7=True) - RE_list = DM.computeRINGs(window=args.window, bgfile=args.untreated_parsed, subtractwindow=args.inclwindow, mincount=args.mincount, + for i,model in enumerate(N1_RE_list): + model.writeCorrelations('{}-{}-N1rings.txt'.format(args.outputprefix, i), chi2cut=args.chisq_cut) + for i,model in enumerate(N1N7_RE_list): + model.writeCorrelations('{}-{}-N1N7rings.txt'.format(args.outputprefix, i), chi2cut=args.chisq_cut) + for i,model in enumerate(N7_RE_list): + model.writeCorrelations('{}-{}-N7rings.txt'.format(args.outputprefix, i), chi2cut=args.chisq_cut) + + else: + RE_list = DM.computeRINGs(window=args.window, bgfile=args.untreated_parsed, subtractwindow=args.inclwindow, mincount=args.mincount, assignprob=args.readprob_cut, verbal=args.suppressverbal) - for i,model in enumerate(RE_list): - model.writeCorrelations('{0}-{1}-rings.txt'.format(args.outputprefix, i), + for i,model in enumerate(RE_list): + model.writeCorrelations('{0}-{1}-rings.txt'.format(args.outputprefix, i), chi2cut=args.chisq_cut) - + #concat rings are computed in 3 separate ring exps N1/N17/N7 if args.pairmap: - profiles = DM.computeNormalizedReactivities(args.oldDMSnorm) + profiles = DM.computeNormalizedReactivities(args.oldDMSnorm, concat=args.concat) if args.suppressverbal: print('--------------Computing PAIRs--------------') @@ -1381,6 +1804,10 @@ def parseArguments(): - + if args.concat: + cleanup(mod_output, prof_output, fasta_output) + if args.untreated_parsed: + cleanup(unt_output) + # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 diff --git a/README.md b/README.md index 3f4d0d5..f136b38 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,80 @@ use the following flags: --oldDMSnorm --pm_secondary_reactivity 0.5 --mincount 50 +DanceMapper.py usage related to N7 +-------------- +To use DanceMapper.py to deconvolve N7-G reactivities in addition to traditional N1/3 reactivities +ShapeMapper 2.3+ must be run with --dms and --N7 flags in addition to the --output-parsed-mutations +option to produce N7-G related files. + +Input: + + parsed.mut + file output by ShapeMapper + + profile.txt + file output by ShapeMapper + + --concat + flag directing DanceMapper to concatenate the parsed.mut and parsed.mutga files to deconvolve + N7-G reactivities in addition to N1/3 reactivities + +*Note* The parsed.mut file must have a corresponding parsed.mutga file located in the same directory. +Additionally, the profile.txt file must have a corresponding profile.txtga file located in the same +directory as well. When ShapeMapper 2.3+ is run with the --N7 flag the parsed.mut and profile.txt as +well as the corresponding parsed.mutga and profile.txtga files are all produced in the same directory +by default. + +Output: + + concat.bm file + save file of the Bernoulli mixture model encompassing both N1/3 and N7-G nucleotides. Only + generated when using the --fit option + + N13.bm file + save file of the Bernoulli mixture model encompassing the N1/3 nucleotides. Only generated + when using the --fit option + + N7.bm file + save file of the Bernoulli mixture model encompassing the N7 nucleotides. Only generated + when using the --fit option + + -concat-reactivities.txt file + normalized N1/3 and N7-G reactivities for each structure. Only generated when using the + --fit option + + -N13-reactivities.txt file + normalized N1/3 reactivities for each structure. Only generated when using the --fit + option + + -N7-reactivities.txt file + normalized N7 reactivities for each structure. Only generated when using the --fit + option + + [i]-N1rings.txt file + N1/3-N1/3 RINGs for state i (window=1). Only generated when using the --ring option + + [i]-N7rings.txt file + N7-N7 RINGs for state i (window=1). Only generated when using the --ring option + + [i]-N1N7rings.txt file + N1/3-N7 RINGs for state i (window=1). Only generated when using the --ring option + + +*Additional Notes* +--pairmap flag not currently compatible with --concat flag + +In order to visualize deconvolved N1/3 and N7-G data it is currently recommended that the +dance-reactivites_profile_converter.py script be used. This script will split the +-concat-reactivities.txt file into profile.txt and profile.txtga files corresponding +to the states of the deconvolved model. eg: + +"python dance-reactivites_profile_converter.py -concat-reactivities.txt --output output_prefix" + +These profile files can then be visualized via arcplot. + + + foldClusters.py ---------------- Script for performing RNAstructure modeling based on clustered reactivities and plotting results diff --git a/changelog.txt b/changelog.txt index 7be88de..edc46fd 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,17 +1,52 @@ -Version 1.1 ------------ +Version 1.2 +August 30, 2023 +Lucas Kearns , Thomas Miller +*********************************************************************** --Fixed internal issues so untreated sample is now optional. Additionally added -the --ignore_untreated flag to ignore any untreated sample information in the -profile.txt file +ADDED: +Added --concat flag. This will concatenate mut and corresponding mutga files +together. By default N7-Gs do not contribute to clustering but are set to +inactive. This means that although N7-G positions do not drive clustering they will be +solved for in the final model. --Updated mincount cutoffs for pair/ring analysis to 10 to be compatible with -pair/ringmapper v1.2 +Added --activate_n7 flag. Allows n7 nucleotides to contribute to DanceMap +clustering. By default N7-G nucleotides will not contribute to clustering. + +Added ability for concatenated DanceMapper runs to use RingMapper on solved models +and generate N1/3-N1/3, N1/3-N7G, and N7G-N7G rings. + +CHANGED: + +Updated eDMS normalization to reflect new pur and pyr normalization cases +for N7-G. Changed eDMS so highly protected N7-G regions (ares where bg is higher than +raw) are assigned a normalized reactivity of 3.32. + +FIXED: + +Debugged min coverage calculation to properly calculate 75% coverage +threshold. +Version 1.1 +February 1, 2023 +Anthony Mustoe +*********************************************************************** + +ADDED: + +Added --ignore_untreated flag to ignore any untreated sample information in the +profile.txt file. + +CHANGED: --Updated DMS normalization to eDMS by default. The --oldDMSnorm flag can be used +Updated DMS normalization to eDMS by default. The --oldDMSnorm flag can be used to perform normalization using original method --Changed dependency to consolidated StructureAnalysisTools package +Updated mincount cutoffs for pair/ring analysis to 10 to be compatible with +pair/ringmapper v1.2 + +Changed dependency to consolidated StructureAnalysisTools package + +FIXED: +Fixed internal issues so untreated sample is now optional. diff --git a/compute_readprobs.py b/compute_readprobs.py new file mode 100644 index 0000000..46119b3 --- /dev/null +++ b/compute_readprobs.py @@ -0,0 +1,30 @@ + + +import DanceMapper, accessoryFunctions +import sys +import numpy as np + + +DM = DanceMapper.DanceMap(modfile=sys.argv[1], profilefile=sys.argv[2]) +DM.readModelFromFile(sys.argv[3]) + + +llmat = np.zeros((DM.BMsolution.pdim, DM.reads.shape[0]), dtype=np.float64) + +accessoryFunctions.loglikelihoodmatrix(llmat, DM.reads, DM.mutations, np.array(DM.active_columns, dtype=np.int32), DM.BMsolution.mu, DM.BMsolution.p) + +llmat = np.exp(llmat) +s = np.sum(llmat, axis=0) +llmat /= s + +bins = np.linspace(0,1,11) + +hist, edge = np.histogram(llmat[0,:], bins=bins) + +hist = np.array(hist, dtype=float)/np.sum(hist) + +for i in range(10): + print("{} {:.3f}".format(i, hist[i])) + + + diff --git a/concat_profile_mut_N.py b/concat_profile_mut_N.py new file mode 100644 index 0000000..f08d5b6 --- /dev/null +++ b/concat_profile_mut_N.py @@ -0,0 +1,91 @@ +import numpy as np +import sys + + +#This function opens the .mut and .mutga strings and just concats +#them together. +def concat_mut(inputmut, output, profile): + read_length = 0 + with open("{}.txt".format(profile), 'r') as prof: + lines = prof.readlines() + last_line = lines[-1].split() + read_length = int(last_line[0]) + + + + with open("{}.mut".format(inputmut), 'r') as mut: + with open("{}.mutga".format(inputmut), 'r') as mutGA: + #with open("{}.mut".format(output), 'w') as concat: + with open("{}".format(output), 'w') as concat: + parsedLines = mut.read().splitlines() + parsedLinesGA = mutGA.read().splitlines() + length = len(parsedLines) + for i in range(length): + splGA = parsedLinesGA[i].split() + spl = parsedLines[i].split() + if(splGA[4] == "INCLUDED" and spl[4] == "INCLUDED"): + to_add = [] + to_add += (read_length - (int(spl[3]) + 1)) * ["0"] + to_add += (int(spl[2])) * ["0"] + + + if(len(to_add) > 0): + spl[8] = spl[8] + "".join(to_add) + splGA[8] + spl[6] = spl[6] + "".join(to_add) + splGA[6] + spl[7] = spl[7] + "".join(to_add) + splGA[7] + spl[3] = str(int(spl[3]) + read_length) + else: + spl[8] = spl[8] + splGA[8] + spl[6] = spl[6] + splGA[6] + spl[7] = spl[7] + splGA[7] + spl[3] = str(int(spl[3]) + read_length) + + newline = " ".join(spl) + concat.write(newline + "\n") + +# Concatenates the N1/3 and N7 profile again for further downstream processing +def concat_profile(inputProf, profOut): + with open("{}.txt".format(inputProf), 'r') as N1: + with open("{}.txtga".format(inputProf), 'r') as N7: + #with open("{}.txt".format(profOut), 'w') as out: + with open("{}".format(profOut), 'w') as out: + linesN1 = [line.rstrip() for line in N1] + linesN7 = [line.rstrip() for line in N7][1:] + + + nt_length = (len(linesN1) - 1) + + for i in range(len(linesN7)): + spl_line = linesN7[i].split() + newnum = int(spl_line[0]) + nt_length + newnum = str(newnum) + spl_line[0] = newnum + if spl_line[1] == 'G': + spl_line[1] = 'N' + elif spl_line[1] == 'g': + spl_line[1] = 'n' + + newline = " ".join(spl_line) + linesN1.append(newline) + + + for line in linesN1: + out.write(line + "\n") + + + + + + +if(__name__ == "__main__"): + #Rudimentary arg parsing. Basically do -m for for concat of muts + if(sys.argv[1] == "-h"): + print("USAGE: concat_profile_mut.py -m Mut Output Profile") + print("USAGE: concat_profile_mut.py -p Profile Output") + elif(sys.argv[1] == "-m"): + concat_mut(sys.argv[2], sys.argv[3], sys.argv[4]) + #-p for concat of profile + elif(sys.argv[1] == "-p"): + concat_profile(sys.argv[2], sys.argv[3]) + + diff --git a/dance-reactivites_profile_converter.py b/dance-reactivites_profile_converter.py new file mode 100644 index 0000000..1668f7f --- /dev/null +++ b/dance-reactivites_profile_converter.py @@ -0,0 +1,120 @@ +import argparse +import numpy as np +from numpy import isnan, nan, sqrt + +################################################################################ +#Processes an N1/3-N7 concatenated dance-reactivities.txt file and produces +#a .txt .txtga profile file for each predicted model in the ensemble. These .txtga files #may be fed into arcplot etc. +################################################################################ + +#Parser function. Finds total number of models, length of sequence, +#and extracts N7 lines. +def parseLines(filename): + num_models = 0 + Nt_Length = 0 + reactFile = open(filename, 'r') + lines = [line.rstrip() for line in reactFile] + fline = lines[0] + num_models = int(fline.split()[0]) + lline = lines[len(lines) - 1] + Nt_Length = int(lline.split()[0]) / 2 + reactFile.close() + + + N7index = int(Nt_Length) + 3 + lines = lines[3:] + return num_models, Nt_Length, lines + +#Extracts a two dimensional list of the raw and normalized reactivities as well +#as the sequence. +def findReactivities(lines, num_Models, Nt_Length): + + + seq = [] + norm = [] + + for i in range(num_Models): + norm.append([]) + + + for i in range(len(lines)): + splLine = lines[i].split() + index = 2 + lIndex = 0 + seq.append(splLine[1]) + for j in range(len(norm)): + norm[lIndex].append(splLine[index]) + lIndex += 1 + index += 2 + + norm_N13 = [] + norm_N7 = [] + + for i in range(len(norm)): + norm_N13.append(norm[i][:int(Nt_Length)]) + norm_N7.append(norm[i][int(Nt_Length):]) + + return norm_N13, norm_N7, seq + + +#Iterates through extracted reactivities and generates .txtga profiles. +#def make_txtga(seq, raw, norm, output): +def make_txtga(norm_N13, norm_N7, seq, output): + for i in range(len(norm_N13)): + lines_N13 = [] + lines_N7 = [] + lines_N13.append("Nucleotide Sequence Modified_mutations Modified_read_depth Modified_effective_depth Modified_rate Modified_off_target_mapped_depth Modified_low_mapq_mapped_depth Modified_mapped_depth Untreated_mutations Untreated_read_depth Untreated_effective_depth Untreated_rate Untreated_off_target_mapped_depth Untreated_low_mapq_mapped_depth Untreated_mapped_depth Denatured_mutations Denatured_read_depth Denatured_effective_depth Denatured_rate Denatured_off_target_mapped_depth Denatured_low_mapq_mapped_depth Denatured_mapped_depth Reactivity_profile Std_err HQ_profile HQ_stderr Norm_profile Norm_stderr") + lines_N7.append("Nucleotide Sequence Modified_mutations Modified_read_depth Modified_effective_depth Modified_rate Modified_off_target_mapped_depth Modified_low_mapq_mapped_depth Modified_mapped_depth Untreated_mutations Untreated_read_depth Untreated_effective_depth Untreated_rate Untreated_off_target_mapped_depth Untreated_low_mapq_mapped_depth Untreated_mapped_depth Denatured_mutations Denatured_read_depth Denatured_effective_depth Denatured_rate Denatured_off_target_mapped_depth Denatured_low_mapq_mapped_depth Denatured_mapped_depth Reactivity_profile Std_err HQ_profile HQ_stderr Norm_profile Norm_stderr") + for j in range(len(norm_N13[i])): + line_N13 = [] + line_N13.append(str(j + 1)) + line_N13.append(" {} ".format(seq[j])) + line_N13.append(" 0 " * 25) + line_N13.append(str(norm_N13[i][j])) + line_N13.append(" 0") + + line_N13 = ''.join(line_N13) + + line_N7 = [] + line_N7.append(str(j + 1)) + if seq[j] == "N": + seq[j] = "G" + line_N7.append(" {} ".format(seq[j])) + line_N7.append(" 0 " * 25) + line_N7.append(str(norm_N7[i][j])) + line_N7.append(" 0") + line_N7 = ''.join(line_N7) + + + + + lines_N13.append(line_N13) + lines_N7.append(line_N7) + + if output == None: + oFile_N13 = open("dance_map_model_{}_profile.txt".format(i), "w") + oFile_N7 = open("dance_map_model_{}_profile.txtga".format(i), "w") + for line in lines_N13: + oFile_N13.write(line + "\n") + for line in lines_N7: + oFile_N7.write(line + "\n") + + else: + oFile_N13 = open("{}_{}_profile.txt".format(output, i), "w") + oFile_N7 = open("{}_{}_profile.txtga".format(output, i), "w") + for line in lines_N13: + oFile_N13.write(line + "\n") + for line in lines_N7: + oFile_N7.write(line + "\n") + + oFile_N13.close() + oFile_N7.close() + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument("reactivities", help = "Path to the concatenated dance-reactivities.txt") + parser.add_argument("--output", help = "Output file(s) prefix", default=None) + args=parser.parse_args() + num_Models, Nt_Length, lines = parseLines(args.reactivities) + norm_N13, norm_N7, seq = findReactivities(lines, num_Models, Nt_Length) + make_txtga(norm_N13, norm_N7, seq, args.output) diff --git a/de_cat_N7_N13.py b/de_cat_N7_N13.py new file mode 100644 index 0000000..fbbc9cb --- /dev/null +++ b/de_cat_N7_N13.py @@ -0,0 +1,119 @@ +import sys +import re + +#################### +#Script that deconcatenates N13/N7 for bm, reactivity, and ring files. +#################### +def decat_bm(filename, N13output, N7output): + bmfile = open(filename, "r") + eIndex = 0 + lines = bmfile.readlines() + for i in range(len(lines)): + if lines[i] == "# Initial P\n": + eIndex = i + eIndex = i - 1 + + bmfile.close() + + header = lines[0:6] + + react_lines = lines[6:eIndex] + #for line in react_lines: + # print(line) + + midpt = int(len(react_lines) / 2 ) + N13 = react_lines[0:midpt] + N7 = react_lines[midpt:] + + + + #print("Length: ", int(len(react_lines) / 2)) + + N13_file = open(N13output, "w") + for line in header: + N13_file.write(line) + for line in N13: + N13_file.write(line) + + N13_file.close() + + N7_file = open(N7output, "w") + for line in header: + N7_file.write(line) + for line in N7: + N7_file.write(line) + + +def decat_react(filename, N13output, N7output): + reactfile = open(filename, "r") + lines = reactfile.readlines() + reactfile.close() + + header = lines[0:3] + + + react_lines = lines[3:] + midpt = int(len(react_lines) / 2 ) + N13 = react_lines[0:midpt] + N7 = react_lines[midpt:] + + N13_file = open(N13output, "w") + for line in header: + N13_file.write(line) + for line in N13: + N13_file.write(line) + N13_file.close() + + N7_file = open(N7output, "w") + for line in header: + N7_file.write(line) + for line in N7: + N7_file.write(line) + N7_file.close() + + +def decat_rings(filename, output): + ringFile = open(filename, "r") + lines = ringFile.readlines() + + concatLength = int(lines[0].split()[0]) + ntLength = int(concatLength / 2) + + splFLine = lines[0].split() + splFLine[0] = str(ntLength) + splFLine.append("\n") + splFLine = " ".join(splFLine) + toAdd = [] + toAdd.append(splFLine) + toAdd.append(lines[1]) + + for index in range(2, len(lines)): + splLine = lines[index].split() + i = int(splLine[0]) + j = int(splLine[1]) + if ( i < ntLength and j < ntLength): + toAdd.append(lines[index]) + + ringFile.close() + + ringoutput = open( output + ".txt", "w") + for line in toAdd: + ringoutput.write(line) + + ringoutput.close() + + +if __name__=="__main__": + if (sys.argv[1] == "-h"): + print("Usage: input N13_output, N7_output") + + else: + if (re.search('.reactivities\.txt', sys.argv[1]) != None): + #print("reactivities.txt file input, de - concatenating reactivities") + decat_react(sys.argv[1], sys.argv[2], sys.argv[3]) + elif (re.search('.\.bm', sys.argv[1]) != None): + #print(".bm file input, de - concatenating bm") + decat_bm(sys.argv[1], sys.argv[2], sys.argv[3]) + elif (re.search('.rings\.txt', sys.argv[1]) != None or re.search('.ringmap\.txt', sys.argv[1])): + #print("rings.txt or ringmap.txt file input, de - concatenating rings") + decat_rings(sys.argv[1], sys.argv[2]) diff --git a/plotClusters.py b/plotClusters.py index 389625e..3a76f20 100644 --- a/plotClusters.py +++ b/plotClusters.py @@ -12,8 +12,8 @@ from scipy import stats from BernoulliMixture import BernoulliMixture - import externalpaths + sys.path.append(externalpaths.structureanalysistools()) from ReactivityProfile import ReactivityProfile @@ -278,14 +278,17 @@ def plotClusterProfile(clustobj, out=None, modelNums=None): fig, ax = plot.subplots() for i,p in enumerate(clustobj.p): - + if i == 0: + alpha_ = 1.0 + else: + alpha_ = 0.5 if modelNums is not None and i not in modelNums: continue with np.errstate(invalid='ignore'): mask = clustobj.rawprofiles[i]>-1 - ax.step(xvals[mask], clustobj.rawprofiles[i][mask], label='p={0:.2f}'.format(p), where='mid') + ax.step(xvals[mask], clustobj.rawprofiles[i][mask], alpha=alpha_, label='p={0:.2f}'.format(p), where='mid') for c in mergeColumns(clustobj.inactive_columns): ax.axvspan(c[0],c[1], color='gray', alpha=0.2) @@ -330,14 +333,17 @@ def plotClusterComparison(clust1, clust2, name1='', name2='', out=None, align=Fa for i,p in enumerate(clust1.p): - + if i == 0: + alpha_ = 1.0 + else: + alpha_ = 0.5 with np.errstate(invalid='ignore'): mask = (clust1.rawprofiles[i]>-1) & (clust2.rawprofiles[i]>-1) - ax[i].step(xvals[mask], clust1.rawprofiles[i][mask], where='mid', + ax[i].step(xvals[mask], clust1.rawprofiles[i][mask],alpha=alpha_, where='mid', label='{0} p={1:.2f}'.format(name1, p)) - ax[i].step(xvals[mask], clust2.rawprofiles[c2idx[i]][mask], where='mid', + ax[i].step(xvals[mask], clust2.rawprofiles[c2idx[i]][mask], alpha=alpha_, where='mid', label='{0} p={1:.2f}'.format(name2, clust2.p[c2idx[i]])) @@ -428,21 +434,28 @@ def mergeColumns(columns): def plotReactProfile(rpclust, out=None, modelNums=None, ptype=None): + + fig, ax = plot.subplots() xvals = rpclust.profiles[0].nts - + for i,p in enumerate(rpclust.population): - + if i == 0: + alpha_ = 1.0 + else: + alpha_ = 0.5 if modelNums is not None and i not in modelNums: continue with np.errstate(invalid='ignore'): - mask = rpclust.profiles[i].normprofile >-1 + mask = np.isfinite(rpclust.profiles[i].normprofile) - prof = rpclust.profiles[i].profile(ptype) - ax.step(xvals[mask], prof[mask], where='mid', label='p={:.2f}'.format(p)) - + #convert invalid positions to 0 in our profile to ensure the steps align with the data + prof = np.nan_to_num(rpclust.profiles[i].profile(ptype)) + + ax.step(xvals, prof,alpha=alpha_, where='mid', label='p={:.2f}'.format(p)) + for c in mergeColumns(np.where(np.isnan(rpclust.profiles[0].normprofile))[0]): ax.axvspan(c[0],c[1], color='gray', alpha=0.6) @@ -479,7 +492,7 @@ def plotReactProfileComparison(rp1, rp2, name1='', name2='', ptype=None, out=Non xvals = rp1.profiles[0].nts - + fig, ax = plot.subplots(nrows=max(2,len(rp1.population)), ncols=1) if align: @@ -489,15 +502,18 @@ def plotReactProfileComparison(rp1, rp2, name1='', name2='', ptype=None, out=Non for i,p in enumerate(rp1.population): - + if i == 0: + alpha_ = 1.0 + else: + alpha_ = 0.5 with np.errstate(invalid='ignore'): - mask = (rp1.profiles[i].rawprofile>-1) & (rp2.profiles[i].rawprofile>-1) + mask = np.isfinite(rp1.profiles[i].rawprofile) & np.isfinite(rp2.profiles[i].rawprofile) - prof1 = rp1.profiles[i].profile(ptype) - prof2 = rp2.profiles[i].profile(ptype) + prof1 = np.nan_to_num(rp1.profiles[i].profile(ptype)) + prof2 = np.nan_to_num(rp2.profiles[i].profile(ptype)) - ax[i].step(xvals[mask], prof1[mask], where='mid', label='{0} p={1:.2f}'.format(name1, p)) - ax[i].step(xvals[mask], prof2[mask], where='mid', label='{0} p={1:.2f}'.format(name2, rp2.population[c2idx[i]])) + ax[i].step(xvals, prof1, alpha=alpha_, where='mid', label='{0} p={1:.2f}'.format(name1, p)) + ax[i].step(xvals, prof2, alpha=alpha_, where='mid', label='{0} p={1:.2f}'.format(name2, rp2.population[c2idx[i]])) for c in mergeColumns(rp1.inactive_columns): diff --git a/version.py b/version.py index 52d2bb8..827b5e0 100644 --- a/version.py +++ b/version.py @@ -1,5 +1,5 @@ -v = '1.1' +v = '1.2' def print_version(): print("Version {}".format(v))