diff --git a/LibraryTesting/BadVariants.txt b/LibraryTesting/BadVariants.txt new file mode 100644 index 0000000..e69de29 diff --git a/libs/IMlibs.py b/libs/IMlibs.py index 071f890..cdc9db5 100644 --- a/libs/IMlibs.py +++ b/libs/IMlibs.py @@ -12,13 +12,13 @@ import pandas as pd import scipy.io as sio import matplotlib.pyplot as plt -import CPlibs +#import CPlibs def spawnMatlabJob(matlabFunctionCallString): try: #construct the command-line matlab call functionCallString = "try," - functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/sarah/array_image_tools_SKD/libs', '/home/sarah/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE + functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/namita/Code/array_image_tools_SKD/libs', '/home/namita/Code/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE functionCallString = functionCallString + matlabFunctionCallString + ';' functionCallString = functionCallString + "catch e," functionCallString = functionCallString + "disp(getReport(e,'extended'));" @@ -142,7 +142,8 @@ def getFPfromCPsignal(signalNamesByTileDict): return bindingSeriesFilenameDict def pasteTogetherSignal(cpSeqFilename, signal, outfilename): - tmpfilename = 'signal_'+str(time.time()) + output_directory = os.path.dirname(outfilename) + tmpfilename = os.path.join(output_directory, 'signal_'+str(time.time())) np.savetxt(tmpfilename, signal, fmt='%s') os.system("paste %s %s > %s"%(cpSeqFilename, tmpfilename, outfilename+'.tmp')) os.system("mv %s %s"%(outfilename+'.tmp', outfilename)) @@ -218,19 +219,19 @@ def sortConcatenateCPsignal(reducedSignalNamesByTileDict, barcode_col, sortedAll return def compressBarcodes(sortedAllCPsignalFile, barcode_col, seq_col, compressedBarcodeFile): - script = '/home/sarah/array_image_tools_SKD/scripts/compressBarcodes.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/compressBarcodes.py' print "python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col) os.system("python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col)) return def barcodeToSequenceMap(compressedBarcodeFile, libraryDesignFile, outFile): - script = '/home/sarah/array_image_tools_SKD/scripts/findSeqDistribution.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/findSeqDistribution.py' print "python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile) os.system("python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile)) return def matchCPsignalToLibrary(barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename): - script = '/home/sarah/array_image_tools_SKD/scripts/matchCPsignalLibrary.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/matchCPsignalLibrary.py' print "python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename) os.system("python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename)) return @@ -338,9 +339,18 @@ def loadCPseqSignal(filename): """ cols = ['tileID','filter','read1_seq','read1_quality','read2_seq','read2_quality','index1_seq','index1_quality','index2_seq', 'index2_quality','all_cluster_signal','binding_series'] table = pd.read_csv(filename, sep='\t', header=None, names=cols, index_col=False) + #count = 0 + # + #binding_series = np.zeros((len(table),8)) + #for series in table['binding_series']: + # if (len(np.array(series.split(','), dtype=float)) ==8): + # binding_series[count] = np.array(series.split(','), dtype=float) + # count += 1 binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + for col in range(binding_series.shape[1]): table[col] = binding_series[:, col] + return table def loadNullScores(signalNamesByTileDict, filterSet): @@ -361,7 +371,15 @@ def loadBindingCurveFromCPsignal(filename): then return the binding series, normalized by all cluster image. """ table = pd.read_table(filename, usecols=(10,11)) - binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + count = 0 + binding_series = np.zeros((len(table),8)) + for series in table['binding_series']: + if (len(np.array(series.split(','), dtype=float)) ==8): + binding_series[count] = np.array(series.split(','), dtype=float) + count += 1 + + #binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + return binding_series, np.array(table['all_cluster_signal']) def loadFittedCPsignal(filename): diff --git a/libs/IMlibs.pyc b/libs/IMlibs.pyc index 81515fb..1eba5aa 100644 Binary files a/libs/IMlibs.pyc and b/libs/IMlibs.pyc differ diff --git a/libs/variantFun.py b/libs/variantFun.py index 308b26a..5354126 100644 --- a/libs/variantFun.py +++ b/libs/variantFun.py @@ -10,36 +10,85 @@ import plotfun import scipy.io as sio import matplotlib.pyplot as plt -import CPlibs +#import CPlibs +import scikits.bootstrap +import scipy as scipy +from multiprocessing import Pool +import functools class Parameters(): def __init__(self): - # save the units of concentration given in the binding series self.RT = 0.582 # kcal/mol at 20 degrees celsius self.min_deltaG = -12 self.max_deltaG = -3 - + + def perVariantInfo(table): - variants = np.arange(0, np.max(table['variant_number'])) - columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin'] + variants = range(0,int(np.max(table['variant_number']))) + wd = '/home/namita/RigidAnalysis' + filename = os.path.join(wd, 'BadVariants.txt') + f = open(filename, 'w') + + columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ] newtable = pd.DataFrame(columns=columns, index=np.arange(len(variants))) - for i, variant in enumerate(variants): - if i%1000==0: print 'computing iteration %d'%i - sub_table = table[table['variant_number']==variant] - if len(sub_table) > 0: - sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax', 'fmin']] - newtable.iloc[i]['numTests'] = len(sub_table_filtered) - newtable.iloc[i]['numRejects'] = len(sub_table) - len(sub_table_filtered) - newtable.iloc[i]['kd':] = np.median(sub_table_filtered, 0) - newtable.iloc[i][:'total_length'] = sub_table.iloc[0][:'total_length'] - return newtable + pool = Pool(processes=20) + datapervar= pool.map(functools.partial(generateVarStats, table, f), variants) + pool.close() + f.close() + print('here!') + for variant in variants: + #this is really slow, there's got to be a better way of unpacking datapervar + if (variant % 1000) == 0: + print(variant) + newtable.iloc[variant] = datapervar[variant].iloc[0] + + return newtable + +def generateVarStats(table,f,variant): + sub_table = table[table['variant_number']==variant] + #how do you intialize a list? + # print(variant) + #doesn't like 15356 + columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ] + newtable_pervar = pd.DataFrame(columns=columns, index = xrange(0,1)) + if len(sub_table) > 0: + sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax','qvalue', 'fmin']] + if not(sub_table_filtered.empty): + newtable_pervar.iloc[0]['numTests'] = len(sub_table_filtered) + newtable_pervar.iloc[0]['numRejects'] = len(sub_table) - len(sub_table_filtered) + newtable_pervar.iloc[0]['kd':'qvalue'] = np.median(sub_table_filtered, 0) + newtable_pervar.iloc[0][:'total_length'] = sub_table.iloc[0][:'total_length'] + if len(sub_table_filtered) > 1: + #get bootstrapped error bars on mean, would have liked to do median but get errors... + try: + if (variant % 1000) == 0: + print(variant) + bootval = scikits.bootstrap.ci(data=sub_table_filtered['dG'], statfunction=np.median, method='pi') + newtable_pervar.iloc[0]['errdG_L'] = bootval[0]-np.median(sub_table_filtered, 0)[1] + newtable_pervar.iloc[0]['errdG_R'] = bootval[1]-np.median(sub_table_filtered, 0)[1] + except IndexError: + print('value error') + print(variant) + indexbad = np.array(variant) + np.savetxt(filename, indexbad) + #could also use + f.write('%d\t\n'%variant) + + #save to text file the name of the variant, could do an np.savetxt, + + + return newtable_pervar def filterFitParameters(sub_table): - sub_table = sub_table[sub_table['fit_success']==1] + #sub_table = sub_table[sub_table['fit_success']==1] sub_table = sub_table[sub_table['rsq']>0.5] - sub_table = sub_table[sub_table['fraction_consensus']>=67] # 2/3 majority + #find where the binding series starts + firstconc = int(np.where(sub_table.columns == 0)[0]) + numconcs = 8 + #filters out anything that has a single NaN concentration + sub_table = sub_table[sub_table.iloc[:,firstconc:firstconc+numconcs].isnull().sum(1) == 0 ] return sub_table def bindingCurve(concentrations, kd, fmax=None, fmin=None): @@ -262,7 +311,6 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep # choose just one sequence of that topology to look at seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0] criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0) - sub_table = variant_table[criteria_central] helix_context = np.unique(sub_table['helix_context']) total_lengths = np.array([8,9,10,11,12]) @@ -286,4 +334,11 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep ax.set_ylabel('delta delta G') ax.set_ylim((-1, 5)) plt.tight_layout() - return \ No newline at end of file + return + + + + + + + diff --git a/scripts/analyzeVariants.py b/scripts/analyzeVariants.py index 1f45496..c9aca43 100644 --- a/scripts/analyzeVariants.py +++ b/scripts/analyzeVariants.py @@ -21,6 +21,8 @@ import pandas as pd import variantFun import IMlibs +import multiprocessing +import plotfun parameters = variantFun.Parameters() #set up command line argument parser @@ -38,14 +40,16 @@ args = parser.parse_args() # outdirectory -imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs' +#imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs' +imageDirectory = '/home/namita/RigidAnalysis/figs' + # load concentrations xValuesFilename, fluor_dirs_red, fluor_dirs, concentrations = IMlibs.loadMapFile(args.CPfluor_dirs_to_quantify) # load table fittedBindingFilename = args.CPfitted table = IMlibs.loadFittedCPsignal(fittedBindingFilename) -table['dG'] = parameters.RT*np.log(table['kd']*1E-9) +#table['dG'] = parameters.RT*np.log(table['kd']*1E-9) # get another dict that gives info on a per variant basis variantFittedFilename = os.path.splitext(fittedBindingFilename)[0]+'.perVariant.fitParameters' diff --git a/scripts/findSeqDistribution.py b/scripts/findSeqDistribution.py index 1f4b826..815ec4e 100644 --- a/scripts/findSeqDistribution.py +++ b/scripts/findSeqDistribution.py @@ -10,6 +10,8 @@ import numpy as np from optparse import OptionParser import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') import pandas as pd import sys import os