From bae4fde1d39f29d19bda29b55f79e6a4033e29e9 Mon Sep 17 00:00:00 2001 From: nbisaria Date: Wed, 31 Dec 2014 10:23:18 -0800 Subject: [PATCH 1/2] Added Boostrapping, fixed issues with missing values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Edited IMlibs to deal with the fact that some of the binding series data didn’t have 8 points and was interfering with the conversion into an array. All directories for saving are pointing to my home directory and will need to be changed. Edited variantFun to do the bootstrapping (which is still slow), and extract the data to a new table. Right now 20% of the data has NaNs for error bars, they are either varaitns that only have one cluster, or variants that for some reason have NaN as a variant number. I’m investigating the second reason currently. The filtering of variants is just the value > 0.5, but should be changed depending on what is working well for you. --- LibraryTesting/BadVariants.txt | 0 libs/IMlibs.py | 35 ++++++-- libs/IMlibs.pyc | Bin 19921 -> 21106 bytes libs/variantFun.py | 156 +++++++++++++++++++++++++++++---- scripts/analyzeVariants.py | 8 +- scripts/findSeqDistribution.py | 2 + 6 files changed, 173 insertions(+), 28 deletions(-) create mode 100644 LibraryTesting/BadVariants.txt diff --git a/LibraryTesting/BadVariants.txt b/LibraryTesting/BadVariants.txt new file mode 100644 index 0000000..e69de29 diff --git a/libs/IMlibs.py b/libs/IMlibs.py index 071f890..bd0fcca 100644 --- a/libs/IMlibs.py +++ b/libs/IMlibs.py @@ -12,13 +12,13 @@ import pandas as pd import scipy.io as sio import matplotlib.pyplot as plt -import CPlibs +#import CPlibs def spawnMatlabJob(matlabFunctionCallString): try: #construct the command-line matlab call functionCallString = "try," - functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/sarah/array_image_tools_SKD/libs', '/home/sarah/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE + functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/namita/Code/array_image_tools_SKD/libs', '/home/namita/Code/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE functionCallString = functionCallString + matlabFunctionCallString + ';' functionCallString = functionCallString + "catch e," functionCallString = functionCallString + "disp(getReport(e,'extended'));" @@ -142,7 +142,8 @@ def getFPfromCPsignal(signalNamesByTileDict): return bindingSeriesFilenameDict def pasteTogetherSignal(cpSeqFilename, signal, outfilename): - tmpfilename = 'signal_'+str(time.time()) + output_directory = os.path.dirname(outfilename) + tmpfilename = os.path.join(output_directory, 'signal_'+str(time.time())) np.savetxt(tmpfilename, signal, fmt='%s') os.system("paste %s %s > %s"%(cpSeqFilename, tmpfilename, outfilename+'.tmp')) os.system("mv %s %s"%(outfilename+'.tmp', outfilename)) @@ -218,19 +219,19 @@ def sortConcatenateCPsignal(reducedSignalNamesByTileDict, barcode_col, sortedAll return def compressBarcodes(sortedAllCPsignalFile, barcode_col, seq_col, compressedBarcodeFile): - script = '/home/sarah/array_image_tools_SKD/scripts/compressBarcodes.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/compressBarcodes.py' print "python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col) os.system("python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col)) return def barcodeToSequenceMap(compressedBarcodeFile, libraryDesignFile, outFile): - script = '/home/sarah/array_image_tools_SKD/scripts/findSeqDistribution.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/findSeqDistribution.py' print "python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile) os.system("python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile)) return def matchCPsignalToLibrary(barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename): - script = '/home/sarah/array_image_tools_SKD/scripts/matchCPsignalLibrary.py' + script = '/home/namita/Code/array_image_tools_SKD/scripts/matchCPsignalLibrary.py' print "python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename) os.system("python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename)) return @@ -338,9 +339,21 @@ def loadCPseqSignal(filename): """ cols = ['tileID','filter','read1_seq','read1_quality','read2_seq','read2_quality','index1_seq','index1_quality','index2_seq', 'index2_quality','all_cluster_signal','binding_series'] table = pd.read_csv(filename, sep='\t', header=None, names=cols, index_col=False) + #we ended here!! + #count = 0 + # + #binding_series = np.zeros((len(table),8)) + #for series in table['binding_series']: + # if (len(np.array(series.split(','), dtype=float)) ==8): + # binding_series[count] = np.array(series.split(','), dtype=float) + # count += 1 + #import pdb; pdb.set_trace() binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + for col in range(binding_series.shape[1]): + print('newversion') table[col] = binding_series[:, col] + return table def loadNullScores(signalNamesByTileDict, filterSet): @@ -361,7 +374,15 @@ def loadBindingCurveFromCPsignal(filename): then return the binding series, normalized by all cluster image. """ table = pd.read_table(filename, usecols=(10,11)) - binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + count = 0 + binding_series = np.zeros((len(table),8)) + for series in table['binding_series']: + if (len(np.array(series.split(','), dtype=float)) ==8): + binding_series[count] = np.array(series.split(','), dtype=float) + count += 1 + + #binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) + return binding_series, np.array(table['all_cluster_signal']) def loadFittedCPsignal(filename): diff --git a/libs/IMlibs.pyc b/libs/IMlibs.pyc index 81515fb3ce7ed94b027bdfeb85e3ddaa04a5d51c..1eba5aa10ee36228b66b80aa5cba7e0629e209a0 100644 GIT binary patch delta 4421 zcmbVPdrX_x6+g!>umKyB7(XxuY;24}F>3;}0{H{Kc-AfOTKQAR!!+vHPcj8&6=r`s&(C(sY>@ywDqqpP1!k@&~0g} z@nQIW{(bk}^SI|e&hc-a#(%zmb;b{?Z0BE-w~oNIkCDE&(Nl0fR`+1Z2uA=8I6;^c zsR3bDWDN+5B5OgEDdGZAu1GBi0|+Y|K{()1oL+5k!~h4pAS%=tABajt>OfQ};s;?@ zq#lGrkpPJ5l41=YoQgDps8OT|#2Q6{Al51p0^w4m8APojVGwRbA|O19w1Ds`5(VK? zq!mP+A~6ttMb?3+SELO@07^Xj80peE4{qEU^AgJ@Eu14K}fFM$Xta-$MyRxwx) zBCHxWfQTs438F=jE)Y>gx|dC~Dib$>h^cnl%^=n()}wV9K0#L-txWuDx$$&-A~l{V zrsDlMk(Pf7*VPu{pWAgR8qgj!oi;>vaQAJ6biNRuIFu=*@|jdTksFyFPfrvJ@xj!g zWmOTmcdJReyKIk9kIWU@KUO@vFQ$bny}Ddl2$sux?LS_ziEuwi2xf4P8ljdXxi`e!7PiG5^fhYuw2!-3r99_Jp;a!{cx0Q`S4Eb{)q4uX^mx=P zzv>O#R!t4L>Eh&caad&X>5*bCe^0T3=Q2Jyn#rc=#H6G23JMB34#cZoztzr?Sc75h zlJ^ASl$@)3R8OQ6{F7ezUG^!#mm7;K$RlvEjsu&4nZC z2z*F_XJmEb6M6*SZuH{MNGKt1YPwyI;JKy%{#*&R2KVR@TnM`GW%)|*2l%kehQfM8 z&xM-s741^Ut}CeP!cF*VITXID^Wo(!q;ntj;Tu|WB!*a$ds>Dqlv*HppM0?;rpNg; z(K>un#-jxnAJxyMf>Y7)uPDtYsJmE3P)JV=kL0q+quihl%}72oSzO&om}^~&@5t}8 zW?b99z1Q(Dsc3_oUpJI^kSrwCe0f!p zt2fxy=ED!PFSmJgrKRWEEAX;B+5V!QC6sYDek^|&m-Z*f=OPzeyHmT9^K!7GnXZVb z4u8))Pv$2CtBc-S9Zp2;QpZP#-o%eJ;lP)B$a3hC{E`!x=tAh?xB%4c#{=Iak z?8_XUNM-ku9|e^fO)+e-Z87-Ji~)oE-R5`EAkS>M1sCPjEdf1Y+`F}=bePq0J_od* z4ejW|N~}XCj_|zyr&xK38kUL`DlVulpA=xzvY8>w5MpR$XkoZY5alQ))yY|8_IC!}BT1kbM}dD%sBNOB zP&(e*=*@q+b`}?{9PE+ldAI^-Y4yTi-d}bMX;EP?Zy6)Fb-zGdF1O1Qpyi8)rUlJ$<|3}Mp zN~m(W(qE0ow93Tqk?NoiZpl_u@g$a%%7fcqN4sp_aY4^9=-TPP$F=P{|AIcH>lf_c zl0s%@8Rq4{Kr`jz+&~0hlk)@didv3?mHQUu`2ioEm46)w(&b^_rK?1ulPr(67{jv+-(z^5 z;RS|M3>w23hBp}AWO$e1BEw|{1IM(S!HR=~l}zmnF@|n}|E!+wCA@XW=^G_87DFXn gHvj$SK!;J@H{_e*VwZ>|7lv!*44Mg*wNB{r; delta 3217 zcma)8eQZ-z6hEikTDq2%v90UYt#n)4bsgK3Ip)A^z=jgRZ9GB45?u$Fb*&<6I7li z#h~&vDFJ2Eq!g506A!3qnv{vTa=0}jN;L_A@@NtURi;TDD6b|FQ01D`gR0OZ z3d*NR1E@+(W9L)r@ zKVYY>qH`v)P;PwY0aK&gSjktW0(#YR*2GgI{Z{s=$@ubY9w)=bS2thj`xq2L1|Wm5 z4q+?6R)n%KD`YIN&I+Ua)6W4v0b4DInLJpRFHgZJXc-^rd?*{Tf``qfq&66x9Ou6w z%Vi=vSVpb%ZFvlBG*IDRWlZM)G7?UDOF9!Dj!1OIGJ&{AoD3T_Wrh;T_}#JLxW3N} zr>5>BECvl}lRc)?oII-WdF?JyLr<>T_cUUPzO3A3l9@0atSZ4L>3G!-rqT<34<5*Q zW_6fv!~Q@3-=HIbr%jDz%MVnS<3T!B+e6Np<)&J%)Rg1fTEkYm%2eaET0g$S8e}4E zDokWDadqRxTpgyv!B_BMS{n+PD2M6uP&K}rn+O$}uV-alHNH>x*4^WOcq+&Lo#-Wz zN<5K^N1}-L(_{66Ha-v z=9J+C?U+M_ds#hWW$2Cd8vCfeCCEp)x23XnpG5N(!&O#ztfdGccdF$q;v@8F>+PoV zZJF!EX|!kVgZLqpw)L3myxdm7kJ^#8PMonvrj?+KL4n7qu)Wg%%++Jr);#W35dZN>)ZiUP=80FgMkUm?^wXWxm0F33!824LMO71QnuezZtip8>OW23+~ z3<+eQdl%f|lZZ3<00;}TVx@K(~Qn>@IIQaA0bJ!LT%iNgFy|?T~tk6bB zWXgZ$H4e$`K04ML4W}`Y)YKU+lawu5{w50_>~)shXi&A>=f7~ z@TkCJ0?!KU7kEM7MS<4^-V`_}a6;guz?TAF3;ZMik}R8m9a|X-gt`PG0<8>}_Rf40 zPA+e|ZY}$=SzLJe*Ntv;SsZl^ufyta^JR18y9*sQ{hdbhj7F!^;dB%@9B#YA^Ec_8 B3lRVS diff --git a/libs/variantFun.py b/libs/variantFun.py index 308b26a..83f3a98 100644 --- a/libs/variantFun.py +++ b/libs/variantFun.py @@ -10,36 +10,93 @@ import plotfun import scipy.io as sio import matplotlib.pyplot as plt -import CPlibs +#import CPlibs +import scikits.bootstrap +import scipy as scipy +from multiprocessing import Pool +import functools class Parameters(): def __init__(self): - # save the units of concentration given in the binding series self.RT = 0.582 # kcal/mol at 20 degrees celsius self.min_deltaG = -12 self.max_deltaG = -3 - + + def perVariantInfo(table): - variants = np.arange(0, np.max(table['variant_number'])) - columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin'] + #variants_1 = np.arange(0, np.max(table['variant_number']),dtype=int) + variants = range(0,int(np.max(table['variant_number']))) + #something is wrong with 595,681 + wd = '/home/namita/RigidAnalysis' + filename = os.path.join(wd, 'BadVariants.txt') + f = open(filename, 'w') + + columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ] newtable = pd.DataFrame(columns=columns, index=np.arange(len(variants))) - for i, variant in enumerate(variants): - if i%1000==0: print 'computing iteration %d'%i - sub_table = table[table['variant_number']==variant] - if len(sub_table) > 0: - sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax', 'fmin']] - newtable.iloc[i]['numTests'] = len(sub_table_filtered) - newtable.iloc[i]['numRejects'] = len(sub_table) - len(sub_table_filtered) - newtable.iloc[i]['kd':] = np.median(sub_table_filtered, 0) - newtable.iloc[i][:'total_length'] = sub_table.iloc[0][:'total_length'] - return newtable + pool = Pool(processes=20) + datapervar= pool.map(functools.partial(generateVarStats, table, f), variants) + #newtable.iloc[variants] = pool.map(functools.partial(generateVarStats, table), variants) + pool.close() + f.close() + print('here!') + for variant in variants: + #this is really slow, there's got to be a better way of unpacking datapervar + if (variant % 1000) == 0: + print(variant) + newtable.iloc[variant] = datapervar[variant].iloc[0] + + return newtable + +def versionNum(): + print('v3') + + + +def generateVarStats(table,f,variant): + sub_table = table[table['variant_number']==variant] + #how do you intialize a list? + # print(variant) + #doesn't like 15356 + columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ] + newtable_pervar = pd.DataFrame(columns=columns, index = xrange(0,1)) + if len(sub_table) > 0: + sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax','qvalue', 'fmin']] + if not(sub_table_filtered.empty): + newtable_pervar.iloc[0]['numTests'] = len(sub_table_filtered) + newtable_pervar.iloc[0]['numRejects'] = len(sub_table) - len(sub_table_filtered) + newtable_pervar.iloc[0]['kd':'qvalue'] = np.median(sub_table_filtered, 0) + newtable_pervar.iloc[0][:'total_length'] = sub_table.iloc[0][:'total_length'] + if len(sub_table_filtered) > 1: + #get bootstrapped error bars on mean, would have liked to do median but get errors... + try: + if (variant % 1000) == 0: + print(variant) + bootval = scikits.bootstrap.ci(data=sub_table_filtered['dG'], statfunction=np.median, method='pi') + newtable_pervar.iloc[0]['errdG_L'] = bootval[0]-np.median(sub_table_filtered, 0)[1] + newtable_pervar.iloc[0]['errdG_R'] = bootval[1]-np.median(sub_table_filtered, 0)[1] + except IndexError: + print('value error') + print(variant) + indexbad = np.array(variant) + np.savetxt(filename, indexbad) + #could also use + f.write('%d\t\n'%variant) + + #save to text file the name of the variant, could do an np.savetxt, + + + return newtable_pervar def filterFitParameters(sub_table): - sub_table = sub_table[sub_table['fit_success']==1] + #sub_table = sub_table[sub_table['fit_success']==1] sub_table = sub_table[sub_table['rsq']>0.5] - sub_table = sub_table[sub_table['fraction_consensus']>=67] # 2/3 majority + #find where the binding series starts + firstconc = int(np.where(sub_table.columns == 0)[0]) + numconcs = 8 + #filters out anything that has a single NaN concentration + sub_table = sub_table[sub_table.iloc[:,firstconc:firstconc+numconcs].isnull().sum(1) == 0 ] return sub_table def bindingCurve(concentrations, kd, fmax=None, fmin=None): @@ -262,7 +319,6 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep # choose just one sequence of that topology to look at seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0] criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0) - sub_table = variant_table[criteria_central] helix_context = np.unique(sub_table['helix_context']) total_lengths = np.array([8,9,10,11,12]) @@ -286,4 +342,66 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep ax.set_ylabel('delta delta G') ax.set_ylim((-1, 5)) plt.tight_layout() - return \ No newline at end of file + return + +#def helixContextAnalysis(table, variant_table, topology, loop = 'goodLoop', receptor = 'R1', offset = 0): +# if loop is None: +# loop = 'goodLoop' +# if receptor is None: +# receptor='R1' +# if offset is None: +# offset = 0 # amount to change helix_one_length by from default +# couldPlot = True +# criteria_central = np.all((np.array(variant_table['receptor'] == receptor), +# np.array(variant_table['loop']==loop)), +# axis=0) +# length = 10 +# if topology == '': +# criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence'] =='_'),np.array(variant_table['total_length']==length)), axis=0) +# else: +# criteria_central = np.all((criteria_central, np.array(variant_table['topology']==topology),np.array(variant_table['total_length']==length)), axis=0) +# # choose just one sequence of that topology to look at +# #seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0] +# #criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0) +# sub_table = variant_table[criteria_central] +# helix_context = np.unique(sub_table['helix_context']) +# delta_G_initial = variant_table.loc[0]['dG'] +# newtable = pd.DataFrame(columns=['helix_context','dG', 'ddG', 'likelihood'], index = xrange(0,len(sub_table))) +# count = 0 +# for i, sequence in enumerate(helix_context): +# dG = sub_table[np.all((np.array(sub_table['helix_context']==sequence)),axis=0)]['dG'] +# count = count + len(dG) +# delta_deltaG = dG - delta_G_initial +# newtable.iloc[i]['helix_context'] = sequence +# newtable.iloc[i]['dG'] = dG.iloc[0] +# newtable.iloc[i]['ddG'] = delta_deltaG +# +# +# #next time load mploadtext. +# for i, sequence in enumerate(np.unique(newtable['helix_context'])): +# if sequence == 'h02': +# newtable.iloc[i]['likelihood'] = -4.62 +# if sequence == 'h06': +# newtable.iloc[i]['likelihood'] = -5.16 +# if sequence == 'h08': +# newtable.iloc[i]['likelihood'] = -5.30 +# if sequence == 'h10': +# newtable.iloc[i]['likelihood'] = -5.54 +# if sequence == 'h12': +# newtable.iloc[i]['likelihood'] = -5.64 +# if sequence == 'h14': +# newtable.iloc[i]['likelihood'] = -5.67 +# if sequence == 'h16': +# newtable.iloc[i]['likelihood'] = -6.15 +# if sequence == 'h21': +# newtable.iloc[i]['likelihood'] = -6.44 +# if sequence == 'h25': +# newtable.iloc[i]['likelihood'] = -6.57 +# if sequence == 'h28': +# newtable.iloc[i]['likelihood'] = -6.76 + + + + + + diff --git a/scripts/analyzeVariants.py b/scripts/analyzeVariants.py index 1f45496..c9aca43 100644 --- a/scripts/analyzeVariants.py +++ b/scripts/analyzeVariants.py @@ -21,6 +21,8 @@ import pandas as pd import variantFun import IMlibs +import multiprocessing +import plotfun parameters = variantFun.Parameters() #set up command line argument parser @@ -38,14 +40,16 @@ args = parser.parse_args() # outdirectory -imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs' +#imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs' +imageDirectory = '/home/namita/RigidAnalysis/figs' + # load concentrations xValuesFilename, fluor_dirs_red, fluor_dirs, concentrations = IMlibs.loadMapFile(args.CPfluor_dirs_to_quantify) # load table fittedBindingFilename = args.CPfitted table = IMlibs.loadFittedCPsignal(fittedBindingFilename) -table['dG'] = parameters.RT*np.log(table['kd']*1E-9) +#table['dG'] = parameters.RT*np.log(table['kd']*1E-9) # get another dict that gives info on a per variant basis variantFittedFilename = os.path.splitext(fittedBindingFilename)[0]+'.perVariant.fitParameters' diff --git a/scripts/findSeqDistribution.py b/scripts/findSeqDistribution.py index 1f4b826..815ec4e 100644 --- a/scripts/findSeqDistribution.py +++ b/scripts/findSeqDistribution.py @@ -10,6 +10,8 @@ import numpy as np from optparse import OptionParser import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') import pandas as pd import sys import os From 567a6ad03e12107e660d271bd657148befe22c8b Mon Sep 17 00:00:00 2001 From: nbisaria Date: Tue, 6 Jan 2015 19:12:20 -0800 Subject: [PATCH 2/2] Cleaned up comments Got ride of non-sensical comments and debugging code --- libs/IMlibs.py | 3 --- libs/variantFun.py | 67 ++-------------------------------------------- 2 files changed, 2 insertions(+), 68 deletions(-) diff --git a/libs/IMlibs.py b/libs/IMlibs.py index bd0fcca..cdc9db5 100644 --- a/libs/IMlibs.py +++ b/libs/IMlibs.py @@ -339,7 +339,6 @@ def loadCPseqSignal(filename): """ cols = ['tileID','filter','read1_seq','read1_quality','read2_seq','read2_quality','index1_seq','index1_quality','index2_seq', 'index2_quality','all_cluster_signal','binding_series'] table = pd.read_csv(filename, sep='\t', header=None, names=cols, index_col=False) - #we ended here!! #count = 0 # #binding_series = np.zeros((len(table),8)) @@ -347,11 +346,9 @@ def loadCPseqSignal(filename): # if (len(np.array(series.split(','), dtype=float)) ==8): # binding_series[count] = np.array(series.split(','), dtype=float) # count += 1 - #import pdb; pdb.set_trace() binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']]) for col in range(binding_series.shape[1]): - print('newversion') table[col] = binding_series[:, col] return table diff --git a/libs/variantFun.py b/libs/variantFun.py index 83f3a98..5354126 100644 --- a/libs/variantFun.py +++ b/libs/variantFun.py @@ -25,9 +25,7 @@ def __init__(self): def perVariantInfo(table): - #variants_1 = np.arange(0, np.max(table['variant_number']),dtype=int) variants = range(0,int(np.max(table['variant_number']))) - #something is wrong with 595,681 wd = '/home/namita/RigidAnalysis' filename = os.path.join(wd, 'BadVariants.txt') f = open(filename, 'w') @@ -36,7 +34,6 @@ def perVariantInfo(table): newtable = pd.DataFrame(columns=columns, index=np.arange(len(variants))) pool = Pool(processes=20) datapervar= pool.map(functools.partial(generateVarStats, table, f), variants) - #newtable.iloc[variants] = pool.map(functools.partial(generateVarStats, table), variants) pool.close() f.close() print('here!') @@ -47,12 +44,7 @@ def perVariantInfo(table): newtable.iloc[variant] = datapervar[variant].iloc[0] return newtable - -def versionNum(): - print('v3') - - - + def generateVarStats(table,f,variant): sub_table = table[table['variant_number']==variant] #how do you intialize a list? @@ -344,62 +336,7 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep plt.tight_layout() return -#def helixContextAnalysis(table, variant_table, topology, loop = 'goodLoop', receptor = 'R1', offset = 0): -# if loop is None: -# loop = 'goodLoop' -# if receptor is None: -# receptor='R1' -# if offset is None: -# offset = 0 # amount to change helix_one_length by from default -# couldPlot = True -# criteria_central = np.all((np.array(variant_table['receptor'] == receptor), -# np.array(variant_table['loop']==loop)), -# axis=0) -# length = 10 -# if topology == '': -# criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence'] =='_'),np.array(variant_table['total_length']==length)), axis=0) -# else: -# criteria_central = np.all((criteria_central, np.array(variant_table['topology']==topology),np.array(variant_table['total_length']==length)), axis=0) -# # choose just one sequence of that topology to look at -# #seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0] -# #criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0) -# sub_table = variant_table[criteria_central] -# helix_context = np.unique(sub_table['helix_context']) -# delta_G_initial = variant_table.loc[0]['dG'] -# newtable = pd.DataFrame(columns=['helix_context','dG', 'ddG', 'likelihood'], index = xrange(0,len(sub_table))) -# count = 0 -# for i, sequence in enumerate(helix_context): -# dG = sub_table[np.all((np.array(sub_table['helix_context']==sequence)),axis=0)]['dG'] -# count = count + len(dG) -# delta_deltaG = dG - delta_G_initial -# newtable.iloc[i]['helix_context'] = sequence -# newtable.iloc[i]['dG'] = dG.iloc[0] -# newtable.iloc[i]['ddG'] = delta_deltaG -# -# -# #next time load mploadtext. -# for i, sequence in enumerate(np.unique(newtable['helix_context'])): -# if sequence == 'h02': -# newtable.iloc[i]['likelihood'] = -4.62 -# if sequence == 'h06': -# newtable.iloc[i]['likelihood'] = -5.16 -# if sequence == 'h08': -# newtable.iloc[i]['likelihood'] = -5.30 -# if sequence == 'h10': -# newtable.iloc[i]['likelihood'] = -5.54 -# if sequence == 'h12': -# newtable.iloc[i]['likelihood'] = -5.64 -# if sequence == 'h14': -# newtable.iloc[i]['likelihood'] = -5.67 -# if sequence == 'h16': -# newtable.iloc[i]['likelihood'] = -6.15 -# if sequence == 'h21': -# newtable.iloc[i]['likelihood'] = -6.44 -# if sequence == 'h25': -# newtable.iloc[i]['likelihood'] = -6.57 -# if sequence == 'h28': -# newtable.iloc[i]['likelihood'] = -6.76 - +