MustoeLab · lucaskearns · Nov 25, 2024
diff --git a/DanceMapper.py b/DanceMapper.py
diff --git a/README.md b/README.md
@@ -106,6 +106,80 @@ use the following flags:
 --oldDMSnorm --pm_secondary_reactivity 0.5 --mincount 50
 
 
+DanceMapper.py usage related to N7
+--------------
+To use DanceMapper.py to deconvolve N7-G reactivities in addition to traditional N1/3 reactivities
+ShapeMapper 2.3+ must be run with --dms and --N7 flags in addition to the --output-parsed-mutations
+option to produce N7-G related files.
+
+Input:
+
+    parsed.mut 
+        file output by ShapeMapper
+
+    profile.txt 
+        file output by ShapeMapper
+
+    --concat
+        flag directing DanceMapper to concatenate the parsed.mut and parsed.mutga files to deconvolve
+        N7-G reactivities in addition to N1/3 reactivities
+
+*Note* The parsed.mut file must have a corresponding parsed.mutga file located in the same directory.
+Additionally, the profile.txt file must have a corresponding profile.txtga file located in the same
+directory as well. When ShapeMapper 2.3+ is run with the --N7 flag the parsed.mut and profile.txt as
+well as the corresponding parsed.mutga and profile.txtga files are all produced in the same directory
+by default.
+
+Output:
+
+    concat.bm file 
+        save file of the Bernoulli mixture model encompassing both N1/3 and N7-G nucleotides. Only 
+        generated when using the --fit option
+
+    N13.bm file 
+        save file of the Bernoulli mixture model encompassing the N1/3 nucleotides. Only generated 
+        when using the --fit option
+
+    N7.bm file 
+        save file of the Bernoulli mixture model encompassing the N7 nucleotides. Only generated 
+        when using the --fit option
+
+    -concat-reactivities.txt file
+        normalized N1/3 and N7-G reactivities for each structure. Only generated when using the 
+        --fit option
+
+    -N13-reactivities.txt file
+        normalized N1/3 reactivities for each structure. Only generated when using the --fit
+        option
+
+    -N7-reactivities.txt file
+        normalized N7 reactivities for each structure. Only generated when using the --fit
+        option
+
+    [i]-N1rings.txt file
+        N1/3-N1/3 RINGs for state i (window=1). Only generated when using the --ring option
+
+    [i]-N7rings.txt file
+        N7-N7 RINGs for state i (window=1). Only generated when using the --ring option
+
+    [i]-N1N7rings.txt file
+        N1/3-N7 RINGs for state i (window=1). Only generated when using the --ring option
+
+
+*Additional Notes*
+--pairmap flag not currently compatible with --concat flag
+
+In order to visualize deconvolved N1/3 and N7-G data it is currently recommended that the 
+dance-reactivites_profile_converter.py script be used. This script will split the
+-concat-reactivities.txt file into profile.txt and profile.txtga files corresponding
+to the states of the deconvolved model. eg:
+
+"python dance-reactivites_profile_converter.py -concat-reactivities.txt --output output_prefix"
+
+These profile files can then be visualized via arcplot.
+
+
+
 foldClusters.py
 ----------------
 Script for performing RNAstructure modeling based on clustered reactivities and plotting results 

diff --git a/changelog.txt b/changelog.txt
@@ -1,17 +1,52 @@
-Version 1.1
------------
+Version 1.2
+August 30, 2023
+Lucas Kearns <lucas.kearns@bcm.edu>, Thomas Miller <thomas.miller2@bcm.edu>
+***********************************************************************
 
--Fixed internal issues so untreated sample is now optional. Additionally added 
-the --ignore_untreated flag to ignore any untreated sample information in the
-profile.txt file
+ADDED:
 
+Added --concat flag. This will concatenate mut and corresponding mutga files
+together. By default N7-Gs do not contribute to clustering but are set to
+inactive. This means that although N7-G positions do not drive clustering they will be 
+solved for in the final model.
 
--Updated mincount cutoffs for pair/ring analysis to 10 to be compatible with
-pair/ringmapper v1.2
+Added --activate_n7 flag. Allows n7 nucleotides to contribute to DanceMap
+clustering. By default N7-G nucleotides will not contribute to clustering.
+
+Added ability for concatenated DanceMapper runs to use RingMapper on solved models
+and generate N1/3-N1/3, N1/3-N7G, and N7G-N7G rings.
+
+CHANGED:
+
+Updated eDMS normalization to reflect new pur and pyr normalization cases
+for N7-G. Changed eDMS so highly protected N7-G regions (ares where bg is higher than
+raw) are assigned a normalized reactivity of 3.32.
+
+FIXED:
+
+Debugged min coverage calculation to properly calculate 75% coverage
+threshold.
 
+Version 1.1
+February 1, 2023
+Anthony Mustoe <anthony.mustoe@bcm.edu>
+***********************************************************************
+
+ADDED:
+
+Added --ignore_untreated flag to ignore any untreated sample information in the
+profile.txt file.
+
+CHANGED:
 
--Updated DMS normalization to eDMS by default. The --oldDMSnorm flag can be used 
+Updated DMS normalization to eDMS by default. The --oldDMSnorm flag can be used 
 to perform normalization using original method
 
--Changed dependency to consolidated StructureAnalysisTools package
+Updated mincount cutoffs for pair/ring analysis to 10 to be compatible with
+pair/ringmapper v1.2
+
+Changed dependency to consolidated StructureAnalysisTools package
+
+FIXED:
 
+Fixed internal issues so untreated sample is now optional. 
diff --git a/compute_readprobs.py b/compute_readprobs.py
@@ -0,0 +1,30 @@
+
+
+import DanceMapper, accessoryFunctions
+import sys
+import numpy as np
+
+
+DM = DanceMapper.DanceMap(modfile=sys.argv[1], profilefile=sys.argv[2])
+DM.readModelFromFile(sys.argv[3])
+
+
+llmat = np.zeros((DM.BMsolution.pdim, DM.reads.shape[0]), dtype=np.float64)
+
+accessoryFunctions.loglikelihoodmatrix(llmat, DM.reads, DM.mutations, np.array(DM.active_columns, dtype=np.int32), DM.BMsolution.mu, DM.BMsolution.p)
+
+llmat = np.exp(llmat)
+s = np.sum(llmat, axis=0)
+llmat /= s
+
+bins = np.linspace(0,1,11)
+
+hist, edge = np.histogram(llmat[0,:], bins=bins)
+
+hist = np.array(hist, dtype=float)/np.sum(hist)
+
+for i in range(10):
+    print("{} {:.3f}".format(i, hist[i]))
+
+
+
diff --git a/concat_profile_mut_N.py b/concat_profile_mut_N.py
@@ -0,0 +1,91 @@
+import numpy as np
+import sys
+
+
+#This function opens the .mut and .mutga strings and just concats
+#them together. 
+def concat_mut(inputmut, output, profile):
+   read_length = 0
+   with open("{}.txt".format(profile), 'r') as prof:
+      lines = prof.readlines()
+      last_line = lines[-1].split()
+      read_length = int(last_line[0])
+
+
+
+   with open("{}.mut".format(inputmut), 'r') as mut:
+      with open("{}.mutga".format(inputmut), 'r') as mutGA:
+         #with open("{}.mut".format(output), 'w') as concat:
+         with open("{}".format(output), 'w') as concat:
+            parsedLines = mut.read().splitlines()
+            parsedLinesGA = mutGA.read().splitlines()
+            length = len(parsedLines)
+            for i in range(length):
+               splGA = parsedLinesGA[i].split()
+               spl = parsedLines[i].split()
+               if(splGA[4] == "INCLUDED" and spl[4] == "INCLUDED"):
+                  to_add = []
+                  to_add += (read_length - (int(spl[3]) + 1)) * ["0"]
+                  to_add += (int(spl[2])) * ["0"]
+
+
+                  if(len(to_add) > 0):         
+                     spl[8] = spl[8] + "".join(to_add) + splGA[8]
+                     spl[6] = spl[6] + "".join(to_add) + splGA[6]
+                     spl[7] = spl[7] + "".join(to_add) + splGA[7]
+                     spl[3] = str(int(spl[3]) + read_length)
+                  else:
+                     spl[8] = spl[8] + splGA[8]
+                     spl[6] = spl[6] + splGA[6]
+                     spl[7] = spl[7] + splGA[7]
+                     spl[3] = str(int(spl[3]) + read_length)
+
+                  newline = " ".join(spl)
+                  concat.write(newline + "\n")
+
+# Concatenates the N1/3 and N7 profile again for further downstream processing
+def concat_profile(inputProf, profOut):
+   with open("{}.txt".format(inputProf), 'r') as N1:
+      with open("{}.txtga".format(inputProf), 'r') as N7:
+         #with open("{}.txt".format(profOut), 'w') as out:
+         with open("{}".format(profOut), 'w') as out:
+            linesN1 = [line.rstrip() for line in N1]
+            linesN7 = [line.rstrip() for line in N7][1:]
+
+
+            nt_length = (len(linesN1) - 1)
+
+            for i in range(len(linesN7)):
+               spl_line = linesN7[i].split()
+               newnum = int(spl_line[0]) + nt_length
+               newnum = str(newnum)
+               spl_line[0] = newnum
+               if spl_line[1] == 'G':
+                  spl_line[1] = 'N'
+               elif spl_line[1] == 'g':
+                  spl_line[1] = 'n'
+
+               newline = " ".join(spl_line)
+               linesN1.append(newline)
+
+
+            for line in linesN1:
+               out.write(line + "\n")
+
+
+
+
+
+
+if(__name__ == "__main__"):
+   #Rudimentary arg parsing. Basically do -m for for concat of muts
+   if(sys.argv[1] == "-h"):
+      print("USAGE: concat_profile_mut.py -m Mut Output Profile")
+      print("USAGE: concat_profile_mut.py -p Profile Output")
+   elif(sys.argv[1] == "-m"):
+      concat_mut(sys.argv[2], sys.argv[3], sys.argv[4])
+   #-p for concat of profile
+   elif(sys.argv[1] == "-p"):
+      concat_profile(sys.argv[2], sys.argv[3])
+
+
diff --git a/dance-reactivites_profile_converter.py b/dance-reactivites_profile_converter.py
@@ -0,0 +1,120 @@
+import argparse
+import numpy as np
+from numpy import isnan, nan, sqrt
+
+################################################################################
+#Processes an N1/3-N7 concatenated  dance-reactivities.txt file and produces
+#a .txt .txtga profile file for each predicted model in the ensemble. These .txtga files #may be fed into arcplot etc.
+################################################################################
+
+#Parser function. Finds total number of models, length of sequence,
+#and extracts N7 lines.
+def parseLines(filename):
+   num_models = 0
+   Nt_Length = 0
+   reactFile = open(filename, 'r')
+   lines = [line.rstrip() for line in reactFile]
+   fline = lines[0]
+   num_models  = int(fline.split()[0])
+   lline = lines[len(lines) - 1]
+   Nt_Length =  int(lline.split()[0]) / 2
+   reactFile.close()
+
+
+   N7index = int(Nt_Length) + 3
+   lines = lines[3:]
+   return num_models, Nt_Length, lines
+
+#Extracts a two dimensional list of the raw and normalized reactivities as well
+#as the sequence.
+def findReactivities(lines, num_Models, Nt_Length):
+
+
+   seq = []
+   norm = []
+
+   for i in range(num_Models):
+      norm.append([])
+
+
+   for i  in range(len(lines)):
+      splLine = lines[i].split()
+      index = 2
+      lIndex = 0 
+      seq.append(splLine[1])
+      for j in range(len(norm)):
+         norm[lIndex].append(splLine[index]) 
+         lIndex += 1
+         index += 2
+
+   norm_N13 = []
+   norm_N7 = []
+
+   for i in range(len(norm)):
+      norm_N13.append(norm[i][:int(Nt_Length)])
+      norm_N7.append(norm[i][int(Nt_Length):])
+
+   return norm_N13, norm_N7, seq
+
+
+#Iterates through extracted reactivities and generates .txtga profiles.
+#def make_txtga(seq, raw, norm, output):
+def make_txtga(norm_N13, norm_N7, seq, output):
+   for i in range(len(norm_N13)):
+      lines_N13 = []
+      lines_N7 = []
+      lines_N13.append("Nucleotide  Sequence Modified_mutations   Modified_read_depth  Modified_effective_depth   Modified_rate  Modified_off_target_mapped_depth Modified_low_mapq_mapped_depth   Modified_mapped_depth   Untreated_mutations  Untreated_read_depth Untreated_effective_depth  Untreated_rate     Untreated_off_target_mapped_depth   Untreated_low_mapq_mapped_depth  Untreated_mapped_depth      Denatured_mutations  Denatured_read_depth Denatured_effective_depth  Denatured_rate Denatured_off_target_mapped_depth   Denatured_low_mapq_mapped_depth  Denatured_mapped_depth  Reactivity_profile   Std_err  HQ_profile  HQ_stderr   Norm_profile   Norm_stderr")
+      lines_N7.append("Nucleotide  Sequence Modified_mutations   Modified_read_depth  Modified_effective_depth   Modified_rate  Modified_off_target_mapped_depth Modified_low_mapq_mapped_depth   Modified_mapped_depth   Untreated_mutations  Untreated_read_depth Untreated_effective_depth  Untreated_rate     Untreated_off_target_mapped_depth   Untreated_low_mapq_mapped_depth  Untreated_mapped_depth      Denatured_mutations  Denatured_read_depth Denatured_effective_depth  Denatured_rate Denatured_off_target_mapped_depth   Denatured_low_mapq_mapped_depth  Denatured_mapped_depth  Reactivity_profile   Std_err  HQ_profile  HQ_stderr   Norm_profile   Norm_stderr")
+      for j in range(len(norm_N13[i])):
+         line_N13 = []
+         line_N13.append(str(j + 1))
+         line_N13.append(" {} ".format(seq[j]))
+         line_N13.append(" 0 " * 25)
+         line_N13.append(str(norm_N13[i][j]))
+         line_N13.append(" 0")
+
+         line_N13 = ''.join(line_N13)
+
+         line_N7 = []
+         line_N7.append(str(j + 1))
+         if seq[j] == "N":
+            seq[j] = "G"
+         line_N7.append(" {} ".format(seq[j]))
+         line_N7.append(" 0 " * 25)
+         line_N7.append(str(norm_N7[i][j]))
+         line_N7.append(" 0")
+         line_N7 = ''.join(line_N7)
+
+
+
+
+         lines_N13.append(line_N13)
+         lines_N7.append(line_N7)
+
+      if output == None:
+         oFile_N13 = open("dance_map_model_{}_profile.txt".format(i), "w")
+         oFile_N7 = open("dance_map_model_{}_profile.txtga".format(i), "w")
+         for line in lines_N13:
+            oFile_N13.write(line + "\n")
+         for line in lines_N7:
+            oFile_N7.write(line + "\n")
+
+      else:
+         oFile_N13 = open("{}_{}_profile.txt".format(output, i), "w")
+         oFile_N7 = open("{}_{}_profile.txtga".format(output, i), "w")
+         for line in lines_N13:
+            oFile_N13.write(line + "\n")
+         for line in lines_N7:
+            oFile_N7.write(line + "\n")
+
+      oFile_N13.close()
+      oFile_N7.close()
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("reactivities", help = "Path to the concatenated dance-reactivities.txt")
+    parser.add_argument("--output", help = "Output file(s) prefix", default=None)
+    args=parser.parse_args()
+    num_Models, Nt_Length, lines = parseLines(args.reactivities)
+    norm_N13, norm_N7, seq = findReactivities(lines, num_Models, Nt_Length) 
+    make_txtga(norm_N13, norm_N7, seq, args.output)