benjaminjack · hadley · Oct 1, 2020
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,5 +9,4 @@ export(quantitate)
 import(dplyr)
 import(stringr)
 import(tidyr)
-importFrom(lazyeval,interp)
 importFrom(stats,setNames)
diff --git a/R/make_area_table.R b/R/make_area_table.R
@@ -28,35 +28,39 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA-
   # Access MSF database file
   # my_db <- src_sqlite(msf_file)
   my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
+  on.exit(DBI::dbDisconnect(my_db))
 
   Events <- tbl(my_db, "Events") %>%
-    select_(~-RT, ~-LeftRT, ~-RightRT, ~-SN, ~-FileID, ~-Intensity)
+    select(-"RT", -"LeftRT", -"RightRT", -"SN", -"FileID", -"Intensity")
 
   EventAreaAnnotations <- tbl(my_db, "EventAreaAnnotations") %>%
-    select_(~EventID, ~QuanResultID)
+    select("EventID", "QuanResultID")
 
   PrecursorIonAreaSearchSpectra <- tbl(my_db, "PrecursorIonAreaSearchSpectra")
 
   # Here is where all the scan information, including mass and charge
   SpectrumHeaders <- tbl(my_db, "SpectrumHeaders") %>%
-    select_(~SpectrumID, ~FirstScan, ~Mass, ~Charge, ~RetentionTime, ~UniqueSpectrumID) %>%
+    select("SpectrumID", "FirstScan", "Mass", "Charge", "RetentionTime", "UniqueSpectrumID") %>%
     collect(n = Inf)
 
   # Grab intensities
   MassPeaks <- tbl(my_db, "MassPeaks") %>%
-    select_(~MassPeakID, ~Intensity) %>%
+    select("MassPeakID", "Intensity") %>%
     collect(n = Inf)
 
   # Here are all the areas
   # m/z is really just one of many m/z's that could be retrieved from the msf file.
   # All of the m/z's are calculated slightly differently. I just picked one since
   # I don't use m/z's for my work.
-  events_joined <- inner_join(Events, EventAreaAnnotations, by = "EventID") %>%
+  events_joined <- Events %>%
+    inner_join(EventAreaAnnotations, by = "EventID") %>%
     inner_join(PrecursorIonAreaSearchSpectra, by = "QuanResultID") %>%
     collect(n = Inf)  %>%
-    group_by_(~SearchSpectrumID) %>%
-    summarize_(.dots = setNames(list(~sum(Area), ~mean(Mass)),
-                               c("Area", "m_z")))
+    group_by(.data$SearchSpectrumID) %>%
+    summarize(
+      Area = sum(.data$Area),
+      m_z = mean(.data$Mass)
+    )
 
   # NOTE: mass and m/z are probably not correct right now! I have check them in more detail
 
@@ -68,34 +72,22 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA-
 
   pep_table <- make_pep_table(msf_file, min_conf, prot_regex, collapse)
 
-  # Rename columns for more consistent column naming
-  old_names <- list(~peptide_id,
-                    ~SpectrumID,
-                    ~protein_desc,
-                    ~sequence,
-                    ~Area,
-                    ~Mass,
-                    ~m_z,
-                    ~Charge,
-                    ~Intensity,
-                    ~FirstScan)
-  new_names <- c("peptide_id",
-                 "spectrum_id",
-                 "protein_desc",
-                 "sequence",
-                 "area",
-                 "mass",
-                 "m_z",
-                 "charge",
-                 "intensity",
-                 "first_scan")
-
-  # Join peptide info to mass/area/charge/etc.
-  auc_table <- right_join(spectra, pep_table, by=c("SpectrumID" = "spectrum_id")) %>%
-    select_(.dots = setNames(old_names, new_names))
-
-  DBI::dbDisconnect(my_db)
+  auc_table <- spectra %>%
+    # Join peptide info to mass/area/charge/etc.
+    right_join(pep_table, by = c("SpectrumID" = "spectrum_id")) %>%
+    # Rename columns for more consistent column naming
+    select(
+      "peptide_id" = "peptide_id",
+      "spectrum_id" = "SpectrumID",
+      "protein_desc" = "protein_desc",
+      "sequence" = "sequence",
+      "area" = "Area",
+      "mass" = "Mass",
+      "m_z" = "m_z",
+      "charge" = "Charge",
+      "intensity" = "Intensity",
+      "first_scan" = "FirstScan"
+    )
 
   return(auc_table)
-
 }
diff --git a/R/make_pep_table.R b/R/make_pep_table.R
@@ -50,47 +50,49 @@ make_pep_table <- function(msf_file,
   # Access MSF database file
   # my_db <- dplyr::src_sqlite(msf_file)
   my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
+  on.exit(DBI::dbDisconnect(my_db))
 
   # First check file version (only Proteome Discoverer 1.4.x is supported)
   schema <- tbl(my_db, "SchemaInfo") %>%
-    select_(~SoftwareVersion) %>%
+    select("SoftwareVersion") %>%
     collect()
   if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) {
     warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.")
   }
 
   # This table maps PeptideIDs to ProteinIDs
   PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>%
-    select_(~PeptideID, ~ProteinID)
+    select("PeptideID", "ProteinID")
 
   # Here are the actual peptide sequences with corresponding SpectrumIDs
   Peptides <- tbl(my_db, "Peptides") %>%
-    select_(~PeptideID, ~SpectrumID, ~ConfidenceLevel, ~Sequence) %>%
-    filter_(~ ConfidenceLevel >= confidence)
+    select("PeptideID", "SpectrumID", "ConfidenceLevel", "Sequence") %>%
+    filter(.data$ConfidenceLevel >= confidence)
 
   # Protein descriptions are contained in this table
   ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>%
-    select_(~ProteinID, ~Description)
-
-  # Lazy-eval formula for protein name matching
-  prot_match <- lazyeval::interp(~str_match(Description, prot_regex)[,2], prot_regex = prot_regex)
+    select("ProteinID", "Description")
 
   # Build a peptide table
-  pep_table <- inner_join(PeptidesProteins, Peptides, by = c("PeptideID" = "PeptideID")) %>%
+  pep_table <- PeptidesProteins %>%
+    inner_join(Peptides, by = c("PeptideID" = "PeptideID")) %>%
     inner_join(ProteinAnnotations, by = "ProteinID") %>%
     collect(n = Inf) %>%
     # Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space
-    mutate_(.dots = setNames(list(prot_match), c("Proteins"))) %>%
-    group_by_(~PeptideID, ~SpectrumID, ~Sequence)
+    mutate(Proteins = str_match(.data$Description, prot_regex)[,2]) %>%
+    group_by(.data$PeptideID, .data$SpectrumID, .data$Sequence)
+
   # Collapse peptides that map to multiple proteins into a single row
   if (collapse == TRUE) {
     pep_table <- pep_table %>%
-      summarize_(.dots = setNames(list(~paste(Proteins, sep = "", collapse = "; "),
-                                       ~paste(ProteinID, sep = "", collapse = "; ")),
-                                  c("Proteins",
-                                    "ProteinID")))
+      summarize(
+        Proteins = paste0(.data$Proteins, collapse = "; "),
+        ProteinID = paste0(.data$ProteinID, collapse = "; ")
+      )
   }
-  pep_table <- pep_table %>% select_(~PeptideID, ~SpectrumID, ~Sequence, ~Proteins, ~ProteinID)
+
+  pep_table <- pep_table %>%
+    select("PeptideID", "SpectrumID", "Sequence", "Proteins", "ProteinID")
   # Append custom fields
   CustomFields <- tbl(my_db, "CustomDataFields")
 
@@ -99,33 +101,25 @@ make_pep_table <- function(msf_file,
   CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides"))
 
   # Spread custom fields as separate columns
-  custom_data <- left_join(CustomPeptides, CustomFields, by = "FieldID") %>%
-    select_(~PeptideID, ~DisplayName, ~FieldValue) %>%
+  custom_data <- CustomPeptides %>%
+    left_join(CustomFields, by = "FieldID") %>%
+    select("PeptideID", "DisplayName", "FieldValue") %>%
     collect(n = Inf) %>%
-    spread_("DisplayName", "FieldValue")
-
-  # Join to peptide table
-  pep_table <- left_join(pep_table, custom_data, by = "PeptideID")
-
-  old_names <- list(~PeptideID,
-                    ~SpectrumID,
-                    ~ProteinID,
-                    ~Proteins,
-                    ~Sequence,
-                    ~PEP,
-                    ~`q-Value`)
-  new_names <- c("peptide_id",
-                 "spectrum_id",
-                 "protein_id",
-                 "protein_desc",
-                 "sequence",
-                 "pep_score",
-                 "q_value")
-
-  # Rename columns for a more consistent column name style
-  pep_table <- select_(pep_table, .dots = setNames(old_names, new_names))
-
-  DBI::dbDisconnect(my_db)
+    spread("DisplayName", "FieldValue")
+
+  pep_table <- pep_table %>%
+    # Join to peptide table
+    left_join(custom_data, by = "PeptideID") %>%
+    # Rename columns for a more consistent column name style
+    select(
+      peptide_id = "PeptideID",
+      spectrum_id = "SpectrumID",
+      protein_id = "ProteinID",
+      protein_desc = "Proteins",
+      sequence = "Sequence",
+      pep_score = "PEP",
+      q_value = "q-Value"
+    )
 
   return(pep_table)
 

diff --git a/R/map_peptides.R b/R/map_peptides.R
@@ -25,21 +25,23 @@
 map_peptides <- function(msf_file, min_conf = "High", prot_regex = "") {
 
   pep_table <- make_pep_table(msf_file, min_conf, collapse = FALSE) %>%
-    rename_(.dots = setNames(list(~sequence), c("peptide_sequence")))
+    rename(peptide_sequence = "sequence")
 
   my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
 
   prots <- tbl(my_db, "Proteins") %>%
-    select_(~ProteinID, ~Sequence) %>%
+    select(.data$ProteinID, .data$Sequence) %>%
     collect()
 
-  pep_table <- inner_join(pep_table, prots, by = c("protein_id" = "ProteinID")) %>%
-    rename_(.dots = setNames(list(~Sequence), c("protein_sequence"))) %>%
+  pep_table <- pep_table %>%
+    inner_join(prots, by = c("protein_id" = "ProteinID")) %>%
+    rename(protein_sequence = "Sequence") %>%
     rowwise() %>%
-    mutate_(.dots = setNames(list(~str_locate(protein_sequence, peptide_sequence)[1],
-                                  ~str_locate(protein_sequence, peptide_sequence)[2]),
-                             c("start",
-                               "end")))
+    mutate(
+      start = str_locate(.data$protein_sequence, .data$peptide_sequence)[1],
+      end = str_locate(.data$protein_sequence, .data$peptide_sequence)[2]
+    )
+
   DBI::dbDisconnect(my_db)
 
   return(pep_table)

diff --git a/R/parsemsf.R b/R/parsemsf.R
@@ -4,7 +4,6 @@
 #' @name parsemsf
 #'
 #' @import dplyr
-#' @importFrom lazyeval interp
 #' @import tidyr
 #' @importFrom stats setNames
 #' @import stringr

diff --git a/R/quantitate.R b/R/quantitate.R
@@ -20,30 +20,30 @@
 merge_top_peptides <- function(df, num_reps, match_peps = TRUE) {
 
   df %>%
-    group_by_(~tech_rep, ~sequence) %>%
-    distinct_(.dots = list(~area)) %>% # Remove peptides with identical areas
-    summarize_(.dots = setNames(list(~sum(area, na.rm = T)),
-                                c("area"))) %>% # Sum across peptides with different charges
-    group_by_(~tech_rep) %>%
-    filter_(~ min_rank(desc(area)) <= 3) %>% # Find top 3 peptides
-    ungroup() -> df
+    group_by(.data$tech_rep, .data$sequence) %>%
+    distinct(.data$area) %>% # Remove peptides with identical areas
+    summarize(area = sum(.data$area, na.rm = T)) %>% # Sum across peptides with different charges
+    group_by(.data$tech_rep) %>%
+    filter(min_rank(desc(.data$area)) <= 3) %>% # Find top 3 peptides
+    ungroup() ->
+    df
 
   if (match_peps == TRUE) {
-    df %>% group_by_(~sequence) %>%
-      mutate_(.dots = setNames(list(~n()), c("n"))) %>% # Count number of matched peptides
+    df %>%
+      group_by(.data$sequence) %>%
+      mutate(n = n()) %>% # Count number of matched peptides
       ungroup() %>%
-      filter_(~ n >= num_reps) -> df # Remove any unmatched peptides
+      filter(.data$n >= num_reps) ->
+      df # Remove any unmatched peptides
   }
 
-  dots <- setNames(list(~mean(area, na.rm = T),
-                        ~sd(area, na.rm = T),
-                        ~ n()/num_reps),
-                   c("area_mean",
-                     "area_sd",
-                     "peps_per_rep"))
-
   df %>%
-    summarize_(.dots = dots) -> matched_areas # Compute mean areas from top peptides
+    summarize(
+      area_mean = mean(.data$area, na.rm = T),
+      area_sd = sd(.data$area, na.rm = T),
+      peps_per_rep = n() / num_reps
+    ) ->
+    matched_areas # Compute mean areas from top peptides
 
   return(matched_areas)
 }
@@ -82,37 +82,41 @@ quantitate <- function(reps, normalize = T, match_peps = T, relabel = c()) {
   for (i in 1:length(reps)) {
     message(paste("Now processing: ", reps[[i]]))
     reps_df[[i]] <- make_area_table(reps[[i]]) %>%
-      mutate_(.dots = setNames(list(i), c("tech_rep")))
+      mutate(tech_rep = list(i))
   }
 
   # Combine into single dataframe with all technical replicates
   combined <- bind_rows(reps_df)
 
   # Rename some protein groups
   if (length(relabel) > 0) {
-    # Rename lazy eval function
-    rename_lazy = interp(~str_replace_all(protein_desc, relabel), relabel = relabel)
     combined %>%
-      mutate_(.dots = setNames(list(rename_lazy), c("protein_desc"))) -> combined
+      mutate(protein_desc = str_replace_all(.data$protein_desc, relabel)) ->
+      combine
   }
 
   # Check if we should normalize to total area for a given replicate
   # This accounts for any variability in how the sample was injected
   message("Quantitating...")
   if (normalize == TRUE) {
-    # Lazy mutate list
-    normalize_lazy = list(~sum(area, na.rm = T), ~ area/total_area)
     combined %>%
-      group_by_(~tech_rep) %>%
-      mutate_(.dots = setNames(normalize_lazy, c("total_area", "area"))) %>%
-      ungroup() -> combined
+      group_by(.data$tech_rep) %>%
+      mutate(
+        total_area = sum(.data$area, na.rm = T),
+        area = .data$area / .data$total_area
+      ) %>%
+      ungroup() ->
+      combined
   }
 
   # Quantitate using top three most abundant areas
   combined %>%
-    group_by_(~protein_desc) %>%
-    do_(~merge_top_peptides(., num_reps, match_peps = match_peps)) -> combined
+    group_by(.data$protein_desc) %>%
+    do(merge_top_peptides(., num_reps, match_peps = match_peps)) ->
+    combined
 
   return(combined)
-
 }
+
+
+globalVariables(c(".", "sd"))