From 6592d37ef0d250e62fff96b34bfe49d33e740889 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 1 Oct 2020 14:06:56 -0500 Subject: [PATCH] Convert lazy eval to tidy eval So that you package continues to work with dbplyr 2.0.0, which removes all the lazyeval methods. --- NAMESPACE | 1 - R/make_area_table.R | 64 ++++++++++++++++--------------------- R/make_pep_table.R | 78 +++++++++++++++++++++------------------------ R/map_peptides.R | 18 ++++++----- R/parsemsf.R | 1 - R/quantitate.R | 64 ++++++++++++++++++++----------------- 6 files changed, 108 insertions(+), 118 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e776c85..5133d1e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,5 +9,4 @@ export(quantitate) import(dplyr) import(stringr) import(tidyr) -importFrom(lazyeval,interp) importFrom(stats,setNames) diff --git a/R/make_area_table.R b/R/make_area_table.R index 850e391..970f0b1 100644 --- a/R/make_area_table.R +++ b/R/make_area_table.R @@ -28,35 +28,39 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA- # Access MSF database file # my_db <- src_sqlite(msf_file) my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file) + on.exit(DBI::dbDisconnect(my_db)) Events <- tbl(my_db, "Events") %>% - select_(~-RT, ~-LeftRT, ~-RightRT, ~-SN, ~-FileID, ~-Intensity) + select(-"RT", -"LeftRT", -"RightRT", -"SN", -"FileID", -"Intensity") EventAreaAnnotations <- tbl(my_db, "EventAreaAnnotations") %>% - select_(~EventID, ~QuanResultID) + select("EventID", "QuanResultID") PrecursorIonAreaSearchSpectra <- tbl(my_db, "PrecursorIonAreaSearchSpectra") # Here is where all the scan information, including mass and charge SpectrumHeaders <- tbl(my_db, "SpectrumHeaders") %>% - select_(~SpectrumID, ~FirstScan, ~Mass, ~Charge, ~RetentionTime, ~UniqueSpectrumID) %>% + select("SpectrumID", "FirstScan", "Mass", "Charge", "RetentionTime", "UniqueSpectrumID") %>% collect(n = Inf) # Grab intensities MassPeaks <- tbl(my_db, "MassPeaks") %>% - select_(~MassPeakID, ~Intensity) %>% + select("MassPeakID", "Intensity") %>% collect(n = Inf) # Here are all the areas # m/z is really just one of many m/z's that could be retrieved from the msf file. # All of the m/z's are calculated slightly differently. I just picked one since # I don't use m/z's for my work. - events_joined <- inner_join(Events, EventAreaAnnotations, by = "EventID") %>% + events_joined <- Events %>% + inner_join(EventAreaAnnotations, by = "EventID") %>% inner_join(PrecursorIonAreaSearchSpectra, by = "QuanResultID") %>% collect(n = Inf) %>% - group_by_(~SearchSpectrumID) %>% - summarize_(.dots = setNames(list(~sum(Area), ~mean(Mass)), - c("Area", "m_z"))) + group_by(.data$SearchSpectrumID) %>% + summarize( + Area = sum(.data$Area), + m_z = mean(.data$Mass) + ) # NOTE: mass and m/z are probably not correct right now! I have check them in more detail @@ -68,34 +72,22 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA- pep_table <- make_pep_table(msf_file, min_conf, prot_regex, collapse) - # Rename columns for more consistent column naming - old_names <- list(~peptide_id, - ~SpectrumID, - ~protein_desc, - ~sequence, - ~Area, - ~Mass, - ~m_z, - ~Charge, - ~Intensity, - ~FirstScan) - new_names <- c("peptide_id", - "spectrum_id", - "protein_desc", - "sequence", - "area", - "mass", - "m_z", - "charge", - "intensity", - "first_scan") - - # Join peptide info to mass/area/charge/etc. - auc_table <- right_join(spectra, pep_table, by=c("SpectrumID" = "spectrum_id")) %>% - select_(.dots = setNames(old_names, new_names)) - - DBI::dbDisconnect(my_db) + auc_table <- spectra %>% + # Join peptide info to mass/area/charge/etc. + right_join(pep_table, by = c("SpectrumID" = "spectrum_id")) %>% + # Rename columns for more consistent column naming + select( + "peptide_id" = "peptide_id", + "spectrum_id" = "SpectrumID", + "protein_desc" = "protein_desc", + "sequence" = "sequence", + "area" = "Area", + "mass" = "Mass", + "m_z" = "m_z", + "charge" = "Charge", + "intensity" = "Intensity", + "first_scan" = "FirstScan" + ) return(auc_table) - } diff --git a/R/make_pep_table.R b/R/make_pep_table.R index b30a161..d53185e 100644 --- a/R/make_pep_table.R +++ b/R/make_pep_table.R @@ -50,10 +50,11 @@ make_pep_table <- function(msf_file, # Access MSF database file # my_db <- dplyr::src_sqlite(msf_file) my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file) + on.exit(DBI::dbDisconnect(my_db)) # First check file version (only Proteome Discoverer 1.4.x is supported) schema <- tbl(my_db, "SchemaInfo") %>% - select_(~SoftwareVersion) %>% + select("SoftwareVersion") %>% collect() if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) { warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.") @@ -61,36 +62,37 @@ make_pep_table <- function(msf_file, # This table maps PeptideIDs to ProteinIDs PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>% - select_(~PeptideID, ~ProteinID) + select("PeptideID", "ProteinID") # Here are the actual peptide sequences with corresponding SpectrumIDs Peptides <- tbl(my_db, "Peptides") %>% - select_(~PeptideID, ~SpectrumID, ~ConfidenceLevel, ~Sequence) %>% - filter_(~ ConfidenceLevel >= confidence) + select("PeptideID", "SpectrumID", "ConfidenceLevel", "Sequence") %>% + filter(.data$ConfidenceLevel >= confidence) # Protein descriptions are contained in this table ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>% - select_(~ProteinID, ~Description) - - # Lazy-eval formula for protein name matching - prot_match <- lazyeval::interp(~str_match(Description, prot_regex)[,2], prot_regex = prot_regex) + select("ProteinID", "Description") # Build a peptide table - pep_table <- inner_join(PeptidesProteins, Peptides, by = c("PeptideID" = "PeptideID")) %>% + pep_table <- PeptidesProteins %>% + inner_join(Peptides, by = c("PeptideID" = "PeptideID")) %>% inner_join(ProteinAnnotations, by = "ProteinID") %>% collect(n = Inf) %>% # Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space - mutate_(.dots = setNames(list(prot_match), c("Proteins"))) %>% - group_by_(~PeptideID, ~SpectrumID, ~Sequence) + mutate(Proteins = str_match(.data$Description, prot_regex)[,2]) %>% + group_by(.data$PeptideID, .data$SpectrumID, .data$Sequence) + # Collapse peptides that map to multiple proteins into a single row if (collapse == TRUE) { pep_table <- pep_table %>% - summarize_(.dots = setNames(list(~paste(Proteins, sep = "", collapse = "; "), - ~paste(ProteinID, sep = "", collapse = "; ")), - c("Proteins", - "ProteinID"))) + summarize( + Proteins = paste0(.data$Proteins, collapse = "; "), + ProteinID = paste0(.data$ProteinID, collapse = "; ") + ) } - pep_table <- pep_table %>% select_(~PeptideID, ~SpectrumID, ~Sequence, ~Proteins, ~ProteinID) + + pep_table <- pep_table %>% + select("PeptideID", "SpectrumID", "Sequence", "Proteins", "ProteinID") # Append custom fields CustomFields <- tbl(my_db, "CustomDataFields") @@ -99,33 +101,25 @@ make_pep_table <- function(msf_file, CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides")) # Spread custom fields as separate columns - custom_data <- left_join(CustomPeptides, CustomFields, by = "FieldID") %>% - select_(~PeptideID, ~DisplayName, ~FieldValue) %>% + custom_data <- CustomPeptides %>% + left_join(CustomFields, by = "FieldID") %>% + select("PeptideID", "DisplayName", "FieldValue") %>% collect(n = Inf) %>% - spread_("DisplayName", "FieldValue") - - # Join to peptide table - pep_table <- left_join(pep_table, custom_data, by = "PeptideID") - - old_names <- list(~PeptideID, - ~SpectrumID, - ~ProteinID, - ~Proteins, - ~Sequence, - ~PEP, - ~`q-Value`) - new_names <- c("peptide_id", - "spectrum_id", - "protein_id", - "protein_desc", - "sequence", - "pep_score", - "q_value") - - # Rename columns for a more consistent column name style - pep_table <- select_(pep_table, .dots = setNames(old_names, new_names)) - - DBI::dbDisconnect(my_db) + spread("DisplayName", "FieldValue") + + pep_table <- pep_table %>% + # Join to peptide table + left_join(custom_data, by = "PeptideID") %>% + # Rename columns for a more consistent column name style + select( + peptide_id = "PeptideID", + spectrum_id = "SpectrumID", + protein_id = "ProteinID", + protein_desc = "Proteins", + sequence = "Sequence", + pep_score = "PEP", + q_value = "q-Value" + ) return(pep_table) diff --git a/R/map_peptides.R b/R/map_peptides.R index 604cda4..650c328 100644 --- a/R/map_peptides.R +++ b/R/map_peptides.R @@ -25,21 +25,23 @@ map_peptides <- function(msf_file, min_conf = "High", prot_regex = "") { pep_table <- make_pep_table(msf_file, min_conf, collapse = FALSE) %>% - rename_(.dots = setNames(list(~sequence), c("peptide_sequence"))) + rename(peptide_sequence = "sequence") my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file) prots <- tbl(my_db, "Proteins") %>% - select_(~ProteinID, ~Sequence) %>% + select(.data$ProteinID, .data$Sequence) %>% collect() - pep_table <- inner_join(pep_table, prots, by = c("protein_id" = "ProteinID")) %>% - rename_(.dots = setNames(list(~Sequence), c("protein_sequence"))) %>% + pep_table <- pep_table %>% + inner_join(prots, by = c("protein_id" = "ProteinID")) %>% + rename(protein_sequence = "Sequence") %>% rowwise() %>% - mutate_(.dots = setNames(list(~str_locate(protein_sequence, peptide_sequence)[1], - ~str_locate(protein_sequence, peptide_sequence)[2]), - c("start", - "end"))) + mutate( + start = str_locate(.data$protein_sequence, .data$peptide_sequence)[1], + end = str_locate(.data$protein_sequence, .data$peptide_sequence)[2] + ) + DBI::dbDisconnect(my_db) return(pep_table) diff --git a/R/parsemsf.R b/R/parsemsf.R index c7d710a..f96c1bd 100644 --- a/R/parsemsf.R +++ b/R/parsemsf.R @@ -4,7 +4,6 @@ #' @name parsemsf #' #' @import dplyr -#' @importFrom lazyeval interp #' @import tidyr #' @importFrom stats setNames #' @import stringr diff --git a/R/quantitate.R b/R/quantitate.R index 117cb21..d16f757 100644 --- a/R/quantitate.R +++ b/R/quantitate.R @@ -20,30 +20,30 @@ merge_top_peptides <- function(df, num_reps, match_peps = TRUE) { df %>% - group_by_(~tech_rep, ~sequence) %>% - distinct_(.dots = list(~area)) %>% # Remove peptides with identical areas - summarize_(.dots = setNames(list(~sum(area, na.rm = T)), - c("area"))) %>% # Sum across peptides with different charges - group_by_(~tech_rep) %>% - filter_(~ min_rank(desc(area)) <= 3) %>% # Find top 3 peptides - ungroup() -> df + group_by(.data$tech_rep, .data$sequence) %>% + distinct(.data$area) %>% # Remove peptides with identical areas + summarize(area = sum(.data$area, na.rm = T)) %>% # Sum across peptides with different charges + group_by(.data$tech_rep) %>% + filter(min_rank(desc(.data$area)) <= 3) %>% # Find top 3 peptides + ungroup() -> + df if (match_peps == TRUE) { - df %>% group_by_(~sequence) %>% - mutate_(.dots = setNames(list(~n()), c("n"))) %>% # Count number of matched peptides + df %>% + group_by(.data$sequence) %>% + mutate(n = n()) %>% # Count number of matched peptides ungroup() %>% - filter_(~ n >= num_reps) -> df # Remove any unmatched peptides + filter(.data$n >= num_reps) -> + df # Remove any unmatched peptides } - dots <- setNames(list(~mean(area, na.rm = T), - ~sd(area, na.rm = T), - ~ n()/num_reps), - c("area_mean", - "area_sd", - "peps_per_rep")) - df %>% - summarize_(.dots = dots) -> matched_areas # Compute mean areas from top peptides + summarize( + area_mean = mean(.data$area, na.rm = T), + area_sd = sd(.data$area, na.rm = T), + peps_per_rep = n() / num_reps + ) -> + matched_areas # Compute mean areas from top peptides return(matched_areas) } @@ -82,7 +82,7 @@ quantitate <- function(reps, normalize = T, match_peps = T, relabel = c()) { for (i in 1:length(reps)) { message(paste("Now processing: ", reps[[i]])) reps_df[[i]] <- make_area_table(reps[[i]]) %>% - mutate_(.dots = setNames(list(i), c("tech_rep"))) + mutate(tech_rep = list(i)) } # Combine into single dataframe with all technical replicates @@ -90,29 +90,33 @@ quantitate <- function(reps, normalize = T, match_peps = T, relabel = c()) { # Rename some protein groups if (length(relabel) > 0) { - # Rename lazy eval function - rename_lazy = interp(~str_replace_all(protein_desc, relabel), relabel = relabel) combined %>% - mutate_(.dots = setNames(list(rename_lazy), c("protein_desc"))) -> combined + mutate(protein_desc = str_replace_all(.data$protein_desc, relabel)) -> + combine } # Check if we should normalize to total area for a given replicate # This accounts for any variability in how the sample was injected message("Quantitating...") if (normalize == TRUE) { - # Lazy mutate list - normalize_lazy = list(~sum(area, na.rm = T), ~ area/total_area) combined %>% - group_by_(~tech_rep) %>% - mutate_(.dots = setNames(normalize_lazy, c("total_area", "area"))) %>% - ungroup() -> combined + group_by(.data$tech_rep) %>% + mutate( + total_area = sum(.data$area, na.rm = T), + area = .data$area / .data$total_area + ) %>% + ungroup() -> + combined } # Quantitate using top three most abundant areas combined %>% - group_by_(~protein_desc) %>% - do_(~merge_top_peptides(., num_reps, match_peps = match_peps)) -> combined + group_by(.data$protein_desc) %>% + do(merge_top_peptides(., num_reps, match_peps = match_peps)) -> + combined return(combined) - } + + +globalVariables(c(".", "sd"))