Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ export(quantitate)
import(dplyr)
import(stringr)
import(tidyr)
importFrom(lazyeval,interp)
importFrom(stats,setNames)
64 changes: 28 additions & 36 deletions R/make_area_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,35 +28,39 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA-
# Access MSF database file
# my_db <- src_sqlite(msf_file)
my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
on.exit(DBI::dbDisconnect(my_db))

Events <- tbl(my_db, "Events") %>%
select_(~-RT, ~-LeftRT, ~-RightRT, ~-SN, ~-FileID, ~-Intensity)
select(-"RT", -"LeftRT", -"RightRT", -"SN", -"FileID", -"Intensity")

EventAreaAnnotations <- tbl(my_db, "EventAreaAnnotations") %>%
select_(~EventID, ~QuanResultID)
select("EventID", "QuanResultID")

PrecursorIonAreaSearchSpectra <- tbl(my_db, "PrecursorIonAreaSearchSpectra")

# Here is where all the scan information, including mass and charge
SpectrumHeaders <- tbl(my_db, "SpectrumHeaders") %>%
select_(~SpectrumID, ~FirstScan, ~Mass, ~Charge, ~RetentionTime, ~UniqueSpectrumID) %>%
select("SpectrumID", "FirstScan", "Mass", "Charge", "RetentionTime", "UniqueSpectrumID") %>%
collect(n = Inf)

# Grab intensities
MassPeaks <- tbl(my_db, "MassPeaks") %>%
select_(~MassPeakID, ~Intensity) %>%
select("MassPeakID", "Intensity") %>%
collect(n = Inf)

# Here are all the areas
# m/z is really just one of many m/z's that could be retrieved from the msf file.
# All of the m/z's are calculated slightly differently. I just picked one since
# I don't use m/z's for my work.
events_joined <- inner_join(Events, EventAreaAnnotations, by = "EventID") %>%
events_joined <- Events %>%
inner_join(EventAreaAnnotations, by = "EventID") %>%
inner_join(PrecursorIonAreaSearchSpectra, by = "QuanResultID") %>%
collect(n = Inf) %>%
group_by_(~SearchSpectrumID) %>%
summarize_(.dots = setNames(list(~sum(Area), ~mean(Mass)),
c("Area", "m_z")))
group_by(.data$SearchSpectrumID) %>%
summarize(
Area = sum(.data$Area),
m_z = mean(.data$Mass)
)

# NOTE: mass and m/z are probably not correct right now! I have check them in more detail

Expand All @@ -68,34 +72,22 @@ make_area_table <- function(msf_file, min_conf = "High", prot_regex = "^>([a-zA-

pep_table <- make_pep_table(msf_file, min_conf, prot_regex, collapse)

# Rename columns for more consistent column naming
old_names <- list(~peptide_id,
~SpectrumID,
~protein_desc,
~sequence,
~Area,
~Mass,
~m_z,
~Charge,
~Intensity,
~FirstScan)
new_names <- c("peptide_id",
"spectrum_id",
"protein_desc",
"sequence",
"area",
"mass",
"m_z",
"charge",
"intensity",
"first_scan")

# Join peptide info to mass/area/charge/etc.
auc_table <- right_join(spectra, pep_table, by=c("SpectrumID" = "spectrum_id")) %>%
select_(.dots = setNames(old_names, new_names))

DBI::dbDisconnect(my_db)
auc_table <- spectra %>%
# Join peptide info to mass/area/charge/etc.
right_join(pep_table, by = c("SpectrumID" = "spectrum_id")) %>%
# Rename columns for more consistent column naming
select(
"peptide_id" = "peptide_id",
"spectrum_id" = "SpectrumID",
"protein_desc" = "protein_desc",
"sequence" = "sequence",
"area" = "Area",
"mass" = "Mass",
"m_z" = "m_z",
"charge" = "Charge",
"intensity" = "Intensity",
"first_scan" = "FirstScan"
)

return(auc_table)

}
78 changes: 36 additions & 42 deletions R/make_pep_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,47 +50,49 @@ make_pep_table <- function(msf_file,
# Access MSF database file
# my_db <- dplyr::src_sqlite(msf_file)
my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
on.exit(DBI::dbDisconnect(my_db))

# First check file version (only Proteome Discoverer 1.4.x is supported)
schema <- tbl(my_db, "SchemaInfo") %>%
select_(~SoftwareVersion) %>%
select("SoftwareVersion") %>%
collect()
if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) {
warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.")
}

# This table maps PeptideIDs to ProteinIDs
PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>%
select_(~PeptideID, ~ProteinID)
select("PeptideID", "ProteinID")

# Here are the actual peptide sequences with corresponding SpectrumIDs
Peptides <- tbl(my_db, "Peptides") %>%
select_(~PeptideID, ~SpectrumID, ~ConfidenceLevel, ~Sequence) %>%
filter_(~ ConfidenceLevel >= confidence)
select("PeptideID", "SpectrumID", "ConfidenceLevel", "Sequence") %>%
filter(.data$ConfidenceLevel >= confidence)

# Protein descriptions are contained in this table
ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>%
select_(~ProteinID, ~Description)

# Lazy-eval formula for protein name matching
prot_match <- lazyeval::interp(~str_match(Description, prot_regex)[,2], prot_regex = prot_regex)
select("ProteinID", "Description")

# Build a peptide table
pep_table <- inner_join(PeptidesProteins, Peptides, by = c("PeptideID" = "PeptideID")) %>%
pep_table <- PeptidesProteins %>%
inner_join(Peptides, by = c("PeptideID" = "PeptideID")) %>%
inner_join(ProteinAnnotations, by = "ProteinID") %>%
collect(n = Inf) %>%
# Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space
mutate_(.dots = setNames(list(prot_match), c("Proteins"))) %>%
group_by_(~PeptideID, ~SpectrumID, ~Sequence)
mutate(Proteins = str_match(.data$Description, prot_regex)[,2]) %>%
group_by(.data$PeptideID, .data$SpectrumID, .data$Sequence)

# Collapse peptides that map to multiple proteins into a single row
if (collapse == TRUE) {
pep_table <- pep_table %>%
summarize_(.dots = setNames(list(~paste(Proteins, sep = "", collapse = "; "),
~paste(ProteinID, sep = "", collapse = "; ")),
c("Proteins",
"ProteinID")))
summarize(
Proteins = paste0(.data$Proteins, collapse = "; "),
ProteinID = paste0(.data$ProteinID, collapse = "; ")
)
}
pep_table <- pep_table %>% select_(~PeptideID, ~SpectrumID, ~Sequence, ~Proteins, ~ProteinID)

pep_table <- pep_table %>%
select("PeptideID", "SpectrumID", "Sequence", "Proteins", "ProteinID")
# Append custom fields
CustomFields <- tbl(my_db, "CustomDataFields")

Expand All @@ -99,33 +101,25 @@ make_pep_table <- function(msf_file,
CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides"))

# Spread custom fields as separate columns
custom_data <- left_join(CustomPeptides, CustomFields, by = "FieldID") %>%
select_(~PeptideID, ~DisplayName, ~FieldValue) %>%
custom_data <- CustomPeptides %>%
left_join(CustomFields, by = "FieldID") %>%
select("PeptideID", "DisplayName", "FieldValue") %>%
collect(n = Inf) %>%
spread_("DisplayName", "FieldValue")

# Join to peptide table
pep_table <- left_join(pep_table, custom_data, by = "PeptideID")

old_names <- list(~PeptideID,
~SpectrumID,
~ProteinID,
~Proteins,
~Sequence,
~PEP,
~`q-Value`)
new_names <- c("peptide_id",
"spectrum_id",
"protein_id",
"protein_desc",
"sequence",
"pep_score",
"q_value")

# Rename columns for a more consistent column name style
pep_table <- select_(pep_table, .dots = setNames(old_names, new_names))

DBI::dbDisconnect(my_db)
spread("DisplayName", "FieldValue")

pep_table <- pep_table %>%
# Join to peptide table
left_join(custom_data, by = "PeptideID") %>%
# Rename columns for a more consistent column name style
select(
peptide_id = "PeptideID",
spectrum_id = "SpectrumID",
protein_id = "ProteinID",
protein_desc = "Proteins",
sequence = "Sequence",
pep_score = "PEP",
q_value = "q-Value"
)

return(pep_table)

Expand Down
18 changes: 10 additions & 8 deletions R/map_peptides.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@
map_peptides <- function(msf_file, min_conf = "High", prot_regex = "") {

pep_table <- make_pep_table(msf_file, min_conf, collapse = FALSE) %>%
rename_(.dots = setNames(list(~sequence), c("peptide_sequence")))
rename(peptide_sequence = "sequence")

my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)

prots <- tbl(my_db, "Proteins") %>%
select_(~ProteinID, ~Sequence) %>%
select(.data$ProteinID, .data$Sequence) %>%
collect()

pep_table <- inner_join(pep_table, prots, by = c("protein_id" = "ProteinID")) %>%
rename_(.dots = setNames(list(~Sequence), c("protein_sequence"))) %>%
pep_table <- pep_table %>%
inner_join(prots, by = c("protein_id" = "ProteinID")) %>%
rename(protein_sequence = "Sequence") %>%
rowwise() %>%
mutate_(.dots = setNames(list(~str_locate(protein_sequence, peptide_sequence)[1],
~str_locate(protein_sequence, peptide_sequence)[2]),
c("start",
"end")))
mutate(
start = str_locate(.data$protein_sequence, .data$peptide_sequence)[1],
end = str_locate(.data$protein_sequence, .data$peptide_sequence)[2]
)

DBI::dbDisconnect(my_db)

return(pep_table)
Expand Down
1 change: 0 additions & 1 deletion R/parsemsf.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#' @name parsemsf
#'
#' @import dplyr
#' @importFrom lazyeval interp
#' @import tidyr
#' @importFrom stats setNames
#' @import stringr
Expand Down
64 changes: 34 additions & 30 deletions R/quantitate.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,30 @@
merge_top_peptides <- function(df, num_reps, match_peps = TRUE) {

df %>%
group_by_(~tech_rep, ~sequence) %>%
distinct_(.dots = list(~area)) %>% # Remove peptides with identical areas
summarize_(.dots = setNames(list(~sum(area, na.rm = T)),
c("area"))) %>% # Sum across peptides with different charges
group_by_(~tech_rep) %>%
filter_(~ min_rank(desc(area)) <= 3) %>% # Find top 3 peptides
ungroup() -> df
group_by(.data$tech_rep, .data$sequence) %>%
distinct(.data$area) %>% # Remove peptides with identical areas
summarize(area = sum(.data$area, na.rm = T)) %>% # Sum across peptides with different charges
group_by(.data$tech_rep) %>%
filter(min_rank(desc(.data$area)) <= 3) %>% # Find top 3 peptides
ungroup() ->
df

if (match_peps == TRUE) {
df %>% group_by_(~sequence) %>%
mutate_(.dots = setNames(list(~n()), c("n"))) %>% # Count number of matched peptides
df %>%
group_by(.data$sequence) %>%
mutate(n = n()) %>% # Count number of matched peptides
ungroup() %>%
filter_(~ n >= num_reps) -> df # Remove any unmatched peptides
filter(.data$n >= num_reps) ->
df # Remove any unmatched peptides
}

dots <- setNames(list(~mean(area, na.rm = T),
~sd(area, na.rm = T),
~ n()/num_reps),
c("area_mean",
"area_sd",
"peps_per_rep"))

df %>%
summarize_(.dots = dots) -> matched_areas # Compute mean areas from top peptides
summarize(
area_mean = mean(.data$area, na.rm = T),
area_sd = sd(.data$area, na.rm = T),
peps_per_rep = n() / num_reps
) ->
matched_areas # Compute mean areas from top peptides

return(matched_areas)
}
Expand Down Expand Up @@ -82,37 +82,41 @@ quantitate <- function(reps, normalize = T, match_peps = T, relabel = c()) {
for (i in 1:length(reps)) {
message(paste("Now processing: ", reps[[i]]))
reps_df[[i]] <- make_area_table(reps[[i]]) %>%
mutate_(.dots = setNames(list(i), c("tech_rep")))
mutate(tech_rep = list(i))
}

# Combine into single dataframe with all technical replicates
combined <- bind_rows(reps_df)

# Rename some protein groups
if (length(relabel) > 0) {
# Rename lazy eval function
rename_lazy = interp(~str_replace_all(protein_desc, relabel), relabel = relabel)
combined %>%
mutate_(.dots = setNames(list(rename_lazy), c("protein_desc"))) -> combined
mutate(protein_desc = str_replace_all(.data$protein_desc, relabel)) ->
combine
}

# Check if we should normalize to total area for a given replicate
# This accounts for any variability in how the sample was injected
message("Quantitating...")
if (normalize == TRUE) {
# Lazy mutate list
normalize_lazy = list(~sum(area, na.rm = T), ~ area/total_area)
combined %>%
group_by_(~tech_rep) %>%
mutate_(.dots = setNames(normalize_lazy, c("total_area", "area"))) %>%
ungroup() -> combined
group_by(.data$tech_rep) %>%
mutate(
total_area = sum(.data$area, na.rm = T),
area = .data$area / .data$total_area
) %>%
ungroup() ->
combined
}

# Quantitate using top three most abundant areas
combined %>%
group_by_(~protein_desc) %>%
do_(~merge_top_peptides(., num_reps, match_peps = match_peps)) -> combined
group_by(.data$protein_desc) %>%
do(merge_top_peptides(., num_reps, match_peps = match_peps)) ->
combined

return(combined)

}


globalVariables(c(".", "sd"))