Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 178 additions & 27 deletions R/clean_Spectronaut.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,60 +4,211 @@
#' @return `data.table`
#' @keywords internal
.cleanRawSpectronaut = function(msstats_object, intensity,
calculateAnomalyScores,
anomalyModelFeatures) {
calculateAnomalyScores,
anomalyModelFeatures,
peptideSequenceColumn = "EG.ModifiedSequence",
heavyLabels = NULL) {
FFrgLossType = FExcludedFromQuantification = NULL

spec_input = getInputFile(msstats_object, "input")
.validateSpectronautInput(spec_input)
.validateSpectronautInput(spec_input, peptideSequenceColumn)
spec_input = .addSpectronautColumnsIfMissing(spec_input)
spec_input = spec_input[FFrgLossType == "noloss", ]

f_charge_col = .findAvailable(c("FCharge", "FFrgZ"), colnames(spec_input))
f_charge_col = .findAvailable(c("FCharge", "FFrgZ"), colnames(spec_input), fall_back = "FCharge")
pg_qval_col = .findAvailable(c("PGQvalue"), colnames(spec_input))
interference_col = .findAvailable(c("FPossibleInterference"),
interference_col = .findAvailable(c("FPossibleInterference"),
colnames(spec_input))
exclude_col = .findAvailable(c("FExcludedFromQuantification"),
exclude_col = .findAvailable(c("FExcludedFromQuantification"),
colnames(spec_input))
intensity_column_mapping = c(
"PeakArea" = "FPeakArea",
"NormalizedPeakArea" = "FNormalizedPeakArea",
"MS1Quantity" = "FGMS1Quantity"
)
intensity = match.arg(intensity, names(intensity_column_mapping))
intensity_column = intensity_column_mapping[[intensity]]
cols = c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon",
f_charge_col, "RFileName", "RCondition", "RReplicate",
intensity_col = .resolveSpectronautIntensityColumn(intensity, colnames(spec_input))
peptide_col = .findAvailable(.standardizeColnames(peptideSequenceColumn),
colnames(spec_input))
protein_col = .findAvailable(c("PGProteinGroups", "PGProteinAccessions"),
colnames(spec_input), fall_back = "PGProteinGroups")

cols = c(protein_col, peptide_col, "FGCharge", "FFrgIon",
f_charge_col, "RFileName", "RCondition", "RReplicate",
"EGQvalue", pg_qval_col, interference_col, exclude_col,
intensity_column)
intensity_col)
if (calculateAnomalyScores){
cols = c(cols, anomalyModelFeatures)
}
cols = intersect(cols, colnames(spec_input))
spec_input = spec_input[, cols, with = FALSE]

data.table::setnames(
spec_input,
c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon",
f_charge_col, "RFileName", intensity_column,
spec_input,
c(protein_col, peptide_col, "FGCharge", "FFrgIon",
f_charge_col, "RFileName", intensity_col,
"RCondition", "RReplicate"),
c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
"ProductCharge", "Run", "Intensity", "Condition", "BioReplicate"),
"ProductCharge", "Run", "Intensity", "Condition", "BioReplicate"),
skip_absent = TRUE)

spec_input = .assignSpectronautIsotopeLabelType(
spec_input, heavyLabels)

.logSuccess("Spectronaut", "clean")
spec_input
}

#' Helper method to validate input has necessary columns
#' @param spec_input dataframe input
#' @param peptideSequenceColumn character, name of the column containing peptide
#' sequences, passed from user
#' @noRd
.validateSpectronautInput = function(spec_input) {
required_columns = c(
"FFrgLossType", "FExcludedFromQuantification", "PGProteinGroups",
"FGCharge")
.validateSpectronautInput = function(spec_input, peptideSequenceColumn) {
# Only FGCharge is truly required
required_columns = c("FGCharge")
missing_columns = setdiff(required_columns, colnames(spec_input))
if (length(missing_columns) > 0) {
msg = paste("The following columns are missing from the input data:",
msg = paste("The following columns are missing from the input data:",
paste(missing_columns, sep = ", ", collapse = ", "))
getOption("MSstatsLog")("ERROR", msg)
stop(msg)
}
# Ensure at least one protein name column is present
if (!any(c("PGProteinGroups", "PGProteinAccessions") %in% colnames(spec_input))) {
msg = paste("The following columns are missing from the input data:",
"PGProteinGroups")
getOption("MSstatsLog")("ERROR", msg)
stop(msg)
}
# Ensure at least one protein name column is present
if (!(.standardizeColnames(peptideSequenceColumn) %in% colnames(spec_input))) {
msg = paste("The following column are missing from the input data:",
peptideSequenceColumn)
getOption("MSstatsLog")("ERROR", msg)
stop(msg)
}
}

#' Add synthetic columns that are absent in protein turnover Spectronaut reports.
#'
#' Spectronaut's protein turnover export omits several columns that the
#' standard MSstats pipeline expects. Rather than requiring callers to patch
#' the data frame before passing it in (the "hacks" documented in the design
#' document), we synthesize sensible defaults here so the rest of the pipeline
#' sees a consistent schema.
#'
#' Columns synthesized when absent:
#' \describe{
#' \item{FFrgLossType}{"noloss" — the filter \code{spec_input[FFrgLossType == "noloss"]}
#' keeps all rows, which is the correct behavior when the column is missing.}
#' \item{FExcludedFromQuantification}{FALSE — no rows are excluded.}
#' \item{FFrgIon}{NA — fragment ion identity is not available at the precursor
#' level used by MS1-based protein turnover workflows.}
#' \item{FCharge}{NA — product charge is not applicable when fragment-level
#' columns are absent.}
#' }
#'
#' @param spec_input `data.table` with standardized column names.
#' @return `data.table` with missing columns added.
#' @keywords internal
#' @noRd
.addSpectronautColumnsIfMissing = function(spec_input) {
if (!("FFrgLossType" %in% colnames(spec_input))) {
spec_input[, FFrgLossType := "noloss"]
}
if (!("FExcludedFromQuantification" %in% colnames(spec_input))) {
spec_input[, FExcludedFromQuantification := FALSE]
}
if (!("FFrgIon" %in% colnames(spec_input))) {
spec_input[, FFrgIon := NA_character_]
}
if (!("FCharge" %in% colnames(spec_input))) {
spec_input[, FCharge := NA_integer_]
}
spec_input
}


#' Resolve the Spectronaut intensity column from user input.
#'
#' Accepts either a legacy enum alias or a raw (standardized) column name.
#' The legacy aliases map to their canonical standardized column names so that
#' old code continues to work unchanged. When the user passes a raw column
#' name (e.g. \code{"FGMS1Quantity"} or \code{"FGMS2Quantity"}), it is used
#' directly after verifying that the column exists.
#'
#' @param intensity Character scalar: enum alias or raw column name.
#' @param available_cols Character vector of available standardized column names.
#' @return The resolved standardized column name.
#' @keywords internal
#' @noRd
.resolveSpectronautIntensityColumn = function(intensity, available_cols) {
legacy_mapping = c(
"PeakArea" = "FPeakArea",
"NormalizedPeakArea" = "FNormalizedPeakArea"
)

if (intensity %in% names(legacy_mapping)) {
resolved = legacy_mapping[[intensity]]
} else {
resolved = .standardizeColnames(intensity)
}

if (!(resolved %in% available_cols)) {
stop(paste0(
"Intensity column '", intensity, "' not found in input data. ", collapse = ", ")
)
}
resolved
}


#' Assign IsotopeLabelType based on heavy label detection.
#'
#' When \code{heavyLabel} is provided, each row is classified as heavy
#' (\code{"H"}), light (\code{"L"}), or unlabeled (\code{NA}) by inspecting
#' the labeled sequence column for the presence of the label tag.
#'
#' In Spectronaut protein turnover reports, heavy peptides appear in
#' \code{FG.LabeledSequence} with a bracketed modification, e.g.
#' \code{_PEPTIDEK[Lys6]_}. Any sequence that contains
#' \code{[<heavyLabel>]} is classified as heavy; all others are light.
#' Sequences that do not have amino acids that can carry the label
#' are classified as \code{NA}. For example, if \code{heavyLabels} is
#' \code{"Lys6"}, then \code{PEPTIDEZ} is classified as NA since it
#' has no lysine residues that could be labeled.
#' When \code{heavyLabel} is \code{NULL} the column is left untouched so
#' that the downstream \code{columns_to_fill} default of \code{"L"} applies,
#' preserving backwards compatibility.
#'
#' @param spec_input `data.table` after column renaming.
#' @param heavyLabels Character scalar heavy label name (e.g. \code{"Lys6"}),
#' or \code{NULL}.
#' @return `data.table` with \code{IsotopeLabelType} column added or updated.
#' @keywords internal
#' @noRd
.assignSpectronautIsotopeLabelType = function(spec_input, heavyLabels) {
IsotopeLabelType = PeptideSequence = StrippedSequence = NULL
if (is.null(heavyLabels)) {
return(spec_input)
}

bare_amino_acids = sub("\\[.*\\]", "", heavyLabels)
bare_amino_acids_pattern = paste0(bare_amino_acids, collapse = "|")
heavy_pattern = paste0(heavyLabels, collapse = "|")
heavy_brackets_escaped_pattern = paste(
gsub("([\\[\\]])", "\\\\\\1", heavy_pattern, perl = TRUE),
collapse = "|"
)

spec_input[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]

spec_input[, IsotopeLabelType := data.table::fcase(
grepl(heavy_brackets_escaped_pattern, PeptideSequence, perl = TRUE), "H",
grepl(bare_amino_acids_pattern, StrippedSequence, perl = TRUE), "L",
default = NA_character_
)]

spec_input[, StrippedSequence := NULL]

for (i in seq_along(heavyLabels)) {
escaped = gsub("([\\[\\]])", "\\\\\\1", heavyLabels[i], perl = TRUE)
spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i], PeptideSequence, perl = TRUE)]
}
spec_input
}
58 changes: 43 additions & 15 deletions R/converters_SpectronauttoMSstatsFormat.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
#' Import Spectronaut files
#'
#'
#' @param input name of Spectronaut output, which is long-format. ProteinName, PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, IsotopeLabelType, Condition, BioReplicate, Run, Intensity, F.ExcludedFromQuantification are required. Rows with F.ExcludedFromQuantification=True will be removed.
#' @param annotation name of 'annotation.txt' data which includes Condition, BioReplicate, Run. If annotation is already complete in Spectronaut, use annotation=NULL (default). It will use the annotation information from input.
#' @param intensity 'PeakArea'(default) uses not normalized MS2 peak area. 'NormalizedPeakArea' uses MS2 peak area normalized by Spectronaut.
#' 'MS1Quantity' uses MS1 level quantification, which should be used if MS2 is unreliable.
#' @param intensity Intensity column to use. Accepts legacy enum values
#' \code{'PeakArea'} (default, uses F.PeakArea), \code{'NormalizedPeakArea'}
#' (uses F.NormalizedPeakArea). Can also be any raw
#' Spectronaut column name passed as a string (e.g.
#' \code{"FG.MS1Quantity"}); the column name is standardized internally.
#' For protein turnover workflows the recommended default is
#' \code{"FG.MS1Quantity"}.
#' @param peptideSequenceColumn Name of the Spectronaut column that contains the
#' peptide sequence. Defaults to \code{"EG.ModifiedSequence"}. The value is
#' standardized internally (dots and spaces removed) before column lookup.
#' @param heavyLabels Character list identifying the heavy isotope labels as it
#' appears inside square brackets in the peptide sequence column, e.g.
#' \code{c("Lys6")} matches peptides containing \code{[Lys6]}.
#' \code{c("Lys6", "Arg10")} matches peptides containing either \code{[Lys6]} or \code{[Arg10]}.
#' Supports any novel label name reported by Spectronaut (e.g. \code{"Leu6"},
#' \code{"Phe10"}). When provided, peptides are
#' classified as heavy (\code{IsotopeLabelType = "H"}), light
#' (\code{IsotopeLabelType = "L"}), or unlabeled
#' (\code{IsotopeLabelType = NA}) based on its labeled sequence. When
#' \code{NULL} (default) all peptides receive \code{IsotopeLabelType = "L"}.
#' Useful for protein turnover experiments.
#' @param excludedFromQuantificationFilter Remove rows with F.ExcludedFromQuantification=TRUE Default is TRUE.
#' @param filter_with_Qvalue FALSE(default) will not perform any filtering. TRUE will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.
#' @param qvalue_cutoff Cutoff for EG.Qvalue. default is 0.01.
Expand Down Expand Up @@ -33,8 +52,10 @@
#' head(spectronaut_imported)
#'
SpectronauttoMSstatsFormat = function(
input, annotation = NULL,
intensity = c('PeakArea', 'NormalizedPeakArea', 'MS1Quantity'),
input, annotation = NULL,
intensity = 'PeakArea',
peptideSequenceColumn = "EG.ModifiedSequence",
heavyLabels = NULL,
excludedFromQuantificationFilter = TRUE,
filter_with_Qvalue = FALSE, qvalue_cutoff = 0.01,
useUniquePeptide = TRUE, removeFewMeasurements=TRUE,
Expand All @@ -47,10 +68,11 @@ SpectronauttoMSstatsFormat = function(
use_log_file = TRUE, append = FALSE, verbose = TRUE,
log_file_path = NULL, ...
) {

validation_config = list(
input = input,
annotation = annotation,
intensity = intensity,
input = input,
annotation = annotation,
intensity = intensity,
excludedFromQuantificationFilter = excludedFromQuantificationFilter,
filter_with_Qvalue = filter_with_Qvalue,
qvalue_cutoff = qvalue_cutoff,
Expand Down Expand Up @@ -84,8 +106,10 @@ SpectronauttoMSstatsFormat = function(
"MSstats", "Spectronaut", ...)

input = MSstatsConvert::MSstatsClean(input, intensity = intensity,
calculateAnomalyScores,
anomalyModelFeatures)
calculateAnomalyScores,
anomalyModelFeatures,
peptideSequenceColumn = peptideSequenceColumn,
heavyLabels = heavyLabels)
annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)

pq_filter = list(score_column = "PGQvalue",
Expand Down Expand Up @@ -113,20 +137,24 @@ SpectronauttoMSstatsFormat = function(
drop_column = TRUE
)

feature_columns = c("PeptideSequence", "PrecursorCharge",
feature_columns = c("PeptideSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge")

fill_isotope_label_type = if (is.null(heavyLabels))
list("IsotopeLabelType" = "L") else list()

input = MSstatsConvert::MSstatsPreprocess(
input,
annotation,
input,
annotation,
feature_columns,
remove_shared_peptides = useUniquePeptide,
remove_single_feature_proteins = removeProtein_with1Feature,
feature_cleaning = list(remove_features_with_few_measurements = removeFewMeasurements,
summarize_multiple_psms = summaryforMultipleRows),
score_filtering = list(pgq = pq_filter,
score_filtering = list(pgq = pq_filter,
psm_q = qval_filter),
exact_filtering = list(excluded_quant = excluded_quant_filter),
columns_to_fill = list("IsotopeLabelType" = "L"),
columns_to_fill = fill_isotope_label_type,
anomaly_metrics = anomalyModelFeatures)
input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]

Expand Down
30 changes: 23 additions & 7 deletions R/utils_balanced_design.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,35 @@
measurement_col, group_col)
result[, 2:4, with = FALSE]
} else {
labels = unique(input[["IsotopeLabelType"]])
labels = na.omit(unique(input[["IsotopeLabelType"]]))
groups = unique(input[[group_col]])
by_group = vector("list", length(groups))
measurements = unique(input[[measurement_col]])
for (group_id in seq_along(groups)) {
group = groups[group_id]
group_filter = input[[group_col]] == group
by_group[[group_id]] = data.table::as.data.table(
expand.grid(
labels = labels,
features = unique(input[[feature_col]][group_filter]),
measurements = unique(input[[measurement_col]][group_filter])
))
non_na_filter = group_filter & !is.na(input[["IsotopeLabelType"]])
na_filter = group_filter & is.na(input[["IsotopeLabelType"]])
non_na_features = unique(input[[feature_col]][non_na_filter])
na_label_features = unique(input[[feature_col]][na_filter])
parts = list()
if (length(non_na_features) > 0) {
parts[[length(parts) + 1]] = data.table::as.data.table(
expand.grid(
labels = labels,
features = non_na_features,
measurements = unique(input[[measurement_col]][group_filter])
))
}
if (length(na_label_features) > 0) {
parts[[length(parts) + 1]] = data.table::as.data.table(
expand.grid(
labels = NA,
features = na_label_features,
measurements = unique(input[[measurement_col]][group_filter])
))
}
by_group[[group_id]] = data.table::rbindlist(parts)
by_group[[group_id]]$group = group
}
result = data.table::rbindlist(by_group)
Expand Down
Loading
Loading