Vitek-Lab · tonywu1999 · Apr 2, 2026 · Mar 30, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/R/clean_Spectronaut.R b/R/clean_Spectronaut.R
@@ -4,60 +4,211 @@
 #' @return `data.table`
 #' @keywords internal
 .cleanRawSpectronaut = function(msstats_object, intensity,
-                                calculateAnomalyScores, 
-                                anomalyModelFeatures) {
+                                calculateAnomalyScores,
+                                anomalyModelFeatures,
+                                peptideSequenceColumn = "EG.ModifiedSequence",
+                                heavyLabels = NULL) {
   FFrgLossType = FExcludedFromQuantification = NULL
-  
+
   spec_input = getInputFile(msstats_object, "input")
-  .validateSpectronautInput(spec_input)
+  .validateSpectronautInput(spec_input, peptideSequenceColumn)
+  spec_input = .addSpectronautColumnsIfMissing(spec_input)
   spec_input = spec_input[FFrgLossType == "noloss", ]
-
-  f_charge_col = .findAvailable(c("FCharge", "FFrgZ"), colnames(spec_input))
+  
+  f_charge_col = .findAvailable(c("FCharge", "FFrgZ"), colnames(spec_input), fall_back = "FCharge")
   pg_qval_col = .findAvailable(c("PGQvalue"), colnames(spec_input))
-  interference_col = .findAvailable(c("FPossibleInterference"), 
+  interference_col = .findAvailable(c("FPossibleInterference"),
                                     colnames(spec_input))
-  exclude_col = .findAvailable(c("FExcludedFromQuantification"), 
+  exclude_col = .findAvailable(c("FExcludedFromQuantification"),
                                colnames(spec_input))
-  intensity_column_mapping = c(
-      "PeakArea" = "FPeakArea",
-      "NormalizedPeakArea" = "FNormalizedPeakArea",
-      "MS1Quantity" = "FGMS1Quantity"
-  )
-  intensity = match.arg(intensity, names(intensity_column_mapping))
-  intensity_column = intensity_column_mapping[[intensity]]
-  cols = c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon", 
-           f_charge_col, "RFileName", "RCondition", "RReplicate", 
+  intensity_col = .resolveSpectronautIntensityColumn(intensity, colnames(spec_input))
+  peptide_col = .findAvailable(.standardizeColnames(peptideSequenceColumn),
+                               colnames(spec_input))
+  protein_col = .findAvailable(c("PGProteinGroups", "PGProteinAccessions"),
+                               colnames(spec_input), fall_back = "PGProteinGroups")
+
+  cols = c(protein_col, peptide_col, "FGCharge", "FFrgIon",
+           f_charge_col, "RFileName", "RCondition", "RReplicate",
            "EGQvalue", pg_qval_col, interference_col, exclude_col,
-           intensity_column)
+           intensity_col)
   if (calculateAnomalyScores){
     cols = c(cols, anomalyModelFeatures)
   }
   cols = intersect(cols, colnames(spec_input))
   spec_input = spec_input[, cols, with = FALSE]
+
   data.table::setnames(
-    spec_input, 
-    c("PGProteinGroups", "EGModifiedSequence", "FGCharge", "FFrgIon",
-      f_charge_col, "RFileName", intensity_column, 
+    spec_input,
+    c(protein_col, peptide_col, "FGCharge", "FFrgIon",
+      f_charge_col, "RFileName", intensity_col,
       "RCondition", "RReplicate"),
     c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
-      "ProductCharge", "Run", "Intensity", "Condition", "BioReplicate"), 
+      "ProductCharge", "Run", "Intensity", "Condition", "BioReplicate"),
     skip_absent = TRUE)
+
+  spec_input = .assignSpectronautIsotopeLabelType(
+    spec_input, heavyLabels)
+
   .logSuccess("Spectronaut", "clean")
   spec_input
 }
 
 #' Helper method to validate input has necessary columns
 #' @param spec_input dataframe input
+#' @param peptideSequenceColumn character, name of the column containing peptide 
+#' sequences, passed from user
 #' @noRd
-.validateSpectronautInput = function(spec_input) {
-    required_columns = c(
-        "FFrgLossType", "FExcludedFromQuantification", "PGProteinGroups",
-        "FGCharge")
+.validateSpectronautInput = function(spec_input, peptideSequenceColumn) {
+    # Only FGCharge is truly required
+    required_columns = c("FGCharge")
     missing_columns = setdiff(required_columns, colnames(spec_input))
     if (length(missing_columns) > 0) {
-        msg = paste("The following columns are missing from the input data:", 
+        msg = paste("The following columns are missing from the input data:",
                     paste(missing_columns, sep = ", ", collapse = ", "))
         getOption("MSstatsLog")("ERROR", msg)
         stop(msg)
     }
+    # Ensure at least one protein name column is present
+    if (!any(c("PGProteinGroups", "PGProteinAccessions") %in% colnames(spec_input))) {
+        msg = paste("The following columns are missing from the input data:",
+                    "PGProteinGroups")
+        getOption("MSstatsLog")("ERROR", msg)
+        stop(msg)
+    }
+    # Ensure at least one protein name column is present
+    if (!(.standardizeColnames(peptideSequenceColumn) %in% colnames(spec_input))) {
+        msg = paste("The following column are missing from the input data:",
+                    peptideSequenceColumn)
+        getOption("MSstatsLog")("ERROR", msg)
+        stop(msg)
+    }
+}
+
+#' Add synthetic columns that are absent in protein turnover Spectronaut reports.
+#'
+#' Spectronaut's protein turnover export omits several columns that the
+#' standard MSstats pipeline expects.  Rather than requiring callers to patch
+#' the data frame before passing it in (the "hacks" documented in the design
+#' document), we synthesize sensible defaults here so the rest of the pipeline
+#' sees a consistent schema.
+#'
+#' Columns synthesized when absent:
+#' \describe{
+#'   \item{FFrgLossType}{"noloss" — the filter \code{spec_input[FFrgLossType == "noloss"]}
+#'     keeps all rows, which is the correct behavior when the column is missing.}
+#'   \item{FExcludedFromQuantification}{FALSE — no rows are excluded.}
+#'   \item{FFrgIon}{NA — fragment ion identity is not available at the precursor
+#'     level used by MS1-based protein turnover workflows.}
+#'   \item{FCharge}{NA — product charge is not applicable when fragment-level
+#'     columns are absent.}
+#' }
+#'
+#' @param spec_input `data.table` with standardized column names.
+#' @return `data.table` with missing columns added.
+#' @keywords internal
+#' @noRd
+.addSpectronautColumnsIfMissing = function(spec_input) {
+  if (!("FFrgLossType" %in% colnames(spec_input))) {
+    spec_input[, FFrgLossType := "noloss"]
+  }
+  if (!("FExcludedFromQuantification" %in% colnames(spec_input))) {
+    spec_input[, FExcludedFromQuantification := FALSE]
+  }
+  if (!("FFrgIon" %in% colnames(spec_input))) {
+    spec_input[, FFrgIon := NA_character_]
+  }
+  if (!("FCharge" %in% colnames(spec_input))) {
+    spec_input[, FCharge := NA_integer_]
+  }
+  spec_input
+}
+
+
+#' Resolve the Spectronaut intensity column from user input.
+#'
+#' Accepts either a legacy enum alias or a raw (standardized) column name.
+#' The legacy aliases map to their canonical standardized column names so that
+#' old code continues to work unchanged.  When the user passes a raw column
+#' name (e.g. \code{"FGMS1Quantity"} or \code{"FGMS2Quantity"}), it is used
+#' directly after verifying that the column exists.
+#'
+#' @param intensity Character scalar: enum alias or raw column name.
+#' @param available_cols Character vector of available standardized column names.
+#' @return The resolved standardized column name.
+#' @keywords internal
+#' @noRd
+.resolveSpectronautIntensityColumn = function(intensity, available_cols) {
+  legacy_mapping = c(
+    "PeakArea"           = "FPeakArea",
+    "NormalizedPeakArea" = "FNormalizedPeakArea"
+  )
+
+  if (intensity %in% names(legacy_mapping)) {
+    resolved = legacy_mapping[[intensity]]
+  } else {
+    resolved = .standardizeColnames(intensity)
+  }
+
+  if (!(resolved %in% available_cols)) {
+    stop(paste0(
+      "Intensity column '", intensity, "' not found in input data. ", collapse = ", ")
+    )
+  }
+  resolved
+}
+
+
+#' Assign IsotopeLabelType based on heavy label detection.
+#'
+#' When \code{heavyLabel} is provided, each row is classified as heavy
+#' (\code{"H"}), light (\code{"L"}), or unlabeled (\code{NA}) by inspecting
+#' the labeled sequence column for the presence of the label tag.
+#'
+#' In Spectronaut protein turnover reports, heavy peptides appear in
+#' \code{FG.LabeledSequence} with a bracketed modification, e.g.
+#' \code{_PEPTIDEK[Lys6]_}.  Any sequence that contains
+#' \code{[<heavyLabel>]} is classified as heavy; all others are light.
+#' Sequences that do not have amino acids that can carry the label
+#' are classified as \code{NA}.  For example, if \code{heavyLabels} is 
+#' \code{"Lys6"}, then \code{PEPTIDEZ} is classified as NA since it
+#' has no lysine residues that could be labeled.
+#' When \code{heavyLabel} is \code{NULL} the column is left untouched so
+#' that the downstream \code{columns_to_fill} default of \code{"L"} applies,
+#' preserving backwards compatibility.
+#'
+#' @param spec_input `data.table` after column renaming.
+#' @param heavyLabels Character scalar heavy label name (e.g. \code{"Lys6"}),
+#'   or \code{NULL}.
+#' @return `data.table` with \code{IsotopeLabelType} column added or updated.
+#' @keywords internal
+#' @noRd
+.assignSpectronautIsotopeLabelType = function(spec_input, heavyLabels) {
+    IsotopeLabelType = PeptideSequence = StrippedSequence = NULL
+    if (is.null(heavyLabels)) {
+        return(spec_input)
+    }
+
+    bare_amino_acids = sub("\\[.*\\]", "", heavyLabels)
+    bare_amino_acids_pattern = paste0(bare_amino_acids, collapse = "|")
+    heavy_pattern = paste0(heavyLabels, collapse = "|")
+    heavy_brackets_escaped_pattern = paste(
+        gsub("([\\[\\]])", "\\\\\\1", heavy_pattern, perl = TRUE),
+        collapse = "|"
+    )
+
+    spec_input[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]
+
+    spec_input[, IsotopeLabelType := data.table::fcase(
+        grepl(heavy_brackets_escaped_pattern, PeptideSequence, perl = TRUE), "H",
+        grepl(bare_amino_acids_pattern, StrippedSequence, perl = TRUE), "L",
+        default = NA_character_
+    )]
+
+    spec_input[, StrippedSequence := NULL]
+
+    for (i in seq_along(heavyLabels)) {
+        escaped = gsub("([\\[\\]])", "\\\\\\1", heavyLabels[i], perl = TRUE)
+        spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i], PeptideSequence, perl = TRUE)]
+    }
+    spec_input
 }
diff --git a/R/converters_SpectronauttoMSstatsFormat.R b/R/converters_SpectronauttoMSstatsFormat.R
@@ -1,9 +1,28 @@
 #' Import Spectronaut files
-#' 
+#'
 #' @param input name of Spectronaut output, which is long-format. ProteinName, PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, IsotopeLabelType, Condition, BioReplicate, Run, Intensity, F.ExcludedFromQuantification are required. Rows with F.ExcludedFromQuantification=True will be removed.
 #' @param annotation name of 'annotation.txt' data which includes Condition, BioReplicate, Run. If annotation is already complete in Spectronaut, use annotation=NULL (default). It will use the annotation information from input.
-#' @param intensity 'PeakArea'(default) uses not normalized MS2 peak area. 'NormalizedPeakArea' uses MS2 peak area normalized by Spectronaut.
-#' 'MS1Quantity' uses MS1 level quantification, which should be used if MS2 is unreliable.
+#' @param intensity Intensity column to use. Accepts legacy enum values
+#'   \code{'PeakArea'} (default, uses F.PeakArea), \code{'NormalizedPeakArea'}
+#'   (uses F.NormalizedPeakArea).  Can also be any raw
+#'   Spectronaut column name passed as a string (e.g.
+#'   \code{"FG.MS1Quantity"}); the column name is standardized internally.
+#'   For protein turnover workflows the recommended default is
+#'   \code{"FG.MS1Quantity"}.
+#' @param peptideSequenceColumn Name of the Spectronaut column that contains the
+#' peptide sequence.  Defaults to \code{"EG.ModifiedSequence"}. The value is 
+#' standardized internally (dots and spaces removed) before column lookup.
+#' @param heavyLabels Character list identifying the heavy isotope labels as it
+#'   appears inside square brackets in the peptide sequence column, e.g.
+#'   \code{c("Lys6")} matches peptides containing \code{[Lys6]}.  
+#'   \code{c("Lys6", "Arg10")} matches peptides containing either \code{[Lys6]} or \code{[Arg10]}.
+#'   Supports any novel label name reported by Spectronaut (e.g. \code{"Leu6"},
+#'   \code{"Phe10"}).  When provided, peptides are
+#'   classified as heavy (\code{IsotopeLabelType = "H"}), light
+#'   (\code{IsotopeLabelType = "L"}), or unlabeled
+#'   (\code{IsotopeLabelType = NA}) based on its labeled sequence.  When
+#'   \code{NULL} (default) all peptides receive \code{IsotopeLabelType = "L"}.
+#'   Useful for protein turnover experiments.
 #' @param excludedFromQuantificationFilter Remove rows with F.ExcludedFromQuantification=TRUE Default is TRUE.
 #' @param filter_with_Qvalue FALSE(default) will not perform any filtering. TRUE will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.
 #' @param qvalue_cutoff Cutoff for EG.Qvalue. default is 0.01.
@@ -33,8 +52,10 @@
 #' head(spectronaut_imported)
 #' 
 SpectronauttoMSstatsFormat = function(
-        input, annotation = NULL, 
-        intensity = c('PeakArea', 'NormalizedPeakArea', 'MS1Quantity'),
+        input, annotation = NULL,
+        intensity = 'PeakArea',
+        peptideSequenceColumn = "EG.ModifiedSequence",
+        heavyLabels = NULL,
         excludedFromQuantificationFilter = TRUE,
         filter_with_Qvalue = FALSE, qvalue_cutoff = 0.01, 
         useUniquePeptide = TRUE, removeFewMeasurements=TRUE,
@@ -47,10 +68,11 @@ SpectronauttoMSstatsFormat = function(
         use_log_file = TRUE, append = FALSE, verbose = TRUE, 
         log_file_path = NULL, ...
 ) {
+
     validation_config = list(
-        input = input, 
-        annotation = annotation, 
-        intensity = intensity, 
+        input = input,
+        annotation = annotation,
+        intensity = intensity,
         excludedFromQuantificationFilter = excludedFromQuantificationFilter,
         filter_with_Qvalue = filter_with_Qvalue, 
         qvalue_cutoff = qvalue_cutoff, 
@@ -84,8 +106,10 @@ SpectronauttoMSstatsFormat = function(
                                           "MSstats", "Spectronaut", ...)
 
     input = MSstatsConvert::MSstatsClean(input, intensity = intensity,
-                                         calculateAnomalyScores, 
-                                         anomalyModelFeatures)
+                                         calculateAnomalyScores,
+                                         anomalyModelFeatures,
+                                         peptideSequenceColumn = peptideSequenceColumn,
+                                         heavyLabels = heavyLabels)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
 
     pq_filter = list(score_column = "PGQvalue", 
@@ -113,20 +137,24 @@ SpectronauttoMSstatsFormat = function(
         drop_column = TRUE
     )
 
-    feature_columns = c("PeptideSequence", "PrecursorCharge", 
+    feature_columns = c("PeptideSequence", "PrecursorCharge",
                         "FragmentIon", "ProductCharge")
+
+    fill_isotope_label_type = if (is.null(heavyLabels)) 
+        list("IsotopeLabelType" = "L") else list()
+
     input = MSstatsConvert::MSstatsPreprocess(
-        input, 
-        annotation, 
+        input,
+        annotation,
         feature_columns,
         remove_shared_peptides = useUniquePeptide,
         remove_single_feature_proteins = removeProtein_with1Feature,
         feature_cleaning = list(remove_features_with_few_measurements = removeFewMeasurements,
                                 summarize_multiple_psms = summaryforMultipleRows),
-        score_filtering = list(pgq = pq_filter, 
+        score_filtering = list(pgq = pq_filter,
                                psm_q = qval_filter),
         exact_filtering = list(excluded_quant = excluded_quant_filter),
-        columns_to_fill = list("IsotopeLabelType" = "L"),
+        columns_to_fill = fill_isotope_label_type,
         anomaly_metrics = anomalyModelFeatures)
     input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
 

diff --git a/R/utils_balanced_design.R b/R/utils_balanced_design.R
@@ -94,19 +94,35 @@
                              measurement_col, group_col)
         result[, 2:4, with = FALSE]
     } else {
-        labels = unique(input[["IsotopeLabelType"]])
+        labels = na.omit(unique(input[["IsotopeLabelType"]]))
         groups = unique(input[[group_col]])
         by_group = vector("list", length(groups))
         measurements = unique(input[[measurement_col]])
         for (group_id in seq_along(groups)) {
             group = groups[group_id]
             group_filter = input[[group_col]] == group
-            by_group[[group_id]] = data.table::as.data.table(
-                expand.grid(
-                    labels = labels,
-                    features = unique(input[[feature_col]][group_filter]),
-                    measurements = unique(input[[measurement_col]][group_filter])
-                ))
+            non_na_filter = group_filter & !is.na(input[["IsotopeLabelType"]])
+            na_filter = group_filter & is.na(input[["IsotopeLabelType"]])
+            non_na_features = unique(input[[feature_col]][non_na_filter])
+            na_label_features = unique(input[[feature_col]][na_filter])
+            parts = list()
+            if (length(non_na_features) > 0) {
+                parts[[length(parts) + 1]] = data.table::as.data.table(
+                    expand.grid(
+                        labels = labels,
+                        features = non_na_features,
+                        measurements = unique(input[[measurement_col]][group_filter])
+                    ))
+            }
+            if (length(na_label_features) > 0) {
+                parts[[length(parts) + 1]] = data.table::as.data.table(
+                    expand.grid(
+                        labels = NA,
+                        features = na_label_features,
+                        measurements = unique(input[[measurement_col]][group_filter])
+                    ))
+            }
+            by_group[[group_id]] = data.table::rbindlist(parts)
             by_group[[group_id]]$group = group
         }
         result = data.table::rbindlist(by_group)