UMCUGenetics · mraves2 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Mar 12, 2026
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
@@ -27,37 +27,17 @@ for (scanmode in scanmodes) {
   repl_pattern <- get(load(pattern_file))
   # calculate Z-scores
   if (z_score == 1) {
-    outlist_stats <- calculate_zscores(outlist_total)
-    nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
-    order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
-    outlist_stats_more <- cbind(
-      outlist_stats[, 1:7],
-      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
-      outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
-      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
-    )
-    # sort Z-score columns and append to peak group list
-    tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
-    tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
-    tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
-    outlist_stats_more <- outlist_stats_more[, -tmp_index]
-    outlist_stats_more <- cbind(outlist_stats_more, tmp)
-    outlist_total <- outlist_stats_more
+    outlist_stats <- calculate_zscores_peakgrouplist(outlist_total)
   }
+  # calculate ppm deviation
+  outlist_withppm <- calculate_ppm_deviation(outlist_stats)
+  #  put columns in correct order
+  outlist_ident <- order_columns_peakgrouplist(outlist_withppm)
 
-  # make a copy of the outlist
-  outlist_ident <- outlist_total
-  # select identified peak groups if ppm deviation is within limits
-  if (z_score == 1) {
-    outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
-    outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
-  }
-
-  # Extra output in Excel-readable format:
-  remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
-  remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
-  outlist_ident <- outlist_ident[, -remove_colindex]
+  # generate output in Excel-readable format:
+  remove_columns <- c("mzmin.pgrp", "mzmax.pgrp")
+  outlist_ident <- outlist_ident[, -which(colnames(outlist_ident) %in% remove_columns)]
   write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
-  # output in RData format
+  # export output in RData format
   save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
 }
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
@@ -1,6 +1,6 @@
 # CollectFilled functions
 
-collapse <- function(column_label, peakgroup_list, index_dup) {
+collapse_information <- function(column_label, peakgroup_list, index_dup) {
   #' Collapse identification info for peak groups with the same mass
   #'
   #' @param column_label: Name of column in peakgroup_list (string)
@@ -36,23 +36,23 @@ merge_duplicate_rows <- function(peakgroup_list) {
     # get the index for the peak group which is double
     peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
     single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
-
-    # use function collapse to concatenate info
-    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
-    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
-    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
-    single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
-    single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
+    
+    # use function collapse_information to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse_information("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse_information("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse_information("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "all_hmdb_ids"] <- collapse_information("all_hmdb_ids", peakgroup_list, peaklist_index)
+    single_peakgroup[, "sec_hmdb_ids"] <- collapse_information("sec_hmdb_ids", peakgroup_list, peaklist_index)
     if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
-
+    
     # keep track of deduplicated entries
     collect <- rbind(collect, single_peakgroup)
     remove <- c(remove, peaklist_index)
-
+    
     # remove current entry from index
     index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
   }
-
+  
   # remove duplicate entries
   if (!is.null(remove)) {
     peakgroup_list <- peakgroup_list[-remove, ]
@@ -62,20 +62,17 @@ merge_duplicate_rows <- function(peakgroup_list) {
   return(peakgroup_list_dedup)
 }
 
-calculate_zscores <- function(peakgroup_list) {
+calculate_zscores_peakgrouplist <- function(peakgroup_list) {
   #' Calculate Z-scores for peak groups based on average and standard deviation of controls
   #'
   #' @param peakgroup_list: Peak group list (matrix)
-  #' @param sort_col: Column to sort on (string)
-  #' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
   #'
   #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
-
+  
   case_label <- "P"
   control_label <- "C"
   # get index for new column names
   startcol <- ncol(peakgroup_list) + 3
-
   # calculate mean and standard deviation for Control group
   ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
   case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
@@ -95,25 +92,51 @@ calculate_zscores <- function(peakgroup_list) {
                      peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
     peakgroup_list <- cbind(peakgroup_list, zscores_1col)
   }
-
+  
   # apply new column names to columns at end plus avg and sd columns
   colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  return(peakgroup_list)
+}
 
-  # add ppm deviation column
-  zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
-  # calculate ppm deviation
+calculate_ppm_deviation <- function(peakgroup_list) {
+  #' Calculate ppm deviation between observed mass and expected theoretical mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_ppm: peak group list with ppm column (matrix)
+
+  # make sure values in columns mzmed.pgrp and theormz_HMDB are numeric
+  peakgroup_list$mzmed.pgrp <- as.numeric(peakgroup_list$mzmed.pgrp)
+  peakgroup_list$theormz_HMDB <- as.numeric(peakgroup_list$theormz_HMDB)
+
+  # calculate ppm deviation 
   for (row_index in seq_len(nrow(peakgroup_list))) {
-    if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
-        !is.null(peakgroup_list$theormz_HMDB[row_index]) &&
-        (peakgroup_list$theormz_HMDB[row_index] != "")) {
-      peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
-                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
-                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
-    } else {
-      peakgroup_list$ppmdev[row_index] <- NA
-    }
+    observed_mz <- peakgroup_list$mzmed.pgrp[row_index]
+    theor_mz <- peakgroup_list$theormz_HMDB[row_index]
+    peakgroup_list$ppmdev[row_index] <- 10^6 * (observed_mz - theor_mz) / theor_mz
   }
 
   return(peakgroup_list)
 }
 
+order_columns_peakgrouplist <- function(peakgroup_list) {
+  #' Put columns in peak group list in correct order
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_ordered: peak group list with columns in correct order (matrix)
+
+  original_colnames <- colnames(peakgroup_list)
+  mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
+  descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
+  intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
+  # if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
+  control_columns <- grep ("ctrls", original_colnames)
+  zscore_columns <- grep("_Zscore", original_colnames)
+  # create peak group list with columns in correct order
+  peakgroup_ordered <- peakgroup_list[ , c(mass_columns, descriptive_columns, intensity_columns, control_columns, zscore_columns)]
+
+  return(peakgroup_ordered)
+}
+
diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots.md b/DIMS/tests/testthat/_snaps/generate_violin_plots.md
@@ -10,12 +10,12 @@
 
 # save_prob_scores_to_excel: Saving the probability score dataframe as an Excel file
 
-        Disease P2025M1 P2025M2 P2025M3 P2025M4
-    1 Disease A  10.900   -10.9   49.90   -49.9
-    2 Disease B   0.953     0.0    2.29     0.0
-    3 Disease C  12.100     0.0    0.00    12.1
-    4 Disease D   0.000   -12.5    0.00    18.2
-    5 Disease E  44.300     0.0    0.00    28.1
-    6 Disease F   0.000   -77.4  -77.40     0.0
-    7 Disease G -38.700    38.7   38.70   -38.7
+        Disease P2025M1            P2025M2            P2025M3 P2025M4
+    1 Disease A  10.900 -10.90000000000000  49.90000000000000   -49.9
+    2 Disease B   0.953   0.00000000000000   2.29000000000000     0.0
+    3 Disease C  12.100   0.00000000000000   0.00000000000000    12.1
+    4 Disease D   0.000 -12.50000000000000   0.00000000000000    18.2
+    5 Disease E  44.300   0.00000000000000   0.00000000000000    28.1
+    6 Disease F   0.000 -77.40000000000001 -77.40000000000001     0.0
+    7 Disease G -38.700  38.70000000000000  38.70000000000000   -38.7
 
diff --git a/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt b/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
@@ -0,0 +1,5 @@
+"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"	"ppmdev"
+"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	16500	3000	2828.42712474619	9000	13000	90000	130000	0.111112214857712
+"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	22000	4000	2828.42712474619	10000	14000	1e+05	140000	0.473299680976197
+"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	27500	5000	2828.42712474619	11000	15000	110000	150000	0.17426158930175
+"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	33000	6000	2828.42712474619	12000	16000	120000	160000	0.186787674436346
diff --git a/DIMS/tests/testthat/test_collect_filled.R b/DIMS/tests/testthat/test_collect_filled.R
@@ -0,0 +1,77 @@
+# unit tests for CollectFilled
+# functions: collapse_information, merge_duplicate_rows, calculate_zscores_peakgrouplist,
+#            calculate_ppm_deviation, order_columns_peakgrouplist
+source("../../preprocessing/collect_filled_functions.R")
+
+testthat::test_that("Values for duplicate rows are correctly collapsed", {
+  test_matrix <- matrix(letters[1:8], nrow = 2, ncol = 4)
+  colnames(test_matrix) <- paste0("column", 1:4)
+
+  expect_equal(collapse_information("column1", test_matrix, c(1,2)), "a;b", TRUE)
+})
+
+testthat::test_that("Duplicate rows in a peak group list are correctly merged", {
+  # Copy/symlink the files to the current location for the function
+  test_files <- list.files("fixtures/", "test_peakgroup_list", full.names = TRUE)
+  file.symlink(file.path(test_files), getwd())
+
+  # read in peakgroup_list, create duplicate row 
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  test_peakgroup_list_dup <- test_peakgroup_list[c(1, 2, 2, 3), ]
+
+  # after merging duplicate rows, the test peak group list should have 3 rows
+  expect_equal(nrow(merge_duplicate_rows(test_peakgroup_list_dup)), 3, TRUE, tolerance = 0.001)
+  expect_equal(merge_duplicate_rows(test_peakgroup_list_dup)[3, "all_hmdb_ids"], 
+               paste(test_peakgroup_list_dup[2, "all_hmdb_ids"], test_peakgroup_list_dup[3, "all_hmdb_ids"], sep = ";"), 
+               TRUE)
+})
+
+testthat::test_that("Z-scores are correctly calculated in CollectFilled", {
+  # read in peakgroup_list; remove avg.int, std.int, noise and Zscore columns
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  # remove avg.ctrls, sd.ctrls and Z-score columns
+  test_peakgroup_list_noz <- test_peakgroup_list[ , -grep("avg.ctrls", colnames(test_peakgroup_list))]
+  test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("sd.ctrls", colnames(test_peakgroup_list_noz))]
+  test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("_Zscore", colnames(test_peakgroup_list_noz))]
+
+  # after calculate_zscores_peakgrouplist, there should be 4 columns with _Zscore in the name
+  expect_equal(length(grep("_Zscore", colnames(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)))), 4, TRUE, tolerance = 0.001)
+
+  # after calculate_zscores_peakgrouplist, the 4 columns with _Zscore in the name should be filled non-zero
+  expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$C101.1_Zscore[1], -0.7071, TRUE, tolerance = 0.00001)
+  expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$P2.1_Zscore[4], 12.0208, TRUE, tolerance = 0.00001)
+
+})
+
+testthat::test_that("ppm deviation values are correctly calculated in CollectFilled", {
+  # read in peakgroup_list
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+
+  # store ppm deviation values
+  test_ppm_values <- test_peakgroup_list$ppmdev
+
+  # after calculate_ppm_deviation, there ppm values in the new column should approximate the old ones
+  expect_equal(calculate_ppm_deviation(test_peakgroup_list)$ppmdev, test_ppm_values, TRUE, tolerance = 0.001)
+
+  # calculate_ppm_deviation should give NA if there is no value for theormz_HMDB
+  test_peakgroup_list$theormz_HMDB[1] <- NA
+  expect_identical(is.na(calculate_ppm_deviation(test_peakgroup_list)$ppmdev[1]), TRUE)
+
+})
+
+testthat::test_that("columns in peak group list are corretly sorted", {
+  # read in peakgroup_list
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  # original order of columns
+  original_column_order <- colnames(test_peakgroup_list)
+  # after ordering, column names should be re-ordered
+  test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]
+
+  expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)
+
+  # Remove copied/symlinked files
+  files_remove <- list.files("./", "test_peakgroup_list.txt", full.names = TRUE)
+  file.remove(files_remove)
+
+})
+