Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 9 additions & 29 deletions DIMS/CollectFilled.R
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,17 @@ for (scanmode in scanmodes) {
repl_pattern <- get(load(pattern_file))
# calculate Z-scores
if (z_score == 1) {
outlist_stats <- calculate_zscores(outlist_total)
nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
outlist_stats_more <- cbind(
outlist_stats[, 1:7],
outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
)
# sort Z-score columns and append to peak group list
tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
outlist_stats_more <- outlist_stats_more[, -tmp_index]
outlist_stats_more <- cbind(outlist_stats_more, tmp)
outlist_total <- outlist_stats_more
outlist_stats <- calculate_zscores_peakgrouplist(outlist_total)
}
# calculate ppm deviation
outlist_withppm <- calculate_ppm_deviation(outlist_stats)
# put columns in correct order
outlist_ident <- order_columns_peakgrouplist(outlist_withppm)

# make a copy of the outlist
outlist_ident <- outlist_total
# select identified peak groups if ppm deviation is within limits
if (z_score == 1) {
outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
}

# Extra output in Excel-readable format:
remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
outlist_ident <- outlist_ident[, -remove_colindex]
# generate output in Excel-readable format:
remove_columns <- c("mzmin.pgrp", "mzmax.pgrp")
outlist_ident <- outlist_ident[, -which(colnames(outlist_ident) %in% remove_columns)]
write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
# output in RData format
# export output in RData format
save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
}
81 changes: 52 additions & 29 deletions DIMS/preprocessing/collect_filled_functions.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# CollectFilled functions

collapse <- function(column_label, peakgroup_list, index_dup) {
collapse_information <- function(column_label, peakgroup_list, index_dup) {
#' Collapse identification info for peak groups with the same mass
#'
#' @param column_label: Name of column in peakgroup_list (string)
Expand Down Expand Up @@ -36,23 +36,23 @@ merge_duplicate_rows <- function(peakgroup_list) {
# get the index for the peak group which is double
peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]

# use function collapse to concatenate info
single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
# use function collapse_information to concatenate info
single_peakgroup[, "assi_HMDB"] <- collapse_information("assi_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "iso_HMDB"] <- collapse_information("iso_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "HMDB_code"] <- collapse_information("HMDB_code", peakgroup_list, peaklist_index)
single_peakgroup[, "all_hmdb_ids"] <- collapse_information("all_hmdb_ids", peakgroup_list, peaklist_index)
single_peakgroup[, "sec_hmdb_ids"] <- collapse_information("sec_hmdb_ids", peakgroup_list, peaklist_index)
if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA

# keep track of deduplicated entries
collect <- rbind(collect, single_peakgroup)
remove <- c(remove, peaklist_index)

# remove current entry from index
index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
}

# remove duplicate entries
if (!is.null(remove)) {
peakgroup_list <- peakgroup_list[-remove, ]
Expand All @@ -62,20 +62,17 @@ merge_duplicate_rows <- function(peakgroup_list) {
return(peakgroup_list_dedup)
}

calculate_zscores <- function(peakgroup_list) {
calculate_zscores_peakgrouplist <- function(peakgroup_list) {
#' Calculate Z-scores for peak groups based on average and standard deviation of controls
#'
#' @param peakgroup_list: Peak group list (matrix)
#' @param sort_col: Column to sort on (string)
#' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
#'
#' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)

case_label <- "P"
control_label <- "C"
# get index for new column names
startcol <- ncol(peakgroup_list) + 3

# calculate mean and standard deviation for Control group
ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
Expand All @@ -95,25 +92,51 @@ calculate_zscores <- function(peakgroup_list) {
peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
peakgroup_list <- cbind(peakgroup_list, zscores_1col)
}

# apply new column names to columns at end plus avg and sd columns
colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores

return(peakgroup_list)
}

# add ppm deviation column
zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
# calculate ppm deviation
calculate_ppm_deviation <- function(peakgroup_list) {
#' Calculate ppm deviation between observed mass and expected theoretical mass
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_list_ppm: peak group list with ppm column (matrix)

# make sure values in columns mzmed.pgrp and theormz_HMDB are numeric
peakgroup_list$mzmed.pgrp <- as.numeric(peakgroup_list$mzmed.pgrp)
peakgroup_list$theormz_HMDB <- as.numeric(peakgroup_list$theormz_HMDB)

# calculate ppm deviation
for (row_index in seq_len(nrow(peakgroup_list))) {
if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
!is.null(peakgroup_list$theormz_HMDB[row_index]) &&
(peakgroup_list$theormz_HMDB[row_index] != "")) {
peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
} else {
peakgroup_list$ppmdev[row_index] <- NA
}
observed_mz <- peakgroup_list$mzmed.pgrp[row_index]
theor_mz <- peakgroup_list$theormz_HMDB[row_index]
peakgroup_list$ppmdev[row_index] <- 10^6 * (observed_mz - theor_mz) / theor_mz
}

return(peakgroup_list)
}

order_columns_peakgrouplist <- function(peakgroup_list) {
#' Put columns in peak group list in correct order
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_ordered: peak group list with columns in correct order (matrix)

original_colnames <- colnames(peakgroup_list)
mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
# if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
control_columns <- grep ("ctrls", original_colnames)
zscore_columns <- grep("_Zscore", original_colnames)
# create peak group list with columns in correct order
peakgroup_ordered <- peakgroup_list[ , c(mass_columns, descriptive_columns, intensity_columns, control_columns, zscore_columns)]

return(peakgroup_ordered)
}

16 changes: 8 additions & 8 deletions DIMS/tests/testthat/_snaps/generate_violin_plots.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

# save_prob_scores_to_excel: Saving the probability score dataframe as an Excel file

Disease P2025M1 P2025M2 P2025M3 P2025M4
1 Disease A 10.900 -10.9 49.90 -49.9
2 Disease B 0.953 0.0 2.29 0.0
3 Disease C 12.100 0.0 0.00 12.1
4 Disease D 0.000 -12.5 0.00 18.2
5 Disease E 44.300 0.0 0.00 28.1
6 Disease F 0.000 -77.4 -77.40 0.0
7 Disease G -38.700 38.7 38.70 -38.7
Disease P2025M1 P2025M2 P2025M3 P2025M4
1 Disease A 10.900 -10.90000000000000 49.90000000000000 -49.9
2 Disease B 0.953 0.00000000000000 2.29000000000000 0.0
3 Disease C 12.100 0.00000000000000 0.00000000000000 12.1
4 Disease D 0.000 -12.50000000000000 0.00000000000000 18.2
5 Disease E 44.300 0.00000000000000 0.00000000000000 28.1
6 Disease F 0.000 -77.40000000000001 -77.40000000000001 0.0
7 Disease G -38.700 38.70000000000000 38.70000000000000 -38.7

5 changes: 5 additions & 0 deletions DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore" "ppmdev"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 16500 3000 2828.42712474619 9000 13000 90000 130000 0.111112214857712
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 22000 4000 2828.42712474619 10000 14000 1e+05 140000 0.473299680976197
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 27500 5000 2828.42712474619 11000 15000 110000 150000 0.17426158930175
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 33000 6000 2828.42712474619 12000 16000 120000 160000 0.186787674436346
77 changes: 77 additions & 0 deletions DIMS/tests/testthat/test_collect_filled.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# unit tests for CollectFilled
# functions: collapse_information, merge_duplicate_rows, calculate_zscores_peakgrouplist,
# calculate_ppm_deviation, order_columns_peakgrouplist
source("../../preprocessing/collect_filled_functions.R")

testthat::test_that("Values for duplicate rows are correctly collapsed", {
test_matrix <- matrix(letters[1:8], nrow = 2, ncol = 4)
colnames(test_matrix) <- paste0("column", 1:4)

expect_equal(collapse_information("column1", test_matrix, c(1,2)), "a;b", TRUE)
})

testthat::test_that("Duplicate rows in a peak group list are correctly merged", {
# Copy/symlink the files to the current location for the function
test_files <- list.files("fixtures/", "test_peakgroup_list", full.names = TRUE)
file.symlink(file.path(test_files), getwd())

# read in peakgroup_list, create duplicate row
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
test_peakgroup_list_dup <- test_peakgroup_list[c(1, 2, 2, 3), ]

# after merging duplicate rows, the test peak group list should have 3 rows
expect_equal(nrow(merge_duplicate_rows(test_peakgroup_list_dup)), 3, TRUE, tolerance = 0.001)
expect_equal(merge_duplicate_rows(test_peakgroup_list_dup)[3, "all_hmdb_ids"],
paste(test_peakgroup_list_dup[2, "all_hmdb_ids"], test_peakgroup_list_dup[3, "all_hmdb_ids"], sep = ";"),
TRUE)
})

testthat::test_that("Z-scores are correctly calculated in CollectFilled", {
# read in peakgroup_list; remove avg.int, std.int, noise and Zscore columns
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
# remove avg.ctrls, sd.ctrls and Z-score columns
test_peakgroup_list_noz <- test_peakgroup_list[ , -grep("avg.ctrls", colnames(test_peakgroup_list))]
test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("sd.ctrls", colnames(test_peakgroup_list_noz))]
test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("_Zscore", colnames(test_peakgroup_list_noz))]

# after calculate_zscores_peakgrouplist, there should be 4 columns with _Zscore in the name
expect_equal(length(grep("_Zscore", colnames(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)))), 4, TRUE, tolerance = 0.001)

# after calculate_zscores_peakgrouplist, the 4 columns with _Zscore in the name should be filled non-zero
expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$C101.1_Zscore[1], -0.7071, TRUE, tolerance = 0.00001)
expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$P2.1_Zscore[4], 12.0208, TRUE, tolerance = 0.00001)

})

testthat::test_that("ppm deviation values are correctly calculated in CollectFilled", {
# read in peakgroup_list
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")

# store ppm deviation values
test_ppm_values <- test_peakgroup_list$ppmdev

# after calculate_ppm_deviation, there ppm values in the new column should approximate the old ones
expect_equal(calculate_ppm_deviation(test_peakgroup_list)$ppmdev, test_ppm_values, TRUE, tolerance = 0.001)

# calculate_ppm_deviation should give NA if there is no value for theormz_HMDB
test_peakgroup_list$theormz_HMDB[1] <- NA
expect_identical(is.na(calculate_ppm_deviation(test_peakgroup_list)$ppmdev[1]), TRUE)

})

testthat::test_that("columns in peak group list are corretly sorted", {
# read in peakgroup_list
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
# original order of columns
original_column_order <- colnames(test_peakgroup_list)
# after ordering, column names should be re-ordered
test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]

expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)

# Remove copied/symlinked files
files_remove <- list.files("./", "test_peakgroup_list.txt", full.names = TRUE)
file.remove(files_remove)

})

Loading