From 585213af67221c0347824696274560b0f51a03eb Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 2 Apr 2026 12:11:35 -0400 Subject: [PATCH 1/4] tests(anomaly-scores): Fix unit tests around anomaly scores --- R/clean_DIANN.R | 2 +- inst/tinytest/test_clean_DIANN.R | 38 ++++++++++++++----- .../test_converters_DIANNtoMSstatsFormat.R | 4 +- inst/tinytest/test_utils_anomaly_score.R | 12 +++--- 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 7ea843e2..43de3447 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -175,7 +175,7 @@ getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) - dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0] + dn_input = dn_input[QValue >= global_qvalue_cutoff, (quantificationColumn) := 0] if (MBR) { msg = '** MBR was used to analyze the data. Now setting names and filtering' msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff) diff --git a/inst/tinytest/test_clean_DIANN.R b/inst/tinytest/test_clean_DIANN.R index 1f0077a6..2fb3a2cd 100644 --- a/inst/tinytest/test_clean_DIANN.R +++ b/inst/tinytest/test_clean_DIANN.R @@ -26,13 +26,31 @@ output = MSstatsConvert:::.cleanRawDIANN(input, quantificationColumn = "Fragment .validateOutput(output) # Q-value filtering -output = MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005) -expect_equal(sum(output$DetectionQValue < 0.005), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001) -expect_equal(sum(output$LibQValue < 0.00001), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001) -expect_equal(sum(output$LibPGQValue < 0.001), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, qvalue_cutoff = 0.005) -expect_equal(sum(output$LibQValue < 0.005), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, pg_qvalue_cutoff = 0.001) -expect_equal(sum(output$LibPGQValue < 0.001), nrow(output)) +expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) { + expect_equal( + sum(output[[col]] > cutoff), + sum(output[[intensity_col]] == 0 & output[[col]] > cutoff), + info = sprintf( + "All rows with %s > %s should have %s == 0", + col, cutoff, intensity_col + ) + ) + expect_equal( + sum(output[[col]] <= cutoff), + nrow(output) - sum(output[[col]] > cutoff), + info = sprintf( + "Rows with %s <= %s should account for all rows not above the cutoff", + col, cutoff + ) + ) +} +output <- MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005) +expect_qvalue_cutoff(output, "DetectionQValue", 0.005, "Intensity") +output <- MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001) +expect_qvalue_cutoff(output, "LibQValue", 0.00001, "Intensity") +output <- MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001) +expect_qvalue_cutoff(output, "LibPGQValue", 0.001, "Intensity") +output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, qvalue_cutoff = 0.001) +expect_qvalue_cutoff(output, "GlobalQValue", 0.001, "Intensity") +output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, pg_qvalue_cutoff = 0.0002) +expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002, "Intensity") diff --git a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R index 94e3b617..2cf5308e 100644 --- a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R +++ b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R @@ -5,7 +5,7 @@ input = data.table::fread(input_file_path) annot = data.table::fread(annotation_file_path) output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE) expect_equal(ncol(output), 11) -expect_equal(nrow(output), 174) +expect_equal(nrow(output), 348) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) @@ -25,7 +25,7 @@ input = arrow::read_parquet(input_file_path) annot = data.table::fread(annotation_file_path) output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto') expect_equal(ncol(output), 11) -expect_equal(nrow(output), 180) +expect_equal(nrow(output), 192) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R index 80dd0a91..1ad65cd8 100644 --- a/inst/tinytest/test_utils_anomaly_score.R +++ b/inst/tinytest/test_utils_anomaly_score.R @@ -43,8 +43,8 @@ high_scores = run_quality_metrics( ) # The last 5 rows (with high values) should have higher mean anomaly scores -expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]), - info = "Higher cumulative sum values should produce higher anomaly scores") +# expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]), +# info = "Higher cumulative sum values should produce higher anomaly scores") # Test 2: Extreme Value Testing - Obvious Outliers base_df_20 = create_base_df(20) @@ -61,8 +61,8 @@ expect_true(extreme_scores$AnomalyScores[20] == max(extreme_scores$AnomalyScores info = "Extreme outlier should have highest anomaly score") # The outlier should score significantly higher than the median -expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2, - info = "Outlier should score significantly higher than median") +# expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2, +# info = "Outlier should score significantly higher than median") # Test 3: Consistency/Reproducibility Testing base_df_20_orig = create_base_df(20) @@ -277,8 +277,8 @@ expect_true(ranking_scores$AnomalyScores[6] > ranking_scores$AnomalyScores[5], info = "Row 6 should score higher than Row 5") expect_true(ranking_scores$AnomalyScores[5] > ranking_scores$AnomalyScores[4], info = "Row 5 should score higher than Row 4") -expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]), - info = "Row 4 should score higher than Rows 1-3") +# expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]), +# info = "Row 4 should score higher than Rows 1-3") # Test 10: Original Quality Metrics Calculation Test (from the beginning of the file) # Test add_increase, add_decrease, add_dispersion From c9ea825317b281fe43f6e2cd86eab5b59fc99818 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 2 Apr 2026 12:39:31 -0400 Subject: [PATCH 2/4] fix unit tests --- inst/tinytest/test_utils_anomaly_score.R | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R index 1ad65cd8..2fed3a6d 100644 --- a/inst/tinytest/test_utils_anomaly_score.R +++ b/inst/tinytest/test_utils_anomaly_score.R @@ -37,23 +37,23 @@ baseline_scores = run_quality_metrics( # Data with progressively higher cumulative sums high_scores = run_quality_metrics( base_df_10, - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_increase - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_decrease - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)) # dispersion_increase + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)), # mean_increase + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)), # mean_decrease + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)) # dispersion_increase ) # The last 5 rows (with high values) should have higher mean anomaly scores -# expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]), -# info = "Higher cumulative sum values should produce higher anomaly scores") +expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]), + info = "Higher cumulative sum values should produce higher anomaly scores") # Test 2: Extreme Value Testing - Obvious Outliers base_df_20 = create_base_df(20) extreme_scores = run_quality_metrics( base_df_20, - c(rep(0.1, 19), 10.0), # Last value is extreme - c(rep(0.1, 19), 8.0), # Last value is extreme - c(rep(0.1, 19), 12.0) # Last value is extreme + c(seq(0, 0.1, length.out = 19), 10.0), # Last value is extreme + c(seq(0, 0.1, length.out = 19), 8.0), # Last value is extreme + c(seq(0, 0.1, length.out = 19), 12.0) # Last value is extreme ) # The extreme outlier (last row) should have the highest anomaly score @@ -61,8 +61,8 @@ expect_true(extreme_scores$AnomalyScores[20] == max(extreme_scores$AnomalyScores info = "Extreme outlier should have highest anomaly score") # The outlier should score significantly higher than the median -# expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2, -# info = "Outlier should score significantly higher than median") +expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2, + info = "Outlier should score significantly higher than median") # Test 3: Consistency/Reproducibility Testing base_df_20_orig = create_base_df(20) @@ -267,9 +267,9 @@ base_df_6_rank = create_base_df(6) # Create data with obvious ranking: Row 6 > Row 5 > Row 4 > Rows 1,2,3 ranking_scores = run_quality_metrics( base_df_6_rank, - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0) + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0), + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0), + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0) ) # Row 5 should have highest score, Row 4 second highest, etc. @@ -277,8 +277,8 @@ expect_true(ranking_scores$AnomalyScores[6] > ranking_scores$AnomalyScores[5], info = "Row 6 should score higher than Row 5") expect_true(ranking_scores$AnomalyScores[5] > ranking_scores$AnomalyScores[4], info = "Row 5 should score higher than Row 4") -# expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]), -# info = "Row 4 should score higher than Rows 1-3") +expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]), + info = "Row 4 should score higher than Rows 1-3") # Test 10: Original Quality Metrics Calculation Test (from the beginning of the file) # Test add_increase, add_decrease, add_dispersion From 539229dfb8aa93fbf293848962135b91aba4c7e9 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 2 Apr 2026 14:26:33 -0400 Subject: [PATCH 3/4] add unit tests for duplicitiy --- inst/tinytest/test_utils_anomaly_score.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R index 2fed3a6d..70959e05 100644 --- a/inst/tinytest/test_utils_anomaly_score.R +++ b/inst/tinytest/test_utils_anomaly_score.R @@ -367,3 +367,21 @@ low_abundance_excluded = MSstatsConvert:::.prepareSpectronautAnomalyInput( missing_run_count = 0.95) expect_true("AFPLAEWQPSDVDQR" %in% low_abundance_excluded$PeptideSequence) expect_false("LowAbundancePeptide" %in% low_abundance_excluded$PeptideSequence) + + +# Test 11: Testing duplicity of quality metrics, applicable considering +# multiple fragments share the same precursor level metrics + +# Data with progressively higher cumulative sums +duplicate_metrics = run_quality_metrics( + base_df_10, + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)), # mean_increase + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)), # mean_decrease + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)) # dispersion_increase +) + +# The last 5 rows (with high values) should have lower mean anomaly scores +# Since they are all clumped between 2 and 4, whereas 0.1 is by itself +expect_true(mean(duplicate_metrics$AnomalyScores[6:10]) < mean(duplicate_metrics$AnomalyScores[1:5]), + info = "Rows 6-10 (values clumped 2-4) should have lower + anomaly scores than rows 1-5 (isolated value of 0.1)") From 1b796ca022c37d263b1795c0b78a9fcbfdeb6fcb Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 2 Apr 2026 14:28:41 -0400 Subject: [PATCH 4/4] fix q-value tests --- inst/tinytest/test_clean_DIANN.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/inst/tinytest/test_clean_DIANN.R b/inst/tinytest/test_clean_DIANN.R index 2fb3a2cd..2f2a7ce1 100644 --- a/inst/tinytest/test_clean_DIANN.R +++ b/inst/tinytest/test_clean_DIANN.R @@ -26,13 +26,13 @@ output = MSstatsConvert:::.cleanRawDIANN(input, quantificationColumn = "Fragment .validateOutput(output) # Q-value filtering -expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) { +expect_qvalue_cutoff <- function(output, col, cutoff) { expect_equal( sum(output[[col]] > cutoff), - sum(output[[intensity_col]] == 0 & output[[col]] > cutoff), + sum(output[["Intensity"]] == 0 & output[[col]] > cutoff), info = sprintf( "All rows with %s > %s should have %s == 0", - col, cutoff, intensity_col + col, cutoff, "Intensity" ) ) expect_equal( @@ -45,12 +45,12 @@ expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) { ) } output <- MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005) -expect_qvalue_cutoff(output, "DetectionQValue", 0.005, "Intensity") +expect_qvalue_cutoff(output, "DetectionQValue", 0.005) output <- MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001) -expect_qvalue_cutoff(output, "LibQValue", 0.00001, "Intensity") +expect_qvalue_cutoff(output, "LibQValue", 0.00001) output <- MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001) -expect_qvalue_cutoff(output, "LibPGQValue", 0.001, "Intensity") +expect_qvalue_cutoff(output, "LibPGQValue", 0.001) output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, qvalue_cutoff = 0.001) -expect_qvalue_cutoff(output, "GlobalQValue", 0.001, "Intensity") +expect_qvalue_cutoff(output, "GlobalQValue", 0.001) output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, pg_qvalue_cutoff = 0.0002) -expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002, "Intensity") +expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002)