From 585213af67221c0347824696274560b0f51a03eb Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 2 Apr 2026 12:11:35 -0400
Subject: [PATCH 1/4] tests(anomaly-scores): Fix unit tests around anomaly
 scores

---
 R/clean_DIANN.R                               |  2 +-
 inst/tinytest/test_clean_DIANN.R              | 38 ++++++++++++++-----
 .../test_converters_DIANNtoMSstatsFormat.R    |  4 +-
 inst/tinytest/test_utils_anomaly_score.R      | 12 +++---
 4 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index 7ea843e2..43de3447 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -175,7 +175,7 @@
     getOption("MSstatsLog")("INFO", msg)
     getOption("MSstatsMsg")("INFO", msg)
     
-    dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0]
+    dn_input = dn_input[QValue >= global_qvalue_cutoff, (quantificationColumn) := 0]
     if (MBR) {
         msg = '** MBR was used to analyze the data. Now setting names and filtering'
         msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff)
diff --git a/inst/tinytest/test_clean_DIANN.R b/inst/tinytest/test_clean_DIANN.R
index 1f0077a6..2fb3a2cd 100644
--- a/inst/tinytest/test_clean_DIANN.R
+++ b/inst/tinytest/test_clean_DIANN.R
@@ -26,13 +26,31 @@ output = MSstatsConvert:::.cleanRawDIANN(input, quantificationColumn = "Fragment
 .validateOutput(output)
 
 # Q-value filtering
-output = MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005)
-expect_equal(sum(output$DetectionQValue < 0.005), nrow(output))
-output = MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001)
-expect_equal(sum(output$LibQValue < 0.00001), nrow(output))
-output = MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001)
-expect_equal(sum(output$LibPGQValue < 0.001), nrow(output))
-output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, qvalue_cutoff = 0.005)
-expect_equal(sum(output$LibQValue < 0.005), nrow(output))
-output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, pg_qvalue_cutoff = 0.001)
-expect_equal(sum(output$LibPGQValue < 0.001), nrow(output))
+expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) {
+    expect_equal(
+        sum(output[[col]] > cutoff),
+        sum(output[[intensity_col]] == 0 & output[[col]] > cutoff),
+        info = sprintf(
+            "All rows with %s > %s should have %s == 0",
+            col, cutoff, intensity_col
+        )
+    )
+    expect_equal(
+        sum(output[[col]] <= cutoff),
+        nrow(output) - sum(output[[col]] > cutoff),
+        info = sprintf(
+            "Rows with %s <= %s should account for all rows not above the cutoff",
+            col, cutoff
+        )
+    )
+}
+output <- MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005)
+expect_qvalue_cutoff(output, "DetectionQValue", 0.005, "Intensity")
+output <- MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001)
+expect_qvalue_cutoff(output, "LibQValue", 0.00001, "Intensity")
+output <- MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001)
+expect_qvalue_cutoff(output, "LibPGQValue", 0.001, "Intensity")
+output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, qvalue_cutoff = 0.001)
+expect_qvalue_cutoff(output, "GlobalQValue", 0.001, "Intensity")
+output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, pg_qvalue_cutoff = 0.0002)
+expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002, "Intensity")
diff --git a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R
index 94e3b617..2cf5308e 100644
--- a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R
+++ b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R
@@ -5,7 +5,7 @@ input = data.table::fread(input_file_path)
 annot = data.table::fread(annotation_file_path)
 output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE)
 expect_equal(ncol(output), 11)
-expect_equal(nrow(output), 174)
+expect_equal(nrow(output), 348) 
 expect_true("Run" %in% colnames(output))
 expect_true("ProteinName" %in% colnames(output))
 expect_true("PeptideSequence" %in% colnames(output))
@@ -25,7 +25,7 @@ input = arrow::read_parquet(input_file_path)
 annot = data.table::fread(annotation_file_path)
 output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto')
 expect_equal(ncol(output), 11)
-expect_equal(nrow(output), 180)
+expect_equal(nrow(output), 192)
 expect_true("Run" %in% colnames(output))
 expect_true("ProteinName" %in% colnames(output))
 expect_true("PeptideSequence" %in% colnames(output))
diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R
index 80dd0a91..1ad65cd8 100644
--- a/inst/tinytest/test_utils_anomaly_score.R
+++ b/inst/tinytest/test_utils_anomaly_score.R
@@ -43,8 +43,8 @@ high_scores = run_quality_metrics(
 )
 
 # The last 5 rows (with high values) should have higher mean anomaly scores
-expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]),
-            info = "Higher cumulative sum values should produce higher anomaly scores")
+# expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]),
+#             info = "Higher cumulative sum values should produce higher anomaly scores")
 
 # Test 2: Extreme Value Testing - Obvious Outliers
 base_df_20 = create_base_df(20)
@@ -61,8 +61,8 @@ expect_true(extreme_scores$AnomalyScores[20] == max(extreme_scores$AnomalyScores
             info = "Extreme outlier should have highest anomaly score")
 
 # The outlier should score significantly higher than the median
-expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2,
-            info = "Outlier should score significantly higher than median")
+# expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2,
+#             info = "Outlier should score significantly higher than median")
 
 # Test 3: Consistency/Reproducibility Testing
 base_df_20_orig = create_base_df(20)
@@ -277,8 +277,8 @@ expect_true(ranking_scores$AnomalyScores[6] > ranking_scores$AnomalyScores[5],
             info = "Row 6 should score higher than Row 5")
 expect_true(ranking_scores$AnomalyScores[5] > ranking_scores$AnomalyScores[4],
             info = "Row 5 should score higher than Row 4")
-expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]),
-            info = "Row 4 should score higher than Rows 1-3")
+# expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]),
+#             info = "Row 4 should score higher than Rows 1-3")
 
 # Test 10: Original Quality Metrics Calculation Test (from the beginning of the file)
 # Test add_increase, add_decrease, add_dispersion

From c9ea825317b281fe43f6e2cd86eab5b59fc99818 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 2 Apr 2026 12:39:31 -0400
Subject: [PATCH 2/4] fix unit tests

---
 inst/tinytest/test_utils_anomaly_score.R | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R
index 1ad65cd8..2fed3a6d 100644
--- a/inst/tinytest/test_utils_anomaly_score.R
+++ b/inst/tinytest/test_utils_anomaly_score.R
@@ -37,23 +37,23 @@ baseline_scores = run_quality_metrics(
 # Data with progressively higher cumulative sums
 high_scores = run_quality_metrics(
     base_df_10,
-    c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)),  # mean_increase
-    c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)),  # mean_decrease
-    c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5))   # dispersion_increase
+    c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)),  # mean_increase
+    c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)),  # mean_decrease
+    c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5))   # dispersion_increase
 )
 
 # The last 5 rows (with high values) should have higher mean anomaly scores
-# expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]),
-#             info = "Higher cumulative sum values should produce higher anomaly scores")
+expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]),
+            info = "Higher cumulative sum values should produce higher anomaly scores")
 
 # Test 2: Extreme Value Testing - Obvious Outliers
 base_df_20 = create_base_df(20)
 
 extreme_scores = run_quality_metrics(
     base_df_20,
-    c(rep(0.1, 19), 10.0),  # Last value is extreme
-    c(rep(0.1, 19), 8.0),   # Last value is extreme
-    c(rep(0.1, 19), 12.0)   # Last value is extreme
+    c(seq(0, 0.1, length.out = 19), 10.0),  # Last value is extreme
+    c(seq(0, 0.1, length.out = 19), 8.0),   # Last value is extreme
+    c(seq(0, 0.1, length.out = 19), 12.0)   # Last value is extreme
 )
 
 # The extreme outlier (last row) should have the highest anomaly score
@@ -61,8 +61,8 @@ expect_true(extreme_scores$AnomalyScores[20] == max(extreme_scores$AnomalyScores
             info = "Extreme outlier should have highest anomaly score")
 
 # The outlier should score significantly higher than the median
-# expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2,
-#             info = "Outlier should score significantly higher than median")
+expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2,
+            info = "Outlier should score significantly higher than median")
 
 # Test 3: Consistency/Reproducibility Testing
 base_df_20_orig = create_base_df(20)
@@ -267,9 +267,9 @@ base_df_6_rank = create_base_df(6)
 # Create data with obvious ranking: Row 6 > Row 5 > Row 4 > Rows 1,2,3
 ranking_scores = run_quality_metrics(
     base_df_6_rank,
-    c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0),
-    c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0),
-    c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0)
+    c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0),
+    c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0),
+    c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0)
 )
 
 # Row 5 should have highest score, Row 4 second highest, etc.
@@ -277,8 +277,8 @@ expect_true(ranking_scores$AnomalyScores[6] > ranking_scores$AnomalyScores[5],
             info = "Row 6 should score higher than Row 5")
 expect_true(ranking_scores$AnomalyScores[5] > ranking_scores$AnomalyScores[4],
             info = "Row 5 should score higher than Row 4")
-# expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]),
-#             info = "Row 4 should score higher than Rows 1-3")
+expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]),
+            info = "Row 4 should score higher than Rows 1-3")
 
 # Test 10: Original Quality Metrics Calculation Test (from the beginning of the file)
 # Test add_increase, add_decrease, add_dispersion

From 539229dfb8aa93fbf293848962135b91aba4c7e9 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 2 Apr 2026 14:26:33 -0400
Subject: [PATCH 3/4] add unit tests for duplicitiy

---
 inst/tinytest/test_utils_anomaly_score.R | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R
index 2fed3a6d..70959e05 100644
--- a/inst/tinytest/test_utils_anomaly_score.R
+++ b/inst/tinytest/test_utils_anomaly_score.R
@@ -367,3 +367,21 @@ low_abundance_excluded = MSstatsConvert:::.prepareSpectronautAnomalyInput(
     missing_run_count = 0.95)
 expect_true("AFPLAEWQPSDVDQR" %in% low_abundance_excluded$PeptideSequence)
 expect_false("LowAbundancePeptide" %in% low_abundance_excluded$PeptideSequence)
+
+
+# Test 11: Testing duplicity of quality metrics, applicable considering
+# multiple fragments share the same precursor level metrics
+
+# Data with progressively higher cumulative sums
+duplicate_metrics = run_quality_metrics(
+    base_df_10,
+    c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)),  # mean_increase
+    c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)),  # mean_decrease
+    c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5))   # dispersion_increase
+)
+
+# The last 5 rows (with high values) should have lower mean anomaly scores
+# Since they are all clumped between 2 and 4, whereas 0.1 is by itself
+expect_true(mean(duplicate_metrics$AnomalyScores[6:10]) < mean(duplicate_metrics$AnomalyScores[1:5]),
+            info = "Rows 6-10 (values clumped 2-4) should have lower 
+            anomaly scores than rows 1-5 (isolated value of 0.1)")

From 1b796ca022c37d263b1795c0b78a9fcbfdeb6fcb Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 2 Apr 2026 14:28:41 -0400
Subject: [PATCH 4/4] fix q-value tests

---
 inst/tinytest/test_clean_DIANN.R | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/inst/tinytest/test_clean_DIANN.R b/inst/tinytest/test_clean_DIANN.R
index 2fb3a2cd..2f2a7ce1 100644
--- a/inst/tinytest/test_clean_DIANN.R
+++ b/inst/tinytest/test_clean_DIANN.R
@@ -26,13 +26,13 @@ output = MSstatsConvert:::.cleanRawDIANN(input, quantificationColumn = "Fragment
 .validateOutput(output)
 
 # Q-value filtering
-expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) {
+expect_qvalue_cutoff <- function(output, col, cutoff) {
     expect_equal(
         sum(output[[col]] > cutoff),
-        sum(output[[intensity_col]] == 0 & output[[col]] > cutoff),
+        sum(output[["Intensity"]] == 0 & output[[col]] > cutoff),
         info = sprintf(
             "All rows with %s > %s should have %s == 0",
-            col, cutoff, intensity_col
+            col, cutoff, "Intensity"
         )
     )
     expect_equal(
@@ -45,12 +45,12 @@ expect_qvalue_cutoff <- function(output, col, cutoff, intensity_col = NULL) {
     )
 }
 output <- MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005)
-expect_qvalue_cutoff(output, "DetectionQValue", 0.005, "Intensity")
+expect_qvalue_cutoff(output, "DetectionQValue", 0.005)
 output <- MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001)
-expect_qvalue_cutoff(output, "LibQValue", 0.00001, "Intensity")
+expect_qvalue_cutoff(output, "LibQValue", 0.00001)
 output <- MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001)
-expect_qvalue_cutoff(output, "LibPGQValue", 0.001, "Intensity")
+expect_qvalue_cutoff(output, "LibPGQValue", 0.001)
 output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, qvalue_cutoff = 0.001)
-expect_qvalue_cutoff(output, "GlobalQValue", 0.001, "Intensity")
+expect_qvalue_cutoff(output, "GlobalQValue", 0.001)
 output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, pg_qvalue_cutoff = 0.0002)
-expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002, "Intensity")
+expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002)