diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 7ea843e2..43de3447 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -175,7 +175,7 @@ getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) - dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0] + dn_input = dn_input[QValue >= global_qvalue_cutoff, (quantificationColumn) := 0] if (MBR) { msg = '** MBR was used to analyze the data. Now setting names and filtering' msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff) diff --git a/inst/tinytest/test_clean_DIANN.R b/inst/tinytest/test_clean_DIANN.R index 1f0077a6..2f2a7ce1 100644 --- a/inst/tinytest/test_clean_DIANN.R +++ b/inst/tinytest/test_clean_DIANN.R @@ -26,13 +26,31 @@ output = MSstatsConvert:::.cleanRawDIANN(input, quantificationColumn = "Fragment .validateOutput(output) # Q-value filtering -output = MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005) -expect_equal(sum(output$DetectionQValue < 0.005), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001) -expect_equal(sum(output$LibQValue < 0.00001), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001) -expect_equal(sum(output$LibPGQValue < 0.001), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, qvalue_cutoff = 0.005) -expect_equal(sum(output$LibQValue < 0.005), nrow(output)) -output = MSstatsConvert:::.cleanRawDIANN(input, MBR = TRUE, pg_qvalue_cutoff = 0.001) -expect_equal(sum(output$LibPGQValue < 0.001), nrow(output)) +expect_qvalue_cutoff <- function(output, col, cutoff) { + expect_equal( + sum(output[[col]] > cutoff), + sum(output[["Intensity"]] == 0 & output[[col]] > cutoff), + info = sprintf( + "All rows with %s > %s should have %s == 0", + col, cutoff, "Intensity" + ) + ) + expect_equal( + sum(output[[col]] <= cutoff), + nrow(output) - sum(output[[col]] > cutoff), + info = sprintf( + "Rows with %s <= %s should account for all rows not above the cutoff", + col, cutoff + ) + ) +} +output <- MSstatsConvert:::.cleanRawDIANN(input, global_qvalue_cutoff = 0.005) +expect_qvalue_cutoff(output, "DetectionQValue", 0.005) +output <- MSstatsConvert:::.cleanRawDIANN(input, qvalue_cutoff = 0.00001) +expect_qvalue_cutoff(output, "LibQValue", 0.00001) +output <- MSstatsConvert:::.cleanRawDIANN(input, pg_qvalue_cutoff = 0.001) +expect_qvalue_cutoff(output, "LibPGQValue", 0.001) +output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, qvalue_cutoff = 0.001) +expect_qvalue_cutoff(output, "GlobalQValue", 0.001) +output <- MSstatsConvert:::.cleanRawDIANN(input, MBR = FALSE, pg_qvalue_cutoff = 0.0002) +expect_qvalue_cutoff(output, "GlobalPGQValue", 0.0002) diff --git a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R index 94e3b617..2cf5308e 100644 --- a/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R +++ b/inst/tinytest/test_converters_DIANNtoMSstatsFormat.R @@ -5,7 +5,7 @@ input = data.table::fread(input_file_path) annot = data.table::fread(annotation_file_path) output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE) expect_equal(ncol(output), 11) -expect_equal(nrow(output), 174) +expect_equal(nrow(output), 348) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) @@ -25,7 +25,7 @@ input = arrow::read_parquet(input_file_path) annot = data.table::fread(annotation_file_path) output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto') expect_equal(ncol(output), 11) -expect_equal(nrow(output), 180) +expect_equal(nrow(output), 192) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R index 80dd0a91..70959e05 100644 --- a/inst/tinytest/test_utils_anomaly_score.R +++ b/inst/tinytest/test_utils_anomaly_score.R @@ -37,9 +37,9 @@ baseline_scores = run_quality_metrics( # Data with progressively higher cumulative sums high_scores = run_quality_metrics( base_df_10, - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_increase - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_decrease - c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)) # dispersion_increase + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)), # mean_increase + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)), # mean_decrease + c(seq(0, 0.1, length.out = 5), seq(2.0, 5.0, length.out = 5)) # dispersion_increase ) # The last 5 rows (with high values) should have higher mean anomaly scores @@ -51,9 +51,9 @@ base_df_20 = create_base_df(20) extreme_scores = run_quality_metrics( base_df_20, - c(rep(0.1, 19), 10.0), # Last value is extreme - c(rep(0.1, 19), 8.0), # Last value is extreme - c(rep(0.1, 19), 12.0) # Last value is extreme + c(seq(0, 0.1, length.out = 19), 10.0), # Last value is extreme + c(seq(0, 0.1, length.out = 19), 8.0), # Last value is extreme + c(seq(0, 0.1, length.out = 19), 12.0) # Last value is extreme ) # The extreme outlier (last row) should have the highest anomaly score @@ -267,9 +267,9 @@ base_df_6_rank = create_base_df(6) # Create data with obvious ranking: Row 6 > Row 5 > Row 4 > Rows 1,2,3 ranking_scores = run_quality_metrics( base_df_6_rank, - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), - c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0) + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0), + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0), + c(0.1, 0.11, 0.12, 1.0, 2.0, 5.0) ) # Row 5 should have highest score, Row 4 second highest, etc. @@ -367,3 +367,21 @@ low_abundance_excluded = MSstatsConvert:::.prepareSpectronautAnomalyInput( missing_run_count = 0.95) expect_true("AFPLAEWQPSDVDQR" %in% low_abundance_excluded$PeptideSequence) expect_false("LowAbundancePeptide" %in% low_abundance_excluded$PeptideSequence) + + +# Test 11: Testing duplicity of quality metrics, applicable considering +# multiple fragments share the same precursor level metrics + +# Data with progressively higher cumulative sums +duplicate_metrics = run_quality_metrics( + base_df_10, + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)), # mean_increase + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)), # mean_decrease + c(rep(0.1, 5), seq(2.0, 4.0, length.out = 5)) # dispersion_increase +) + +# The last 5 rows (with high values) should have lower mean anomaly scores +# Since they are all clumped between 2 and 4, whereas 0.1 is by itself +expect_true(mean(duplicate_metrics$AnomalyScores[6:10]) < mean(duplicate_metrics$AnomalyScores[1:5]), + info = "Rows 6-10 (values clumped 2-4) should have lower + anomaly scores than rows 1-5 (isolated value of 0.1)")