diff --git a/DESCRIPTION b/DESCRIPTION index 0cef1b8..95e4147 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.3 +Version: 0.7.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/NAMESPACE b/NAMESPACE index d47d95d..574981a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(dosage2vcf) export(dosage_ratios) export(filterMADC) export(filterVCF) +export(fixMADC) export(flip_dosage) export(get_countsMADC) export(imputation_concordance) @@ -40,6 +41,7 @@ importFrom(reshape2,dcast) importFrom(reshape2,melt) importFrom(stats,cor) importFrom(stats,setNames) +importFrom(tidyr,replace_na) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/NEWS.md b/NEWS.md index 19d2509..e8f1b1c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# BIGr 0.7.0 + +- Added a new function fixMADC to format raw MADC files with a user-supplied Chr and Pos file. + # BIGr 0.6.3 - Ignore tags when targets are indels diff --git a/R/fixMADC.R b/R/fixMADC.R new file mode 100644 index 0000000..2437f86 --- /dev/null +++ b/R/fixMADC.R @@ -0,0 +1,213 @@ +#' Fix MADC File Allele IDs +#' +#' Process raw MADC files to format and update the allele IDs with user supplied Chr and Pos information +#' +#' @details +#' This function can process raw MADC files to update the Allele IDs and Clone IDs to the Chr_Pos format with a user supplied file. +#' The output MADC will be the standard fixed allele ID format to support use in madc2vcf and BIGapp functions. +#' +#'@import dplyr +#'@import stringr +#'@importFrom tidyr replace_na +#'@importFrom utils read.csv write.csv +#' +#'@param madc.file Path to the MADC file to be filtered +#'@param marker.file Path to the three column marker ID file. +#' - The first column is the existing list of unique CloneIDs (obtained from raw MADC CloneID), +#'where each row is a unique CloneID. +#' - The second column is the chromosome that the marker is located (ie Chr01). No special characters (*#_-!.) are permitted in the +#' chromosome name. +#' - The third column is the numeric position of the marker within the chromosome (ie 1234). No special characters (*#_-!.) are permitted. +#'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed. +#'@param output.file Path to save the fixed allele ID MADC file (if NULL, data will not be saved) +#' +#'@return data.frame or saved csv file +#' +#'@examples +#' #Example +#' +#' #Example MADC +#' madc_file <- system.file("iris_DArT_MADC.csv", package="BIGr") +#' marker_file <- system.file("iris_MADC_marker_file.csv", package="BIGr") +#' +#' #Fix the raw MADC file IDs to use the user provided Chr_Pos format +#' fixedMADC_df <- fixMADC(madc.file = madc_file, +#' marker.file = marker_file, +#' n.summary.columns = NULL, +#' output.file = NULL) +#' +#' +#' +#'@export +fixMADC <- function(madc.file, + marker.file, + n.summary.columns = NULL, + output.file = NULL) { + + + #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not + first_seven_rows <- read.csv(madc.file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) + + #Check if all entries in the first column are either blank or "*" + check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline + if (check_entries) { + #Note: This assumes that the first 7 rows are placeholder info from DArT processing + + #Read the madc file + filtered_df <- read.csv(madc.file, sep = ',', skip = 7, check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } else { + + stop("This MADC file appears to already use fixed allele IDs and cannot be reprocessed with a raw-ID marker file.") + + #Read the madc file + filtered_df <- read.csv(madc.file, sep = ',', check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } + #Check for extra columns + #Save the three columns for later adding to the output (currently unused) + + if (!is.null(n.summary.columns)) { + #Remove the first n.summary.columns columns + if (n.summary.columns > 0) { + cols_to_remove <- 4:(3 + n.summary.columns) + filtered_df <- filtered_df[, -cols_to_remove, drop = FALSE] + } + }else{ + rm.col <- c("ClusterConsensusSequence", + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp") + + filtered_df <- filtered_df[, !(colnames(filtered_df) %in% rm.col)] + } + + #Trim whitespace if present for some reason + filtered_df$CloneID <- trimws(as.character(filtered_df$CloneID)) + filtered_df$AlleleID <- trimws(as.character(filtered_df$AlleleID)) + + #Read in the marker file + marker_file <- read.csv(marker.file, sep = ',', check.names = FALSE) + + ### Verify marker file is formatted correctly ### + + marker_file[,1] <- trimws(as.character(marker_file[,1])) + marker_file[,2] <- trimws(as.character(marker_file[,2])) + marker_file[,3] <- trimws(as.character(marker_file[,3])) + + if (any(grepl("[*#_!.\\-]", marker_file[,3]))) { + stop("Special characters (*#_-!.) detected in the position column (column 3). Please review the marker file.") + } + + if (!all(grepl("^[0-9]+$", marker_file[,3]))) { + stop("The position column (column 3) must be numeric. Please review the marker file.") + } + + #Make marker IDs column and pad 0's for the position + marker_file$new_ID <- paste0( + marker_file[,2], + "_", + str_pad(marker_file[,3], width = 9, side = "left", pad = "0") + ) + + #Verify there are no duplicate IDs in the marker file + if (length(unique(marker_file[,1])) != length(marker_file[,1])) { + stop("There are duplicate marker IDs in the first column. Please review marker file.") + } + + #Verify there are no duplicate position information + if (length(unique(marker_file$new_ID)) != length(marker_file$new_ID)) { + stop("There are duplicate Chr and Pos information where more than one marker has the same Chr_Pos. Please review the marker file.") + } + + #Verify chromosome column (col 2) contains no special characters + if (any(grepl("[*#_!.\\-]", marker_file[,2]))) { + stop("Special characters (*#_-!.) detected in the chromosome column (column 2). Please review the marker file.") + } + + + ## Filtering + + #Identify MADC CloneIDs not found in the marker file + missing_ids <- setdiff(filtered_df$CloneID, marker_file[,1]) + + if (length(missing_ids) > 0) { + warning(paste0( + length(missing_ids), " CloneID(s) in the MADC file were not found in the marker file and will be removed from the output:\n", + paste(missing_ids, collapse = "\n") + )) + + # Remove unmatched IDs from MADC file + filtered_df <- filtered_df[filtered_df$CloneID %in% marker_file[,1], ] + } + + ###Replace old IDs with new IDs + + #Create a named lookup vector: old CloneID and AlleleID -> new_ID + id_lookup <- setNames(marker_file$new_ID, marker_file[,1]) + + filtered_df <- filtered_df %>% + mutate( + # Replace CloneID column directly via lookup + CloneID = id_lookup[CloneID], + + # For AlleleID: extract the |suffix, replace the ID prefix, rejoin + AlleleID = { + old_id <- str_extract(AlleleID, "^[^|]+") + suffix <- str_extract(AlleleID, "\\|.*$") + suffix <- replace_na(suffix, "") # Handle cases with no suffix + paste0(id_lookup[old_id], suffix) + } + ) + + ###Add the proper numbering suffix to allele IDs for unique IDs. + # |Ref -> |Ref_0001 + # |Alt -> |Alt_0002 + # |RefMatch -> numbered _0001, _0002, ... within each CloneID + # |AltMatch -> numbered _0001, _0002, ... within each CloneID + + filtered_df <- filtered_df %>% + mutate( + .suffix_type = str_extract(AlleleID, "(?<=\\|)[^_]+$") + ) %>% + group_by(CloneID, .suffix_type) %>% + mutate( + AlleleID = case_when( + .suffix_type == "Ref" ~ paste0(str_remove(AlleleID, "\\|Ref$"), "|Ref_0001"), + .suffix_type == "Alt" ~ paste0(str_remove(AlleleID, "\\|Alt$"), "|Alt_0002"), + .suffix_type == "RefMatch" ~ paste0( + str_remove(AlleleID, "\\|RefMatch$"), + "|RefMatch_", + sprintf("%04d", row_number()) + ), + .suffix_type == "AltMatch" ~ paste0( + str_remove(AlleleID, "\\|AltMatch$"), + "|AltMatch_", + sprintf("%04d", row_number()) + ), + TRUE ~ AlleleID + ) + ) %>% + ungroup() %>% + select(-.suffix_type) + + + #Save the output to disk if file name provided + if (!is.null(output.file)) { + message("Saving fixed MADC data to file") + write.csv(filtered_df, paste0(output.file,"_fixedID.csv"), row.names = FALSE) + } else { + message("No output file provided. Returning fixed MADC data.") + return(filtered_df) + } + +} diff --git a/R/utils.R b/R/utils.R index c280ad2..4f31798 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,7 +6,7 @@ utils::globalVariables(c( "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", - "pos", "alt", "match_key" + "pos", "alt", "match_key", ".suffix_type" )) #' Convert GT format to numeric dosage diff --git a/cran-comments.md b/cran-comments.md index 2515e5b..858617d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -3,7 +3,3 @@ 0 errors | 0 warnings | 1 note * This is a new release. - -## Updates - -- The maintainer is the same as the previous release, but the email address has been updated. diff --git a/inst/iris_MADC_marker_file.csv b/inst/iris_MADC_marker_file.csv new file mode 100644 index 0000000..6ec0476 --- /dev/null +++ b/inst/iris_MADC_marker_file.csv @@ -0,0 +1,501 @@ +CloneID,Chr,Pos +Chr1_0010,Chr1,10 +Chr1_0100,Chr1,100 +Chr4_0810,Chr4,810 +Chr4_0820,Chr4,820 +Chr4_0830,Chr4,830 +Chr4_0840,Chr4,840 +Chr4_0850,Chr4,850 +Chr5_0860,Chr5,860 +Chr5_0870,Chr5,870 +Chr5_0880,Chr5,880 +Chr5_0890,Chr5,890 +Chr5_0900,Chr5,900 +Chr1_0110,Chr1,110 +Chr5_0910,Chr5,910 +Chr5_0920,Chr5,920 +Chr5_0930,Chr5,930 +Chr5_0940,Chr5,940 +Chr5_0950,Chr5,950 +Chr5_0960,Chr5,960 +Chr5_0970,Chr5,970 +Chr5_0980,Chr5,980 +Chr5_0990,Chr5,990 +Chr5_1000,Chr5,1000 +Chr1_0120,Chr1,120 +Chr5_1010,Chr5,1010 +Chr5_1020,Chr5,1020 +Chr5_1030,Chr5,1030 +Chr5_1040,Chr5,1040 +Chr5_1050,Chr5,1050 +Chr5_1060,Chr5,1060 +Chr5_1070,Chr5,1070 +Chr5_1080,Chr5,1080 +Chr5_1090,Chr5,1090 +Chr5_1100,Chr5,1100 +Chr1_0130,Chr1,130 +Chr5_1110,Chr5,1110 +Chr5_1120,Chr5,1120 +Chr5_1130,Chr5,1130 +Chr5_1140,Chr5,1140 +Chr5_1150,Chr5,1150 +Chr5_1160,Chr5,1160 +Chr5_1170,Chr5,1170 +Chr5_1180,Chr5,1180 +Chr5_1190,Chr5,1190 +Chr5_1200,Chr5,1200 +Chr1_0140,Chr1,140 +Chr5_1210,Chr5,1210 +Chr5_1220,Chr5,1220 +Chr5_1230,Chr5,1230 +Chr5_1240,Chr5,1240 +Chr5_1250,Chr5,1250 +Chr5_1260,Chr5,1260 +Chr5_1270,Chr5,1270 +Chr5_1280,Chr5,1280 +Chr5_1290,Chr5,1290 +Chr5_1300,Chr5,1300 +Chr1_0150,Chr1,150 +Chr5_1310,Chr5,1310 +Chr5_1320,Chr5,1320 +Chr5_1330,Chr5,1330 +Chr5_1340,Chr5,1340 +Chr5_1350,Chr5,1350 +Chr5_1360,Chr5,1360 +Chr6_1370,Chr6,1370 +Chr6_1380,Chr6,1380 +Chr6_1390,Chr6,1390 +Chr6_1400,Chr6,1400 +Chr1_0160,Chr1,160 +Chr6_1410,Chr6,1410 +Chr6_1420,Chr6,1420 +Chr6_1430,Chr6,1430 +Chr6_1440,Chr6,1440 +Chr6_1450,Chr6,1450 +Chr6_1460,Chr6,1460 +Chr6_1470,Chr6,1470 +Chr6_1480,Chr6,1480 +Chr6_1490,Chr6,1490 +Chr6_1500,Chr6,1500 +Chr1_0170,Chr1,170 +Chr6_1510,Chr6,1510 +Chr6_1520,Chr6,1520 +Chr6_1530,Chr6,1530 +Chr6_1540,Chr6,1540 +Chr6_1550,Chr6,1550 +Chr6_1560,Chr6,1560 +Chr6_1570,Chr6,1570 +Chr6_1580,Chr6,1580 +Chr6_1590,Chr6,1590 +Chr6_1600,Chr6,1600 +Chr1_0180,Chr1,180 +Chr6_1610,Chr6,1610 +Chr6_1620,Chr6,1620 +Chr6_1630,Chr6,1630 +Chr6_1640,Chr6,1640 +Chr6_1650,Chr6,1650 +Chr6_1660,Chr6,1660 +Chr6_1670,Chr6,1670 +Chr6_1680,Chr6,1680 +Chr6_1690,Chr6,1690 +Chr6_1700,Chr6,1700 +Chr1_0190,Chr1,190 +Chr6_1710,Chr6,1710 +Chr6_1720,Chr6,1720 +Chr6_1730,Chr6,1730 +Chr6_1740,Chr6,1740 +Chr6_1750,Chr6,1750 +Chr6_1760,Chr6,1760 +Chr6_1770,Chr6,1770 +Chr6_1780,Chr6,1780 +Chr6_1790,Chr6,1790 +Chr6_1800,Chr6,1800 +Chr1_0020,Chr1,20 +Chr2_0010,Chr2,10 +Chr6_1810,Chr6,1810 +Chr6_1820,Chr6,1820 +Chr6_1830,Chr6,1830 +Chr6_1840,Chr6,1840 +Chr6_1850,Chr6,1850 +Chr6_1860,Chr6,1860 +Chr6_1870,Chr6,1870 +Chr6_1880,Chr6,1880 +Chr6_1890,Chr6,1890 +Chr6_1900,Chr6,1900 +Chr2_0020,Chr2,20 +Chr6_1910,Chr6,1910 +Chr6_1920,Chr6,1920 +Chr6_1930,Chr6,1930 +Chr6_1940,Chr6,1940 +Chr6_1950,Chr6,1950 +Chr6_1960,Chr6,1960 +Chr6_1970,Chr6,1970 +Chr6_1980,Chr6,1980 +Chr6_1990,Chr6,1990 +Chr6_2000,Chr6,2000 +Chr2_0030,Chr2,30 +Chr6_2010,Chr6,2010 +Chr6_2020,Chr6,2020 +Chr6_2030,Chr6,2030 +Chr7_2040,Chr7,2040 +Chr7_2050,Chr7,2050 +Chr7_2060,Chr7,2060 +Chr7_2070,Chr7,2070 +Chr7_2080,Chr7,2080 +Chr7_2090,Chr7,2090 +Chr7_2100,Chr7,2100 +Chr2_0040,Chr2,40 +Chr7_2110,Chr7,2110 +Chr7_2120,Chr7,2120 +Chr7_2130,Chr7,2130 +Chr7_2140,Chr7,2140 +Chr7_2150,Chr7,2150 +Chr7_2160,Chr7,2160 +Chr7_2170,Chr7,2170 +Chr7_2180,Chr7,2180 +Chr7_2190,Chr7,2190 +Chr7_2200,Chr7,2200 +Chr2_0050,Chr2,50 +Chr7_2210,Chr7,2210 +Chr7_2220,Chr7,2220 +Chr7_2230,Chr7,2230 +Chr7_2240,Chr7,2240 +Chr7_2250,Chr7,2250 +Chr7_2260,Chr7,2260 +Chr7_2270,Chr7,2270 +Chr7_2280,Chr7,2280 +Chr7_2290,Chr7,2290 +Chr7_2300,Chr7,2300 +Chr2_0060,Chr2,60 +Chr7_2310,Chr7,2310 +Chr7_2320,Chr7,2320 +Chr7_2330,Chr7,2330 +Chr7_2340,Chr7,2340 +Chr7_2350,Chr7,2350 +Chr7_2360,Chr7,2360 +Chr7_2370,Chr7,2370 +Chr7_2380,Chr7,2380 +Chr7_2390,Chr7,2390 +Chr7_2400,Chr7,2400 +Chr2_0070,Chr2,70 +Chr7_2410,Chr7,2410 +Chr7_2420,Chr7,2420 +Chr7_2430,Chr7,2430 +Chr7_2440,Chr7,2440 +Chr7_2450,Chr7,2450 +Chr7_2460,Chr7,2460 +Chr7_2470,Chr7,2470 +Chr7_2480,Chr7,2480 +Chr7_2490,Chr7,2490 +Chr7_2500,Chr7,2500 +Chr2_0080,Chr2,80 +Chr7_2510,Chr7,2510 +Chr7_2520,Chr7,2520 +Chr7_2530,Chr7,2530 +Chr7_2540,Chr7,2540 +Chr7_2550,Chr7,2550 +Chr7_2560,Chr7,2560 +Chr7_2570,Chr7,2570 +Chr7_2580,Chr7,2580 +Chr7_2590,Chr7,2590 +Chr7_2600,Chr7,2600 +Chr2_0090,Chr2,90 +Chr7_2610,Chr7,2610 +Chr7_2620,Chr7,2620 +Chr7_2630,Chr7,2630 +Chr7_2640,Chr7,2640 +Chr7_2650,Chr7,2650 +Chr7_2660,Chr7,2660 +Chr7_2670,Chr7,2670 +Chr7_2680,Chr7,2680 +Chr7_2690,Chr7,2690 +Chr7_2700,Chr7,2700 +Chr2_0100,Chr2,100 +Chr7_2710,Chr7,2710 +Chr7_2720,Chr7,2720 +Chr7_2730,Chr7,2730 +Chr7_2740,Chr7,2740 +Chr7_2750,Chr7,2750 +Chr7_2760,Chr7,2760 +Chr7_2770,Chr7,2770 +Chr7_2780,Chr7,2780 +Chr7_2790,Chr7,2790 +Chr7_2800,Chr7,2800 +Chr1_0030,Chr1,30 +Chr2_0110,Chr2,110 +Chr7_2810,Chr7,2810 +Chr7_2820,Chr7,2820 +Chr7_2830,Chr7,2830 +Chr7_2840,Chr7,2840 +Chr7_2850,Chr7,2850 +Chr7_2860,Chr7,2860 +Chr7_2870,Chr7,2870 +Chr7_2880,Chr7,2880 +Chr8_2890,Chr8,2890 +Chr8_2900,Chr8,2900 +Chr2_0120,Chr2,120 +Chr8_2910,Chr8,2910 +Chr8_2920,Chr8,2920 +Chr8_2930,Chr8,2930 +Chr8_2940,Chr8,2940 +Chr8_2950,Chr8,2950 +Chr8_2960,Chr8,2960 +Chr8_2970,Chr8,2970 +Chr8_2980,Chr8,2980 +Chr8_2990,Chr8,2990 +Chr8_3000,Chr8,3000 +Chr2_0130,Chr2,130 +Chr8_3010,Chr8,3010 +Chr8_3020,Chr8,3020 +Chr8_3030,Chr8,3030 +Chr8_3040,Chr8,3040 +Chr8_3050,Chr8,3050 +Chr8_3060,Chr8,3060 +Chr8_3070,Chr8,3070 +Chr8_3080,Chr8,3080 +Chr8_3090,Chr8,3090 +Chr8_3100,Chr8,3100 +Chr2_0140,Chr2,140 +Chr8_3110,Chr8,3110 +Chr8_3120,Chr8,3120 +Chr8_3130,Chr8,3130 +Chr8_3140,Chr8,3140 +Chr8_3150,Chr8,3150 +Chr8_3160,Chr8,3160 +Chr8_3170,Chr8,3170 +Chr8_3180,Chr8,3180 +Chr8_3190,Chr8,3190 +Chr8_3200,Chr8,3200 +Chr2_0150,Chr2,150 +Chr8_3210,Chr8,3210 +Chr8_3220,Chr8,3220 +Chr8_3230,Chr8,3230 +Chr8_3240,Chr8,3240 +Chr8_3250,Chr8,3250 +Chr8_3260,Chr8,3260 +Chr8_3270,Chr8,3270 +Chr8_3280,Chr8,3280 +Chr8_3290,Chr8,3290 +Chr8_3300,Chr8,3300 +Chr2_0160,Chr2,160 +Chr8_3310,Chr8,3310 +Chr8_3320,Chr8,3320 +Chr8_3330,Chr8,3330 +Chr8_3340,Chr8,3340 +Chr8_3350,Chr8,3350 +Chr8_3360,Chr8,3360 +Chr8_3370,Chr8,3370 +Chr8_3380,Chr8,3380 +Chr8_3390,Chr8,3390 +Chr8_3400,Chr8,3400 +Chr2_0170,Chr2,170 +Chr8_3410,Chr8,3410 +Chr8_3420,Chr8,3420 +Chr8_3430,Chr8,3430 +Chr8_3440,Chr8,3440 +Chr8_3450,Chr8,3450 +Chr8_3460,Chr8,3460 +Chr8_3470,Chr8,3470 +Chr8_3480,Chr8,3480 +Chr8_3490,Chr8,3490 +Chr8_3500,Chr8,3500 +Chr2_0180,Chr2,180 +Chr8_3510,Chr8,3510 +Chr8_3520,Chr8,3520 +Chr8_3530,Chr8,3530 +Chr8_3540,Chr8,3540 +Chr8_3550,Chr8,3550 +Chr8_3560,Chr8,3560 +Chr8_3570,Chr8,3570 +Chr8_3580,Chr8,3580 +Chr8_3590,Chr8,3590 +Chr8_3600,Chr8,3600 +Chr2_0190,Chr2,190 +Chr8_3610,Chr8,3610 +Chr8_3620,Chr8,3620 +Chr8_3630,Chr8,3630 +Chr8_3640,Chr8,3640 +Chr8_3650,Chr8,3650 +Chr8_3660,Chr8,3660 +Chr8_3670,Chr8,3670 +Chr8_3680,Chr8,3680 +Chr8_3690,Chr8,3690 +Chr8_3700,Chr8,3700 +Chr2_0200,Chr2,200 +Chr8_3710,Chr8,3710 +Chr8_3720,Chr8,3720 +Chr8_3730,Chr8,3730 +Chr8_3740,Chr8,3740 +Chr8_3750,Chr8,3750 +Chr8_3760,Chr8,3760 +Chr8_3770,Chr8,3770 +Chr8_3780,Chr8,3780 +Chr8_3790,Chr8,3790 +Chr8_3800,Chr8,3800 +Chr1_0040,Chr1,40 +Chr2_0210,Chr2,210 +Chr8_3810,Chr8,3810 +Chr8_3820,Chr8,3820 +Chr8_3830,Chr8,3830 +Chr8_3840,Chr8,3840 +Chr8_3850,Chr8,3850 +Chr8_3860,Chr8,3860 +Chr8_3870,Chr8,3870 +Chr8_3880,Chr8,3880 +Chr8_3890,Chr8,3890 +Chr8_3900,Chr8,3900 +Chr3_0220,Chr3,220 +Chr8_3910,Chr8,3910 +Chr8_3920,Chr8,3920 +Chr8_3930,Chr8,3930 +Chr8_3940,Chr8,3940 +Chr8_3950,Chr8,3950 +Chr8_3960,Chr8,3960 +Chr8_3970,Chr8,3970 +Chr8_3980,Chr8,3980 +Chr8_3990,Chr8,3990 +Chr8_4000,Chr8,4000 +Chr3_0230,Chr3,230 +Chr8_4010,Chr8,4010 +Chr8_4020,Chr8,4020 +Chr8_4030,Chr8,4030 +Chr8_4040,Chr8,4040 +Chr8_4050,Chr8,4050 +Chr8_4060,Chr8,4060 +Chr8_4070,Chr8,4070 +Chr8_4080,Chr8,4080 +Chr8_4090,Chr8,4090 +Chr8_4100,Chr8,4100 +Chr3_0240,Chr3,240 +Chr8_4110,Chr8,4110 +Chr8_4120,Chr8,4120 +Chr8_4130,Chr8,4130 +Chr8_4140,Chr8,4140 +Chr8_4150,Chr8,4150 +Chr8_4160,Chr8,4160 +Chr8_4170,Chr8,4170 +Chr8_4180,Chr8,4180 +Chr8_4190,Chr8,4190 +Chr8_4200,Chr8,4200 +Chr3_0250,Chr3,250 +Chr8_4210,Chr8,4210 +Chr8_4220,Chr8,4220 +Chr8_4230,Chr8,4230 +Chr8_4240,Chr8,4240 +Chr8_4250,Chr8,4250 +Chr8_4260,Chr8,4260 +Chr8_4270,Chr8,4270 +Chr8_4280,Chr8,4280 +Chr8_4290,Chr8,4290 +Chr8_4300,Chr8,4300 +Chr3_0260,Chr3,260 +Chr8_4310,Chr8,4310 +Chr8_4320,Chr8,4320 +Chr8_4330,Chr8,4330 +Chr8_4340,Chr8,4340 +Chr8_4350,Chr8,4350 +Chr8_4360,Chr8,4360 +Chr8_4370,Chr8,4370 +Chr8_4380,Chr8,4380 +Chr8_4390,Chr8,4390 +Chr8_4400,Chr8,4400 +Chr3_0270,Chr3,270 +Chr8_4410,Chr8,4410 +Chr8_4420,Chr8,4420 +Chr8_4430,Chr8,4430 +Chr8_4440,Chr8,4440 +Chr8_4450,Chr8,4450 +Chr8_4460,Chr8,4460 +Chr8_4470,Chr8,4470 +Chr8_4480,Chr8,4480 +Chr8_4490,Chr8,4490 +Chr8_4500,Chr8,4500 +Chr3_0280,Chr3,280 +Chr8_4510,Chr8,4510 +Chr8_4520,Chr8,4520 +Chr9_4530,Chr9,4530 +Chr9_4540,Chr9,4540 +Chr9_4550,Chr9,4550 +Chr9_4560,Chr9,4560 +Chr9_4570,Chr9,4570 +Chr9_4580,Chr9,4580 +Chr9_4590,Chr9,4590 +Chr9_4600,Chr9,4600 +Chr3_0290,Chr3,290 +Chr9_4610,Chr9,4610 +Chr9_4620,Chr9,4620 +Chr9_4630,Chr9,4630 +Chr9_4640,Chr9,4640 +Chr9_4650,Chr9,4650 +Chr9_4660,Chr9,4660 +Chr9_4670,Chr9,4670 +Chr9_4680,Chr9,4680 +Chr9_4690,Chr9,4690 +Chr9_4700,Chr9,4700 +Chr3_0300,Chr3,300 +Chr9_4710,Chr9,4710 +Chr9_4720,Chr9,4720 +Chr9_4730,Chr9,4730 +Chr9_4740,Chr9,4740 +Chr9_4750,Chr9,4750 +Chr9_4760,Chr9,4760 +Chr9_4770,Chr9,4770 +Chr9_4780,Chr9,4780 +Chr9_4790,Chr9,4790 +Chr9_4800,Chr9,4800 +Chr1_0050,Chr1,50 +Chr3_0310,Chr3,310 +Chr9_4810,Chr9,4810 +Chr3_0320,Chr3,320 +Chr3_0330,Chr3,330 +Chr3_0340,Chr3,340 +Chr3_0350,Chr3,350 +Chr3_0360,Chr3,360 +Chr3_0370,Chr3,370 +Chr3_0380,Chr3,380 +Chr3_0390,Chr3,390 +Chr3_0400,Chr3,400 +Chr1_0060,Chr1,60 +Chr3_0410,Chr3,410 +Chr3_0420,Chr3,420 +Chr3_0430,Chr3,430 +Chr3_0440,Chr3,440 +Chr3_0450,Chr3,450 +Chr3_0460,Chr3,460 +Chr3_0470,Chr3,470 +Chr3_0480,Chr3,480 +Chr3_0490,Chr3,490 +Chr3_0500,Chr3,500 +Chr1_0070,Chr1,70 +Chr3_0510,Chr3,510 +Chr3_0520,Chr3,520 +Chr3_0530,Chr3,530 +Chr3_0540,Chr3,540 +Chr3_0550,Chr3,550 +Chr3_0560,Chr3,560 +Chr3_0570,Chr3,570 +Chr3_0580,Chr3,580 +Chr3_0590,Chr3,590 +Chr3_0600,Chr3,600 +Chr1_0080,Chr1,80 +Chr3_0610,Chr3,610 +Chr4_0620,Chr4,620 +Chr4_0630,Chr4,630 +Chr4_0640,Chr4,640 +Chr4_0650,Chr4,650 +Chr4_0660,Chr4,660 +Chr4_0670,Chr4,670 +Chr4_0680,Chr4,680 +Chr4_0690,Chr4,690 +Chr4_0700,Chr4,700 +Chr1_0090,Chr1,90 +Chr4_0710,Chr4,710 +Chr4_0720,Chr4,720 +Chr4_0730,Chr4,730 +Chr4_0740,Chr4,740 +Chr4_0750,Chr4,750 +Chr4_0760,Chr4,760 +Chr4_0770,Chr4,770 +Chr4_0780,Chr4,780 +Chr4_0790,Chr4,790 +Chr4_0800,Chr4,800 diff --git a/man/fixMADC.Rd b/man/fixMADC.Rd new file mode 100644 index 0000000..986ce7e --- /dev/null +++ b/man/fixMADC.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fixMADC.R +\name{fixMADC} +\alias{fixMADC} +\title{Fix MADC File Allele IDs} +\usage{ +fixMADC(madc.file, marker.file, n.summary.columns = NULL, output.file = NULL) +} +\arguments{ +\item{madc.file}{Path to the MADC file to be filtered} + +\item{marker.file}{Path to the three column marker ID file. +\itemize{ +\item The first column is the existing list of unique CloneIDs (obtained from raw MADC CloneID), +where each row is a unique CloneID. +\item The second column is the chromosome that the marker is located (ie Chr01). No special characters (*#_-!.) are permitted in the +chromosome name. +\item The third column is the numeric position of the marker within the chromosome (ie 1234). No special characters (*#_-!.) are permitted. +}} + +\item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.} + +\item{output.file}{Path to save the fixed allele ID MADC file (if NULL, data will not be saved)} +} +\value{ +data.frame or saved csv file +} +\description{ +Process raw MADC files to format and update the allele IDs with user supplied Chr and Pos information +} +\details{ +This function can process raw MADC files to update the Allele IDs and Clone IDs to the Chr_Pos format with a user supplied file. +The output MADC will be the standard fixed allele ID format to support use in madc2vcf and BIGapp functions. +} +\examples{ +#Example + +#Example MADC +madc_file <- system.file("iris_DArT_MADC.csv", package="BIGr") +marker_file <- system.file("iris_MADC_marker_file.csv", package="BIGr") + +#Fix the raw MADC file IDs to use the user provided Chr_Pos format +fixedMADC_df <- fixMADC(madc.file = madc_file, + marker.file = marker_file, + n.summary.columns = NULL, + output.file = NULL) + + + +} diff --git a/tests/testthat/test-fixMADC.R b/tests/testthat/test-fixMADC.R new file mode 100644 index 0000000..a51a53b --- /dev/null +++ b/tests/testthat/test-fixMADC.R @@ -0,0 +1,616 @@ +library(testthat) +library(dplyr) +library(stringr) + +# ── Helpers: create temporary test fixtures ────────────────────────────────── + +#' Build a raw (unprocessed) MADC CSV with 7 filler rows of "*" +create_raw_madc <- function(path, clone_ids, allele_ids, sample_names, counts_matrix, + include_summary_cols = TRUE) { + n_rows <- length(clone_ids) + n_samples <- length(sample_names) + + # Summary columns matching the full set the function auto-removes + summary_col_names <- c("ClusterConsensusSequence", "CallRate", "OneRatioRef", + "OneRatioSnp", "FreqHomRef", "FreqHomSnp", "FreqHets", + "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + extra_cols <- if (include_summary_cols) length(summary_col_names) else 0 + total_cols <- 3 + extra_cols + n_samples # AlleleID, CloneID, AlleleSequence + summary + samples + + # 7 filler rows: col 1 = "*", other cols have placeholder metadata (like real DArT files) + filler <- matrix("*", nrow = 7, ncol = total_cols) + + # Header row — real MADC: AlleleID (col1), CloneID (col2), AlleleSequence (col3) + summary_names <- if (include_summary_cols) summary_col_names else character(0) + header <- c("AlleleID", "CloneID", "AlleleSequence", summary_names, sample_names) + + # Data rows + allele_seqs <- rep("ACTCTCAGGTGGAT", n_rows) + data_rows <- cbind( + AlleleID = allele_ids, + CloneID = clone_ids, + AlleleSequence = allele_seqs, + if (include_summary_cols) { + data.frame( + ClusterConsensusSequence = rep("ACTCTCAGGTGGAT", n_rows), + CallRate = 1.0, OneRatioRef = 1.0, OneRatioSnp = 1.0, + FreqHomRef = 0.5, FreqHomSnp = 0.3, FreqHets = 0.2, + PICRef = 0.4, PICSnp = 0.3, AvgPIC = 0.35, + AvgCountRef = 100, AvgCountSnp = 80, + RatioAvgCountRefAvgCountSnp = 1.25, + stringsAsFactors = FALSE + ) + } else { + NULL + }, + counts_matrix + ) + + # Write filler rows first (no header), then header + data + con <- file(path, open = "wt") + for (i in seq_len(nrow(filler))) { + writeLines(paste(filler[i, ], collapse = ","), con) + } + writeLines(paste(header, collapse = ","), con) + for (i in seq_len(nrow(data_rows))) { + writeLines(paste(data_rows[i, ], collapse = ","), con) + } + close(con) + invisible(path) +} + +#' Build a preprocessed (no filler rows) MADC CSV +create_preprocessed_madc <- function(path, clone_ids, allele_ids, sample_names, counts_matrix, + include_summary_cols = TRUE) { + n_rows <- length(clone_ids) + + df <- data.frame( + AlleleID = allele_ids, + CloneID = clone_ids, + AlleleSequence = rep("ACTCTCAGGTGGAT", n_rows), + stringsAsFactors = FALSE + ) + if (include_summary_cols) { + df$ClusterConsensusSequence <- rep("ACTCTCAGGTGGAT", n_rows) + df$CallRate <- 1.0 + df$OneRatioRef <- 1.0 + df$OneRatioSnp <- 1.0 + df$FreqHomRef <- 0.5 + df$FreqHomSnp <- 0.3 + df$FreqHets <- 0.2 + df$PICRef <- 0.4 + df$PICSnp <- 0.3 + df$AvgPIC <- 0.35 + df$AvgCountRef <- 100 + df$AvgCountSnp <- 80 + df$RatioAvgCountRefAvgCountSnp <- 1.25 + } + df <- cbind(df, counts_matrix) + colnames(df)[(ncol(df) - ncol(counts_matrix) + 1):ncol(df)] <- sample_names + + write.csv(df, path, row.names = FALSE) + invisible(path) +} + +#' Build a 3-column marker CSV +create_marker_file <- function(path, clone_ids, chromosomes, positions) { + df <- data.frame(CloneID = clone_ids, Chr = chromosomes, Pos = positions, + stringsAsFactors = FALSE) + write.csv(df, path, row.names = FALSE) + invisible(path) +} + + +# ── Shared fixtures ────────────────────────────────────────────────────────── + +sample_names <- c("SampleA", "SampleB", "SampleC") +clone_ids <- c("marker1", "marker2", "marker3", "marker4") +allele_ids <- paste0(clone_ids, "|Ref") +counts_matrix <- data.frame( + SampleA = c(10, 20, 30, 40), + SampleB = c(15, 25, 35, 45), + SampleC = c(12, 22, 32, 42) +) + + + + + + + + + + + + + + + + + + + + + + + +# ── 1. Basic happy-path with raw MADC (7 filler rows) ─────────────────────── + +test_that("fixMADC correctly processes a raw MADC file and replaces IDs", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix, + include_summary_cols = TRUE) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + # Should return a data.frame + + expect_s3_class(result, "data.frame") + + + # CloneIDs should now be Chr_Pos format + + expect_equal(unname(result$CloneID), c("Chr01_000000100", "Chr01_000000200", "Chr02_000000300", "Chr02_000000400")) + + # AlleleID should have the new prefix with |Ref_0001 suffix + expect_true(all(grepl("\\|Ref_0001$", result$AlleleID))) + expect_equal(unname(result$AlleleID), c("Chr01_000000100|Ref_0001", "Chr01_000000200|Ref_0001", "Chr02_000000300|Ref_0001", "Chr02_000000400|Ref_0001")) + + # Summary columns should be removed + summary_cols <- c("ClusterConsensusSequence", "CallRate", "OneRatioRef", + "OneRatioSnp", "FreqHomRef", "FreqHomSnp", "FreqHets", + "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + expect_false(any(summary_cols %in% colnames(result))) + + # Sample columns should still be present + expect_true(all(sample_names %in% colnames(result))) +}) + + +# ── 1b. Marker file in different order than MADC ───────────────────────────── + +test_that("fixMADC correctly maps IDs when marker file order differs from MADC", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix, + include_summary_cols = TRUE) + + # Marker file in reversed order relative to MADC + create_marker_file(marker_path, + rev(clone_ids), + c("Chr02", "Chr02", "Chr01", "Chr01"), + c(400, 300, 200, 100)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + # marker1 -> Chr01_100, marker2 -> Chr01_200, marker3 -> Chr02_300, marker4 -> Chr02_400 + # (same mapping regardless of marker file row order) + expect_equal(unname(result$CloneID), c("Chr01_000000100", "Chr01_000000200", "Chr02_000000300", "Chr02_000000400")) + expect_equal(unname(result$AlleleID), c("Chr01_000000100|Ref_0001", "Chr01_000000200|Ref_0001", "Chr02_000000300|Ref_0001", "Chr02_000000400|Ref_0001")) +}) + + +# ── 1c. Duplicate CloneIDs with different AlleleID suffixes ────────────────── + +test_that("fixMADC correctly handles multiple rows per CloneID (Ref/Alt pairs)", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + # Two markers, each with a Ref and Alt row = 4 rows total + dup_clone_ids <- c("marker1", "marker1", "marker2", "marker2") + dup_allele_ids <- c("marker1|Ref", "marker1|Alt", "marker2|Ref", "marker2|Alt") + dup_counts <- data.frame( + SampleA = c(10, 5, 20, 8), + SampleB = c(15, 3, 25, 6), + SampleC = c(12, 4, 22, 7) + ) + + create_raw_madc(madc_path, dup_clone_ids, dup_allele_ids, sample_names, dup_counts, + include_summary_cols = TRUE) + create_marker_file(marker_path, + c("marker1", "marker2"), + c("Chr01", "Chr02"), + c(100, 200)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + # All 4 rows should be present + + expect_equal(nrow(result), 4) + + # CloneIDs should both be updated (duplicates are expected) + expect_equal(unname(result$CloneID), c("Chr01_000000100", "Chr01_000000100", "Chr02_000000200", "Chr02_000000200")) + + # AlleleIDs should have updated suffixes: Ref -> Ref_0001, Alt -> Alt_0002 + expect_equal(unname(result$AlleleID), + c("Chr01_000000100|Ref_0001", "Chr01_000000100|Alt_0002", "Chr02_000000200|Ref_0001", "Chr02_000000200|Alt_0002")) + + # Sample data should be intact + expect_equal(result$SampleA, c(10, 5, 20, 8)) +}) + + +# ── 2. Preprocessed MADC (no filler rows) triggers error ──────────────────── + +test_that("fixMADC errors when MADC appears preprocessed (no filler rows)", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_preprocessed_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "already use fixed allele IDs" + ) + +}) + + +# ── 3. Manual n.summary.columns removal ───────────────────────────────────── + +test_that("n.summary.columns removes the correct number of extra columns", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix, + include_summary_cols = TRUE) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + # 13 summary columns start at col 4, so columns 4:16 + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path, + n.summary.columns = 13) + + expect_s3_class(result, "data.frame") + expect_false("CallRate" %in% colnames(result)) + expect_false("ClusterConsensusSequence" %in% colnames(result)) + expect_true(all(sample_names %in% colnames(result))) +}) + + +# ── 4. Marker file validation — duplicate CloneIDs ────────────────────────── + +test_that("fixMADC errors on duplicate marker IDs in column 1", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + # Duplicate first marker + create_marker_file(marker_path, + c("marker1", "marker1", "marker2", "marker3"), + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "duplicate marker IDs" + ) +}) + + +# ── 5. Marker file validation — duplicate Chr_Pos ─────────────────────────── + +test_that("fixMADC errors on duplicate Chr_Pos combinations", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 100, 300, 400)) # Chr01_100 appears twice + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "duplicate Chr and Pos" + ) +}) + + +# ── 6. Marker file validation — special chars in chromosome ───────────────── + +test_that("fixMADC errors on special characters in chromosome column", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids[1:2], allele_ids[1:2], sample_names, + counts_matrix[1:2, ]) + create_marker_file(marker_path, + clone_ids[1:2], + c("Chr_01", "Chr02"), # underscore in chr name + c(100, 200)) + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "Special characters.*chromosome" + ) +}) + + +# ── 7. Marker file validation — special chars in position ─────────────────── + +test_that("fixMADC errors on special characters in position column", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids[1:2], allele_ids[1:2], sample_names, + counts_matrix[1:2, ]) + create_marker_file(marker_path, + clone_ids[1:2], + c("Chr01", "Chr02"), + c("1.5", "200")) # decimal in position + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "Special characters.*position|position.*must be numeric" + ) +}) + + +# ── 8. Marker file validation — non-numeric position ──────────────────────── + +test_that("fixMADC errors when position column is not numeric", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids[1:2], allele_ids[1:2], sample_names, + counts_matrix[1:2, ]) + create_marker_file(marker_path, + clone_ids[1:2], + c("Chr01", "Chr02"), + c("abc", "def")) + + expect_error( + fixMADC(madc.file = madc_path, marker.file = marker_path), + "must be numeric" + ) +}) + + +# ── 9. Missing CloneIDs in marker file triggers warning + removal ─────────── + +test_that("fixMADC warns and removes CloneIDs not in marker file", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + # Marker file only has 3 of the 4 markers + create_marker_file(marker_path, + clone_ids[1:3], + c("Chr01", "Chr01", "Chr02"), + c(100, 200, 300)) + + expect_warning( + result <- fixMADC(madc.file = madc_path, marker.file = marker_path), + "not found in the marker file" + ) + + # Only 3 rows should remain + expect_equal(nrow(result), 3) + expect_false("marker4" %in% result$CloneID) +}) + + +# ── 10. Output file is written when output.file is provided ───────────────── + +test_that("fixMADC writes CSV when output.file is specified", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + output_path <- tempfile() # function appends ".csv" + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + fixMADC(madc.file = madc_path, + marker.file = marker_path, + output.file = output_path) + + written_file <- paste0(output_path, "_fixedID.csv") + expect_true(file.exists(written_file)) + + saved <- read.csv(written_file) + expect_equal(saved$CloneID, c("Chr01_000000100", "Chr01_000000200", "Chr02_000000300", "Chr02_000000400")) +}) + + +# ── 11. AlleleID with Alt suffix is handled ───────────────────────────────── + +test_that("fixMADC correctly handles |Alt suffixes in AlleleID", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + alt_allele_ids <- paste0(clone_ids, "|Alt") + + create_raw_madc(madc_path, clone_ids, alt_allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + expect_true(all(grepl("\\|Alt_0002$", result$AlleleID))) + expect_equal(unname(result$AlleleID[1]), "Chr01_000000100|Alt_0002") +}) + + +# ── 11b. AlleleID with |RefMatch and |AltMatch suffixes ───────────────────── + +test_that("fixMADC correctly handles |RefMatch and |AltMatch suffixes", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + match_allele_ids <- c("marker1|RefMatch", "marker2|AltMatch", + "marker3|RefMatch", "marker4|AltMatch") + + create_raw_madc(madc_path, clone_ids, match_allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + expect_equal(unname(result$AlleleID), + c("Chr01_000000100|RefMatch_0001", "Chr01_000000200|AltMatch_0001", + "Chr02_000000300|RefMatch_0001", "Chr02_000000400|AltMatch_0001")) +}) + + +# ── 11c. Duplicate RefMatch/AltMatch suffixes are numbered ────────────────── + +test_that("fixMADC appends _0001, _0002 etc. to duplicate RefMatch/AltMatch within a CloneID", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + # One marker with Ref, Alt, 2x RefMatch, 3x AltMatch = 7 rows + dup_clone_ids <- rep("marker1", 7) + dup_allele_ids <- c("marker1|Ref", "marker1|Alt", + "marker1|RefMatch", "marker1|RefMatch", + "marker1|AltMatch", "marker1|AltMatch", "marker1|AltMatch") + dup_counts <- data.frame( + SampleA = c(10, 5, 8, 12, 3, 6, 2), + SampleB = c(15, 3, 9, 11, 4, 7, 1), + SampleC = c(12, 4, 7, 13, 2, 5, 3) + ) + + create_raw_madc(madc_path, dup_clone_ids, dup_allele_ids, sample_names, dup_counts) + create_marker_file(marker_path, + c("marker1"), + c("Chr01"), + c(100)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + expect_equal(nrow(result), 7) + + # Ref always gets _0001, Alt always gets _0002 + expect_equal(unname(result$AlleleID[1]), "Chr01_000000100|Ref_0001") + expect_equal(unname(result$AlleleID[2]), "Chr01_000000100|Alt_0002") + + # Duplicate RefMatch rows should be numbered _0001, _0002 + expect_equal(unname(result$AlleleID[3]), "Chr01_000000100|RefMatch_0001") + expect_equal(unname(result$AlleleID[4]), "Chr01_000000100|RefMatch_0002") + + # Duplicate AltMatch rows should be numbered _0001, _0002, _0003 + expect_equal(unname(result$AlleleID[5]), "Chr01_000000100|AltMatch_0001") + expect_equal(unname(result$AlleleID[6]), "Chr01_000000100|AltMatch_0002") + expect_equal(unname(result$AlleleID[7]), "Chr01_000000100|AltMatch_0003") +}) + + +# ── 11d. All suffix types are numbered even when unique per CloneID ─────────── + +test_that("fixMADC numbers all suffix types including single RefMatch/AltMatch per CloneID", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + # Two markers, each with Ref, Alt, one RefMatch, one AltMatch + mixed_clone_ids <- c("marker1", "marker1", "marker1", "marker1", + "marker2", "marker2", "marker2", "marker2") + mixed_allele_ids <- c("marker1|Ref", "marker1|Alt", "marker1|RefMatch", "marker1|AltMatch", + "marker2|Ref", "marker2|Alt", "marker2|RefMatch", "marker2|AltMatch") + mixed_counts <- data.frame( + SampleA = c(10, 5, 8, 3, 20, 8, 12, 6), + SampleB = c(15, 3, 9, 4, 25, 6, 11, 7), + SampleC = c(12, 4, 7, 2, 22, 7, 13, 5) + ) + + create_raw_madc(madc_path, mixed_clone_ids, mixed_allele_ids, sample_names, mixed_counts) + create_marker_file(marker_path, + c("marker1", "marker2"), + c("Chr01", "Chr02"), + c(100, 200)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + # All suffixes are always numbered: Ref_0001, Alt_0002, RefMatch_0001, AltMatch_0001 + expect_equal(unname(result$AlleleID), + c("Chr01_000000100|Ref_0001", "Chr01_000000100|Alt_0002", "Chr01_000000100|RefMatch_0001", "Chr01_000000100|AltMatch_0001", + "Chr02_000000200|Ref_0001", "Chr02_000000200|Alt_0002", "Chr02_000000200|RefMatch_0001", "Chr02_000000200|AltMatch_0001")) +}) + + +# ── 12. Whitespace in marker file is trimmed ──────────────────────────────── + +test_that("fixMADC trims whitespace in marker file columns", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids[1:2], allele_ids[1:2], sample_names, + counts_matrix[1:2, ]) + + # Write marker file with extra whitespace + df <- data.frame( + CloneID = c(" marker1 ", " marker2"), + Chr = c("Chr01 ", " Chr02"), + Pos = c(" 100", "200 "), + stringsAsFactors = FALSE + ) + write.csv(df, marker_path, row.names = FALSE) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path) + + expect_equal(unname(result$CloneID), c("Chr01_000000100", "Chr02_000000200")) +}) + + +# ── 13. Returns NULL (invisibly) when writing to file ──────────────────────── + +test_that("fixMADC returns NULL when output.file is provided", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + output_path <- tempfile() + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, clone_ids, + c("Chr01", "Chr01", "Chr02", "Chr02"), + c(100, 200, 300, 400)) + + result <- fixMADC(madc.file = madc_path, + marker.file = marker_path, + output.file = output_path) + + expect_null(result) +}) + + +# ── 14. Empty marker file (no matching IDs) removes all rows ──────────────── + +test_that("fixMADC warns and returns empty df when no CloneIDs match", { + madc_path <- tempfile(fileext = ".csv") + marker_path <- tempfile(fileext = ".csv") + + create_raw_madc(madc_path, clone_ids, allele_ids, sample_names, counts_matrix) + create_marker_file(marker_path, + c("noMatch1", "noMatch2"), + c("Chr01", "Chr02"), + c(100, 200)) + + expect_warning( + result <- fixMADC(madc.file = madc_path, marker.file = marker_path), + "not found in the marker file" + ) + + expect_equal(nrow(result), 0) +})