From 778aefaeedcf121f2243cdbecb1124f5d50af639 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 3 Oct 2025 15:11:40 -0400 Subject: [PATCH 01/37] indels support for madc2vcf_targets --- NAMESPACE | 1 + R/check_madc_sanity.R | 105 ++++++++ R/madc2vcf_targets.R | 345 +++++++++++++++--------- man/check_madc_sanity.Rd | 43 +++ man/madc2vcf_targets.Rd | 118 ++++++-- tests/testthat/test-check_madc_sanity.R | 10 + 6 files changed, 474 insertions(+), 148 deletions(-) create mode 100644 R/check_madc_sanity.R create mode 100644 man/check_madc_sanity.Rd create mode 100644 tests/testthat/test-check_madc_sanity.R diff --git a/NAMESPACE b/NAMESPACE index d47d95d..443314d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) +export(check_madc_sanity) export(check_ped) export(check_replicates) export(dosage2vcf) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R new file mode 100644 index 0000000..19d3ff6 --- /dev/null +++ b/R/check_madc_sanity.R @@ -0,0 +1,105 @@ +#' Run basic sanity checks on a MADC-style allele report +#' +#' @description +#' Performs five quick validations on an allele report: +#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*" +#' *and* both `_0001` and `_0002` appear in `AlleleID`; +#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`. +#' +#' @param report A `data.frame` with at least the columns +#' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also +#' used in the “FixAlleleIDs” check to inspect its first up to six entries. +#' +#' @details +#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive), +#' which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`. +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`, +#' merged by `CloneID`, and the lengths of `AlleleSequence` are compared. +#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and +#' `indel_clone_ids` is returned as `NULL`. +#' +#' @return A list with: +#' \describe{ +#' \item{checks}{Named logical vector with entries +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ. +#' Returns `character(0)` if none, or `NULL` when required columns are missing.} +#' } +#' +#' +#' @export +check_madc_sanity <- function(report) { + + # Initialize + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + + # Validate required columns + required <- c("CloneID", "AlleleID", "AlleleSequence") + missing_cols <- setdiff(required, names(report)) + checks["Columns"] <- length(missing_cols) == 0 + + if(checks[["Columns"]]){ + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + # ---- IUPACcodes ---- + iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE) + checks["IUPACcodes"] <- any(iu, na.rm = TRUE) + + # ---- LowerCase ---- + lc <- grepl("[atcg]", report$AlleleSequence) + checks["LowerCase"] <- any(lc, na.rm = TRUE) + + # ---- Indels ---- + refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + + merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE) + + if (nrow(merged) > 0) { + ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) + alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) + cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + indel_mask <- cmp_ok & (ref_len != alt_len) + checks["Indels"] <- any(indel_mask) + indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + } else { + checks["Indels"] <- FALSE + indels <- character(0) + } + + # ---- Chrom Pos ---- + pos <- strsplit(report[,2], "_") + checks["ChromPos"] <- all(sapply(pos, length) == 2) + + } else indels <- NULL + + messages[["Columns"]] <- c("Required columns are present\n", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", + "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", + "No IUPAC (non-ATCG) codes found in AlleleSequence\n") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", + "No lowercase bases found in AlleleSequence\n") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match) for all CloneIDs\n") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n") + + list(checks = checks, messages = messages, indel_clone_ids = indels) +} diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1b02c31..fc022f9 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -1,45 +1,121 @@ #' Format MADC Target Loci Read Counts Into VCF #' -#' This function will extract the read count information from a MADC file target markers and convert to VCF file format. +#' Convert DArTag MADC target read counts to a VCF #' -#' The DArTag MADC file format is not commonly supported through existing tools. This function -#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -#' genotyping panel target markers only +#' @description +#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target +#' read counts for the panel’s target loci. This is useful because MADC is not +#' widely supported by general-purpose tools, while VCF is. #' -#' @param madc_file Path to MADC file -#' @param output.file output file name and path -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. -#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded. +#' @details +#' **What this function does** +#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +#' IUPAC/ambiguous bases, lowercase bases, indels). +#' - Extracts reference and total read counts per sample and target. +#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). +#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing +#' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` +#' (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based +#' inference. +#' +#' **Output VCF layout** +#' - `INFO` fields: +#' * `DP` — total depth across all samples for the locus +#' * `ADS` — total counts across samples in the order `ref,alt` +#' - `FORMAT` fields (per sample): +#' * `DP` — total reads (ref + alt) +#' * `RA` — reads supporting the reference allele +#' * `AD` — `"ref,alt"` counts +#' +#' **Strand handling** +#' If a target ID appears in `botloci_file`, its probe sequences are reverse- +#' complemented prior to base comparison so that REF/ALT are reported in the +#' top-strand genomic orientation. +#' +#' **Sanity check behavior** +#' - If required columns or fixed IDs are missing, the function `stop()`s. +#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not** +#' provided, the function `stop()`s with a diagnostic message explaining what to fix. +#' +#' @param madc_file character. Path to the input MADC CSV file. +#' @param output.file character. Path to the output VCF file to write. +#' @param botloci_file character. Path to a plain-text file listing target IDs +#' designed on the **bottom** strand (one ID per line). Required when +#' `get_REF_ALT = TRUE` and `markers_info` is not provided. +#' @param markers_info character or `NULL`. Optional path to a CSV providing target +#' metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are +#' present, also require `Type, Indel_pos`. When supplied, these values populate +#' `#CHROM, POS, REF, ALT` in the VCF directly. +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT +#' bases from the Ref/Alt probe sequences in the MADC file (with strand correction +#' using `botloci_file`). Targets with more than one difference between Ref/Alt +#' sequences are removed. +#' +#' @return (Invisibly) returns the path to `output.file`. The side effect is a +#' **VCF v4.3** written to disk containing one row per target and columns for all +#' samples in the MADC file. +#' +#' @section Dependencies: +#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base +#' **utils**. Helper functions expected in this package: `check_madc_sanity()`, +#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`. +#' +#' @examples +#' # Example files shipped with the package +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", +#' package = "BIGr") +#' out_vcf <- tempfile(fileext = ".vcf") +#' +#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +#' \dontrun{ +#' madc2vcf_targets( +#' madc_file = madc_file, +#' output.file = out_vcf, +#' botloci_file = bot_file, +#' get_REF_ALT = TRUE +#' ) +#' } +#' +#' # Clean up (example) +#' unlink(out_vcf) +#' +#' @seealso +#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()` #' -#' @return A VCF file v4.3 with the target marker read count information #' @import dplyr #' @import tidyr #' @import tibble -#' @importFrom Rdpack reprompt #' @importFrom reshape2 melt dcast #' @importFrom utils write.table #' @importFrom Biostrings DNAString reverseComplement -#' @return A VCF file v4.3 with the target marker read count information -#' -#' @examples -#' # Load example files -#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#' -#' #Temp location (only for example) -#' output_file <- tempfile() -#' -#' # Convert MADC to VCF -#' madc2vcf_targets(madc_file = madc_file, -#' output.file = output_file, -#' get_REF_ALT = TRUE, -#' botloci_file = bot_file) -#' -#' rm(output_file) #' #' @export -madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) { - #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) +madc2vcf_targets <- function(madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE) { + + # MADC checks + report <- read.csv(madc_file) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ + idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) + if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + } matrices <- get_countsMADC(madc_file) ref_df <- data.frame(matrices[[1]], check.names = FALSE) @@ -56,18 +132,124 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = row.names(ad_df) <- row.names(ref_df) #Obtaining Chr and Pos information from the row_names - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + if(is.null(markers_info)){ + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% + select(CHROM, POS) + + # Remove leading zeros from the POS column + new_df$POS <- sub("^0+", "", new_df$POS) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + # Get REF and ALT + if(get_REF_ALT){ + if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") + csv <- get_counts(madc_file) + # Keep only the ones that have alt and ref + csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] + + # Get reverse complement the tag is present in botloci + botloci <- read.table(botloci_file, header = FALSE) + + # Check if the botloci file marker IDs match with the MADC file + checked_botloci <- check_botloci(botloci, csv) + botloci <- checked_botloci[[1]] + csv <- checked_botloci[[2]] + + # FIXED: Store original sequences before any transformation + csv$OriginalAlleleSequence <- csv$AlleleSequence + + # Apply reverse complement to sequences for bottom strand markers + idx <- which(csv$CloneID %in% botloci[,1]) + csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) + + ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + + # FIXED: Get original sequences for SNP calling + orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(all(sort(ref_ord) == sort(alt_ord))){ + # Order sequences consistently + ref_seq <- ref_seq[order(ref_ord)] + alt_seq <- alt_seq[order(alt_ord)] + orig_ref_seq <- orig_ref_seq[order(ref_ord)] + orig_alt_seq <- orig_alt_seq[order(alt_ord)] + ordered_clone_ids <- sort(ref_ord) - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) + ref_base <- alt_base <- vector() + for(i in 1:length(orig_ref_seq)){ + # FIXED: Use original sequences for SNP calling + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences + ref_base[i] <- NA + alt_base[i] <- NA + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + + # FIXED: Apply reverse complement to bases only if marker is in botloci + if(ordered_clone_ids[i] %in% botloci[,1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } + } else { + # No differences found + ref_base[i] <- NA + alt_base[i] <- NA + } + } + } else { + warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") + ref_base <- "." + alt_base <- "." + } + + } else { + ref_base <- "." + alt_base <- "." + } + } else { + # Verify markers_info file + df <- read.csv(markers_info) + if(checks$checks["Indels"]){ + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos") + } + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT") + + if(!all(rownames(ad_df)%in% df$BI_markerID)) + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + + matched <- df[match(rownames(ad_df), df$BI_markerID),] + + new_df <- data.frame( + CHROM = matched$Chr, + POS = matched$Pos + ) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- matched$Ref + alt_base <- matched$Alt + } #Make a header separate from the dataframe vcf_header <- c( @@ -82,83 +264,6 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = '##FORMAT=' ) - # Get REF and ALT - if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] - - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) - - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] - - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence - - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) - - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] - - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) - - ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling - temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences - ref_base[i] <- NA - alt_base[i] <- NA - } else if(length(idx_diff) == 1) { - orig_ref_base <- temp_list[[1]][idx_diff] - orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { - ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) - alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) - } else { - ref_base[i] <- orig_ref_base - alt_base[i] <- orig_alt_base - } - } else { - # No differences found - ref_base[i] <- NA - alt_base[i] <- NA - } - } - } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") - ref_base <- "." - alt_base <- "." - } - - } else { - ref_base <- "." - alt_base <- "." - } - #Make the header#Make the VCF df vcf_df <- data.frame( CHROM = new_df$CHROM, @@ -233,12 +338,4 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - #Unload all items from memory - rm(matrices) - rm(ref_df) - rm(alt_df) - rm(size_df) - rm(ad_df) - rm(vcf_df) - rm(geno_df) } diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd new file mode 100644 index 0000000..494145e --- /dev/null +++ b/man/check_madc_sanity.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_madc_sanity.R +\name{check_madc_sanity} +\alias{check_madc_sanity} +\title{Run basic sanity checks on a MADC-style allele report} +\usage{ +check_madc_sanity(report) +} +\arguments{ +\item{report}{A \code{data.frame} with at least the columns +\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also +used in the “FixAlleleIDs” check to inspect its first up to six entries.} +} +\value{ +A list with: +\describe{ +\item{checks}{Named logical vector with entries +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. +Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.} +} +} +\description{ +Performs five quick validations on an allele report: +\enumerate{ +\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*" +\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; +\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}. +} +} +\details{ +\itemize{ +\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive), +which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}. +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"}, +merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared. +\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and +\code{indel_clone_ids} is returned as \code{NULL}. +} +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index a790460..fad847c 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -4,44 +4,114 @@ \alias{madc2vcf_targets} \title{Format MADC Target Loci Read Counts Into VCF} \usage{ -madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) +madc2vcf_targets( + madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character. Path to the input MADC CSV file.} -\item{output.file}{output file name and path} +\item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{character. Path to a plain-text file listing target IDs +designed on the \strong{bottom} strand (one ID per line). Required when +\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} -\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.} +\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target +metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are +present, also require \verb{Type, Indel_pos}. When supplied, these values populate +\verb{#CHROM, POS, REF, ALT} in the VCF directly.} + +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT +bases from the Ref/Alt probe sequences in the MADC file (with strand correction +using \code{botloci_file}). Targets with more than one difference between Ref/Alt +sequences are removed.} } \value{ -A VCF file v4.3 with the target marker read count information - -A VCF file v4.3 with the target marker read count information +(Invisibly) returns the path to \code{output.file}. The side effect is a +\strong{VCF v4.3} written to disk containing one row per target and columns for all +samples in the MADC file. } \description{ -This function will extract the read count information from a MADC file target markers and convert to VCF file format. +Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target +read counts for the panel’s target loci. This is useful because MADC is not +widely supported by general-purpose tools, while VCF is. } \details{ -The DArTag MADC file format is not commonly supported through existing tools. This function -will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -genotyping panel target markers only +Convert DArTag MADC target read counts to a VCF + +\strong{What this function does} +\itemize{ +\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +IUPAC/ambiguous bases, lowercase bases, indels). +\item Extracts reference and total read counts per sample and target. +\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). +\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing +the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} +(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based +inference. } -\examples{ -# Load example files -madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#Temp location (only for example) -output_file <- tempfile() +\strong{Output VCF layout} +\itemize{ +\item \code{INFO} fields: +\itemize{ +\item \code{DP} — total depth across all samples for the locus +\item \code{ADS} — total counts across samples in the order \verb{ref,alt} +} +\item \code{FORMAT} fields (per sample): +\itemize{ +\item \code{DP} — total reads (ref + alt) +\item \code{RA} — reads supporting the reference allele +\item \code{AD} — \code{"ref,alt"} counts +} +} -# Convert MADC to VCF -madc2vcf_targets(madc_file = madc_file, - output.file = output_file, - get_REF_ALT = TRUE, - botloci_file = bot_file) +\strong{Strand handling} +If a target ID appears in \code{botloci_file}, its probe sequences are reverse- +complemented prior to base comparison so that REF/ALT are reported in the +top-strand genomic orientation. -rm(output_file) +\strong{Sanity check behavior} +\itemize{ +\item If required columns or fixed IDs are missing, the function \code{stop()}s. +\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not} +provided, the function \code{stop()}s with a diagnostic message explaining what to fix. +} +} +\section{Dependencies}{ +Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base +\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()}, +\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}. +} + +\examples{ +# Example files shipped with the package +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", + package = "BIGr") +out_vcf <- tempfile(fileext = ".vcf") + +# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +\dontrun{ +madc2vcf_targets( + madc_file = madc_file, + output.file = out_vcf, + botloci_file = bot_file, + get_REF_ALT = TRUE +) +} + +# Clean up (example) +unlink(out_vcf) + +} +\seealso{ +\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()} } diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R new file mode 100644 index 0000000..5053e55 --- /dev/null +++ b/tests/testthat/test-check_madc_sanity.R @@ -0,0 +1,10 @@ +test_that("check madc",{ + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + report <- read.csv(madc_file, check.names = FALSE) + + check_madc_sanity(report) + + +}) + + From 1b761b92586c5eca1521b52cb7e745edb7e173dc Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:06:12 -0500 Subject: [PATCH 02/37] updated check_ped to save corrected dataframe and report --- .DS_Store | Bin 0 -> 8196 bytes BIGr.Rproj | 1 + DESCRIPTION | 2 +- R/check_ped.R | 267 +++++++++++++++++++++++++++++--------------------- 4 files changed, 157 insertions(+), 113 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3b9d8426aa5a73e270c1816e6de3b764b9deb853 GIT binary patch literal 8196 zcmeHMU2GLa6rOKeV3!TB)B*)rduxSaxrVktrJ``TZ6iM|f48MS{M5a77rJ4)Tkh_? zr8d?2qyaVYMdSY)Q6EsFDEjJuVtgP-l$eMy@rf6W;Dd?r%+6k*Eq&Aw*hyy2oHOUl z^ql?9%$+G?4C#W|$XFF)Os2}Irc!g0!pC`?R-{N$P84L%{Nb$QWr;hR9iFBg3*rdG z5r`uYM<9+s9D!RQ0(55cBF}K{i_y4^BM?X6j*I}`9}-kKO+|D<&~WRZCcFX=O19Ce0V?}Tsw(I&v=O>yYtb#ENM*eYHl z$}-tu;myU8H_BX9ou~Ko_Vw>SFraA#-`ZLfdX=Gu}aB>AwwzfD<#(rFA=klK8?Dl*+usyf0H?WQee2h;tX{fi~_wqq&I9abv z^9!vCzQ(R$ZtU-~@;*^TIX!<#^?j=zXnOGBRQtBhJ?9r_3oF!0b#YJ8_U)`=b&uF) zFxY4KIm5REEQFQnW?AXYc|$|wj4xzqy-n7>mJjiTI}AVA zHEg-1KPjzmmF0ea(8yc6MhJ2xiw;>1KO@PxNxNwWUOtS*&Kh0s<5O)hP0J*wmsKsT z&@+0steqnp&3216ftYNCHR-399fU6aRu`tH9sRqGUmc$X|hnmhBSek8T zhuAP1WhdE->`nF|JHtL=U$F1k1@;sBjs4F4U{}~x_BYBg7nN9!6{y21+>a)##dirPV^%O6NhmG4hksZI7aaVp2jmcfs=Rvui#a@jyG@$@8UhYk5BO#KF62%3K#G* zF5wqk#uZ$}Uy>qKNXsQ%x<{&$R!Q~JYH5?SS=uV?k@}?pX;63|l@jgXWY45g;-hVY z5-oc6lTP0}(W)Qax^4T8j$1a-ew@!!>#oX0iHDhe5O>?=J2$Y&vWgXd(~=f7A1^eSJn|bo03JatCQ8$nnXD& z*oMYxRV$+mR6^VJNn#N(Ef1-xR!+Gn#y081vWQ8y(lk7?`G+2X={FCXD}s zGRRnh1XdzRnBIgG;kpg&gzMediwwGO5CbrfC6qgGF^pr#V+`YX5)*ieF#ar_!}E9v zFB8sRBcz|gTX-Aq-~*h&$2f~mLd1WI?{N{oPQh^JWDMia>liL(@~-2#M@Uja{AA_Z zk}DHlRq^|O`^@kEx25l~Q*i|12#5%vye-p~rv6NOAHQoSs6IxO7p^xUXlO#sq=x|F kul>W2`Uz6yHmQhC2uczv|NBG0^dH*s`5&MEcij2^FAa5Kvj6}9 literal 0 HcmV?d00001 diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..cba831e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..35b0ba9 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,142 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: +#' * Corrected pedigree as a dataframe in the global environment +#' * A report listing all detected issues +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### for (i in 1:nrow(data)) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +145,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +164,74 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))) + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### file names #### + file_base <- tools::file_path_sans_ext(basename(ped.file)) + corrected_name <- paste0(file_base, "_corrected") + report_name <- paste0(file_base, "_report") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") + + if (nrow(exact_duplicates) > 0) { + cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate rows found.\n") + + if (nrow(repeated_ids_report) > 0) { + cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated IDs found.\n") + + if (nrow(messy_parents) > 0) { + cat("\nIDs found as both sire and dam:\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + if (nrow(missing_parents) > 0) { + cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + #### interactive save #### + cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, data, envir = .GlobalEnv) + assign("input_ped_report", input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + } else { + cat("No corrected pedigree was saved.\n") } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) } else { - if (verbose) { - cat("No dependencies found.\n") - } + # Silent automatic mode + assign(corrected_name, data, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) } - return(results) + invisible(input_ped_report) } From 743043a0753f35def5d52c441d077f07800adb1d Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:23:14 -0500 Subject: [PATCH 03/37] reorganized report and fixed language --- R/check_ped.R | 21 +++++++++--------- man/check_ped.Rd | 56 ++++++++++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 35b0ba9..a2277bc 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -169,7 +169,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { if (length(cycles) > 0) { for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } @@ -192,24 +192,25 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { cat("\n=== Pedigree Quality Check Report ===\n") if (nrow(exact_duplicates) > 0) { - cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") print(exact_duplicates) - } else cat("\nNo exact duplicate rows found.\n") + } else cat("\nNo exact duplicate trios found.\n") if (nrow(repeated_ids_report) > 0) { - cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") print(repeated_ids_report) - } else cat("\nNo conflicting repeated IDs found.\n") + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") if (nrow(messy_parents) > 0) { - cat("\nIDs found as both sire and dam:\n") + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") print(messy_parents) } else cat("\nNo IDs found as both sire and dam.\n") - if (nrow(missing_parents) > 0) { - cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") - print(missing_parents) - } else cat("\nNo missing parents found.\n") if (nrow(input_ped_report$dependencies) > 0) { cat("\nDependencies detected:\n") diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..ea63de7 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,58 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: +\itemize{ +\item Corrected pedigree as a dataframe in the global environment +\item A report listing all detected issues +} } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + } From 0b97b46434afcd7f441cb2b9705df2a9323232bf Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 14 Nov 2025 12:59:33 -0500 Subject: [PATCH 04/37] bugfix - if hapDB padding is not matching report --- R/madc2vcf_all.R | 16 ++++++++++++++-- R/utils.R | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 329ac5a..125d540 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -135,7 +135,7 @@ madc2vcf_all <- function(madc = NULL, loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ if(!is.null(hap_seq)){ - hap_seq <- get_ref_alt_hap_seq(hap_seq) + hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } nsamples <- ncol(report) - 3 @@ -376,7 +376,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ #' @param hap_seq haplotype db #' #' @noRd -get_ref_alt_hap_seq <- function(hap_seq){ +get_ref_alt_hap_seq <- function(hap_seq, botloci){ + headers <- hap_seq$V1[grep(">",hap_seq$V1)] headers <- gsub(">", "", headers) @@ -394,6 +395,17 @@ get_ref_alt_hap_seq <- function(hap_seq){ seqs <- sapply(seqs, function(x) paste0(x, collapse = "")) hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs) + + # Check padding + hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1]) + botloci_cloneID <- botloci$V1 + + pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID))) + + if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.") + if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.") + return(hap_seq) } diff --git a/R/utils.R b/R/utils.R index c280ad2..2399560 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,6 +56,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){ report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) From 82279af97ef82a077a0dcdfd264f961b9d2abf2c Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 26 Feb 2026 15:12:09 -0500 Subject: [PATCH 05/37] added option to print plot or list to imputation_concordance --- R/imputation_concordance.R | 155 ++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 47 deletions(-) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 6ab2eba..ac501bb 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,89 +1,121 @@ #' Calculate Concordance between Imputed and Reference Genotypes #' -#' This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -#' It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -#' will not be considered for concordance if the `missing_code` argument is used. If a specific subset of markers should be excluded, -#' it can be provided using the `snps_2_exclude` argument. +#' This function calculates the concordance between imputed and reference +#' genotypes. It assumes that samples are rows and markers are columns. +#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +#' Missing data in either dataset can be excluded from the concordance calculation +#' using the `missing_code` argument. Specific markers can be excluded using +#' the `snps_2_exclude` argument. #' -#' @param reference_genos A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param imputed_genos A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param missing_code An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation. -#' @param snps_2_exclude An optional vector of marker IDs to exclude from the concordance calculation. -#' @param verbose A logical value indicating whether to print a summary of the concordance results. Default is FALSE. +#' @param reference_genos A data frame containing reference genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @return A list with two elements: -#' \itemize{ -#' \item \code{result_df}: A data frame with sample IDs and their concordance percentages. -#' \item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. -#' } +#' @param imputed_genos A data frame containing imputed genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @details The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +#' @param missing_code Optional value specifying missing data. If provided, +#' loci with this value in either dataset will be excluded from the concordance calculation. #' -#' @import dplyr +#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. #' -#' @examples +#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, +#' median, mean, maximum) of concordance percentages. #' -#' # Example Input variables -#' ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -#' ref_file <- system.file("imputation_reference.txt", package="BIGr") -#' test_file <- system.file("imputation_test.txt", package="BIGr") +#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage +#' by sample. #' -#' # Import files -#' snps = read.table(ignore_file, header = TRUE) -#' ref = read.table(ref_file, header = TRUE) -#' test = read.table(test_file, header = TRUE) +#' @param print_result Logical. If `TRUE` (default), prints the concordance +#' results data frame to the console. If `FALSE`, results are returned invisibly. #' -#' #Calculations -#' result <- imputation_concordance(reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = snps, -#' missing_code = 5, -#' verbose = FALSE) +#' @return A data frame with: +#' \itemize{ +#' \item \code{ID}: Sample identifiers shared between the datasets. +#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' } +#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' +#' @details +#' The function: +#' \enumerate{ +#' \item Identifies common samples and markers between the datasets. +#' \item Optionally excludes specified SNPs. +#' \item Removes loci with missing data (if \code{missing_code} is provided). +#' \item Computes per-sample concordance as the percentage of matching genotypes. +#' } #' +#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +#' is generated using \pkg{ggplot2}. #' -#' @export +#' @import dplyr +#' @import ggplot2 +#' +#' @examples +#' result <- imputation_concordance( +#' reference_genos = ref, +#' imputed_genos = test, +#' snps_2_exclude = snps, +#' missing_code = 5, +#' verbose = TRUE, +#' plot = TRUE +#' ) #' +#' @export imputation_concordance <- function(reference_genos, imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE) { + verbose = FALSE, + plot = FALSE, + print_result = TRUE) { # Find common IDs common_ids <- intersect(imputed_genos$ID, reference_genos$ID) - imputed_genos <- imputed_genos %>% filter(ID %in% common_ids) %>% arrange(ID) - reference_genos <- reference_genos %>% filter(ID %in% common_ids) %>% arrange(ID) + imputed_genos <- imputed_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) + + reference_genos <- reference_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) - # Find common SNPs, excluding those in snps_2_exclude if provided + # Find common SNPs common_snps <- setdiff( intersect(colnames(imputed_genos), colnames(reference_genos)), as.vector(unlist(snps_2_exclude)) ) - # Subset and convert to matrices for faster computation + # Remove ID column if present + common_snps <- setdiff(common_snps, "ID") + + # Convert to matrices imputed_matrix <- as.matrix(imputed_genos[, common_snps]) reference_matrix <- as.matrix(reference_genos[, common_snps]) - # Identify valid SNPs that are not missing in either dataset + # Identify valid SNPs if (!is.null(missing_code)) { - valid_snps <- (imputed_matrix != missing_code) & (reference_matrix != missing_code) + valid_snps <- (imputed_matrix != missing_code) & + (reference_matrix != missing_code) } else { - valid_snps <- matrix(TRUE, nrow = nrow(imputed_matrix), ncol = ncol(imputed_matrix)) + valid_snps <- matrix(TRUE, + nrow = nrow(imputed_matrix), + ncol = ncol(imputed_matrix)) } - # Compute concordance (row-wise percentage of matching SNPs) + # Compute concordance matches <- (imputed_matrix == reference_matrix) & valid_snps - percentage_match <- rowSums(matches, na.rm = TRUE) / rowSums(valid_snps, na.rm = TRUE) + percentage_match <- rowSums(matches, na.rm = TRUE) / + rowSums(valid_snps, na.rm = TRUE) + + percentage_match[is.nan(percentage_match)] <- NA - # Create output data frame + # Output data frame (original structure preserved) result_df <- data.frame( - ID = common_ids, + ID = imputed_genos$ID, Concordance = paste0(round(percentage_match * 100, 2), "%") ) - # Print mean concordance + # Summary statistics summary_concordance <- summary(percentage_match, na.rm = TRUE) * 100 names(summary_concordance) <- c("Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max") @@ -94,6 +126,35 @@ imputation_concordance <- function(reference_genos, } } - return(result_df) -} + # Print results to console (NEW OPTION) + if (print_result) { + print(result_df) + } + + # Optional plot + if (plot) { + + plot_df <- data.frame( + ID = imputed_genos$ID, + Concordance = percentage_match * 100 + ) + + concordance_plot <- ggplot(plot_df, + aes(x = reorder(ID, Concordance), + y = Concordance)) + + geom_bar(stat = "identity") + + labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + print(concordance_plot) + } + + if (print_result) { + return(result_df) + } else { + invisible(result_df) + } +} From 6b81982330c37127c9c0c6125cc62f6063d5fad3 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 3 Mar 2026 09:05:31 -0500 Subject: [PATCH 06/37] ignore DS_STore --- .DS_Store | Bin 8196 -> 8196 bytes .gitignore | 1 + 2 files changed, 1 insertion(+) diff --git a/.DS_Store b/.DS_Store index 3b9d8426aa5a73e270c1816e6de3b764b9deb853..55d74ff50dbdf39668d23a3bcd564e8fac996bea 100644 GIT binary patch literal 8196 zcmeHMTWl0n7(U;$z!^HwDHIswvWr!Tu)+eRf`DvqXc1`BZRrJM*~XSj@EgECgS0yM9MvbUZ5`&@-J}6!=#s{Oucz-ZH`QU%%%o5tdlg5b7N#^|L|Ih!Q zIp;gyKeK0+F@}~>VHINrV~o=!q+UbSMT+Pb*QBC^mTHoqc;@D)buu?>Tb@73J2FHF zLaI(ikzf>s4q?F+d;o$?}oe`c;5Zs-{j|c7y zDJi2mLLfq5G6M4K-klrqhI96X=l5jt6DU>HQ>ISi(|8TPHND>*N&8v9kcR4qw7~E6Y<71O@|Mwd``p44c(ao_K@Qqk+Vnm1Yk2JtmXmI!Mbg)+$xt+ zwN`$LNG3n8EJdAOqs=gSdUy6|0~vi*ndaO%>|3tm?KF!X$&6m>(b_uluG3XCMSox3 zwM#>em(lC;Svy~{v%cAE+iB|#lWetKzUbO^m+M(#n1O!ZJmibMNnQPeo?G-=gTXSo zUh>Rtlg6^jW*d#@^;>jH5m-NASx`^RoVRev)yq~kuHV$MW$TW!wR+twtzMhkSGGJW zXPdnvR>9vtkoEGWQ?Q(Ug9NVQTen$eQLRHl?Qyb0W)Nk}Hx_Wcf%lhk*1?kH-!8{m zY{XUlWxl&W6WQ(-L}WqwFE*~?e3!7d7EP+9SzBsctLjd|`I1e_D>>{MSVEJy?x!Bmw`G6R#xlo{SX6bdcon3}O z+h93mi&#xtC&{gU$T@GQY(Yb6nQS(kvQGuA086zYl?+`tTf|ngHnxrJWkoj1j__%H00mPp6*Z{G9L$4(C0K^#xDG3@0c~i<7Ib45c4H4RC}0Q< zhH(&O+>J5Zi~Ddt9>Jq{43Fa(Jd5XW3@_q!yn#3I7CykI_zdUpHGaa+_(g#-U8zwt zWv&uaRx6FlTBS+3UP&n(O1ILd^vjtml{6X*R}u;(OAVPu;S)hQaq%>Y-E?!uR&Cp5 zlSb!eio}^+KWFZ|xX7k;o0`wlPO`K@%nJ!0aC}^51RM`|Fp;0e{3{kLT%^T%DGAI& z6VtMiuKF~Ql>DslUUH=t)2C3HNcEbfq)w$2k?Qh9ET$`znNm%zifMWk<((SDu1%1N zP+5P2rs>s`fwJ$$MWnJ&X=$ZlL_Ym(%s$7yV&Ab}*&oF0nV3y{UWz1|u#tGY16@d? z2ff7Qy~rUS@Yy9c7vbSf+(nGOhd6x%NAVyY!ozqHPvPl+*T--i&*KHWf>&`8uiBY^6*bXyB8u=0~&;#$R7J5JYLxGPV45=L_R}qs6<(QOY Zq4JM^2#DVQ0`|Z5%Qkxd3k)yo@+aYeOHcp+ literal 8196 zcmeHMU2GLa6rOKeV3!TB)B*)rduxSaxrVktrJ``TZ6iM|f48MS{M5a77rJ4)Tkh_? zr8d?2qyaVYMdSY)Q6EsFDEjJuVtgP-l$eMy@rf6W;Dd?r%+6k*Eq&Aw*hyy2oHOUl z^ql?9%$+G?4C#W|$XFF)Os2}Irc!g0!pC`?R-{N$P84L%{Nb$QWr;hR9iFBg3*rdG z5r`uYM<9+s9D!RQ0(55cBF}K{i_y4^BM?X6j*I}`9}-kKO+|D<&~WRZCcFX=O19Ce0V?}Tsw(I&v=O>yYtb#ENM*eYHl z$}-tu;myU8H_BX9ou~Ko_Vw>SFraA#-`ZLfdX=Gu}aB>AwwzfD<#(rFA=klK8?Dl*+usyf0H?WQee2h;tX{fi~_wqq&I9abv z^9!vCzQ(R$ZtU-~@;*^TIX!<#^?j=zXnOGBRQtBhJ?9r_3oF!0b#YJ8_U)`=b&uF) zFxY4KIm5REEQFQnW?AXYc|$|wj4xzqy-n7>mJjiTI}AVA zHEg-1KPjzmmF0ea(8yc6MhJ2xiw;>1KO@PxNxNwWUOtS*&Kh0s<5O)hP0J*wmsKsT z&@+0steqnp&3216ftYNCHR-399fU6aRu`tH9sRqGUmc$X|hnmhBSek8T zhuAP1WhdE->`nF|JHtL=U$F1k1@;sBjs4F4U{}~x_BYBg7nN9!6{y21+>a)##dirPV^%O6NhmG4hksZI7aaVp2jmcfs=Rvui#a@jyG@$@8UhYk5BO#KF62%3K#G* zF5wqk#uZ$}Uy>qKNXsQ%x<{&$R!Q~JYH5?SS=uV?k@}?pX;63|l@jgXWY45g;-hVY z5-oc6lTP0}(W)Qax^4T8j$1a-ew@!!>#oX0iHDhe5O>?=J2$Y&vWgXd(~=f7A1^eSJn|bo03JatCQ8$nnXD& z*oMYxRV$+mR6^VJNn#N(Ef1-xR!+Gn#y081vWQ8y(lk7?`G+2X={FCXD}s zGRRnh1XdzRnBIgG;kpg&gzMediwwGO5CbrfC6qgGF^pr#V+`YX5)*ieF#ar_!}E9v zFB8sRBcz|gTX-Aq-~*h&$2f~mLd1WI?{N{oPQh^JWDMia>liL(@~-2#M@Uja{AA_Z zk}DHlRq^|O`^@kEx25l~Q*i|12#5%vye-p~rv6NOAHQoSs6IxO7p^xUXlO#sq=x|F kul>W2`Uz6yHmQhC2uczv|NBG0^dH*s`5&MEcij2^FAa5Kvj6}9 diff --git a/.gitignore b/.gitignore index b64a99f..d3ffaad 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata revdep/ +.DS_Store From 8205e4ec01d2de3ca53eb909e24e70cccc07e032 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 3 Mar 2026 10:24:31 -0500 Subject: [PATCH 07/37] added option to print pre-filtering depth and genotyping rate --- R/filterVCF.R | 292 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 247 insertions(+), 45 deletions(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index a54e32e..6170697 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -26,42 +26,84 @@ #' @examples #' ## Use file paths for each file on the local system #' -#' #Temp location (only for example) -#' output_file <- tempfile() #' -#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), -#' filter.OD = 0.5, -#' filter.MAF = 0.05, -#' ploidy = 2, -#' output.file = output_file) -#' -#' # Removing the output for the example -#' rm(output_file) +#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv", +#' # filter.OD = 0.5, +#' # ploidy = 2, +#' # output.file = "name_for_vcf") #' #' ##The function will output the filtered VCF to the current working directory #' #' @export filterVCF <- function(vcf.file, - filter.OD = NULL, - filter.BIAS.min = NULL, - filter.BIAS.max = NULL, - filter.DP = NULL, - filter.MPP = NULL, - filter.PMC = NULL, - filter.MAF = NULL, - filter.SAMPLE.miss = NULL, - filter.SNP.miss = NULL, - ploidy, - output.file = NULL) { + quality.rates = F, + filter.OD = NULL, + filter.BIAS.min = NULL, + filter.BIAS.max = NULL, + filter.DP = NULL, + filter.MPP = NULL, + filter.PMC = NULL, + filter.MAF = NULL, + filter.SAMPLE.miss = NULL, + filter.SNP.miss = NULL, + ploidy, + output.file = NULL) { #Should allow for any INFO field to be entered to be filtered - # Import VCF (can be .vcf or .vcf.gz) - if (!inherits(vcf.file, "vcfR")) { - vcf <- read.vcfR(vcf.file, verbose = FALSE) + + + # Read VCF (can be .vcf or .vcf.gz) + + if (class(vcf.file) != "vcfR") { + vcf <- read.vcfR(vcf.file) } else { vcf <- vcf.file - #rm(vcf.file) + } + + # Keep original VCF for pre‑filter statistics + vcf_orig <- vcf + + + # pre‑filtering quality rates + + if (quality.rates) { + gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) + + dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] + if ("DP" %in% dfmt) { + dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) + } else { + dp_orig <- matrix(NA_real_, nrow = nrow(gt_orig), ncol = ncol(gt_orig), + dimnames = dimnames(gt_orig)) + } + + # Per‑marker + mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) + genotype_present <- !is.na(gt_orig) + genotyping_rate_marker <- rowMeans(genotype_present) + markers_df <- data.frame( + marker = vcf_orig@fix[, "ID"], + mean_depth = round(mean_depth_marker,2), + genotyping_rate = round(genotyping_rate_marker,2), + stringsAsFactors = FALSE + ) + + # Per‑sample + mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) + genotyping_rate_sample <- colMeans(genotype_present) + samples_df <- data.frame( + sample = colnames(gt_orig), + mean_depth = round(mean_depth_sample,2), + genotyping_rate = round(genotyping_rate_sample,2), + stringsAsFactors = FALSE + ) + + base_name <- if (!is.null(output.file)) output.file else "pre_filter" + write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), + row.names = FALSE, quote = FALSE) + write.csv(samples_df, paste0(base_name, "_sample_stats.csv"), + row.names = FALSE, quote = FALSE) } #Update header based on user filtering parameters @@ -102,7 +144,7 @@ filterVCF <- function(vcf.file, # Extract the DP values if ("DP" %in% format_fields && !is.null(filter.DP)) { - message("Filtering by DP\n") + cat("Filtering by DP\n") dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold threshold <- as.numeric(filter.DP) @@ -116,7 +158,7 @@ filterVCF <- function(vcf.file, #Filter if the MPP field is present if ("MPP" %in% format_fields && !is.null(filter.MPP)) { - message("Filtering by MPP\n") + cat("Filtering by MPP\n") # Extract the MPP values mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold @@ -156,13 +198,13 @@ filterVCF <- function(vcf.file, # Filtering by OD if ("OD" %in% info_ids && !is.null(filter.OD)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by OD\n") + cat("Filtering by OD\n") od_values <- extract_info_value(info, "OD") # Ensure no NA values before filtering if (!all(is.na(od_values))) { vcf <- vcf[od_values < as.numeric(filter.OD), ] } else { - warning("No valid OD values found.\n") + cat("No valid OD values found.\n") } } @@ -171,26 +213,26 @@ filterVCF <- function(vcf.file, # Filtering by BIAS if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by BIAS\n") + cat("Filtering by BIAS\n") bias_values <- extract_info_value(info, "BIAS") # Ensure no NA values before filtering if (!all(is.na(bias_values))) { vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ] } else { - warning("No valid BIAS values found.\n") + cat("No valid BIAS values found.\n") } } # Filtering by PMC if ("PMC" %in% info_ids && !is.null(filter.PMC)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by PMC\n") + cat("Filtering by PMC\n") pmc_values <- extract_info_value(info, "PMC") # Ensure no NA values before filtering if (!all(is.na(pmc_values))) { vcf <- vcf[pmc_values < as.numeric(filter.PMC), ] } else { - warning("No valid PMC values found.\n") + cat("No valid PMC values found.\n") } } @@ -200,14 +242,14 @@ filterVCF <- function(vcf.file, gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) if (!is.null(filter.SNP.miss)) { - message("Filtering by SNP missing data\n") + cat("Filtering by SNP missing data\n") snp_missing_data <- rowMeans(is.na(gt_matrix)) vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ] gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) } if (!is.null(filter.SAMPLE.miss)) { - message("Filtering by Sample missing data\n") + cat("Filtering by Sample missing data\n") # Calculate the proportion of missing data for each sample sample_missing_data <- colMeans(is.na(gt_matrix)) # Identify samples to keep based on the missing data threshold @@ -222,30 +264,112 @@ filterVCF <- function(vcf.file, rm(gt_matrix) } + ##Convert GT to dosage + #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) + + # Function to determine the ploidy level from a genotype string + #determine_ploidy <- function(gt) { + # if (is.na(gt)) { + # return(NA) + # } + # return(length(strsplit(gt, "[|/]")[[1]])) + #} + + # Function to find a non-NA example genotype to determine ploidy + #find_example_gt <- function(matrix) { + # for (i in seq_len(nrow(matrix))) { + # for (j in seq_len(ncol(matrix))) { + # if (!is.na(matrix[i, j])) { + # return(matrix[i, j]) + # } + # } + # } + # return(NA) # Return NA if no non-NA genotype is found + #} + + # Find a non-NA example genotype + #example_gt <- find_example_gt(gt_matrix) + + # Determine the ploidy level + #if (!is.na(example_gt)) { + # ploidy <- determine_ploidy(example_gt) + #} else { + # stop("No non-NA genotype found to determine ploidy.") + #} + + # Generate lookup table for genotypes to dosage conversion + #generate_lookup_table <- function(ploidy) { + # possible_alleles <- 0:ploidy + # genotypes <- expand.grid(rep(list(possible_alleles), ploidy)) + # genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/")) + # dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy))) + # lookup_table <- setNames(dosage_values, genotypes) + # return(lookup_table) + #} + + # Generate the lookup table + #lookup_table <- generate_lookup_table(ploidy) + + # Function to convert genotype to dosage using the lookup table + #genotype_to_dosage <- function(gt, lookup_table) { + # if (is.na(gt)) { + # return(NA) + # } + # return(lookup_table[[gt]]) + #} + + # Function to convert genotype matrix to dosage matrix using vectorized operations + #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) { + # unique_gts <- unique(gt_matrix) + # gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts) + # valid_gts <- unique_gts[unique_gts %in% names(lookup_table)] + # gt_to_dosage[valid_gts] <- lookup_table[valid_gts] + # dosage_matrix <- gt_to_dosage[gt_matrix] + #colnames(dosage_matrix) <- colnames(gt_matrix) + #row.names(dosage_matrix) <- row.names(gt_matrix) + # return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix))) + #} + + # Convert the genotype matrix to dosage matrix + #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table) + ##MAF filter + #Compare my lengthy process to estimate MAF with vcfR::maf() function + #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations + #The step where I extract UD and calculate MAF is different... + #if ("UD" %in% format_fields) { + # maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy) + #} else { + #convert genotypes to dosage and filter + # maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy) + #} + #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code if (!is.null(filter.MAF)) { - message("Filtering by MAF\n") + cat("Filtering by MAF\n") maf_df <- data.frame(vcfR::maf(vcf, element = 2)) vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ] } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) - message("Exporting VCF\n") - if (!inherits(vcf.file, "vcfR")) { - if (!is.null(output.file)) { - output_name <- paste0(output.file, ".vcf.gz") + cat("Exporting VCF\n") + if (!class(vcf.file) == "vcfR"){ + if (!is.null(output.file)){ + output_name <- paste0(output.file,".vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } - } else { - if (!is.null(output.file)) { - output_name <- paste0(output.file, "_filtered.vcf.gz") + }else{ + if (!is.null(output.file)){ + output_name <- paste0(output.file,"_filtered.vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } } + #Message that includes the output vcf stats + print(vcf) + #Message samples_removed <- starting_samples - (ncol(vcf@gt)-1) SNPs_removed <- starting_snps - nrow(vcf) @@ -253,3 +377,81 @@ filterVCF <- function(vcf.file, message("SNPs removed due to filtering: ",SNPs_removed) message("Complete!") } +#This is not reliable, so no longer use this shortcut to get dosage matrix +#test2 <- vcfR2genlight(vcf) + + +#####Testing custom VCF reading function###### +# Open the gzipped VCF file +#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt") + +# Read in the entire file +#lines <- readLines(con) +#close(con) +# Read in the entire file +#lines <- readLines("/Users/ams866/Desktop/output.vcf") +# Filter out lines that start with ## +#filtered_lines <- lines[!grepl("^##", lines)] +# Create a temporary file to write the filtered lines +#temp_file <- tempfile() +#writeLines(filtered_lines, temp_file) +# Read in the filtered data using read.table or read.csv +#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE) +# Clean up the temporary file +#unlink(temp_file) + +##Extract INFO column and Filter SNPs by those values +#Update the filtering options by the items present in the INFO column? + +# Load required library +#library(dplyr) + +# Split INFO column into key-value pairs +#vcf_data_parsed <- vcf_data %>% +# mutate(INFO_PARSED = strsplit(INFO, ";")) %>% +# unnest(INFO_PARSED) %>% +# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>% +# spread(KEY, VALUE) + +#Filter by DP +#filtered_vcf_data <- vcf_data_parsed %>% +# filter(as.numeric(DP) > 10) + +# View the filtered dataframe +#print(filtered_vcf_data) + +##Extracting and filtering by FORMAT column +# Identify the columns that are not sample columns +#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT") +# Identify the sample columns +#sample_cols <- setdiff(names(vcf_data), non_sample_cols) +# Extract FORMAT keys +#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]] +# Split SAMPLE columns based on FORMAT +#vcf_data_samples <- vcf_data %>% +# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>% +# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>% +# unnest_wider(all_of(sample_cols), names_sep = "_") + +# View the parsed dataframe +#print(head(vcf_data_samples)) + +# Create separate dataframes for each FORMAT variable +#format_dfs <- lapply(format_keys, function(format_key) { +# vcf_data_samples %>% +# select(ID, ends_with(paste0("_", format_key))) %>% +# column_to_rownames("ID") +#}) + +# Assign names to the list elements +#names(format_dfs) <- format_keys + +# Access the separate dataframes +#gt_df <- format_dfs$GT # Genotype dataframe +#ad_df <- format_dfs$AD # Allelic depths dataframe + +#*I think the above method is okay if you only need to filter at the INFO level, +#*But I think if you want to filter for FORMAT, that vcfR is probably best, +#*Will need to explore further if I can easily just filter for MPP by checking if it is above a +#*threshold, and then converting the GT and UD values to NA if so... +#*If that is efficient and works, then I will just use this custom VCF method... From 31248e3d443c9196533a0ef4f71b5201e3965784 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 4 Mar 2026 15:19:19 -0500 Subject: [PATCH 08/37] added calculation for Ho --- .DS_Store | Bin 8196 -> 8196 bytes R/filterVCF.R | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/.DS_Store b/.DS_Store index 55d74ff50dbdf39668d23a3bcd564e8fac996bea..dd2adc3630aaafc6fd7700efa91c96db437ecab5 100644 GIT binary patch delta 239 zcmZp1XmOYj%eZ4>+9h2n*3b`e!t4%G9bQDYs3?|POkY?*xy3`Ptt3_c7A3`Gpp48069 z7#2=W5T47JF!`UbFOxSmL!OA35n+gxSTp12&HKbW88@>@d}rDGPeg_UUG9tk^@%=r delta 270 zcmZp1XmOYj%eZr6+os;DR3V9@}tBnoJOmq~?Oe`kP5|C!>p1e&!p0Q{0H33~F zp2*2QBGoeS0s^JQsX?hZi6xn3sV<2nsl^eQ$@zJ~nN_Kq=LjxlRNz#0FkpZH4hC5U zBL)`+ABF^mB8F;)UWOSAt0%_`&t;65{8!kQiN}YqnU6)xh%nPqteNrk=DlK`jGNgd PzOiinBO*(JE;mL1vIt69 diff --git a/R/filterVCF.R b/R/filterVCF.R index 6170697..550468c 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -68,37 +68,67 @@ filterVCF <- function(vcf.file, # pre‑filtering quality rates if (quality.rates) { + ## Extract genotypes, depth and DP matrix gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] if ("DP" %in% dfmt) { dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) } else { - dp_orig <- matrix(NA_real_, nrow = nrow(gt_orig), ncol = ncol(gt_orig), + dp_orig <- matrix(NA_real_, + nrow = nrow(gt_orig), ncol = ncol(gt_orig), dimnames = dimnames(gt_orig)) } - # Per‑marker + + # 1. Observed heterozygosity (per‑marker & per‑sample) + + # Helper: TRUE if a genotype is heterozygous (any two different + # alleles, excluding missing "./.") + is_het <- function(g) { + if (is.na(g) || g == "./.") return(FALSE) + alleles <- strsplit(g, split = "[/|]")[[1]] + return(length(unique(alleles)) > 1) + } + #matrix of heterozygous calls + het_mat <- apply(gt_orig, c(1, 2), is_het) + + #Observed heterozygosity per marker and per sample + obs_het_marker <- rowMeans(het_mat, na.rm = TRUE) + obs_het_sample <- colMeans(het_mat, na.rm = TRUE) + + + #Per‑marker stats + mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) genotype_present <- !is.na(gt_orig) genotyping_rate_marker <- rowMeans(genotype_present) + markers_df <- data.frame( marker = vcf_orig@fix[, "ID"], - mean_depth = round(mean_depth_marker,2), - genotyping_rate = round(genotyping_rate_marker,2), + mean_depth = round(mean_depth_marker, 2), + genotyping_rate = round(genotyping_rate_marker, 2), + obs_het = round(obs_het_marker, 2), stringsAsFactors = FALSE ) - # Per‑sample + + #Per‑sample stats + mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) genotyping_rate_sample <- colMeans(genotype_present) + samples_df <- data.frame( sample = colnames(gt_orig), - mean_depth = round(mean_depth_sample,2), - genotyping_rate = round(genotyping_rate_sample,2), + mean_depth = round(mean_depth_sample, 2), + genotyping_rate = round(genotyping_rate_sample, 2), + obs_het = round(obs_het_sample, 2), stringsAsFactors = FALSE ) + + #Write CSV + base_name <- if (!is.null(output.file)) output.file else "pre_filter" write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), row.names = FALSE, quote = FALSE) From 757b01c69e989096c00c81144bf74704db5de413 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Mar 2026 10:11:38 -0400 Subject: [PATCH 09/37] up version --- DESCRIPTION | 6 +++--- NEWS.md | 7 +++++++ R/madc2vcf_all.R | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..9a18709 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.2 +Version: 0.6.3 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -23,7 +23,7 @@ Authors@R: c(person(given='Alexander M.', person(given='Dongyan', family='Zhao', role='ctb'), - person('Cornell', 'University', + person('University', "of Florida", role=c('cph'), comment = "Breeding Insight")) Maintainer: Alexander M. Sandercock @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/NEWS.md b/NEWS.md index b089e67..1a0d64d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# BIGr 0.6.3 + +- New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. +- Add new arguments markers_info for users to add CSV file with marker information such as CHROM, POS, Marker Type and position of indels. For BI species, these information is at https://github.com/Breeding-Insight/BIGapp-PanelHub +- Check inputs for madc2vcf_all +- Update affiliation on DESCRIPTION + # BIGr 0.6.2 - Fixed the doi and name list in the CITATION file diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 125d540..1413c8e 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -62,6 +62,26 @@ madc2vcf_all <- function(madc = NULL, out_vcf = NULL, verbose = TRUE){ + # Input checks + if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") + if(!is.null(hap_seq_file) & !file.exists(hap_seq_file)) stop("Haplotype sequence file not found. Please provide a valid path.") + + ## n.cores as integer + if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") + + ## alignment_score_thr, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr as numeric + if(!is.numeric(alignment_score_thr)) stop("alignment_score_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_dp_thr)) stop("multiallelic_SNP_dp_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_sample_thr)) stop("multiallelic_SNP_sample_thr should be numeric.") + + ## out_vcf as string + if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") + + ## rm_multiallelic_SNP and verbose as logical + if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(verbose)) stop("verbose should be logical.") + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=%") +importFrom(dplyr,across) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(dplyr,summarise) +importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/NEWS.md b/NEWS.md index 1a0d64d..928265a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# BIGr 0.6.4 + +- Add function `vmsg` to organize messages printed on the console +- Add metadata to VCF header from madc2vcf_targets +- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci` +- Organize messages from `madc2vcf_targets` checks +- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function + # BIGr 0.6.3 - New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 19d3ff6..1455606 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -88,18 +88,18 @@ check_madc_sanity <- function(report) { } else indels <- NULL - messages[["Columns"]] <- c("Required columns are present\n", - "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") - messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", - "No IUPAC (non-ATCG) codes found in AlleleSequence\n") - messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", - "No lowercase bases found in AlleleSequence\n") + messages[["Columns"]] <- c("Required columns are present", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", + "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", + "No IUPAC (non-ATCG) codes found in AlleleSequence") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", + "No lowercase bases found in AlleleSequence") messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs\n") - messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n", - "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n") + "No indels found (ref/alt lengths match) for all CloneIDs") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") list(checks = checks, messages = messages, indel_clone_ids = indels) } diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8be1cda..2123be1 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -1,13 +1,38 @@ #' Obtain Read Counts from MADC File #' -#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +#' Reads a DArTag MADC report and returns reference and total read count matrices +#' per marker and sample. Only `Ref` and `Alt` target loci are retained; +#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on +#' `collapse_matches_counts`. #' +#' @details +#' Either `madc_file` or `madc_object` must be provided (not both `NULL`). +#' When `madc_object` is supplied it is passed directly to `get_counts()`, +#' skipping file I/O. The function constructs: +#' - `ref_matrix` — per-sample reference allele counts. +#' - `size_matrix` — per-sample total counts (ref + alt). +#' +#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows +#' are removed with a warning. A summary of the proportion of zero-count +#' data points (missing data) is reported via `vmsg()`. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., as returned by `check_botloci()`). When supplied, file reading is +#' skipped. At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A named list with two numeric matrices, both with markers as rows +#' and samples as columns: +#' \describe{ +#' \item{`ref_matrix`}{Reference allele read counts.} +#' \item{`size_matrix`}{Total read counts (reference + alternative).} +#' } #' -#' @param madc_file Path to MADC file -#' @import dplyr -#' @return A list of read count matrices for reference, alternate, and total read count values #' @examples #' # Get the path to the MADC file #' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") @@ -15,17 +40,35 @@ #' # Extract the read count matrices #' counts_matrices <- get_countsMADC(madc_path) #' -#' # Access the reference, alternate, and size matrices -#' -#' # ref_matrix <- counts_matrices$ref_matrix -#' # alt_matrix <- counts_matrices$alt_matrix +#' # Access the reference and size matrices +#' # ref_matrix <- counts_matrices$ref_matrix #' # size_matrix <- counts_matrices$size_matrix #' #' rm(counts_matrices) +#' +#' @seealso [get_counts()], [check_madc_sanity()] +#' +#' @import dplyr #' @export -get_countsMADC <- function(madc_file) { +get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + + vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>") + vmsg(ifelse(collapse_matches_counts, + "|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.", + "|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."), + verbose = verbose, level = 1, type = ">>") + # This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output - update_df <- get_counts(madc_file) + if (is.null(madc_object)) { + update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } else { + update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) @@ -43,9 +86,16 @@ get_countsMADC <- function(madc_file) { #Retain only the rows in common if they are not identical and provide warning if (same == FALSE) { - warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.") # Find the common CloneIDs between the two dataframes + all_mks <- unique(c(rownames(ref_df), rownames(alt_df))) common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + n_singles <- length(all_mks) - length(common_ids) + + vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") + vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion")) + # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] alt_df <- alt_df[common_ids, ] @@ -77,7 +127,7 @@ get_countsMADC <- function(madc_file) { # Print the result ratio_missing_data <- count_zeros / length(size_matrix) - message("Ratio of missing data =", ratio_missing_data, "\n") + vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>") # Return the ref and alt matrices as a list matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix) @@ -86,41 +136,109 @@ get_countsMADC <- function(madc_file) { } -get_counts <- function(madc_file) { +#' Read and Pre-process a MADC File +#' +#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +#' file format, and returns a filtered data frame containing only `Ref` and `Alt` +#' haplotype rows ready for count-matrix construction. +#' +#' @details +#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data +#' frame) must be supplied; at least one is required. +#' +#' **Format detection** (applied to file or object alike): the first seven rows +#' of the first column are inspected: +#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are +#' treated as DArT placeholder rows and skipped. +#' - **Fixed-allele-ID format**: no filler rows — data are used as-is. +#' +#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`): +#' - `FALSE` (default): these rows are simply discarded. +#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt` +#' row for the same `CloneID`. +#' +#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`) +#' are stripped to the canonical `|Ref` / `|Alt` form. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., from `check_botloci()`). When supplied, file reading is skipped. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), those rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining +#' all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample +#' count columns, etc.). +#' +#' @importFrom dplyr mutate group_by summarise across where select +#' @importFrom dplyr %>% +#' +#' @keywords internal +get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + # Read the MADC file + + if(!is.null(madc_file)){ #Read only the first column for the first seven rows first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) #Check if all entries in the first column are either blank or "*" check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + } else { + check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*")) + } + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline if (check_entries) { #Note: This assumes that the first 7 rows are placeholder info from DArT processing - #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - - #Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] - - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - + vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + } else { + madc_df <- madc_object[-(1:7), ] + } } else { #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - # Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] + vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + } else { + madc_df <- madc_object + } + } - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + if(collapse_matches_counts){ + filtered_df <- madc_df[order(madc_df$AlleleID),] %>% + mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>% + group_by(CloneID, Type) %>% + summarise( + AlleleID = paste0(unique(CloneID), "|", unique(Type)), + AlleleSequence = first(AlleleSequence), + across(where(is.numeric), sum), + .groups = "drop" + ) %>% + select(AlleleID, CloneID, AlleleSequence, everything(), -Type) + } else { + #Retain only the Ref and Alt haplotypes + filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + return(filtered_df) } diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 1413c8e..13227d3 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -60,8 +60,11 @@ madc2vcf_all <- function(madc = NULL, multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, verbose = TRUE){ + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + # Input checks if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") @@ -85,8 +88,8 @@ madc2vcf_all <- function(madc = NULL, bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=') - if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file") + report <- read.csv(madc, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ + idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) + if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + } + if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index fc022f9..92bb8e6 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -44,13 +44,15 @@ #' designed on the **bottom** strand (one ID per line). Required when #' `get_REF_ALT = TRUE` and `markers_info` is not provided. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are -#' present, also require `Type, Indel_pos`. When supplied, these values populate -#' `#CHROM, POS, REF, ALT` in the VCF directly. +#' metadata. Required columns: `CloneID, Chr, Pos, Ref, Alt`. This file is required in +#' case your MADC CloneID column doesn't have the format CHR_POS. If indels are +#' present, columns `Type, Indel_pos` are also required. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT #' bases from the Ref/Alt probe sequences in the MADC file (with strand correction #' using `botloci_file`). Targets with more than one difference between Ref/Alt #' sequences are removed. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. +#' @param verbose logical (default `FALSE`). If `TRUE`, prints detailed messages about #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -96,7 +98,31 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, markers_info = NULL, - get_REF_ALT = FALSE) { + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = FALSE) { + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(!file.exists(madc_file)) stop("The MADC file does not exist.") + if(!is.character(output.file)) stop("output.file must be a character string.") + if(get_REF_ALT && is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") + if(get_REF_ALT && !file.exists(botloci_file)) stop("The botloci file does not exist.") + if(!is.null(markers_info) && !file.exists(markers_info)) stop("The markers_info file does not exist.") + if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") + if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") + + # Create a VCF header line with metadata about the command and its parameters + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets=') # MADC checks report <- read.csv(madc_file) @@ -106,6 +132,9 @@ madc2vcf_targets <- function(madc_file, if (check) message[1] else message[2] }, checks$checks, checks$messages) + for(i in 1:length(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) stop(paste("The MADC file does not pass the sanity checks:\n", @@ -117,7 +146,19 @@ madc2vcf_targets <- function(madc_file, if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) } - matrices <- get_countsMADC(madc_file) + # Check marker names compatibility between MADC and botloci + if(!is.null(botloci_file)){ + botloci <- read.csv(botloci_file, header = F) + checked_botloci <- check_botloci(botloci, report) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + } + + vmsg("Input checks done", verbose = verbose, level = 1, type = ">>") + + vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") + + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -148,8 +189,7 @@ madc2vcf_targets <- function(madc_file, # Get REF and ALT if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts) # Keep only the ones that have alt and ref csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] @@ -261,7 +301,8 @@ madc2vcf_targets <- function(madc_file, '##INFO=', '##FORMAT=', '##FORMAT=', - '##FORMAT=' + '##FORMAT=', + bigr_meta ) #Make the header#Make the VCF df diff --git a/R/utils.R b/R/utils.R index 2399560..92fac46 100644 --- a/R/utils.R +++ b/R/utils.R @@ -43,7 +43,7 @@ convert_to_dosage <- function(gt) { #' @noRd check_botloci <- function(botloci, report, verbose=TRUE){ if(!any(botloci$V1 %in% report$CloneID)) { - if(verbose) message("None of the botloci markers could be found in the MADC file. Checking padding match...\n") + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) @@ -51,7 +51,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){ if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") if(pad_madc != pad_botloci) { - if(verbose) message("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.\n") + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") if (pad_madc < pad_botloci) { report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) @@ -69,3 +69,46 @@ check_botloci <- function(botloci, report, verbose=TRUE){ } return(list(botloci, report)) } + +##' Verbose Message Utility +##' +##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +##' +##' @param text Character string, the message to print (supports sprintf formatting). +##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +##' @param ... Additional arguments passed to sprintf for formatting. +##' +##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +##' +##' @return No return value, called for side effects. +##' @export +vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { + if (!verbose) return(invisible()) + # Format timestamp + timestamp <- format(Sys.time(), "[%H:%M:%S]") + + # Create indentation based on level + indent <- switch(as.character(level), + "0" = "", # Section headers + "1" = " ∙ ", # Main steps (medium bullet) + "2" = " - ", # Details + "3" = " > ", # Sub-details + paste0(paste(rep(" ", level), collapse = ""), "• ") # Fallback for level > 3 + ) + + # Format type label (only show for level 0) + type_label <- if (level == 0) sprintf("%-1s ", type) else "" + + # Format message text + dots <- list(...) + if(length(dots) == 0) { + msg_text <- text + } else { + msg_text <- sprintf(text, ...) + } + # Combine everything + formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) + message(formatted_msg) +} diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 676ef7f..39d7264 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,6 +6,7 @@ \usage{ filterVCF( vcf.file, + quality.rates = F, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, @@ -58,17 +59,11 @@ The VCF format is v4.3 \examples{ ## Use file paths for each file on the local system -#Temp location (only for example) -output_file <- tempfile() -filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), - filter.OD = 0.5, - filter.MAF = 0.05, - ploidy = 2, - output.file = output_file) - -# Removing the output for the example -rm(output_file) +#filterVCF(vcf.file = "example_dart_Dosage_Report.csv", + # filter.OD = 0.5, + # ploidy = 2, + # output.file = "name_for_vcf") ##The function will output the filtered VCF to the current working directory diff --git a/man/get_counts.Rd b/man/get_counts.Rd new file mode 100644 index 0000000..1879e07 --- /dev/null +++ b/man/get_counts.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_countsMADC.R +\name{get_counts} +\alias{get_counts} +\title{Read and Pre-process a MADC File} +\usage{ +get_counts( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., from \code{check_botloci()}). When supplied, file reading is skipped. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), those rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} +} +\value{ +A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining +all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample +count columns, etc.). +} +\description{ +Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt} +haplotype rows ready for count-matrix construction. +} +\details{ +\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data +frame) must be supplied; at least one is required. + +\strong{Format detection} (applied to file or object alike): the first seven rows +of the first column are inspected: +\itemize{ +\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are +treated as DArT placeholder rows and skipped. +\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is. +} + +\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}): +\itemize{ +\item \code{FALSE} (default): these rows are simply discarded. +\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt} +row for the same \code{CloneID}. +} + +In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002}) +are stripped to the canonical \verb{|Ref} / \verb{|Alt} form. +} +\keyword{internal} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 66c5708..28fca1e 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -4,18 +4,53 @@ \alias{get_countsMADC} \title{Obtain Read Counts from MADC File} \usage{ -get_countsMADC(madc_file) +get_countsMADC( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., as returned by \code{check_botloci()}). When supplied, file reading is +skipped. At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), \verb{|AltMatch} and \verb{|RefMatch} rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} } \value{ -A list of read count matrices for reference, alternate, and total read count values +A named list with two numeric matrices, both with markers as rows +and samples as columns: +\describe{ +\item{\code{ref_matrix}}{Reference allele read counts.} +\item{\code{size_matrix}}{Total read counts (reference + alternative).} +} } \description{ -This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +Reads a DArTag MADC report and returns reference and total read count matrices +per marker and sample. Only \code{Ref} and \code{Alt} target loci are retained; +\verb{|AltMatch} / \verb{|RefMatch} rows are either discarded or collapsed depending on +\code{collapse_matches_counts}. +} +\details{ +Either \code{madc_file} or \code{madc_object} must be provided (not both \code{NULL}). +When \code{madc_object} is supplied it is passed directly to \code{get_counts()}, +skipping file I/O. The function constructs: +\itemize{ +\item \code{ref_matrix} — per-sample reference allele counts. +\item \code{size_matrix} — per-sample total counts (ref + alt). +} + +Markers whose \code{CloneID} appears only in the \code{Ref} or only in the \code{Alt} rows +are removed with a warning. A summary of the proportion of zero-count +data points (missing data) is reported via \code{vmsg()}. } \examples{ # Get the path to the MADC file @@ -24,11 +59,13 @@ madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") # Extract the read count matrices counts_matrices <- get_countsMADC(madc_path) -# Access the reference, alternate, and size matrices - -# ref_matrix <- counts_matrices$ref_matrix -# alt_matrix <- counts_matrices$alt_matrix +# Access the reference and size matrices +# ref_matrix <- counts_matrices$ref_matrix # size_matrix <- counts_matrices$size_matrix rm(counts_matrices) + +} +\seealso{ +\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}} } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 0c06134..dc1cc65 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -9,55 +9,68 @@ imputation_concordance( imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE + verbose = FALSE, + plot = FALSE, + print_result = TRUE ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{reference_genos}{A data frame containing reference genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{imputed_genos}{A data frame containing imputed genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{missing_code}{An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value specifying missing data. If provided, +loci with this value in either dataset will be excluded from the concordance calculation.} -\item{snps_2_exclude}{An optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} -\item{verbose}{A logical value indicating whether to print a summary of the concordance results. Default is FALSE.} +\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, +median, mean, maximum) of concordance percentages.} + +\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage +by sample.} + +\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance +results data frame to the console. If \code{FALSE}, results are returned invisibly.} } \value{ -A list with two elements: +A data frame with: \itemize{ -\item \code{result_df}: A data frame with sample IDs and their concordance percentages. -\item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. +\item \code{ID}: Sample identifiers shared between the datasets. +\item \code{Concordance}: Percentage of matching genotypes per sample. } +If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -will not be considered for concordance if the \code{missing_code} argument is used. If a specific subset of markers should be excluded, -it can be provided using the \code{snps_2_exclude} argument. +This function calculates the concordance between imputed and reference +genotypes. It assumes that samples are rows and markers are columns. +Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +Missing data in either dataset can be excluded from the concordance calculation +using the \code{missing_code} argument. Specific markers can be excluded using +the \code{snps_2_exclude} argument. } \details{ -The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +The function: +\enumerate{ +\item Identifies common samples and markers between the datasets. +\item Optionally excludes specified SNPs. +\item Removes loci with missing data (if \code{missing_code} is provided). +\item Computes per-sample concordance as the percentage of matching genotypes. } -\examples{ - -# Example Input variables -ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -ref_file <- system.file("imputation_reference.txt", package="BIGr") -test_file <- system.file("imputation_test.txt", package="BIGr") - -# Import files -snps = read.table(ignore_file, header = TRUE) -ref = read.table(ref_file, header = TRUE) -test = read.table(test_file, header = TRUE) - -#Calculations -result <- imputation_concordance(reference_genos = ref, - imputed_genos = test, - snps_2_exclude = snps, - missing_code = 5, - verbose = FALSE) - +When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +is generated using \pkg{ggplot2}. +} +\examples{ +result <- imputation_concordance( + reference_genos = ref, + imputed_genos = test, + snps_2_exclude = snps, + missing_code = 5, + verbose = TRUE, + plot = TRUE +) } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 6fe7f11..58cfcb3 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -14,6 +14,7 @@ madc2vcf_all( multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, verbose = TRUE ) } diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index fad847c..5d62c87 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -9,7 +9,9 @@ madc2vcf_targets( output.file, botloci_file, markers_info = NULL, - get_REF_ALT = FALSE + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = FALSE ) } \arguments{ @@ -22,14 +24,18 @@ designed on the \strong{bottom} strand (one ID per line). Required when \code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are -present, also require \verb{Type, Indel_pos}. When supplied, these values populate -\verb{#CHROM, POS, REF, ALT} in the VCF directly.} +metadata. Required columns: \verb{CloneID, Chr, Pos, Ref, Alt}. This file is required in +case your MADC CloneID column doesn't have the format CHR_POS. If indels are +present, columns \verb{Type, Indel_pos} are also required.} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT bases from the Ref/Alt probe sequences in the MADC file (with strand correction using \code{botloci_file}). Targets with more than one difference between Ref/Alt sequences are removed.} + +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} + +\item{verbose}{logical (default \code{FALSE}). If \code{TRUE}, prints detailed messages about} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a diff --git a/man/vmsg.Rd b/man/vmsg.Rd new file mode 100644 index 0000000..abcc768 --- /dev/null +++ b/man/vmsg.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{vmsg} +\alias{vmsg} +\title{Verbose Message Utility} +\usage{ +vmsg(text, verbose = FALSE, level = 1, type = ">>", ...) +} +\arguments{ +\item{text}{Character string, the message to print (supports sprintf formatting).} + +\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.} + +\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).} + +\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.} + +\item{...}{Additional arguments passed to sprintf for formatting.} +} +\value{ +No return value, called for side effects. +} +\description{ +Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +} +\details{ +Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +} From 9afb26545efa2bdabcef702e0673f7b7e50a7961 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Mar 2026 21:31:26 -0400 Subject: [PATCH 12/37] messages ok --- R/madc2vcf_targets.R | 64 +++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 92bb8e6..d595ae5 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -100,7 +100,7 @@ madc2vcf_targets <- function(madc_file, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, - verbose = FALSE) { + verbose = TRUE) { vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") @@ -132,7 +132,7 @@ madc2vcf_targets <- function(madc_file, if (check) message[1] else message[2] }, checks$checks, checks$messages) - for(i in 1:length(messages_results)) + for(i in seq_along(messages_results)) vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ @@ -143,7 +143,9 @@ madc2vcf_targets <- function(madc_file, if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + if(is.null(markers_info)) stop("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + else vmsg("MADC file has some issues (IUPAC codes, lowercase bases, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } # Check marker names compatibility between MADC and botloci @@ -158,7 +160,7 @@ madc2vcf_targets <- function(madc_file, vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") - matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts) + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -172,8 +174,11 @@ madc2vcf_targets <- function(madc_file, ) row.names(ad_df) <- row.names(ref_df) + vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") + #Obtaining Chr and Pos information from the row_names if(is.null(markers_info)){ + vmsg("No markers_info file provided. Attempting to recover CHROM and POS from CloneID...", verbose = verbose, level = 0, type = ">>") new_df <- size_df %>% rownames_to_column(var = "row_name") %>% separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% @@ -187,9 +192,11 @@ madc2vcf_targets <- function(madc_file, new_df$TotalAlt <- rowSums(alt_df) new_df$TotalSize <- rowSums(size_df) + vmsg("CHROM and POS recovered from CloneID", verbose = verbose, level = 1, type = ">>") # Get REF and ALT if(get_REF_ALT){ - csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts) + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", verbose = verbose, level = 0, type = ">>") + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) # Keep only the ones that have alt and ref csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] @@ -225,8 +232,9 @@ madc2vcf_targets <- function(madc_file, orig_alt_seq <- orig_alt_seq[order(alt_ord)] ordered_clone_ids <- sort(ref_ord) + more_poly <- no_diff <- 0 ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ + for(i in seq_along(orig_ref_seq)){ # FIXED: Use original sequences for SNP calling temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") idx_diff <- which(temp_list[[1]] != temp_list[[2]]) @@ -234,6 +242,7 @@ madc2vcf_targets <- function(madc_file, if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences ref_base[i] <- NA alt_base[i] <- NA + more_poly <- more_poly + 1 } else if(length(idx_diff) == 1) { orig_ref_base <- temp_list[[1]][idx_diff] orig_alt_base <- temp_list[[2]][idx_diff] @@ -250,32 +259,50 @@ madc2vcf_targets <- function(madc_file, # No differences found ref_base[i] <- NA alt_base[i] <- NA + no_diff <- no_diff + 1 } } + if(more_poly > 0) vmsg(paste(more_poly, "markers removed because more than one polymorphism was found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed because no differences were found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") ref_base <- "." alt_base <- "." + vmsg(paste("REF and ALT bases could not be recovered because of missing reference or alternative sequences"), verbose = verbose, level = 1, type = ">>") } - } else { ref_base <- "." alt_base <- "." + vmsg(paste("REF and ALT bases not recovered because get_REF_ALT = FALSE"), verbose = verbose, level = 1, type = ">>") } } else { + vmsg("markers_info file provided. Using CHROM, POS, REF and ALT from the file.", verbose = verbose, level = 0, type = ">>") # Verify markers_info file df <- read.csv(markers_info) + + # Accept either CloneID or BI_markerID as the marker ID column + if ("BI_markerID" %in% colnames(df)) { + id_col <- "BI_markerID" + } else if ("CloneID" %in% colnames(df)) { + id_col <- "CloneID" + } else { + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + if(checks$checks["Indels"]){ - if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) - stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos") + vmsg("Indels detected in MADC file. Checking for required columns in markers_info...", verbose = verbose, level = 1, type = ">>") + if(!all(c(id_col,"Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt, Type, Indel_pos")) } - if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df))) - stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT") + if(!all(c(id_col,"Chr","Pos","Ref","Alt") %in% colnames(df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt")) - if(!all(rownames(ad_df)%in% df$BI_markerID)) + if(!all(rownames(ad_df) %in% df[[id_col]])){ + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") - - matched <- df[match(rownames(ad_df), df$BI_markerID),] + } + matched <- df[match(rownames(ad_df), df[[id_col]]),] new_df <- data.frame( CHROM = matched$Chr, @@ -291,6 +318,9 @@ madc2vcf_targets <- function(madc_file, alt_base <- matched$Alt } + vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF dataframe", verbose = verbose, level = 0, type = ">>") #Make a header separate from the dataframe vcf_header <- c( "##fileformat=VCFv4.3", @@ -351,6 +381,7 @@ madc2vcf_targets <- function(madc_file, rownames(combined_wide) <- combined_wide$Row combined_wide$Row <- NULL colnames(combined_wide) <- colnames(matrices[[1]]) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") return(as.matrix(combined_wide)) } @@ -360,6 +391,7 @@ madc2vcf_targets <- function(madc_file, #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) + vmsg("VCF dataframe prepared", verbose = verbose, level = 1, type = ">>") # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" @@ -368,6 +400,7 @@ madc2vcf_targets <- function(madc_file, vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] if(sum(is.na(vcf_df$REF)) >1) { + vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } @@ -379,4 +412,5 @@ madc2vcf_targets <- function(madc_file, suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) + vmsg(paste("VCF file written to", output.file), verbose = verbose, level = 0, type = ">>") } From c31118d8a7336f31b580cadabecd31c80985578b Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 25 Mar 2026 15:29:49 -0400 Subject: [PATCH 13/37] targets okay --- R/check_madc_sanity.R | 205 +++++++-- R/get_countsMADC.R | 2 + R/madc2vcf_targets.R | 362 ++++++++------- R/utils.R | 52 +-- man/madc2vcf_targets.Rd | 22 +- tests/testthat/test-check_madc_sanity.R | 67 ++- tests/testthat/test-madc2vcf_targets.R | 588 +++++++++++++++++++++++- 7 files changed, 1018 insertions(+), 280 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 1455606..301cd74 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -1,41 +1,80 @@ #' Run basic sanity checks on a MADC-style allele report #' #' @description -#' Performs five quick validations on an allele report: -#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); -#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*" +#' Performs eight quick validations on an allele report: +#' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` #' *and* both `_0001` and `_0002` appear in `AlleleID`; -#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`; -#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`; -#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`. +#' 3) **IUPACcodes** - presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** - presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** - reference/alternate allele lengths differ for the same `CloneID`, +#' or a `"-"` character is present in `AlleleSequence`; +#' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format +#' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); +#' 7) **allNAcol** - at least one column contains only `NA` values; +#' 8) **allNArow** - at least one row contains only `NA` values. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also -#' used in the “FixAlleleIDs” check to inspect its first up to six entries. +#' used in the `FixAlleleIDs` check to inspect its first up to six entries. +#' If `FixAlleleIDs` is `FALSE` (raw DArT format), the first 7 rows are +#' treated as header filler and skipped before further checks are run. #' #' @details -#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive), -#' which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`. -#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`, -#' merged by `CloneID`, and the lengths of `AlleleSequence` are compared. -#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and +#' - **FixAlleleIDs:** When the check fails (raw DArT format), row 7 is used as +#' the column header and the first 7 rows are dropped before subsequent checks. +#' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` +#' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs +#' `"Alt_0002"`, merged by `CloneID`, and the lengths of `AlleleSequence` +#' are compared. A `"-"` anywhere in `AlleleSequence` is also treated as +#' evidence of an indel. +#' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the +#' first part must match `"Chr"` (case-insensitive) and the second must be a +#' positive integer. Returns `FALSE` when any `CloneID` is `NA`. +#' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows +#' respectively; useful for flagging empty or corrupt entries. +#' - If required columns are missing (`Columns = FALSE`), only `Columns` and +#' `FixAlleleIDs` are evaluated; all other checks remain `NA` and #' `indel_clone_ids` is returned as `NULL`. #' -#' @return A list with: +#' @return A named list with three elements: #' \describe{ -#' \item{checks}{Named logical vector with entries -#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.} -#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ. -#' Returns `character(0)` if none, or `NULL` when required columns are missing.} +#' \item{checks}{Named logical vector with eight entries: +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, +#' `ChromPos`, `allNAcol`, `allNArow`. +#' `TRUE` means the condition was detected (or passed for `Columns` and +#' `FixAlleleIDs`); `NA` means the check was skipped.} +#' \item{messages}{Named list of length-2 character vectors, one per check. +#' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` +#' when it is `FALSE`. Indexed by the same names as `checks`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt +#' lengths differ. Returns `character(0)` if none are found, or `NULL` +#' when required columns are missing.} #' } #' -#' #' @export check_madc_sanity <- function(report) { # Initialize - checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) - messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + if(!checks["FixAlleleIDs"]){ + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } # Validate required columns required <- c("CloneID", "AlleleID", "AlleleSequence") @@ -43,19 +82,8 @@ check_madc_sanity <- function(report) { checks["Columns"] <- length(missing_cols) == 0 if(checks[["Columns"]]){ - # ---- FixAlleleIDs ---- - # Check if first up-to-6 entries in the *first column* are all "" or "*" - n <- nrow(report) - idx <- seq_len(min(6L, n)) - first_col_vals <- report[[1]][idx] - all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) - # Also require that both _0001 and _0002 appear in AlleleID - has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) - has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) - checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 - # ---- IUPACcodes ---- - iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE) + iu <- grepl("[^ATCG-]", report$AlleleSequence, ignore.case = TRUE) checks["IUPACcodes"] <- any(iu, na.rm = TRUE) # ---- LowerCase ---- @@ -75,31 +103,126 @@ check_madc_sanity <- function(report) { alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) cmp_ok <- !is.na(ref_len) & !is.na(alt_len) indel_mask <- cmp_ok & (ref_len != alt_len) - checks["Indels"] <- any(indel_mask) + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) } else { checks["Indels"] <- FALSE indels <- character(0) } + # --- All NA ---- + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x)))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)))) + # ---- Chrom Pos ---- - pos <- strsplit(report[,2], "_") - checks["ChromPos"] <- all(sapply(pos, length) == 2) + if(!any(is.na(report$CloneID))) { + pos <- strsplit(report$CloneID, "_") + format <- all(sapply(pos, length) == 2) + first <- all(grepl("Chr", sapply(pos, "[", 1), ignore.case = TRUE)) + second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) + checks["ChromPos"] <- all(format, first, second) + } else checks["ChromPos"] <- FALSE } else indels <- NULL messages[["Columns"]] <- c("Required columns are present", - "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets") + "MADC not processed by HapApp. Please, run the MADC through HapApp to fix Allele IDs before using it in BIGr/BIGapp.") messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", - "No IUPAC (non-ATCG) codes found in AlleleSequence") + "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", - "No lowercase bases found in AlleleSequence") + "No lowercase bases found in AlleleSequence") messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs") + "No indels found (ref/alt lengths match) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", - "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + messages[["allNArow"]] <- c("One or more rows contain all NA values.", + "No rows with all NA values") + messages[["allNAcol"]] <- c("One or more columns contain all NA values.", + "No columns with all NA values") list(checks = checks, messages = messages, indel_clone_ids = indels) } + +#' Check and Adjust Botloci and MADC Marker Compatibility +#' +#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +#' +#' @param botloci A data frame containing the botloci markers. +#' @param report A data frame containing the MADC markers. +#' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE +#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE. +#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. +#' +#' @return A list containing the adjusted botloci and MADC data frames. +#' +#' @details +#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. +#' +#' @keywords internal +#' @noRd +check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + + # Check inputs + if(!ChromPos) { + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.") + # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + } + if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") + } + + if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + + pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) + + if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") + + if(pad_madc != pad_botloci) { + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") + if (pad_madc < pad_botloci) { + report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) + ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + } else { + botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", + sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) + ) + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") + } + } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 1, type = ">>") + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + } + mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) + + if(!any(botloci$V1 %in% mk_info_CloneID)){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 1, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } else { + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 1, type = ">>") + report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + } + } else { + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 1, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } + } + return(list(botloci, report)) +} diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 2123be1..8cce70b 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -69,6 +69,8 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche } else { update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) } + # Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning + update_df <- as.data.frame(update_df) # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index d595ae5..c1c4c71 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -16,8 +16,7 @@ #' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing #' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. #' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` -#' (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based -#' inference. +#' bypassing sequence-based inference. #' #' **Output VCF layout** #' - `INFO` fields: @@ -40,19 +39,20 @@ #' #' @param madc_file character. Path to the input MADC CSV file. #' @param output.file character. Path to the output VCF file to write. -#' @param botloci_file character. Path to a plain-text file listing target IDs -#' designed on the **bottom** strand (one ID per line). Required when -#' `get_REF_ALT = TRUE` and `markers_info` is not provided. +#' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text +#' file listing target IDs designed on the **bottom** strand (one ID per line). +#' Required only when `get_REF_ALT = TRUE` and `markers_info` does not supply +#' `Ref` and `Alt` columns. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Required columns: `CloneID, Chr, Pos, Ref, Alt`. This file is required in -#' case your MADC CloneID column doesn't have the format CHR_POS. If indels are -#' present, columns `Type, Indel_pos` are also required. +#' metadata. Minimum required columns: `CloneID` (or `BI_markerID`), `Chr`, `Pos`. +#' When `get_REF_ALT = TRUE`, also requires `Ref` and `Alt` (replaces probe-sequence +#' inference). `Type` and `Indel_pos` are never required by this function. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT #' bases from the Ref/Alt probe sequences in the MADC file (with strand correction #' using `botloci_file`). Targets with more than one difference between Ref/Alt #' sequences are removed. #' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. -#' @param verbose logical (default `FALSE`). If `TRUE`, prints detailed messages about +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress messages about each processing step. #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -96,7 +96,7 @@ #' @export madc2vcf_targets <- function(madc_file, output.file, - botloci_file, + botloci_file = NULL, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, @@ -105,19 +105,17 @@ madc2vcf_targets <- function(madc_file, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!file.exists(madc_file)) stop("The MADC file does not exist.") + if(!(file.exists(madc_file) | url_exists(madc_file))) stop("The MADC file does not exist.") if(!is.character(output.file)) stop("output.file must be a character string.") - if(get_REF_ALT && is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - if(get_REF_ALT && !file.exists(botloci_file)) stop("The botloci file does not exist.") - if(!is.null(markers_info) && !file.exists(markers_info)) stop("The markers_info file does not exist.") if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.null(markers_info) && !file.exists(markers_info) && !url_exists(markers_info)) stop("The markers_info file does not exist.") if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") # Create a VCF header line with metadata about the command and its parameters bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets= 0) stop(paste("The MADC file does not pass the sanity checks:\n", paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) } - if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ - idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:\n", - paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) - else vmsg("MADC file has some issues (IUPAC codes, lowercase bases, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ + idx <- which((checks$checks[c("IUPACcodes", "Indels")])) + if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", + paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) + else vmsg("MADC file has some issues (IUPAC codes, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } - # Check marker names compatibility between MADC and botloci - if(!is.null(botloci_file)){ - botloci <- read.csv(botloci_file, header = F) - checked_botloci <- check_botloci(botloci, report) - botloci <- checked_botloci[[1]] - report <- checked_botloci[[2]] + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) } - vmsg("Input checks done", verbose = verbose, level = 1, type = ">>") + # ---- Validate botloci and pre-process CloneIDs based on get_REF_ALT logic ---- + mi_df <- NULL # markers_info data frame (loaded once, reused below) + mi_has_ref_alt <- FALSE # TRUE when markers_info provides Ref and Alt columns + botloci <- NULL # botloci data frame (set when needed) + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) + } + + if(!isTRUE(checks$checks["ChromPos"])) { + if(is.null(markers_info)){ + stop("CloneID column does not follow the 'Chr_Pos'. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + } else { + + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + + } + } + + if(get_REF_ALT) { + + if(mi_has_ref_alt) { + # markers_info supplies REF and ALT — no botloci required + vmsg("markers_info contains Ref and Alt columns. REF and ALT will be taken from markers_info.", + verbose = verbose, level = 1, type = ">>") + + } else { + if(checks$checks["Indels"]) + stop("Indels detected in MADC file. Since get_REF_ALT = TRUE, a markers_info file with REF/ALT information is required.") + + # REF/ALT must be extracted from probe sequences — botloci is required + if(is.null(botloci_file) || (!file.exists(botloci_file) && !url_exists(botloci_file))) + stop("get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE.") + + # Validate that CloneIDs match botloci marker names (after any remapping) + botloci <- read.table(botloci_file, header = FALSE) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + + } + } + + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") @@ -176,79 +221,133 @@ madc2vcf_targets <- function(madc_file, vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") - #Obtaining Chr and Pos information from the row_names - if(is.null(markers_info)){ - vmsg("No markers_info file provided. Attempting to recover CHROM and POS from CloneID...", verbose = verbose, level = 0, type = ">>") - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + if(get_REF_ALT && mi_has_ref_alt) { + vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since Ref and Alt are provided in markers_info, Type and Indel_pos are not required.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", + id_col, ", Chr, Pos, Ref, Alt")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) new_df$TotalSize <- rowSums(size_df) - vmsg("CHROM and POS recovered from CloneID", verbose = verbose, level = 1, type = ">>") - # Get REF and ALT - if(get_REF_ALT){ - vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", verbose = verbose, level = 0, type = ">>") - csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] + ref_base <- matched$Ref + alt_base <- matched$Alt - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) + } else if(!is.null(markers_info) && !get_REF_ALT) { + vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", + verbose = verbose, level = 1, type = ">>") - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) + if(!all(c(id_col, "Chr", "Pos") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos")) - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + + } else { + vmsg(ifelse(get_REF_ALT, + "Recovering CHROM and POS from CloneID for probe-sequence REF/ALT extraction...", + "No markers_info file provided. Recovering CHROM and POS from CloneID..."), + verbose = verbose, level = 0, type = ">>") + + # Split on the last underscore to handle chromosome names containing underscores + # (e.g. Chr_01_000123456). When ChromPos was FALSE, check_botloci already + # remapped CloneIDs to Chr_PaddedPos, so this split is always valid. + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_(?=[^_]*$)") %>% + select(CHROM, POS) + new_df$POS <- sub("^0+", "", new_df$POS) + vmsg("CHROM and POS recovered from CloneID.", verbose = verbose, level = 1, type = ">>") + + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + if(get_REF_ALT) { + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", + verbose = verbose, level = 0, type = ">>") + + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) + csv <- csv[which(csv$CloneID %in% rownames(ad_df)), ] + + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + orig_ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + + if(all(sort(ref_ord) == sort(alt_ord))) { + # Key sequences by CloneID, then reorder to MADC row order so that + # loop index i always corresponds to rownames(size_df)[i]. + ref_seq_by_id <- setNames(orig_ref_seq, ref_ord) + alt_seq_by_id <- setNames(orig_alt_seq, alt_ord) + madc_ids <- rownames(size_df) + orig_ref_seq <- ref_seq_by_id[madc_ids] + orig_alt_seq <- alt_seq_by_id[madc_ids] more_poly <- no_diff <- 0 - ref_base <- alt_base <- vector() - for(i in seq_along(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling + ref_base <- alt_base <- rep(NA_character_, length(madc_ids)) + names(ref_base) <- names(alt_base) <- madc_ids + for(i in seq_along(madc_ids)) { + if(is.na(orig_ref_seq[i]) || is.na(orig_alt_seq[i])) next temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences + if(length(temp_list[[1]]) != length(temp_list[[2]])) + stop(paste0("Marker '", madc_ids[i], "' has Ref and Alt probe sequences of different lengths ", + "(", length(temp_list[[1]]), " vs ", length(temp_list[[2]]), "). ", + "This should not happen for SNP markers. ", + "If this is an indel, please provide a markers_info file with Ref and Alt columns.")) + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { ref_base[i] <- NA alt_base[i] <- NA more_poly <- more_poly + 1 } else if(length(idx_diff) == 1) { orig_ref_base <- temp_list[[1]][idx_diff] orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { + if(madc_ids[i] %in% botloci[, 1]) { ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) } else { @@ -256,66 +355,27 @@ madc2vcf_targets <- function(madc_file, alt_base[i] <- orig_alt_base } } else { - # No differences found ref_base[i] <- NA alt_base[i] <- NA no_diff <- no_diff + 1 } } - if(more_poly > 0) vmsg(paste(more_poly, "markers removed because more than one polymorphism was found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") - if(no_diff > 0) vmsg(paste(no_diff, "markers removed because no differences were found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + if(more_poly > 0) vmsg(paste(more_poly, "markers removed: more than one polymorphism between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed: no differences found between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") } else { ref_base <- "." alt_base <- "." - vmsg(paste("REF and ALT bases could not be recovered because of missing reference or alternative sequences"), verbose = verbose, level = 1, type = ">>") + vmsg("REF and ALT bases could not be recovered: missing reference or alternative sequences.", + verbose = verbose, level = 1, type = ">>") } + } else { + # ── get_REF_ALT = FALSE, no markers_info ───────────────────────── ref_base <- "." alt_base <- "." - vmsg(paste("REF and ALT bases not recovered because get_REF_ALT = FALSE"), verbose = verbose, level = 1, type = ">>") - } - } else { - vmsg("markers_info file provided. Using CHROM, POS, REF and ALT from the file.", verbose = verbose, level = 0, type = ">>") - # Verify markers_info file - df <- read.csv(markers_info) - - # Accept either CloneID or BI_markerID as the marker ID column - if ("BI_markerID" %in% colnames(df)) { - id_col <- "BI_markerID" - } else if ("CloneID" %in% colnames(df)) { - id_col <- "CloneID" - } else { - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - } - - if(checks$checks["Indels"]){ - vmsg("Indels detected in MADC file. Checking for required columns in markers_info...", verbose = verbose, level = 1, type = ">>") - if(!all(c(id_col,"Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) - stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt, Type, Indel_pos")) + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") } - if(!all(c(id_col,"Chr","Pos","Ref","Alt") %in% colnames(df))) - stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt")) - - if(!all(rownames(ad_df) %in% df[[id_col]])){ - miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% df[[id_col]]] - vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") - warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") - } - matched <- df[match(rownames(ad_df), df[[id_col]]),] - - new_df <- data.frame( - CHROM = matched$Chr, - POS = matched$Pos - ) - - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) - - ref_base <- matched$Ref - alt_base <- matched$Alt } vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") @@ -356,38 +416,20 @@ madc2vcf_targets <- function(madc_file, vcf_df$FORMAT <- paste("DP","RA","AD",sep=":") #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) - - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) - - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value - - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + m_size <- melt(as.matrix(size_df), varnames = c("Row", "Col"), value.name = "Value") + m_ref <- melt(as.matrix(ref_df), varnames = c("Row", "Col"), value.name = "Value") + m_ad <- melt(as.matrix(ad_df), varnames = c("Row", "Col"), value.name = "Value") - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + combined_long <- m_size + combined_long$Combined <- paste(m_size$Value, m_ref$Value, m_ad$Value, sep = ":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) - vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") + combined_wide <- suppressMessages(dcast(combined_long, Row ~ Col, value.var = "Combined")) + rownames(combined_wide) <- combined_wide$Row + combined_wide$Row <- NULL + colnames(combined_wide) <- colnames(size_df) - return(as.matrix(combined_wide)) - } - - # Combine the matrices - geno_df <- make_vcf_format(as.matrix(size_df), as.matrix(ref_df), as.matrix(ad_df)) + geno_df <- as.matrix(combined_wide) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) @@ -399,7 +441,15 @@ madc2vcf_targets <- function(madc_file, # Sort vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] - if(sum(is.na(vcf_df$REF)) >1) { + # Remove markers with NA CHROM/POS (unmatched in markers_info, Case 3) + na_coord <- is.na(vcf_df[, 1]) | is.na(vcf_df$POS) + if(any(na_coord)) { + vmsg(paste(sum(na_coord), "markers removed: no matching entry found in markers_info."), verbose = verbose, level = 1, type = ">>") + warning(paste(sum(na_coord), "markers removed: no matching entry found in markers_info.")) + vcf_df <- vcf_df[!na_coord, ] + } + + if(sum(is.na(vcf_df$REF)) > 0) { vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] diff --git a/R/utils.R b/R/utils.R index 92fac46..a270570 100644 --- a/R/utils.R +++ b/R/utils.R @@ -26,50 +26,6 @@ convert_to_dosage <- function(gt) { }) } -#' Check and Adjust Botloci and MADC Marker Compatibility -#' -#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. -#' -#' @param botloci A data frame containing the botloci markers. -#' @param report A data frame containing the MADC markers. -#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. -#' -#' @return A list containing the adjusted botloci and MADC data frames. -#' -#' @details -#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. -#' -#' @keywords internal -#' @noRd -check_botloci <- function(botloci, report, verbose=TRUE){ - if(!any(botloci$V1 %in% report$CloneID)) { - vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") - - pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) - pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) - - if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") - - if(pad_madc != pad_botloci) { - vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") - if (pad_madc < pad_botloci) { - report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) - ) - report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) - } else { - botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", - sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) - ) - if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") - } - } else { - stop("Check marker IDs in both MADC and botloci files. They should be the same.") - } - } - return(list(botloci, report)) -} - ##' Verbose Message Utility ##' ##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. @@ -112,3 +68,11 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) message(formatted_msg) } + +url_exists <- function(u) { + tryCatch({ + con <- url(u, open = "rb") + close(con) + TRUE + }, error = function(e) FALSE, warning = function(w) FALSE) +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 5d62c87..1da55d8 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -7,11 +7,11 @@ madc2vcf_targets( madc_file, output.file, - botloci_file, + botloci_file = NULL, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, - verbose = FALSE + verbose = TRUE ) } \arguments{ @@ -19,14 +19,15 @@ madc2vcf_targets( \item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{character. Path to a plain-text file listing target IDs -designed on the \strong{bottom} strand (one ID per line). Required when -\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} +\item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text +file listing target IDs designed on the \strong{bottom} strand (one ID per line). +Required only when \code{get_REF_ALT = TRUE} and \code{markers_info} does not supply +\code{Ref} and \code{Alt} columns.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Required columns: \verb{CloneID, Chr, Pos, Ref, Alt}. This file is required in -case your MADC CloneID column doesn't have the format CHR_POS. If indels are -present, columns \verb{Type, Indel_pos} are also required.} +metadata. Minimum required columns: \code{CloneID} (or \code{BI_markerID}), \code{Chr}, \code{Pos}. +When \code{get_REF_ALT = TRUE}, also requires \code{Ref} and \code{Alt} (replaces probe-sequence +inference). \code{Type} and \code{Indel_pos} are never required by this function.} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT bases from the Ref/Alt probe sequences in the MADC file (with strand correction @@ -35,7 +36,7 @@ sequences are removed.} \item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} -\item{verbose}{logical (default \code{FALSE}). If \code{TRUE}, prints detailed messages about} +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress messages about each processing step.} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a @@ -59,8 +60,7 @@ IUPAC/ambiguous bases, lowercase bases, indels). \item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. \item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} -(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based -inference. +bypassing sequence-based inference. } \strong{Output VCF layout} diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 5053e55..91e5493 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,10 +1,71 @@ test_that("check madc",{ - madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") - report <- read.csv(madc_file, check.names = FALSE) - check_madc_sanity(report) + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow") + # raw madc + report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) + res <- check_madc_sanity(report) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test lower case + report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test IUPAC + report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # clean alfalfa madc (fixed allele IDs, Chr_Pos CloneIDs, no issues) + report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel madc (ChromPos FALSE because IDs are not Chr_Pos) + report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel IUPAC (IUPAC codes + lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel lowercase (lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato more indels madc ChromPosFALSE + report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 524b687..5e21c7c 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -69,33 +69,10 @@ test_that("bottom strand markers have correct REF/ALT", { # Get results from both functions suppressWarnings( madc2vcf_targets(madc_file = madc_file, output.file = temp_targets, - get_REF_ALT = TRUE, botloci_file = bot_file) - ) - - suppressWarnings( - madc2vcf_all(madc = madc_file, botloci_file = bot_file, - hap_seq_file = NULL, out_vcf = temp_all, verbose = FALSE) + get_REF_ALT = TRUE, botloci_file = bot_file) ) vcf_targets <- read.vcfR(temp_targets, verbose = FALSE) - vcf_all <- read.vcfR(temp_all, verbose = FALSE) - - # Find common markers between both outputs - common_markers <- intersect(vcf_targets@fix[,"ID"], vcf_all@fix[,"ID"]) - - if(length(common_markers) > 0) { - # Compare REF/ALT for common markers - targets_subset <- vcf_targets@fix[vcf_targets@fix[,"ID"] %in% common_markers,] - all_subset <- vcf_all@fix[vcf_all@fix[,"ID"] %in% common_markers,] - - # Sort both by ID for comparison - targets_subset <- targets_subset[order(targets_subset[,"ID"]),] - all_subset <- all_subset[order(all_subset[,"ID"]),] - - # Check that REF/ALT match between the two functions - expect_equal(targets_subset[,"REF"], all_subset[,"REF"]) - expect_equal(targets_subset[,"ALT"], all_subset[,"ALT"]) - } # Validate that all REF/ALT are valid nucleotides expect_true(all(vcf_targets@fix[,"REF"] %in% c("A", "T", "G", "C", "."))) @@ -107,5 +84,566 @@ test_that("bottom strand markers have correct REF/ALT", { expect_true(all(vcf_targets@fix[valid_snps,"REF"] != vcf_targets@fix[valid_snps,"ALT"])) } - rm(vcf_targets, vcf_all, temp_targets, temp_all) + rm(vcf_targets, temp_targets) +}) + + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4139) + expect_equal(sum(DP[,5]), 42869) + unlink(out) + + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when get_REF_ALT = TRUE but no markers_info or botloci provided to extract REF/ALT + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE), + regexp = "get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE." + ) + + # Test that it works when get_REF_ALT = TRUE and botloci is provided (REF/ALT recovered from probe sequences) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci (should give same result as above but just to confirm no interference between the two) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC (even after trying to fix potential padding mismatch with ChromPos info in markers_info) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when no matching CloneID between markers_info and MADC to fix the botloci mismatch issue (even if botloci file is not used, the function should still check that the provided markers_info can match with MADC CloneIDs to be able to use the ChromPos info to fix potential padding mismatch) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE), + "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match." + ) + + # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue + # (even if botloci file is not used, the function should still be able to use the ChromPos info in markers_info to + # fix potential padding mismatch and find matching IDs between MADC and botloci) + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info_ChromPos, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info_ChromPos, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + botloci_file = potato_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # get_REF_ALT = FALSE: same counts as alfalfa_madc + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + }) }) From 5d54f0de55a89258da46b220ae740d2e346b5e44 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 02:26:48 -0400 Subject: [PATCH 14/37] targets ok --- R/check_madc_sanity.R | 122 +++++++++++++++++------ R/madc2vcf_all.R | 37 ++++++- R/madc2vcf_targets.R | 125 +++++++++++++++++++----- man/check_madc_sanity.Rd | 58 +++++++---- tests/testthat/test-check_madc_sanity.R | 18 ++-- tests/testthat/test-madc2vcf_targets.R | 75 +++++++++++++- 6 files changed, 348 insertions(+), 87 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 301cd74..759fce3 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -1,7 +1,7 @@ #' Run basic sanity checks on a MADC-style allele report #' #' @description -#' Performs eight quick validations on an allele report: +#' Performs nine quick validations on an allele report: #' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); #' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` #' *and* both `_0001` and `_0002` appear in `AlleleID`; @@ -11,8 +11,9 @@ #' or a `"-"` character is present in `AlleleSequence`; #' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format #' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); -#' 7) **allNAcol** - at least one column contains only `NA` values; -#' 8) **allNArow** - at least one row contains only `NA` values. +#' 7) **allNAcol** - at least one column contains only `NA` or empty values; +#' 8) **allNArow** - at least one row contains only `NA` or empty values; +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also @@ -21,44 +22,64 @@ #' treated as header filler and skipped before further checks are run. #' #' @details -#' - **FixAlleleIDs:** When the check fails (raw DArT format), row 7 is used as -#' the column header and the first 7 rows are dropped before subsequent checks. +#' - **FixAlleleIDs:** When the first six rows of the first column are all blank +#' or `"*"` (raw DArT format), row 7 is promoted to column headers and the +#' first 7 rows are dropped before subsequent checks are run. The check is +#' `TRUE` when the file has already been processed by HapApp (fixed IDs with +#' `_0001`/`_0002` suffixes). #' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` #' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). #' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs -#' `"Alt_0002"`, merged by `CloneID`, and the lengths of `AlleleSequence` -#' are compared. A `"-"` anywhere in `AlleleSequence` is also treated as -#' evidence of an indel. +#' `"Alt_0002"`, merged by `CloneID`, and flagged as indels if either (a) the +#' lengths of `AlleleSequence` differ, (b) the sequences have the same length +#' but more than one character differs between them (complex indel / local +#' rearrangement), or (c) a `"-"` character is present anywhere in +#' `AlleleSequence`. #' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the #' first part must match `"Chr"` (case-insensitive) and the second must be a #' positive integer. Returns `FALSE` when any `CloneID` is `NA`. #' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows -#' respectively; useful for flagging empty or corrupt entries. +#' respectively; a cell is treated as missing when it is `NA` or an empty +#' string (`""`). Useful for flagging empty or corrupt entries. +#' - **RefAltSeqs:** For each unique `CloneID`, checks whether at least one row +#' with a `Ref` (`|Ref_` when `FixAlleleIDs = TRUE`, `|Ref$` otherwise) and +#' one row with an `Alt` (`|Alt_` / `|Alt$`) allele exist. `CloneID`s that +#' lack a `Ref` row are stored in `missRef`; those lacking an `Alt` row in +#' `missAlt`. The check is `TRUE` when both sets are empty. #' - If required columns are missing (`Columns = FALSE`), only `Columns` and #' `FixAlleleIDs` are evaluated; all other checks remain `NA` and -#' `indel_clone_ids` is returned as `NULL`. +#' `indel_clone_ids`, `missRef`, and `missAlt` are returned as `NULL`. #' -#' @return A named list with three elements: +#' @return A named list with five elements: #' \describe{ -#' \item{checks}{Named logical vector with eight entries: +#' \item{checks}{Named logical vector with nine entries: #' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, -#' `ChromPos`, `allNAcol`, `allNArow`. -#' `TRUE` means the condition was detected (or passed for `Columns` and -#' `FixAlleleIDs`); `NA` means the check was skipped.} +#' `ChromPos`, `allNAcol`, `allNArow`, `RefAltSeqs`. +#' `TRUE` means the condition was detected (or passed for `Columns`, +#' `FixAlleleIDs`, `ChromPos`, and `RefAltSeqs`); `NA` means the check +#' was skipped.} #' \item{messages}{Named list of length-2 character vectors, one per check. #' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` #' when it is `FALSE`. Indexed by the same names as `checks`.} #' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt #' lengths differ. Returns `character(0)` if none are found, or `NULL` #' when required columns are missing.} +#' \item{missRef}{Character vector of `CloneID`s that have no `Ref` allele +#' row. Returns `character(0)` if all `CloneID`s have a `Ref` row, or +#' `NULL` when required columns are missing.} +#' \item{missAlt}{Character vector of `CloneID`s that have no `Alt` allele +#' row. Returns `character(0)` if all `CloneID`s have an `Alt` row, or +#' `NULL` when required columns are missing.} #' } #' #' @export check_madc_sanity <- function(report) { # Initialize - checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) - messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) # ---- FixAlleleIDs ---- # Check if first up-to-6 entries in the *first column* are all "" or "*" @@ -67,14 +88,13 @@ check_madc_sanity <- function(report) { first_col_vals <- report[[1]][idx] all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) # Also require that both _0001 and _0002 appear in AlleleID - has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) - has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) - checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 - - if(!checks["FixAlleleIDs"]){ + if(all_blank_or_star) { colnames(report) <- report[7,] report <- report[-c(1:7),] } + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 # Validate required columns required <- c("CloneID", "AlleleID", "AlleleSequence") @@ -102,47 +122,89 @@ check_madc_sanity <- function(report) { ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + + # Classic indel: different lengths indel_mask <- cmp_ok & (ref_len != alt_len) + + # Complex indel: same length but >1 character difference between sequences + same_len <- cmp_ok & (ref_len == alt_len) + if (any(same_len)) { + n_diffs <- mapply(function(r, a) { + r_chars <- strsplit(r, "")[[1]] + a_chars <- strsplit(a, "")[[1]] + sum(r_chars != a_chars) + }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) + indel_mask[same_len] <- n_diffs > 1 + } + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + } else { checks["Indels"] <- FALSE indels <- character(0) } # --- All NA ---- - checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x)))) - checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)))) + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)) | x == "")) # ---- Chrom Pos ---- if(!any(is.na(report$CloneID))) { pos <- strsplit(report$CloneID, "_") format <- all(sapply(pos, length) == 2) - first <- all(grepl("Chr", sapply(pos, "[", 1), ignore.case = TRUE)) + first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) checks["ChromPos"] <- all(format, first, second) } else checks["ChromPos"] <- FALSE - } else indels <- NULL + # ---- RefAltSeqs ---- + all_clones <- unique(report$CloneID) + if (isTRUE(checks["FixAlleleIDs"])) { + has_ref <- unique(report$CloneID[grepl("\\|Ref_", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt_", report$AlleleID)]) + } else { + has_ref <- unique(report$CloneID[grepl("\\|Ref$", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt$", report$AlleleID)]) + } + missRef <- setdiff(all_clones, has_ref) + missAlt <- setdiff(all_clones, has_alt) + checks["RefAltSeqs"] <- length(missRef) == 0 & length(missAlt) == 0 + + # ---- OtherAlleles ---- + checks["OtherAlleles"] <- any(grepl("[|]Other", report$AlleleID)) + + } else { + indels <- NULL + missRef <- NULL + missAlt <- NULL + } messages[["Columns"]] <- c("Required columns are present", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by HapApp. Please, run the MADC through HapApp to fix Allele IDs before using it in BIGr/BIGapp.") + "MADC not processed by HapApp.") messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", "No lowercase bases found in AlleleSequence") - messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ or >1 mismatch between same-length sequences) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") messages[["allNArow"]] <- c("One or more rows contain all NA values.", "No rows with all NA values") messages[["allNAcol"]] <- c("One or more columns contain all NA values.", "No columns with all NA values") + messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", + paste0("Some CloneIDs are missing Ref and/or Alt alleles. ", + "Missing Ref: ", paste(missRef, collapse = " "), ". ", + "Missing Alt: ", paste(missAlt, collapse = " "), ".")) + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID.", + "No alleles other than Ref and Alt found in AlleleID.") - list(checks = checks, messages = messages, indel_clone_ids = indels) + list(checks = checks, messages = messages, indel_clone_ids = indels, + missRef = missRef, missAlt = missAlt) } #' Check and Adjust Botloci and MADC Marker Compatibility diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 13227d3..22edcc6 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -98,7 +98,7 @@ madc2vcf_all <- function(madc = NULL, "out_vcf= ", out_vcf, ', ', "verbose= ", verbose,')">') - report <- read.csv(madc, check.names = FALSE) + report <- read.csv(madc, check.names = FALSE) checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { @@ -111,9 +111,38 @@ madc2vcf_all <- function(madc = NULL, paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) } - if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ - idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + if(any(checks$checks[c("IUPACcodes")])){ + idx <- which((checks$checks[c("IUPACcodes")])) + stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) + } + + if(any(!checks$checks[c("ChromPos")])){ + if(is.null(markers_info)) { + stop(paste(messages_results[c("ChromPos")], collapse = "\n")) + } else { + mi_df <- read.csv(markers_info) + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", + "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") + } + } + + if(any(checks$checks[c("Indels")])){ + idx <- which((checks$checks[c("Indels")])) + if(is.null(markers_info)) { + stop(paste(messages_results[c("Indels")[idx]], collapse = "\n")) + } else { + mi_df <- read.csv(markers_info) + if(checks$checks["Indels"] && + !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) + stop("Indels detected in MADC file. ", + "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + } + } + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) } if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index c1c4c71..1416044 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -9,14 +9,17 @@ #' #' @details #' **What this function does** -#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, -#' IUPAC/ambiguous bases, lowercase bases, indels). +#' - Runs basic sanity checks on the MADC file via `check_madc_sanity()` (column +#' presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +#' chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). #' - Extracts reference and total read counts per sample and target. #' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). -#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing -#' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. -#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` -#' bypassing sequence-based inference. +#' - If `get_REF_ALT = TRUE`, recovers REF/ALT bases either from `markers_info` +#' (when `Ref`/`Alt` columns are present) or by comparing the Ref/Alt probe +#' sequences in the MADC file (with strand correction via `botloci_file`). +#' Targets with >1 polymorphism between sequences are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, +#' `ALT`, bypassing sequence-based inference. #' #' **Output VCF layout** #' - `INFO` fields: @@ -32,27 +35,58 @@ #' complemented prior to base comparison so that REF/ALT are reported in the #' top-strand genomic orientation. #' -#' **Sanity check behavior** -#' - If required columns or fixed IDs are missing, the function `stop()`s. -#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not** -#' provided, the function `stop()`s with a diagnostic message explaining what to fix. +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if required columns (`CloneID`, `AlleleID`, +#' `AlleleSequence`) are missing. +#' +#' For the remaining checks the required inputs depend on the combination of +#' check result and `get_REF_ALT`: +#' +#' | Check | Status | `get_REF_ALT` | Required | +#' |---|---|---|---| +#' | **IUPAC codes** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **Indels** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **ChromPos** | valid | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | valid | `FALSE` | — | +#' | | invalid | `TRUE` | `markers_info` with `Chr`/`Pos`/`Ref`/`Alt` **or** `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | | invalid | `FALSE` | `markers_info` with `Chr`/`Pos` | +#' | **FixAlleleIDs** | fixed | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | fixed | `FALSE` | — | +#' | | not fixed | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | not fixed | `FALSE` | — | #' #' @param madc_file character. Path to the input MADC CSV file. #' @param output.file character. Path to the output VCF file to write. #' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text #' file listing target IDs designed on the **bottom** strand (one ID per line). -#' Required only when `get_REF_ALT = TRUE` and `markers_info` does not supply -#' `Ref` and `Alt` columns. +#' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and +#' `markers_info` does not supply `Ref` and `Alt` columns. Also required when +#' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Minimum required columns: `CloneID` (or `BI_markerID`), `Chr`, `Pos`. -#' When `get_REF_ALT = TRUE`, also requires `Ref` and `Alt` (replaces probe-sequence -#' inference). `Type` and `Indel_pos` are never required by this function. -#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT -#' bases from the Ref/Alt probe sequences in the MADC file (with strand correction -#' using `botloci_file`). Targets with more than one difference between Ref/Alt -#' sequences are removed. -#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. -#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress messages about each processing step. +#' metadata. Accepted columns: +#' - `CloneID` or `BI_markerID` (required as marker identifier); +#' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; +#' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover +#' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` +#' columns take priority; otherwise probe sequences from the MADC are compared +#' (with `botloci_file` for strand correction). Targets with more than one +#' difference between Ref/Alt sequences are removed. When `FALSE`, REF and ALT +#' are set to `"."` in the output VCF. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for +#' `|AltMatch` and `|RefMatch` rows are summed into their corresponding `|Ref` +#' and `|Alt` rows before building the matrices. Useful when the MADC contains +#' multiple allele rows per target that should be aggregated. +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress +#' messages about each processing step. #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -133,18 +167,18 @@ madc2vcf_targets <- function(madc_file, for(i in seq_along(messages_results)) vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") - if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ - idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + if(any(!(checks$checks[c("Columns")]))){ + idx <- which(!(checks$checks[c("Columns")])) if(length(idx) > 0) stop(paste("The MADC file does not pass the sanity checks:\n", - paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + paste(messages_results[c("Columns")[idx]], collapse = "\n"))) } if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ idx <- which((checks$checks[c("IUPACcodes", "Indels")])) if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) - else vmsg("MADC file has some issues (IUPAC codes, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + else vmsg("MADC file has IUPAC codes and/or indels, but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } if(checks$checks["LowerCase"]){ @@ -163,6 +197,35 @@ madc2vcf_targets <- function(madc_file, mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) } + if(!checks$checks["FixAlleleIDs"]){ + vmsg("MADC file has not been processed by HapApp.", verbose = verbose, level = 1) + if(get_REF_ALT){ + if(!mi_has_ref_alt) stop("MADC file has not been processed by HapApp. BIGr only provide results if get_REF_ALT is set to FALSE or if is TRUE but a marker_info with REF and ALT information is provided.") + } + # The check points to FALSE if the 6 initial rows exist or if there are no fixed allele ID (aka _0001, _0002, etc) + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + } + + if(checks$checks["allNArow"]){ + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + report <- report[!idx, ] + vmsg("MADC contains rows with all NA values. Rows %s will be removed.", verbose = verbose, level = 1, type = ">>", paste(which(idx), collapse = ", ")) + } + + if(checks$checks["allNAcol"]){ + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + report <- report[, !idx] + vmsg("MADC contains columns with all NA values. Columns %s will be removed.", verbose = verbose, level = 1, type = ">>", paste0(which(idx), collapse = ",")) + } + if(!isTRUE(checks$checks["ChromPos"])) { if(is.null(markers_info)){ stop("CloneID column does not follow the 'Chr_Pos'. ", @@ -201,11 +264,21 @@ madc2vcf_targets <- function(madc_file, } } + # Throw message if OtherAlleles are present + if(checks$checks["OtherAlleles"]) { + vmsg("AlleleID contains alleles other than Ref and Alt. These will be ignored in the VCF output. Use function madc2vcf_all to include them.", verbose = verbose, level = 1, type = ">>") + } + + # Make sure counts are numeric + count_cols <- setdiff(colnames(report), c("CloneID", "AlleleID", "AlleleSequence")) + report[count_cols] <- lapply(report[count_cols], function(x) as.numeric(as.character(x))) + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -230,7 +303,7 @@ madc2vcf_targets <- function(madc_file, stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") if(checks$checks["Indels"]) - vmsg("Indels detected in MADC file. Since Ref and Alt are provided in markers_info, Type and Indel_pos are not required.", + vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", verbose = verbose, level = 1, type = ">>") if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 494145e..5328a4f 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -9,35 +9,59 @@ check_madc_sanity(report) \arguments{ \item{report}{A \code{data.frame} with at least the columns \code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also -used in the “FixAlleleIDs” check to inspect its first up to six entries.} +used in the \code{FixAlleleIDs} check to inspect its first up to six entries. +If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are +treated as header filler and skipped before further checks are run.} } \value{ -A list with: +A named list with three elements: \describe{ -\item{checks}{Named logical vector with entries -\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.} -\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. -Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.} +\item{checks}{Named logical vector with eight entries: +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, +\code{ChromPos}, \code{allNAcol}, \code{allNArow}. +\code{TRUE} means the condition was detected (or passed for \code{Columns} and +\code{FixAlleleIDs}); \code{NA} means the check was skipped.} +\item{messages}{Named list of length-2 character vectors, one per check. +Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} +when it is \code{FALSE}. Indexed by the same names as \code{checks}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt +lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} +when required columns are missing.} } } \description{ -Performs five quick validations on an allele report: +Performs eight quick validations on an allele report: \enumerate{ -\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); -\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*" +\item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} \emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; -\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence}; -\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence}; -\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}. +\item \strong{IUPACcodes} - presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} - presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} - reference/alternate allele lengths differ for the same \code{CloneID}, +or a \code{"-"} character is present in \code{AlleleSequence}; +\item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format +(prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); +\item \strong{allNAcol} - at least one column contains only \code{NA} values; +\item \strong{allNArow} - at least one row contains only \code{NA} values. } } \details{ \itemize{ -\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive), -which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}. -\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"}, -merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared. -\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and +\item \strong{FixAlleleIDs:} When the check fails (raw DArT format), row 7 is used as +the column header and the first 7 rows are dropped before subsequent checks. +\item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} +(case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs +\code{"Alt_0002"}, merged by \code{CloneID}, and the lengths of \code{AlleleSequence} +are compared. A \code{"-"} anywhere in \code{AlleleSequence} is also treated as +evidence of an indel. +\item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the +first part must match \code{"Chr"} (case-insensitive) and the second must be a +positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. +\item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows +respectively; useful for flagging empty or corrupt entries. +\item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and +\code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and \code{indel_clone_ids} is returned as \code{NULL}. } } diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 91e5493..0fbe774 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,13 +1,13 @@ test_that("check madc",{ github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" - names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow") + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs") # raw madc report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -23,7 +23,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -31,7 +31,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -39,7 +39,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -47,7 +47,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -55,7 +55,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -63,7 +63,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 5e21c7c..ce2ab8e 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -642,8 +642,81 @@ test_that("simu alfalfa",{ expect_error( madc2vcf_targets(madc_file = alfalfa_madc_raw, output.file = out, - get_REF_ALT = FALSE, + get_REF_ALT = TRUE, + verbose = FALSE) + ) + + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, verbose = FALSE) ) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) }) }) From ee509819f866fc241d8002177e2bc2999cc4e6d9 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:02:21 -0400 Subject: [PATCH 15/37] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- NEWS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1a0d64d..84f1c79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,9 @@ # BIGr 0.6.3 -- New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. -- Add new arguments markers_info for users to add CSV file with marker information such as CHROM, POS, Marker Type and position of indels. For BI species, these information is at https://github.com/Breeding-Insight/BIGapp-PanelHub -- Check inputs for madc2vcf_all -- Update affiliation on DESCRIPTION +- New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information. +- Added new argument `markers_info`, which allows users to provide a CSV file with marker information such as CHROM, POS, marker type, and position of indels. For BI species, this information is available from [PanelHub](https://github.com/Breeding-Insight/BIGapp-PanelHub). +- Checked inputs for `madc2vcf_all`. +- Updated affiliation in `DESCRIPTION`. # BIGr 0.6.2 From d3a40610f6a862d187086f220d70581dc6a1b0f1 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:03:19 -0400 Subject: [PATCH 16/37] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 19d3ff6..692f81b 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -91,8 +91,8 @@ check_madc_sanity <- function(report) { messages[["Columns"]] <- c("Required columns are present\n", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", + "MADC was not processed by BI. Please contact us to assign allele IDs to your MADC using the species haplotype database. This guarantees reproducibility across different datasets.\n") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. These codes are not currently supported\n", "No IUPAC (non-ATCG) codes found in AlleleSequence\n") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", "No lowercase bases found in AlleleSequence\n") From f765c7cc8bbb79d6a78e48683aa9b1dcdc08802d Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:05:24 -0400 Subject: [PATCH 17/37] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/filterVCF.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index 550468c..14f94c0 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -36,7 +36,7 @@ #' #' @export filterVCF <- function(vcf.file, - quality.rates = F, + quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, From 87bb1fc75464759629057450e77fa925eae1f21a Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:08:14 -0400 Subject: [PATCH 18/37] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/check_ped.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_ped.R b/R/check_ped.R index a2277bc..0ebbd5a 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -93,7 +93,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { repeated_ids_report <- conflicting_ids #### check 3: missing parents #### - for (i in 1:nrow(data)) { + for (i in seq_len(nrow(data))) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] From 6059c108f31dfb2e041aba9ac221625c372b86d8 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:33:10 -0400 Subject: [PATCH 19/37] Update R/madc2vcf_targets.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_targets.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1416044..c7fd269 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -523,8 +523,21 @@ madc2vcf_targets <- function(madc_file, } if(sum(is.na(vcf_df$REF)) > 0) { - vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") - warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) + vmsg( + paste( + sum(is.na(vcf_df$REF)), + "markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences)." + ), + verbose = verbose, + level = 1, + type = ">>" + ) + warning( + paste( + "Markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences):", + sum(is.na(vcf_df$REF)) + ) + ) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } From b09b0c1faba04150d51acd4716e53fa1afeff094 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:38:45 -0400 Subject: [PATCH 20/37] Update R/check_madc_sanity.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 759fce3..472c9ab 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -137,7 +137,7 @@ check_madc_sanity <- function(report) { indel_mask[same_len] <- n_diffs > 1 } - checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence), na.rm = TRUE) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) } else { From 409dbd33ff80e49b3e9dee39a2a16fe92cfb7728 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:40:18 -0400 Subject: [PATCH 21/37] Update R/get_countsMADC.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/get_countsMADC.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8cce70b..3854559 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -223,7 +223,14 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co if(collapse_matches_counts){ filtered_df <- madc_df[order(madc_df$AlleleID),] %>% - mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>% + # Keep only Ref/Alt alleles and their Match variants; drop other allele types + dplyr::filter(grepl("\\|(Ref|Alt)(Match)?\\b", AlleleID)) %>% + mutate( + Type = dplyr::case_when( + grepl("\\|Alt(Match)?\\b", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?\\b", AlleleID) ~ "Ref" + ) + ) %>% group_by(CloneID, Type) %>% summarise( AlleleID = paste0(unique(CloneID), "|", unique(Type)), From e6fce1953e1a1f3dc94a9c920efed49c3f62fdb7 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:43:21 -0400 Subject: [PATCH 22/37] Update R/get_countsMADC.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/get_countsMADC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 3854559..58a2496 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -235,7 +235,7 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co summarise( AlleleID = paste0(unique(CloneID), "|", unique(Type)), AlleleSequence = first(AlleleSequence), - across(where(is.numeric), sum), + across(where(is.numeric), ~ sum(.x, na.rm = TRUE)), .groups = "drop" ) %>% select(AlleleID, CloneID, AlleleSequence, everything(), -Type) From 669ac4eb77ace06ded03a69fbfab29cc1013c680 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:49:14 -0400 Subject: [PATCH 23/37] Update R/check_madc_sanity.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 472c9ab..87df09b 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -147,7 +147,7 @@ check_madc_sanity <- function(report) { # --- All NA ---- checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) - checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)) | x == "")) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x) | x == ""))) # ---- Chrom Pos ---- if(!any(is.na(report$CloneID))) { From bbfbee226439300609186c21b0fb047c54c8a33d Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 10:50:24 -0400 Subject: [PATCH 24/37] fix tests --- tests/testthat/test-check_madc_sanity.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 0fbe774..a185997 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,13 +1,13 @@ test_that("check madc",{ github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" - names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs") + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") # raw madc report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -23,7 +23,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -31,7 +31,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -39,7 +39,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -47,7 +47,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -55,7 +55,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -63,7 +63,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) }) From 55ee61a91afafbb429011346f93c77a906334ef3 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 17:02:56 -0400 Subject: [PATCH 25/37] madc2vcf_all indels support okay --- NAMESPACE | 1 + R/madc2vcf_all.R | 152 +++++-- R/utils.R | 14 + man/check_madc_sanity.Rd | 49 ++- man/filterVCF.Rd | 2 +- man/madc2vcf_all.Rd | 8 +- man/madc2vcf_targets.Rd | 77 +++- tests/testthat/test-madc2vcf_all.R | 634 +++++++++++++++++++++++++++++ 8 files changed, 859 insertions(+), 78 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 43c0559..6ecaef4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(url_exists) export(vmsg) import(dplyr) import(ggplot2) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 3cab307..08300f6 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -2,8 +2,8 @@ #' #' This function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It includes options for filtering multiallelic SNPs and parallel processing to improve performance. #' -#' @param madc A string specifying the path to the MADC file. -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. +#' @param madc Required. A string specifying the path or URL to the MADC file. +#' @param botloci_file Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand. #' @param hap_seq_file A string specifying the path to the haplotype database fasta file. #' @param rm_multiallelic_SNP A logical value. If TRUE, SNPs with more than one alternative base are removed. If FALSE, the thresholds specified by `multiallelic_SNP_dp_thr` and `multiallelic_SNP_sample_thr` are used to filter low-frequency SNP alleles. Default is FALSE. #' @param multiallelic_SNP_dp_thr A numeric value specifying the minimum depth by tag threshold for filtering low-frequency SNP alleles when `rm_multiallelic_SNP` is FALSE. Default is 0. @@ -51,8 +51,8 @@ #' @import vcfR #' #' @export -madc2vcf_all <- function(madc = NULL, - botloci_file = NULL, +madc2vcf_all <- function(madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -66,9 +66,9 @@ madc2vcf_all <- function(madc = NULL, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") - if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") - if(!is.null(hap_seq_file) & !file.exists(hap_seq_file)) stop("Haplotype sequence file not found. Please provide a valid path.") + if(!(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) & !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") ## n.cores as integer if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") @@ -105,6 +105,9 @@ madc2vcf_all <- function(madc = NULL, if (check) message[1] else message[2] }, checks$checks, checks$messages) + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) stop(paste("The MADC file does not pass the sanity checks:\n", @@ -116,11 +119,31 @@ madc2vcf_all <- function(madc = NULL, stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) } + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + # Standardize marker ID column to CloneID + if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) { + colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID" + vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use.", verbose = verbose, level = 1) + } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) { + stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + # Validate CloneID values + if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) + stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") + if(!any(mi_df$CloneID %in% report$CloneID)) + stop("None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + n_match <- sum(mi_df$CloneID %in% report$CloneID) + n_total <- length(unique(report$CloneID)) + if(n_match < n_total) + vmsg("%s of %s MADC CloneIDs found in markers_info. Unmatched markers will be removed.", verbose = verbose, level = 1, n_match, n_total) + } else mi_df <- NULL + if(any(!checks$checks[c("ChromPos")])){ if(is.null(markers_info)) { stop(paste(messages_results[c("ChromPos")], collapse = "\n")) } else { - mi_df <- read.csv(markers_info) if(!all(c("Chr", "Pos") %in% colnames(mi_df))) stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") @@ -130,13 +153,20 @@ madc2vcf_all <- function(madc = NULL, if(any(checks$checks[c("Indels")])){ idx <- which((checks$checks[c("Indels")])) if(is.null(markers_info)) { - stop(paste(messages_results[c("Indels")[idx]], collapse = "\n")) + vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>>") } else { - mi_df <- read.csv(markers_info) if(checks$checks["Indels"] && !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) stop("Indels detected in MADC file. ", "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + if(!"Type" %in% colnames(mi_df)) { + mi_df$Type <- ifelse(nchar(as.character(mi_df$Ref)) > 1 | nchar(as.character(mi_df$Alt)) > 1, + "Indel", "SNP") + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels).", + verbose = verbose, level = 1, + sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) + } + vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>>") } } @@ -145,21 +175,34 @@ madc2vcf_all <- function(madc = NULL, report$AlleleSequence <- toupper(report$AlleleSequence) } - if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") + if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded.", verbose = verbose, level = 1) + } + + botloci <- read.csv(botloci_file, header = F) if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL # Check marker names compatibility between MADC and botloci - checked_botloci <- check_botloci(botloci, report) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Starting conversion...", verbose = verbose, level = 0, type = ">>") + my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + mi_df = mi_df, verbose = verbose) + vmsg("All information gathered!", verbose = verbose, level = 1, type = ">>") + + vmsg("Adding information to a VCF body...", verbose = verbose, level = 0, type = ">>") + vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, @@ -201,9 +244,9 @@ madc2vcf_all <- function(madc = NULL, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, checks = NULL, mi_df = NULL, verbose = TRUE){ - if(!is.null(hap_seq)){ + if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } @@ -217,7 +260,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) - add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) + add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = FALSE)) stopCluster(clust) ref_index <- sapply(add_ref_alt_results, "[[",2) @@ -229,19 +272,25 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) - if(verbose){ - cat("The Ref_0001 sequence had to be added for:", sum(ref_index==1),"tags\n") - cat("The Alt_0002 sequence had to be added for:", sum(alt_index==1),"tags\n") - cat("Tags discarded due to lack of Ref_0001 sequence:", sum(ref_index==-1),"tags\n") - cat("Tags discarded due to lack of Alt_0002 sequence:", sum(alt_index==-1),"tags\n") + if(!is.null(hap_seq)){ + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + } else { + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) } + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==1)) + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==-1)) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 1) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) stopCluster(clust) + vmsg("Pairwise alignments concluded.", verbose = verbose, level = 1) + my_results_csv <- lapply(compare_results, "[[", 1) my_results_csv <- do.call(rbind, my_results_csv) @@ -252,11 +301,9 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) - if(verbose){ - cat("Number of tags removed because of low alignment score:", length(rm_score),"tags\n") - cat("Number of tags removed because of N in the alternative sequence:", length(rm_N),"tags\n") - cat("Number of tags removed because of indels as targets (not yet supported):", length(rm_indels),"tags\n") - } + vmsg("Number of tags removed because of low alignment score: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_N)) + vmsg("Number of tags removed because of indels as targets (not yet supported): %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_indels)) rownames(my_results_csv) <- NULL return(my_results_csv) @@ -351,10 +398,44 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40){ +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, verbose = FALSE){ + # for(i in 1507:length(updated_by_cloneID)){ + # one_tag <- updated_by_cloneID[[i]] + cloneID <- one_tag$CloneID[1] + isBotLoci <- cloneID %in% botloci[,1] - # If marker is present in the botloc list, get the reverse complement sequence + if(!is.null(mi_df)) { + one_mi_df <- mi_df[which(mi_df$CloneID %in% cloneID), ] + # Handle duplicate CloneIDs in markers_info + if(nrow(one_mi_df) > 1) { + key_cols <- intersect(c("CloneID", "Chr", "Pos", "Ref", "Alt", "Type"), colnames(one_mi_df)) + if(nrow(unique(one_mi_df[, key_cols])) == 1) { + warning("Duplicate CloneID '", cloneID, "' found in markers_info with identical key columns. Keeping the first entry.") + one_mi_df <- one_mi_df[1, ] + } else { + stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") + } + } + isIndel <- tolower(one_mi_df$Type) == "indel" + } else { + isIndel <- FALSE + } + + if(isIndel){ + update_tag <- one_tag[grep("Ref_0001$",one_tag$AlleleID),] + update_tag[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag_temp <- one_tag[grep("Alt_0002$",one_tag$AlleleID),] + update_tag_temp[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag <- rbind(update_tag, update_tag_temp) + + return(list(update_tag = update_tag, + rm_score = NULL, + rm_N = NULL, + rm_indels = NULL)) + } + + # If marker is present in the botloci list, get the reverse complement sequence if(isBotLoci) one_tag$AlleleSequence <- sapply(one_tag$AlleleSequence, function(x) as.character(reverseComplement(DNAString(x)))) chr <- sapply(strsplit(cloneID, "_"), function(x) x[-length(x)]) @@ -368,7 +449,10 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ sigma <- nucleotideSubstitutionMatrix(match = 1, mismatch = -0.5, baseOnly = FALSE) # baseOnly = FALSE avoid breaking when N is present (they will be filtered after)) align <- pairwiseAlignment(ref_seq, alt_seq, - substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + substitutionMatrix = sigma, + gapOpening=-1.4, + gapExtension=-0.1, + type = "global") # The score is a bit different from the python script despite same weights if(align@score > alignment_score_thr){ # if score for the target sequence is smaller than the threshold, the tag will be discarted @@ -529,9 +613,7 @@ create_VCF_body <- function(csv, vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) rm_mks <- sapply(vcf_tag_list, "[[" ,2) - if(verbose){ - print(paste("SNP removed because presented more than one allele:", sum(rm_mks))) - } + vmsg("SNP removed because presented more than one allele: %s", verbose = verbose, level = 2, type = ">>>",sum(rm_mks)) for(i in seq_along(vcf_tag_list1)) { if(is.vector(vcf_tag_list1[[i]])) { @@ -555,9 +637,7 @@ create_VCF_body <- function(csv, if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - if(verbose){ - print(paste("Different primers pair capture same SNP positions in", length(repeated), "locations. The repeated were discarded.")) - } + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded.", verbose = verbose, level = 2, length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] diff --git a/R/utils.R b/R/utils.R index a270570..cf3cc6a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -69,6 +69,20 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { message(formatted_msg) } + +#' Check Whether a URL Is Accessible +#' +#' Attempts to open a connection to the given URL and returns `TRUE` if +#' successful, `FALSE` otherwise. Errors and warnings are both treated as +#' inaccessible. +#' +#' @param u character. The URL to test. +#' +#' @return A single logical: `TRUE` if the URL can be opened, `FALSE` if not. +#' +#' @keywords internal +#' @noRd +#' @export url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 5328a4f..0398625 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -14,23 +14,30 @@ If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are treated as header filler and skipped before further checks are run.} } \value{ -A named list with three elements: +A named list with five elements: \describe{ -\item{checks}{Named logical vector with eight entries: +\item{checks}{Named logical vector with nine entries: \code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, -\code{ChromPos}, \code{allNAcol}, \code{allNArow}. -\code{TRUE} means the condition was detected (or passed for \code{Columns} and -\code{FixAlleleIDs}); \code{NA} means the check was skipped.} +\code{ChromPos}, \code{allNAcol}, \code{allNArow}, \code{RefAltSeqs}. +\code{TRUE} means the condition was detected (or passed for \code{Columns}, +\code{FixAlleleIDs}, \code{ChromPos}, and \code{RefAltSeqs}); \code{NA} means the check +was skipped.} \item{messages}{Named list of length-2 character vectors, one per check. Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} when it is \code{FALSE}. Indexed by the same names as \code{checks}.} \item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} when required columns are missing.} +\item{missRef}{Character vector of \code{CloneID}s that have no \code{Ref} allele +row. Returns \code{character(0)} if all \code{CloneID}s have a \code{Ref} row, or +\code{NULL} when required columns are missing.} +\item{missAlt}{Character vector of \code{CloneID}s that have no \code{Alt} allele +row. Returns \code{character(0)} if all \code{CloneID}s have an \code{Alt} row, or +\code{NULL} when required columns are missing.} } } \description{ -Performs eight quick validations on an allele report: +Performs nine quick validations on an allele report: \enumerate{ \item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); \item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} @@ -41,27 +48,39 @@ Performs eight quick validations on an allele report: or a \code{"-"} character is present in \code{AlleleSequence}; \item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format (prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); -\item \strong{allNAcol} - at least one column contains only \code{NA} values; -\item \strong{allNArow} - at least one row contains only \code{NA} values. +\item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; +\item \strong{allNArow} - at least one row contains only \code{NA} or empty values; +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row. } } \details{ \itemize{ -\item \strong{FixAlleleIDs:} When the check fails (raw DArT format), row 7 is used as -the column header and the first 7 rows are dropped before subsequent checks. +\item \strong{FixAlleleIDs:} When the first six rows of the first column are all blank +or \code{"*"} (raw DArT format), row 7 is promoted to column headers and the +first 7 rows are dropped before subsequent checks are run. The check is +\code{TRUE} when the file has already been processed by HapApp (fixed IDs with +\verb{_0001}/\verb{_0002} suffixes). \item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} (case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). \item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs -\code{"Alt_0002"}, merged by \code{CloneID}, and the lengths of \code{AlleleSequence} -are compared. A \code{"-"} anywhere in \code{AlleleSequence} is also treated as -evidence of an indel. +\code{"Alt_0002"}, merged by \code{CloneID}, and flagged as indels if either (a) the +lengths of \code{AlleleSequence} differ, (b) the sequences have the same length +but more than one character differs between them (complex indel / local +rearrangement), or (c) a \code{"-"} character is present anywhere in +\code{AlleleSequence}. \item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the first part must match \code{"Chr"} (case-insensitive) and the second must be a positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. \item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows -respectively; useful for flagging empty or corrupt entries. +respectively; a cell is treated as missing when it is \code{NA} or an empty +string (\code{""}). Useful for flagging empty or corrupt entries. +\item \strong{RefAltSeqs:} For each unique \code{CloneID}, checks whether at least one row +with a \code{Ref} (\verb{|Ref_} when \code{FixAlleleIDs = TRUE}, \verb{|Ref$} otherwise) and +one row with an \code{Alt} (\verb{|Alt_} / \verb{|Alt$}) allele exist. \code{CloneID}s that +lack a \code{Ref} row are stored in \code{missRef}; those lacking an \code{Alt} row in +\code{missAlt}. The check is \code{TRUE} when both sets are empty. \item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and \code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and -\code{indel_clone_ids} is returned as \code{NULL}. +\code{indel_clone_ids}, \code{missRef}, and \code{missAlt} are returned as \code{NULL}. } } diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 39d7264..2a3ab62 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,7 +6,7 @@ \usage{ filterVCF( vcf.file, - quality.rates = F, + quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 58cfcb3..ac2de3f 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -5,8 +5,8 @@ \title{Converts MADC file to VCF recovering target and off-target SNPs} \usage{ madc2vcf_all( - madc = NULL, - botloci_file = NULL, + madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -19,9 +19,9 @@ madc2vcf_all( ) } \arguments{ -\item{madc}{A string specifying the path to the MADC file.} +\item{madc}{Required. A string specifying the path or URL to the MADC file.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand.} \item{hap_seq_file}{A string specifying the path to the haplotype database fasta file.} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 1da55d8..30363a6 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -21,22 +21,33 @@ madc2vcf_targets( \item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text file listing target IDs designed on the \strong{bottom} strand (one ID per line). -Required only when \code{get_REF_ALT = TRUE} and \code{markers_info} does not supply -\code{Ref} and \code{Alt} columns.} +Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when +\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Minimum required columns: \code{CloneID} (or \code{BI_markerID}), \code{Chr}, \code{Pos}. -When \code{get_REF_ALT = TRUE}, also requires \code{Ref} and \code{Alt} (replaces probe-sequence -inference). \code{Type} and \code{Indel_pos} are never required by this function.} +metadata. Accepted columns: +\itemize{ +\item \code{CloneID} or \code{BI_markerID} (required as marker identifier); +\item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; +\item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +}} -\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT -bases from the Ref/Alt probe sequences in the MADC file (with strand correction -using \code{botloci_file}). Targets with more than one difference between Ref/Alt -sequences are removed.} +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover +REF/ALT bases. The source is chosen automatically: \code{markers_info} \code{Ref}/\code{Alt} +columns take priority; otherwise probe sequences from the MADC are compared +(with \code{botloci_file} for strand correction). Targets with more than one +difference between Ref/Alt sequences are removed. When \code{FALSE}, REF and ALT +are set to \code{"."} in the output VCF.} -\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for +\verb{|AltMatch} and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} +and \verb{|Alt} rows before building the matrices. Useful when the MADC contains +multiple allele rows per target that should be aggregated.} -\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress messages about each processing step.} +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress +messages about each processing step.} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a @@ -53,14 +64,17 @@ Convert DArTag MADC target read counts to a VCF \strong{What this function does} \itemize{ -\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, -IUPAC/ambiguous bases, lowercase bases, indels). +\item Runs basic sanity checks on the MADC file via \code{check_madc_sanity()} (column +presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). \item Extracts reference and total read counts per sample and target. \item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). -\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing -the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. -\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} -bypassing sequence-based inference. +\item If \code{get_REF_ALT = TRUE}, recovers REF/ALT bases either from \code{markers_info} +(when \code{Ref}/\code{Alt} columns are present) or by comparing the Ref/Alt probe +sequences in the MADC file (with strand correction via \code{botloci_file}). +Targets with >1 polymorphism between sequences are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, +\code{ALT}, bypassing sequence-based inference. } \strong{Output VCF layout} @@ -83,11 +97,30 @@ If a target ID appears in \code{botloci_file}, its probe sequences are reverse- complemented prior to base comparison so that REF/ALT are reported in the top-strand genomic orientation. -\strong{Sanity check behavior} -\itemize{ -\item If required columns or fixed IDs are missing, the function \code{stop()}s. -\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not} -provided, the function \code{stop()}s with a diagnostic message explaining what to fix. +\strong{Sanity check behaviour and requirements} + +The function always stops if required columns (\code{CloneID}, \code{AlleleID}, +\code{AlleleSequence}) are missing. + +For the remaining checks the required inputs depend on the combination of +check result and \code{get_REF_ALT}:\tabular{llll}{ + Check \tab Status \tab \code{get_REF_ALT} \tab Required \cr + \strong{IUPAC codes} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{Indels} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{ChromPos} \tab valid \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab valid \tab \code{FALSE} \tab — \cr + \tab invalid \tab \code{TRUE} \tab \code{markers_info} with \code{Chr}/\code{Pos}/\code{Ref}/\code{Alt} \strong{or} \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \tab invalid \tab \code{FALSE} \tab \code{markers_info} with \code{Chr}/\code{Pos} \cr + \strong{FixAlleleIDs} \tab fixed \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab fixed \tab \code{FALSE} \tab — \cr + \tab not fixed \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not fixed \tab \code{FALSE} \tab — \cr } } \section{Dependencies}{ diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f2095c5..7904035 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -65,3 +65,637 @@ test_that("test madc offtargets",{ rm(vcf) }) + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 2, + rm_multiallelic_SNP = TRUE, + multiallelic_SNP_sample_thr = 0, + multiallelic_SNP_dp_thr = 0, + alignment_score_thr = 40, + out_vcf = out, + verbose = TRUE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + unlink(out) + + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when botloci_file is NULL + expect_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = NULL, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + # Test that it works when hap_seq_file is provided (REF/ALT recovered from probe sequences) + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when markers_info does not match MADC CloneIDs + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + + # Test error when markers_info_ChromPos is provided but IDs still don't match botloci + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + ) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # markers_info without Ref/Alt/Indel_pos while indels present -> error + expect_error( + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + }) +}) + From bf5ff4c48889f0ff1153d0b1c0ea0f8f71f6e502 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 31 Mar 2026 09:56:21 -0400 Subject: [PATCH 26/37] madc2vcf_all support indel --- DESCRIPTION | 2 +- NEWS.md | 41 +++ R/check_madc_sanity.R | 44 +-- R/madc2vcf_all.R | 121 +++++--- R/madc2vcf_targets.R | 1 + tests/testthat/test-madc2vcf_all.R | 412 ++++++++----------------- tests/testthat/test-madc2vcf_targets.R | 35 ++- 7 files changed, 292 insertions(+), 364 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9a18709..bf782ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.3 +Version: 0.6.5 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index fce76f4..8037725 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,44 @@ +# BIGr 0.6.5 + +# Updates on madc2vcf functions +Details: + +- both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: + - [Columns] If MADC has the expected columns + - [allNArow | allNAcol] Presence of columns and rows with all NA (happens often when people open the MADC in excel before loading in R) + - [IUPACcodes] Presence of IUPAC codes on AlleleSequence + - [LowerCase] Presence of lower case bases on AlleleSequence + - [Indels] Presence of Indels + - [ChromPos] If CloneID follows the format Chr_Pos + - [RefAltSeqs] If all Ref Allele has corresponding Alt and vice-versa + - [OtherAlleles] If "Other" exists in the MADC AlleleID + +- Better messages if `verbose = TRUE` in `madc2vcf_all` +- `madc2vcf_all` support for Indels - markers_info with Indels position is required; only the target indel is extracted, off-targets are ignored for the tag +- `madc2vcf_targets` doesn’t run if: + - MADC Column names are not correct + - Ignore Other alleles - but inform the user if they exist or not and direct them to `madc2vcf_all` in case they want to extract them as well +- See the table for madc2vcf_targets requirements accordingly to MADC content: + +  | check status | get_REF_ALT | Requires +-- | -- | -- | -- +IUPAC | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +Indels | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +ChromPos | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info CHR/POS/REF/ALT or markers_info CHR/POS/ + botloci +  | FALSE | FALSE | markers_info CHR/POS +FixAlleleIDs | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info REF/ALT +  | FALSE | FALSE | - + # BIGr 0.6.4 - Add function `vmsg` to organize messages printed on the console diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 87df09b..c36a92e 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -132,7 +132,7 @@ check_madc_sanity <- function(report) { n_diffs <- mapply(function(r, a) { r_chars <- strsplit(r, "")[[1]] a_chars <- strsplit(a, "")[[1]] - sum(r_chars != a_chars) + sum(toupper(r_chars) != toupper(a_chars)) }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) indel_mask[same_len] <- n_diffs > 1 } @@ -183,8 +183,8 @@ check_madc_sanity <- function(report) { messages[["Columns"]] <- c("Required columns are present", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by HapApp.") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", + "MADC not processed by HapApp") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them", "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", "No lowercase bases found in AlleleSequence") @@ -192,16 +192,16 @@ check_madc_sanity <- function(report) { "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") - messages[["allNArow"]] <- c("One or more rows contain all NA values.", + messages[["allNArow"]] <- c("One or more rows contain all NA values", "No rows with all NA values") - messages[["allNAcol"]] <- c("One or more columns contain all NA values.", + messages[["allNAcol"]] <- c("One or more columns contain all NA values", "No columns with all NA values") messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", - paste0("Some CloneIDs are missing Ref and/or Alt alleles. ", + paste0("Some CloneIDs are missing Ref and/or Alt alleles ", "Missing Ref: ", paste(missRef, collapse = " "), ". ", "Missing Alt: ", paste(missAlt, collapse = " "), ".")) - messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID.", - "No alleles other than Ref and Alt found in AlleleID.") + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID", + "No alleles other than Ref and Alt found in AlleleID") list(checks = checks, messages = messages, indel_clone_ids = indels, missRef = missRef, missAlt = missAlt) @@ -248,12 +248,17 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") if(pad_madc != pad_botloci) { - vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 2, type = ">>") if (pad_madc < pad_botloci) { report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) ) report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + if(!is.null(mi_df)) { + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID))) + ) + } } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) @@ -261,30 +266,31 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") } } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") } else { use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") } mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) if(!any(botloci$V1 %in% mk_info_CloneID)){ - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 2, type = ">>") stop("Check marker IDs in both MADC and botloci files. They should be the same.") } else { - vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + mi_df$CloneID <- mk_info_CloneID } } else { - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 2, type = ">>") stop("Check marker IDs in both MADC and botloci files. They should be the same.") } } - return(list(botloci, report)) + return(list(botloci, report, mi_df)) } diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 08300f6..3ec95d4 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -66,9 +66,9 @@ madc2vcf_all <- function(madc, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") - if(!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") - if(!is.null(hap_seq_file) & !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") + if(is.null(madc) || !(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(is.null(botloci_file) || !(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) && !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") ## n.cores as integer if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") @@ -153,7 +153,7 @@ madc2vcf_all <- function(madc, if(any(checks$checks[c("Indels")])){ idx <- which((checks$checks[c("Indels")])) if(is.null(markers_info)) { - vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>>") + vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>") } else { if(checks$checks["Indels"] && !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) @@ -166,7 +166,7 @@ madc2vcf_all <- function(madc, verbose = verbose, level = 1, sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) } - vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>>") + vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>") } } @@ -186,28 +186,35 @@ madc2vcf_all <- function(madc, checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + # Derive position padding width from CloneIDs in the original report + pad_width <- unique(nchar(sub(".*_", "", unique(report$CloneID)))) + if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") + pad_width <- pad_width[1] vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") - vmsg("Starting conversion...", verbose = verbose, level = 0, type = ">>") + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + checks = checks, mi_df = mi_df, + pad_width = pad_width, verbose = verbose) - vmsg("All information gathered!", verbose = verbose, level = 1, type = ">>") - - vmsg("Adding information to a VCF body...", verbose = verbose, level = 0, type = ">>") + vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, multiallelic_SNP_dp_thr = multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr = multiallelic_SNP_sample_thr, + pad_width = pad_width, verbose = verbose) #Make a header separate from the dataframe @@ -226,6 +233,8 @@ madc2vcf_all <- function(madc, vcf_term <- sapply(strsplit(out_vcf, "[.]"), function(x) x[length(x)]) if(length(vcf_term) != 0) if(vcf_term != "vcf") out_vcf <- paste0(out_vcf,".vcf") + vmsg("VCF ready! Output written to: %s", verbose = verbose, level = 0, type = ">>", out_vcf) + writeLines(vcf_header, con = out_vcf) suppressWarnings( write.table(vcf_body, file = out_vcf, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) @@ -244,7 +253,8 @@ madc2vcf_all <- function(madc, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, checks = NULL, mi_df = NULL, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, + checks = NULL, mi_df = NULL, pad_width = NULL,verbose = TRUE){ if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) @@ -274,23 +284,21 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align if(!is.null(hap_seq)){ vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) } else { vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) } - vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==1)) - vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==1)) - vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==-1)) - vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==-1)) + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) - vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 1) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) stopCluster(clust) - vmsg("Pairwise alignments concluded.", verbose = verbose, level = 1) - my_results_csv <- lapply(compare_results, "[[", 1) my_results_csv <- do.call(rbind, my_results_csv) @@ -301,9 +309,19 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) - vmsg("Number of tags removed because of low alignment score: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_score)) - vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_N)) - vmsg("Number of tags removed because of indels as targets (not yet supported): %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_indels)) + vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) + if(length(rm_indels) > 0) { + if(!is.null(mi_df) && all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) { + vmsg("Number of tags with indels as targets: %s tags (markers_info provided with required columns; targets exported, off-targets skipped)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } else { + vmsg("Number of tags removed because of indels as targets: %s tags (no markers_info with Ref/Alt/Indel_pos provided; tags discarded)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } + } else { + vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") + } + + vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) rownames(my_results_csv) <- NULL return(my_results_csv) @@ -417,7 +435,7 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") } } - isIndel <- tolower(one_mi_df$Type) == "indel" + isIndel <- !is.null(one_mi_df$Type) && !is.na(one_mi_df$Type) && tolower(one_mi_df$Type) == "indel" } else { isIndel <- FALSE } @@ -597,6 +615,7 @@ create_VCF_body <- function(csv, multiallelic_SNP_dp_thr = 2, multiallelic_SNP_sample_thr = 10, n.cores = 1, + pad_width = NULL, verbose = TRUE){ # Make sure counts are numeric @@ -607,13 +626,31 @@ create_VCF_body <- function(csv, clust <- makeCluster(n.cores) #clusterExport(clust, c("merge_counts","rm_multiallelic_SNP", "multiallelic_SNP_dp_thr", "multiallelic_SNP_sample_thr")) - vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr)) + vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, pad_width)) stopCluster(clust) vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) - rm_mks <- sapply(vcf_tag_list, "[[" ,2) - - vmsg("SNP removed because presented more than one allele: %s", verbose = verbose, level = 2, type = ">>>",sum(rm_mks)) + rm_mks <- sapply(vcf_tag_list, "[[", 2) # total removed + total_mks <- sapply(vcf_tag_list, "[[", 3) # total multiallelic found + rm_setting <- sapply(vcf_tag_list, "[[", 4) # removed by rm_multiallelic_SNP=TRUE + rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering + kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic + simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + + vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") + + vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + if(rm_multiallelic_SNP) { + vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) + } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { + vmsg("Removed (empty after filtering; depth thr = %s, sample thr = %s): %s", + verbose = verbose, level = 3, type = ">>", + multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, sum(rm_filter)) + vmsg("Kept as multiallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + vmsg("Simplified to biallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(simplified)) + } else { + vmsg("All kept (rm_multiallelic_SNP = FALSE, no thresholds set): %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + } for(i in seq_along(vcf_tag_list1)) { if(is.vector(vcf_tag_list1[[i]])) { @@ -626,12 +663,6 @@ create_VCF_body <- function(csv, vcf_body$V3 <- as.numeric(vcf_body$V3) rownames(vcf_body) <- NULL - # Remove padding - sp <- strsplit(vcf_body$target, "_") - pos <- sapply(sp, function(x) x[length(x)]) - chr <- sapply(sp, function(x) paste0(x[-length(x)], collapse = "_")) - vcf_body$target <- paste0(chr, "_",as.numeric(pos)) - # Dealing with repeated positions # discard the ones that are not the target and keep only the first if all are off-targets if(length(which(duplicated(vcf_body[,3]))) > 0){ @@ -655,6 +686,10 @@ create_VCF_body <- function(csv, vcf_body_new <- rbind(vcf_body_new, repeated_tab_stay) } else vcf_body_new <- vcf_body + vmsg("Filters finished", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF...", verbose = verbose, level = 0, type = ">>") + vcf_body_new <- vcf_body_new[,-1] colnames(vcf_body_new) <- c("#CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", "INFO","FORMAT", colnames(csv)[-c(1:7)]) @@ -676,18 +711,24 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0){ +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, pad_width = NULL){ #Get counts for target SNP - rm <- 0 + rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE + rm_by_filter <- 0 # removed because empty after threshold filtering + kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) + simplified <- 0 # simplified from multiallelic to biallelic by filtering + total_multiallelic <- 0 RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + cloneID <- cloneID_unit$CloneID[1] + if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + cloneID, info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") @@ -704,19 +745,21 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic alleles <- unique(by_pos[[i]]$AlleleID) if(length(unique(by_pos[[i]]$Alt)) > 1){ # If SNP is multiallelic + total_multiallelic <- total_multiallelic + 1 if(rm_multiallelic_SNP){ # option to remove multiallelics - rm <- rm + 1 + rm_by_setting <- rm_by_setting + 1 next() } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles rm.idx <- which(apply(by_pos[[i]][,-c(1:7)], 1, function(x) sum(x > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr)) if(length(rm.idx)) up_by_pos <- by_pos[[i]][-rm.idx,] else up_by_pos <- by_pos[[i]] if(length(unique(up_by_pos$Alt)) == 0) { # If after applied filter all tags are gone - rm <- rm + 1 + rm_by_filter <- rm_by_filter + 1 next() } else if (length(unique(up_by_pos$Alt)) > 1 ){ # If after applied filter the SNP remains multiallelic + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(up_by_pos, up_by_pos$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -731,12 +774,14 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- unique.data.frame(info) } else { # If after applied filter, only one alternative remains + simplified <- simplified + 1 alt <- apply(up_by_pos[,-c(1:7)], 2, sum) total_alt <- alt info <- unique.data.frame(up_by_pos[,2:5]) } } else { # If rm_multiallelic_SNP set to FALSE and threshold is 0, keep all multiallelics + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(by_pos[[i]], by_pos[[i]]$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -763,7 +808,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + paste0(info$Chromosome, "_", formatC(as.integer(as.numeric(info$SNP_position_in_Genome)), width = pad_width, flag = "0", format = "d")), info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(ref, total_alt)),";", "ADS=",sum(ref),",",sum(total_alt)), "DP:RA:AD") @@ -773,5 +818,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified)) } diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index c7fd269..80af70f 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -260,6 +260,7 @@ madc2vcf_targets <- function(madc_file, checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] } } diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index 7904035..f88adc9 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -99,12 +99,14 @@ test_that("simu alfalfa",{ test_that("ALFALFA — clean fixed allele ID MADC", { out <- tempfile(fileext = ".vcf") + #out <- "test.vcf" + # Default parameters expect_no_error( madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 2, - rm_multiallelic_SNP = TRUE, + rm_multiallelic_SNP = FALSE, multiallelic_SNP_sample_thr = 0, multiallelic_SNP_dp_thr = 0, alignment_score_thr = 40, @@ -117,7 +119,10 @@ test_that("simu alfalfa",{ expect_true(all(!is.na(vcf@fix[, "ALT"]))) DP <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(DP[1,]), 4534) - expect_equal(sum(DP[,5]), 233482) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) unlink(out) expect_no_error( @@ -126,15 +131,18 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = TRUE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") - expect_true(all(is.na(vcf@fix[, "REF"]))) - expect_true(all(is.na(vcf@fix[, "ALT"]))) + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) DP <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(DP[1,]), 4534) - expect_equal(sum(DP[,5]), 56547) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) # Test error when botloci_file is NULL expect_error( @@ -146,45 +154,48 @@ test_that("simu alfalfa",{ verbose = FALSE) ) - # Test that it works when hap_seq_file is provided (REF/ALT recovered from probe sequences) + # Test that it also works when markers_info is provided together with botloci madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, + multiallelic_SNP_dp_thr = 80, + multiallelic_SNP_sample_thr = 2, n.cores = 1, + markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 234777) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 3) - # Test that it also works when markers_info is provided together with botloci + # Remove multiallelics madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, + rm_multiallelic_SNP = TRUE, n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) - + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + multi <- grepl(",", vcf@fix[,5]) + expect_false(any(multi)) # It has multiallelics + expect_equal(sum(multi), 0) }) test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { @@ -208,145 +219,113 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs." ) # Test error when markers_info_ChromPos is provided but IDs still don't match botloci - expect_error( - madc2vcf_all(madc = alfalfa_madc_wrongID, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - markers_info = alfalfa_markers_info_ChromPos, - out_vcf = out, - verbose = FALSE) - ) - }) - - test_that("alfalfa lower case fixed MADC", { - out <- tempfile(fileext = ".vcf") - madc2vcf_all(madc = alfalfa_lowercase, + madc2vcf_all(madc = alfalfa_madc_wrongID, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + }) + + test_that("alfalfa lower case missing 3 ref and 1 alt fixed MADC", { + out <- tempfile(fileext = ".vcf") madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 233719) madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, hap_seq_file = NULL, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) - }) - - test_that("alfalfa IUPAC code", { - out <- tempfile(fileext = ".vcf") - # IUPAC codes cause a stop in madc2vcf_all - expect_error( - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) - madc2vcf_all(madc = alfalfa_iupac, + madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, + hap_seq_file = NULL, n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = alfalfa_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(dp[1,]), 4534) - expect_equal(sum(dp[,5]), 56547) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - out_vcf = out, - verbose = FALSE) + expect_equal(sum(dp[,5]), 230415) - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + }) + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence\\. This codes are not currently supported by BIGr/BIGapp\\. Run HapApp to replace them" + ) }) test_that("potato indel madc chrompos=FALSE", { @@ -358,7 +337,9 @@ test_that("simu alfalfa",{ hap_seq_file = potato_microhapDB, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) madc2vcf_all(madc = potato_indel_madc, @@ -367,16 +348,18 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41656) + expect_equal(sum(dp[,10]), 43017) + expect_equal(sum(dp[3,]), 5073) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -393,10 +376,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 5163) - expect_equal(sum(dp[,5]), 58927) + expect_equal(sum(dp[1,]), 3937) + expect_equal(sum(dp[,5]), 248571) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -407,25 +391,10 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) - madc2vcf_all(madc = potato_indel_madc, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41656) }) test_that("potato indel chromposFALSE", { @@ -437,7 +406,8 @@ test_that("simu alfalfa",{ hap_seq_file = potato_microhapDB, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) madc2vcf_all(madc = potato_more_indels_chrompos_false, @@ -453,9 +423,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -472,10 +444,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -486,25 +459,9 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) - - madc2vcf_all(madc = potato_more_indels_chrompos_false, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) }) test_that("potato lowercase", { @@ -515,13 +472,15 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(dp[,10]), 41755) @@ -536,165 +495,36 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info_ChromPos, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "Indels detected in MADC file. The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns." ) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = potato_microhapDB, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) }) - test_that("potato IUPAC", { out <- tempfile(fileext = ".vcf") - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = potato_microhapDB, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - }) - - test_that("alfalfa raw MADC format (7-row header)", { - out <- tempfile(fileext = ".vcf") - # Raw format fails FixAlleleIDs check -> madc2vcf_all stops - expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) - - expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, + markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" ) + }) + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops expect_error( madc2vcf_all(madc = alfalfa_madc_raw, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = FALSE), regexp = "MADC not processed by HapApp" ) }) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index ce2ab8e..9899e17 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -245,11 +245,13 @@ test_that("simu alfalfa",{ test_that("alfalfa lower case fixed MADC", { out <- tempfile(fileext = ".vcf") - madc2vcf_targets(madc_file = alfalfa_lowercase, - output.file = out, - get_REF_ALT = TRUE, - botloci_file = alfalfa_botloci, - verbose = FALSE) + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = TRUE) + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -260,14 +262,16 @@ test_that("simu alfalfa",{ expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) - madc2vcf_targets(madc_file = alfalfa_lowercase, - output.file = out, - get_REF_ALT = TRUE, - botloci_file = alfalfa_botloci, - markers_info = alfalfa_markers_info, - verbose = FALSE) + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -278,15 +282,16 @@ test_that("simu alfalfa",{ expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) + expect_warning( madc2vcf_targets(madc_file = alfalfa_lowercase, output.file = out, get_REF_ALT = FALSE, botloci_file = alfalfa_botloci, markers_info = alfalfa_markers_info, verbose = FALSE) - + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] @@ -294,7 +299,7 @@ test_that("simu alfalfa",{ check <- cbind(vcf_infos,lut_infos) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) }) test_that("alfalfa IUPAC code", { From 291ae8e66a9d5cebb65fc2dac15bfae908adb652 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 10:38:45 -0400 Subject: [PATCH 27/37] add support for Others --- R/madc2vcf_all.R | 265 ++++++++++++++++++++++++++++++++++++-------- man/madc2vcf_all.Rd | 11 ++ 2 files changed, 230 insertions(+), 46 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 3ec95d4..ac20835 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -11,6 +11,10 @@ #' @param alignment_score_thr A numeric value specifying the minimum alignment score threshold. Default is 40. #' @param n.cores An integer specifying the number of cores to use for parallel processing. Default is 1. #' @param out_vcf A string specifying the name of the output VCF file. If the file extension is not `.vcf`, it will be appended automatically. +#' @param markers_info A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed). +#' @param add_others A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE. +#' @param others_max_snps An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit). +#' @param others_rm_with_indels A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE. #' @param verbose A logical value indicating whether to print metrics and progress to the console. Default is TRUE. #' #' @return This function does not return an R object. It writes the processed VCF file v4.3 to the specified `out_vcf` path. @@ -61,6 +65,9 @@ madc2vcf_all <- function(madc, alignment_score_thr = 40, out_vcf = NULL, markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE){ vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") @@ -81,8 +88,11 @@ madc2vcf_all <- function(madc, ## out_vcf as string if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") - ## rm_multiallelic_SNP and verbose as logical + ## rm_multiallelic_SNP, add_others and verbose as logical if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(add_others)) stop("add_others should be logical.") + if(!is.null(others_max_snps) && (!is.numeric(others_max_snps) || others_max_snps < 1)) stop("others_max_snps should be a positive integer or NULL.") + if(!is.logical(others_rm_with_indels)) stop("others_rm_with_indels should be logical.") if(!is.logical(verbose)) stop("verbose should be logical.") bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all= 1 | nchar(as.character(mi_df$Alt)) > 1, "Indel", "SNP") - vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels).", + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels)", verbose = verbose, level = 1, sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) } - vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>") + vmsg("The MADC file contains indels and markers_info file was provided with all required columns",verbose = verbose, level = 1, type = ">>") + vmsg("Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors",verbose = verbose, level = 2, type = ">>") } } + vmsg("Inputs checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") if(checks$checks["LowerCase"]){ - vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case", verbose = verbose, level = 1) report$AlleleSequence <- toupper(report$AlleleSequence) } if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ - vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded.", verbose = verbose, level = 1) + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded", verbose = verbose, level = 1) } botloci <- read.csv(botloci_file, header = F) @@ -193,10 +210,6 @@ madc2vcf_all <- function(madc, if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") pad_width <- pad_width[1] - vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") - - vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") - my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, @@ -205,6 +218,9 @@ madc2vcf_all <- function(madc, checks = checks, mi_df = mi_df, pad_width = pad_width, + add_others = add_others, + others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = verbose) vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") @@ -254,7 +270,9 @@ madc2vcf_all <- function(madc, #' #' @noRd loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, - checks = NULL, mi_df = NULL, pad_width = NULL,verbose = TRUE){ + checks = NULL, mi_df = NULL, pad_width = NULL, + add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, + verbose = TRUE){ if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) @@ -283,11 +301,11 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) if(!is.null(hap_seq)){ - vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences", verbose = verbose, level = 1) vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) } else { - vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis", verbose = verbose, level = 1) } vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) @@ -295,8 +313,10 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df", "add_others", "others_max_snps", "others_rm_with_indels")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, + add_others = add_others, others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = FALSE)) stopCluster(clust) my_results_csv <- lapply(compare_results, "[[", 1) @@ -308,6 +328,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_N <- unlist(rm_N) rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) + n_rm_others_indels <- sum(sapply(compare_results, "[[", 5)) + n_rm_others_maxsnps <- sum(sapply(compare_results, "[[", 6)) vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) @@ -320,6 +342,22 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } else { vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") } + n_others_total <- sum(sapply(compare_results, "[[", 7)) + n_others_kept <- n_others_total - n_rm_others_indels - n_rm_others_maxsnps + others_added_info <- unlist(lapply(compare_results, "[[", 8)) + if(add_others) { + vmsg("Number of Other alleles found: %s (%s kept after filters, %s discarded)", verbose = verbose, level = 2, type = ">>", n_others_total, n_others_kept, n_others_total - n_others_kept) + if(others_rm_with_indels) + vmsg("Number of Other alleles discarded due to indels vs Ref: %s", verbose = verbose, level = 2, type = ">>", n_rm_others_indels) + if(!is.null(others_max_snps)) + vmsg("Number of Other alleles discarded due to exceeding max SNPs (%s): %s", verbose = verbose, level = 2, type = ">>", others_max_snps, n_rm_others_maxsnps) + # if(length(others_added_info) > 0) { + # vmsg("Others tags added:", verbose = verbose, level = 3, type = ">>") + # for(msg in others_added_info) vmsg(" %s", verbose = verbose, level = 3, type = ">>", msg) + # } + } else { + vmsg("Number of Other alleles found: %s (not processed, add_others = FALSE)", verbose = verbose, level = 2, type = ">>", n_others_total) + } vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) @@ -416,10 +454,10 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, verbose = FALSE){ - # for(i in 1507:length(updated_by_cloneID)){ - # one_tag <- updated_by_cloneID[[i]] +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df = NULL, add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, verbose = FALSE){ + #idx <- which(names(updated_by_cloneID) == "Ra01_020534029") + #one_tag <- updated_by_cloneID[[idx]] cloneID <- one_tag$CloneID[1] isBotLoci <- cloneID %in% botloci[,1] @@ -450,7 +488,11 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver return(list(update_tag = update_tag, rm_score = NULL, rm_N = NULL, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } # If marker is present in the botloci list, get the reverse complement sequence @@ -480,7 +522,11 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver return(list(update_tag = NULL, rm_score = NULL, rm_N = NULL, - rm_indels= cloneID)) + rm_indels = cloneID, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } ref_base <- substring(ref_seq, align@pattern@mismatch@unlistData, align@pattern@mismatch@unlistData) alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) @@ -506,50 +552,126 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver if(length(rm_target) >0) pos_ref_idx <- pos_ref_idx[-rm_target] # Cases found where the AltMatch is another alternative for the target SNP - they are discarted if(length(pos_ref_idx) >0){ - ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + ref_base_match <- substring(ref_seq, pos_ref_idx, pos_ref_idx) pos_alt_idx <- align@subject@mismatch@unlistData # If there are indels, the position in the alternative is not the same as the reference if(length(rm_target) >0) pos_alt_idx <- pos_alt_idx[-rm_target] # remove target position when is AltMatch - but the order in the sequence is the same - alt_base <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + alt_base_match <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + + # If Match sequences have N, do not consider as polymorphism + if(any(!alt_base_match %in% c("A", "T", "C", "G"))) { + ref_base_match <- ref_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + alt_base_match <- alt_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + } + + if(length(alt_base_match) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + # The reported position is always on reference + pos <- pos_target - (pos_target_idx - pos_ref_idx) + + # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table + update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base_match)), ] + + update_tag_temp$Chromosome <- chr + update_tag_temp$SNP_position_in_Genome <- pos + update_tag_temp$Ref <- ref_base_match + update_tag_temp$Alt <- alt_base_match + + update_tag <- rbind(update_tag, update_tag_temp) + } + } + } + } + others_seq <- one_tag[grep("Other",one_tag$AlleleID),] + n_others_total <- nrow(others_seq) + n_rm_others_indels <- 0L + n_rm_others_maxsnps <- 0L + others_added_info <- character(0) + + if(add_others && nrow(others_seq) > 0){ + for(j in seq_len(nrow(others_seq))){ + align <- pairwiseAlignment(ref_seq, # Align with the reference + others_seq[j,]$AlleleSequence, + substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + # Filter: discard Others with indels relative to Ref + if(others_rm_with_indels && + (length(align@pattern@indel@unlistData) > 0 | length(align@subject@indel@unlistData) > 0)) { + n_rm_others_indels <- n_rm_others_indels + 1L + next + } + pos_ref_idx <- align@pattern@mismatch@unlistData + pos_alt_idx <- align@subject@mismatch@unlistData + # Filter: discard Others with too many SNPs vs Ref (count before removing target position) + if(!is.null(others_max_snps) && length(pos_ref_idx) > others_max_snps) { + n_rm_others_maxsnps <- n_rm_others_maxsnps + 1L + next + } + rm_target_other <- which(pos_ref_idx == pos_target_idx) # remove target position if base is the same as Ref or Alt + if(length(rm_target_other) > 0) { + other_tag_base <- substring(others_seq[j,]$AlleleSequence, pos_target_idx, pos_target_idx) + if(other_tag_base == ref_base | other_tag_base == alt_base){ # If Other has same base as REF and ALT, it won't be considered in their counts + pos_ref_idx <- pos_ref_idx[-rm_target_other] + pos_alt_idx <- pos_alt_idx[-rm_target_other] + } + } + other_ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + other_alt_base <- substring(others_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + # Cases found where the AltMatch is another alternative for the target SNP - they are discarted + if(length(pos_ref_idx) >0){ # If Match sequences have N, do not consider as polymorphism - if(any(!alt_base %in% c("A", "T", "C", "G"))) { - ref_base <- ref_base[-which(!alt_base %in% c("A", "T", "C", "G"))] - pos_ref_idx <- pos_ref_idx[-which(!alt_base %in% c("A", "T", "C", "G"))] - alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] + if(any(!other_alt_base %in% c("A", "T", "C", "G"))) { + other_ref_base <- other_ref_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + other_alt_base <- other_alt_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] } - if(length(alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + if(length(other_alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted # The reported position is always on reference pos <- pos_target - (pos_target_idx - pos_ref_idx) # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table - update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base)), ] + update_tag_temp <- one_tag[grep("Other",one_tag$AlleleID)[j],][rep(1, length(other_alt_base)), ] update_tag_temp$Chromosome <- chr update_tag_temp$SNP_position_in_Genome <- pos - update_tag_temp$Ref <- ref_base - update_tag_temp$Alt <- alt_base + update_tag_temp$Ref <- other_ref_base + update_tag_temp$Alt <- other_alt_base update_tag <- rbind(update_tag, update_tag_temp) + others_added_info <- c(others_added_info, + paste0(others_seq[j,]$AlleleID, " -> position(s): ", paste(pos, collapse = ", "))) } } } } + return(list(update_tag = update_tag, # updated data.frame, NULL if discarted rm_score = NULL, # cloneID if removed because of low alignment score, NULL if kept rm_N = NULL, - rm_indels = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept + rm_indels = NULL, + n_rm_others_indels = n_rm_others_indels, + n_rm_others_maxsnps = n_rm_others_maxsnps, + n_others_total = n_others_total, + others_added_info = others_added_info)) } else { return(list(update_tag = NULL, rm_score = NULL, rm_N = cloneID, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } else{ return(list(update_tag = NULL, rm_score = cloneID, rm_N = NULL, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } @@ -636,10 +758,12 @@ create_VCF_body <- function(csv, rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + multi_others_target <- sapply(vcf_tag_list, "[[", 8) # multiallelic target from Others vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + vmsg("Multiallelic target SNPs with a 3rd allele from Others: %s", verbose = verbose, level = 2, type = ">>", sum(multi_others_target)) if(rm_multiallelic_SNP) { vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { @@ -668,7 +792,7 @@ create_VCF_body <- function(csv, if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded.", verbose = verbose, level = 2, length(repeated)) + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded", verbose = verbose, level = 2, type = ">>", length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] @@ -711,31 +835,80 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, pad_width = NULL){ - +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, + multiallelic_SNP_sample_thr = 0, pad_width = NULL){ + #cloneID_unit <- cloneID[[250]] #Get counts for target SNP rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE rm_by_filter <- 0 # removed because empty after threshold filtering kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) simplified <- 0 # simplified from multiallelic to biallelic by filtering total_multiallelic <- 0 - RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + multiallelic_others_target <- 0 # target SNPs with a 3rd allele from Others + + # Target marker + RefTag <- apply(cloneID_unit[which((grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("RefMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + AltTag <- apply(cloneID_unit[which((grepl("Alt_0002$", cloneID_unit$AlleleID) | grepl("AltMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) cloneID <- cloneID_unit$CloneID[1] if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) - info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] + info <- cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),] + + # In case there are Others that add multiallelics to targets + others_target <- cloneID_unit[,3] %in% cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),3] + if(sum(others_target) > 2 & !rm_multiallelic_SNP){ # If target is multiallelic + multiallelic_others_target <- 1 + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + other_alts <- unique(cloneID_unit[idx_other,5]) + other_alts_info <- cloneID_unit[idx_other,] + OtherTag_list <- list() + total <- rep(0, length(RefTag)) + ads <- vector() + tab_counts <- paste0(RefTag + AltTag + total, ":", RefTag, ":", RefTag, ",", AltTag) + for(j in 1:length(other_alts)){ + temp_other <- which(other_alts_info[,5] == other_alts[j]) + OtherTag_list[[j]] <- apply(other_alts_info[temp_other, -c(1:7)], 2, sum) + total_temp <- OtherTag_list[[j]] + + if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles + if(sum(total_temp > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr) next() + } + total <- total + total_temp + tab_counts <- paste0(tab_counts, ",",OtherTag_list[[j]]) + ads_temp <- sum(OtherTag_list[[j]]) + ads <- paste0(ads, ",", ads_temp) + } + alts <- paste0(info$Alt, ",", paste0(other_alts, collapse = ",")) + info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", + "ADS=",sum(RefTag),",",sum(AltTag), ads) + } else { + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag) + alts <- info$Alt + info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", + "ADS=",sum(RefTag),",",sum(AltTag)) + } + info <- c(info$Chromosome, info$SNP_position_in_Genome, cloneID, - info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", - "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") + info$Ref, + alts, + ".", + ".", + info_mk, + "DP:RA:AD") vcf_tag <- c(info, tab_counts) # Check if there are more than one alternative allele by loci - off_tag <- cloneID_unit[-which(grepl("Ref_", cloneID_unit$AlleleID) | grepl("Alt_", cloneID_unit$AlleleID)),] + rm_tags <- which(grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("Alt_0002$", cloneID_unit$AlleleID)) + if(sum(others_target) > 2){ + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + off_tag <- cloneID_unit[-c(rm_tags,idx_other),] + } else { + off_tag <- cloneID_unit[-rm_tags,] + } + if(nrow(off_tag)){ # If there are off target SNP by_pos <- split.data.frame(off_tag, off_tag$SNP_position_in_Genome) @@ -818,5 +991,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified, multiallelic_others_target)) } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index ac2de3f..a1e8c92 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -15,6 +15,9 @@ madc2vcf_all( alignment_score_thr = 40, out_vcf = NULL, markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE ) } @@ -37,6 +40,14 @@ madc2vcf_all( \item{out_vcf}{A string specifying the name of the output VCF file. If the file extension is not \code{.vcf}, it will be appended automatically.} +\item{markers_info}{A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed).} + +\item{add_others}{A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE.} + +\item{others_max_snps}{An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit).} + +\item{others_rm_with_indels}{A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE.} + \item{verbose}{A logical value indicating whether to print metrics and progress to the console. Default is TRUE.} } \value{ From 84852dabc10214a50ffc8840a6999db1e4c1dab1 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 10:44:36 -0400 Subject: [PATCH 28/37] up version --- DESCRIPTION | 2 +- NEWS.md | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf782ad..f395fbb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.5 +Version: 0.6.6 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index 8037725..6167003 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# BIGr 0.6.6 + +# Updates on `madc2vcf_all` + +- New arguments for controlling processing of `Other` alleles: + - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction + - `others_max_snps`: discards Other alleles with more than this many SNP differences relative to the Ref sequence (default: 5) + - `others_rm_with_indels`: discards Other alleles containing insertions or deletions relative to the Ref sequence (default: `TRUE`) +- Others alleles that carry a different base at the target SNP position are now reported as a 3rd allele in the VCF instead of being silently dropped +- Target position is now correctly removed from Others alignments, preventing duplicate VCF positions and marker IDs +- Fixed a bug where Others alleles with "Ref_" or "Alt_" in their AlleleID would corrupt the target SNP REF/ALT fields and read depth counts in `merge_counts` +- Improved verbose messages throughout: counts of Other alleles found, kept, and discarded (by indel filter and by max SNP filter) are now reported; multiallelic target SNPs with a 3rd allele from Others are counted and reported +- Debug-level message (level 3) listing each Other allele added and its genomic position + # BIGr 0.6.5 # Updates on madc2vcf functions From 96a4ed14a62efa8f0cfe3f925d8c0bda0a216ccc Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 14:31:23 -0400 Subject: [PATCH 29/37] add madc2vcf_multi --- DESCRIPTION | 3 +- NAMESPACE | 7 + NEWS.md | 18 +++ R/check_madc_sanity.R | 5 +- R/get_countsMADC.R | 10 +- R/madc2vcf_multi.R | 173 ++++++++++++++++++++++++ man/madc2vcf_multi.Rd | 59 ++++++++ tests/testthat/test-check_madc_sanity.R | 2 +- tests/testthat/test-check_ped.R | 2 +- tests/testthat/test-madc2vcf_all.R | 34 ++--- tests/testthat/test-madc2vcf_multi.R | 144 ++++++++++++++++++++ tests/testthat/test-madc2vcf_targets.R | 6 +- 12 files changed, 433 insertions(+), 30 deletions(-) create mode 100644 R/madc2vcf_multi.R create mode 100644 man/madc2vcf_multi.Rd create mode 100644 tests/testthat/test-madc2vcf_multi.R diff --git a/DESCRIPTION b/DESCRIPTION index f395fbb..255ec4f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.6 +Version: 0.6.7 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -69,5 +69,6 @@ Suggests: rmdformats, knitr (>= 1.10), rmarkdown, + polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack diff --git a/NAMESPACE b/NAMESPACE index 6ecaef4..f3a2e76 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(get_countsMADC) export(imputation_concordance) export(madc2gmat) export(madc2vcf_all) +export(madc2vcf_multi) export(madc2vcf_targets) export(merge_MADCs) export(solve_composition_poly) @@ -39,11 +40,17 @@ importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) importFrom(dplyr,"%>%") importFrom(dplyr,across) +importFrom(dplyr,case_when) +importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,where) +importFrom(polyRAD,IterateHWE) +importFrom(polyRAD,RADdata2VCF) +importFrom(polyRAD,TestOverdispersion) +importFrom(polyRAD,readDArTag) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/NEWS.md b/NEWS.md index 6167003..0eff4c3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,21 @@ +# BIGr 0.6.7 + +# New function `madc2vcf_multi` + +- New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping +- Runs `check_madc_sanity` before loading the data and stops with informative errors if: + - Required columns are missing + - IUPAC (non-ATCG) codes are present in AlleleSequence + - Ref/Alt sequences are unpaired (`RefAltSeqs = FALSE`) + - Allele IDs have not been fixed by HapApp (`FixAlleleIDs = FALSE`) + - CloneIDs do not follow `Chr_Pos` format and no `markers_info` is provided +- New argument `markers_info`: optional path or URL to a CSV with `CloneID`/`BI_markerID`, `Chr`, and `Pos` columns; required when CloneIDs do not follow the `Chr_Pos` format +- Runs `check_botloci` to validate and reconcile CloneIDs between the MADC and botloci file, automatically fixing padding mismatches +- A corrected temp file is written and passed to `readDArTag` only when needed (all-NA rows/columns detected, CloneIDs remapped by `check_botloci`, or botloci IDs remapped) +- Accepts paths or URLs for `madc_file`, `botloci_file`, and `markers_info` +- Estimates overdispersion with `polyRAD::TestOverdispersion`, iterates priors with `polyRAD::IterateHWE`, and exports the result with `polyRAD::RADdata2VCF` +- `polyRAD` is a soft dependency (listed under `Suggests`); an informative error is raised if it is not installed + # BIGr 0.6.6 # Updates on `madc2vcf_all` diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index c36a92e..2248779 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -129,10 +129,11 @@ check_madc_sanity <- function(report) { # Complex indel: same length but >1 character difference between sequences same_len <- cmp_ok & (ref_len == alt_len) if (any(same_len)) { - n_diffs <- mapply(function(r, a) { + n_diffs <- mapply(function(r, a) { r_chars <- strsplit(r, "")[[1]] a_chars <- strsplit(a, "")[[1]] - sum(toupper(r_chars) != toupper(a_chars)) + standard <- toupper(r_chars) %in% c("A","T","C","G") & toupper(a_chars) %in% c("A","T","C","G") + sum(toupper(r_chars[standard]) != toupper(a_chars[standard])) }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) indel_mask[same_len] <- n_diffs > 1 } diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 58a2496..396004a 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -177,7 +177,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche #' count columns, etc.). #' #' @importFrom dplyr mutate group_by summarise across where select -#' @importFrom dplyr %>% +#' @importFrom dplyr %>% filter case_when #' #' @keywords internal get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { @@ -224,11 +224,11 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co if(collapse_matches_counts){ filtered_df <- madc_df[order(madc_df$AlleleID),] %>% # Keep only Ref/Alt alleles and their Match variants; drop other allele types - dplyr::filter(grepl("\\|(Ref|Alt)(Match)?\\b", AlleleID)) %>% + filter(grepl("\\|(Ref|Alt)(Match)?(_|$)", AlleleID)) %>% mutate( - Type = dplyr::case_when( - grepl("\\|Alt(Match)?\\b", AlleleID) ~ "Alt", - grepl("\\|Ref(Match)?\\b", AlleleID) ~ "Ref" + Type = case_when( + grepl("\\|Alt(Match)?(_|$)", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?(_|$)", AlleleID) ~ "Ref" ) ) %>% group_by(CloneID, Type) %>% diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R new file mode 100644 index 0000000..6e86b45 --- /dev/null +++ b/R/madc2vcf_multi.R @@ -0,0 +1,173 @@ +#' Convert MADC file to VCF using polyRAD for multiallelic genotyping +#' +#' This function converts a DArTag MADC file to a VCF using the polyRAD package's +#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before +#' loading the data, applies corrections for lowercase sequences and all-NA +#' rows/columns, and sets `n.header.rows` automatically based on whether the +#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +#' format (no header rows). +#' +#' @param madc_file character. Path or URL to the input MADC CSV file. +#' @param botloci_file character. Path or URL to the botloci file listing target +#' IDs designed on the bottom strand. +#' @param outfile character. Path for the output VCF file. +#' @param markers_info character or NULL. Optional path or URL to a CSV file +#' with marker metadata. Required when CloneIDs do not follow the +#' \code{Chr_Pos} format; must contain \code{CloneID} (or +#' \code{BI_markerID}), \code{Chr}, and \code{Pos} columns. +#' @param ploidy integer. Ploidy level of the samples passed to \code{taxaPloidy}. +#' Default is 2. +#' @param verbose logical. Whether to print progress messages. Default is TRUE. +#' +#' @return Invisible NULL. Writes a VCF file to \code{outfile}. +#' +#' @details +#' The function performs the following steps: +#' \enumerate{ +#' \item Reads the MADC file and runs \code{check_madc_sanity}. +#' \item Validates the botloci file against MADC CloneIDs using +#' \code{check_botloci}, fixing any padding mismatches automatically. +#' \item Converts lowercase bases in AlleleSequence to uppercase if detected. +#' \item Removes all-NA rows and columns if detected. +#' \item Writes the corrected data to a temporary file and passes it to +#' \code{polyRAD::readDArTag}. +#' \item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +#' calls \code{polyRAD::IterateHWE}, then exports the result with +#' \code{polyRAD::RADdata2VCF}. +#' } +#' +#' @importFrom utils read.csv write.csv read.table +#' +#' @export +madc2vcf_multi <- function(madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE) { + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") + if (!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if (!is.null(markers_info) && !(file.exists(markers_info) | url_exists(markers_info))) stop("markers_info file not found. Please provide a valid path or URL.") + if (!is.numeric(ploidy) || ploidy < 1) stop("ploidy must be a positive integer.") + + # ---- Load markers_info if provided ---- + mi_df <- if (!is.null(markers_info)) read.csv(markers_info) else NULL + + # ---- Read and sanity-check ---- + report <- read.csv(madc_file, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + for (i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if (!checks$checks["Columns"]) + stop("The MADC file is missing required columns (CloneID, AlleleID, AlleleSequence)") + + if (checks$checks["IUPACcodes"]) + stop("MADC Allele Sequences contain IUPAC (non-ATCG) codes. Please run HapApp to clean MADC file before using this function.") + + if (!isTRUE(checks$checks["RefAltSeqs"])) + stop("Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function.") + + if (!isTRUE(checks$checks["FixAlleleIDs"])) + stop("The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function.") + + if (!isTRUE(checks$checks["ChromPos"])) { + if (is.null(markers_info)) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + if (!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + } + + # ---- Check botloci vs MADC CloneIDs ---- + vmsg("Checking botloci file", verbose = verbose, level = 0, type = ">>") + cloneids_before <- report$CloneID + botloci_df <- read.table(botloci_file, header = FALSE) + botloci_before <- botloci_df$V1 + checked_botloci <- check_botloci(botloci_df, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci_df <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + cloneid_changed <- !identical(report$CloneID, cloneids_before) + botloci_changed <- !identical(botloci_df$V1, botloci_before) + + # ---- Botloci temp file (if IDs were remapped) ---- + if (botloci_changed) { + tmp_botloci <- tempfile() + on.exit(unlink(tmp_botloci), add = TRUE) + write.table(botloci_df, tmp_botloci, row.names = FALSE, col.names = FALSE, quote = FALSE) + botloci_input <- tmp_botloci + } else { + botloci_input <- botloci_file + } + + # ---- Corrections: only create a temp file if needed ---- + need_temp <- isTRUE(checks$checks["allNArow"]) || isTRUE(checks$checks["allNAcol"]) || cloneid_changed + + if (need_temp) { + if (checks$checks["LowerCase"]) { + vmsg("MADC Allele Sequences contain lowercase characters. Converting to uppercase", + verbose = verbose, level = 1, type = ">>") + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if (checks$checks["allNArow"]) { + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA row(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[!idx, ] + } + + if (checks$checks["allNAcol"]) { + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA column(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[, !idx] + } + + tmp_madc <- tempfile(fileext = ".csv") + on.exit(unlink(tmp_madc), add = TRUE) + write.csv(report, tmp_madc, row.names = FALSE, quote = TRUE) + input_file <- tmp_madc + } else { + if (checks$checks["LowerCase"]) + vmsg("MADC Allele Sequences contain lowercase characters. polyRAD will handle them", + verbose = verbose, level = 1, type = ">>") + input_file <- madc_file + } + + vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + + raddat <- polyRAD::readDArTag( + file = input_file, + botloci = botloci_input, + n.header.rows = 0L, + sample.name.row = 1, + trim.sample.names = "", + taxaPloidy = as.integer(ploidy) + ) + + overdispersionP <- polyRAD::TestOverdispersion(raddat) + my_ovdisp <- overdispersionP$optimal + + vmsg("Running HWE iteration (overdispersion = %s)", verbose = verbose, level = 0, type = ">>", my_ovdisp) + + raddat_hwe <- polyRAD::IterateHWE(raddat, overdispersion = my_ovdisp) + + vmsg("Writing VCF to %s", verbose = verbose, level = 0, type = ">>", outfile) + + polyRAD::RADdata2VCF(raddat_hwe, file = outfile, asSNPs = FALSE, hindhe = FALSE) + + vmsg("Done!", verbose = verbose, level = 0, type = ">>") + + invisible(NULL) +} + diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd new file mode 100644 index 0000000..5c3a777 --- /dev/null +++ b/man/madc2vcf_multi.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2vcf_multi.R +\name{madc2vcf_multi} +\alias{madc2vcf_multi} +\title{Convert MADC file to VCF using polyRAD for multiallelic genotyping} +\usage{ +madc2vcf_multi( + madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character. Path or URL to the input MADC CSV file.} + +\item{botloci_file}{character. Path or URL to the botloci file listing target +IDs designed on the bottom strand.} + +\item{outfile}{character. Path for the output VCF file.} + +\item{markers_info}{character or NULL. Optional path or URL to a CSV file +with marker metadata. Required when CloneIDs do not follow the +\code{Chr_Pos} format; must contain \code{CloneID} (or +\code{BI_markerID}), \code{Chr}, and \code{Pos} columns.} + +\item{ploidy}{integer. Ploidy level of the samples passed to \code{taxaPloidy}. +Default is 2.} + +\item{verbose}{logical. Whether to print progress messages. Default is TRUE.} +} +\value{ +Invisible NULL. Writes a VCF file to \code{outfile}. +} +\description{ +This function converts a DArTag MADC file to a VCF using the polyRAD package's +\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before +loading the data, applies corrections for lowercase sequences and all-NA +rows/columns, and sets \code{n.header.rows} automatically based on whether the +MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +format (no header rows). +} +\details{ +The function performs the following steps: +\enumerate{ +\item Reads the MADC file and runs \code{check_madc_sanity}. +\item Validates the botloci file against MADC CloneIDs using +\code{check_botloci}, fixing any padding mismatches automatically. +\item Converts lowercase bases in AlleleSequence to uppercase if detected. +\item Removes all-NA rows and columns if detected. +\item Writes the corrected data to a temporary file and passes it to +\code{polyRAD::readDArTag}. +\item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +calls \code{polyRAD::IterateHWE}, then exports the result with +\code{polyRAD::RADdata2VCF}. +} +} diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index a185997..d0c45cf 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE) names(exp) <- names expect_equal(res$checks, exp) diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index fdf69e0..706143f 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -15,7 +15,7 @@ test_that("test imputation",{ messy_parents <- output.list$messy_parents missing_parents <- output.list$missing_parents - expect_true(df_length == 4) + expect_true(df_length == 5) # Before was 4 expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) expect_true(nrow(missing_parents) == 13) diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f88adc9..c8c860f 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -22,7 +22,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) set.seed(456) madc2vcf_all(madc = madc_file, @@ -34,7 +34,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp_multi, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) vcf_multi <- read.vcfR(temp_multi) @@ -56,7 +56,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) @@ -111,7 +111,7 @@ test_that("simu alfalfa",{ multiallelic_SNP_dp_thr = 0, alignment_score_thr = 40, out_vcf = out, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -131,7 +131,7 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -163,7 +163,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -184,7 +184,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -230,7 +230,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info_ChromPos, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -256,7 +256,7 @@ test_that("simu alfalfa",{ hap_seq_file = alfalfa_microhapDB, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -276,7 +276,7 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -297,7 +297,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -348,7 +348,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) @@ -358,8 +358,8 @@ test_that("simu alfalfa",{ check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43017) - expect_equal(sum(dp[3,]), 5073) + expect_equal(sum(dp[,10]), 226838) + expect_equal(sum(dp[3,]), 3996) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -472,7 +472,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) @@ -483,7 +483,7 @@ test_that("simu alfalfa",{ expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) + expect_equal(sum(dp[,10]), 219742) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -510,7 +510,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE), + verbose = FALSE), regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" ) }) diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R new file mode 100644 index 0000000..414d160 --- /dev/null +++ b/tests/testthat/test-madc2vcf_multi.R @@ -0,0 +1,144 @@ +context("MADC to VCF via polyRAD") + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { + + skip_if_not_installed("polyRAD") + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + out <- tempfile(fileext = ".vcf") + on.exit(unlink(out), add = TRUE) + + # Fixed allele ID format + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + # Don't allow raw MADC + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = FALSE + ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + # Wrong IDs + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + markers_info = alfalfa_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + ### Avoid IUPAC codes + expect_error( + madc2vcf_multi( + madc_file = alfalfa_iupac, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." + ) + + expect_error( + madc2vcf_multi( + madc_file = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = potato_indel_madc, + botloci_file = potato_botloci, + outfile = out, + markers_info = potato_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 277) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/1/1/6") + +}) + diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 9899e17..a64da34 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -140,7 +140,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = FALSE, collapse_matches_counts = TRUE, - verbose = FALSE) + verbose = TRUE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -250,7 +250,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = TRUE, botloci_file = alfalfa_botloci, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) @@ -677,7 +677,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = FALSE, collapse_matches_counts = TRUE, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") From cec168d8d12799dee04094b17b0253a0f21859ce Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:06:54 -0400 Subject: [PATCH 30/37] fix checks --- DESCRIPTION | 3 ++- NAMESPACE | 6 +----- NEWS.md | 6 +++--- R/filterVCF.R | 5 +++-- R/imputation_concordance.R | 3 +++ R/utils.R | 11 ++++++----- man/filterVCF.Rd | 2 ++ man/imputation_concordance.Rd | 2 ++ tests/testthat/test-imputation_concordance.R | 1 - 9 files changed, 22 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 255ec4f..fa60f51 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,8 @@ Imports: janitor, quadprog, tibble, - stringr + stringr, + ggplot2 Suggests: covr, spelling, diff --git a/NAMESPACE b/NAMESPACE index f3a2e76..ae09080 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) -export(url_exists) export(vmsg) import(dplyr) import(ggplot2) @@ -47,16 +46,13 @@ importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,where) -importFrom(polyRAD,IterateHWE) -importFrom(polyRAD,RADdata2VCF) -importFrom(polyRAD,TestOverdispersion) -importFrom(polyRAD,readDArTag) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) importFrom(stats,cor) +importFrom(stats,reorder) importFrom(stats,setNames) importFrom(utils,packageVersion) importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 0eff4c3..dd2f630 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # BIGr 0.6.7 -# New function `madc2vcf_multi` +## New function `madc2vcf_multi` - New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping - Runs `check_madc_sanity` before loading the data and stops with informative errors if: @@ -18,7 +18,7 @@ # BIGr 0.6.6 -# Updates on `madc2vcf_all` +## Updates on `madc2vcf_all` - New arguments for controlling processing of `Other` alleles: - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction @@ -32,7 +32,7 @@ # BIGr 0.6.5 -# Updates on madc2vcf functions +## Updates on madc2vcf functions Details: - both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: diff --git a/R/filterVCF.R b/R/filterVCF.R index 14f94c0..b9bca78 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -17,6 +17,7 @@ #' @param filter.SAMPLE.miss Sample missing data filter #' @param filter.SNP.miss SNP missing data filter #' @param ploidy The ploidy of the species being analyzed +#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity). #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned. #' @return A gzipped vcf file #' @importFrom vcfR read.vcfR @@ -55,7 +56,7 @@ filterVCF <- function(vcf.file, # Read VCF (can be .vcf or .vcf.gz) - if (class(vcf.file) != "vcfR") { + if (!inherits(vcf.file, "vcfR")) { vcf <- read.vcfR(vcf.file) } else { vcf <- vcf.file @@ -381,7 +382,7 @@ filterVCF <- function(vcf.file, } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) cat("Exporting VCF\n") - if (!class(vcf.file) == "vcfR"){ + if (!inherits(vcf.file, "vcfR")){ if (!is.null(output.file)){ output_name <- paste0(output.file,".vcf.gz") vcfR::write.vcf(vcf, file = output_name) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index ac501bb..1eb441a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -50,6 +50,7 @@ #' @import ggplot2 #' #' @examples +#' \dontrun{ #' result <- imputation_concordance( #' reference_genos = ref, #' imputed_genos = test, @@ -58,7 +59,9 @@ #' verbose = TRUE, #' plot = TRUE #' ) +#' } #' +#' @importFrom stats reorder #' @export imputation_concordance <- function(reference_genos, imputed_genos, diff --git a/R/utils.R b/R/utils.R index cf3cc6a..59e5563 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,8 +1,9 @@ #Internal Functions utils::globalVariables(c( - "ALT", "AlleleID", "CHROM", "Data", "ID", "MarkerName", "POS", - "QPseparate", "QPsolve_par", "REF", "Var1", "Variant", "geno", + "ALT", "AlleleID", "AlleleSequence", "CHROM", "Concordance", "Data", "ID", + "MarkerName", "POS", + "QPseparate", "QPsolve_par", "REF", "Type", "Var1", "Variant", "geno", "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", @@ -48,10 +49,10 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { # Create indentation based on level indent <- switch(as.character(level), "0" = "", # Section headers - "1" = " ∙ ", # Main steps (medium bullet) + "1" = " \u2219 ", # Main steps (medium bullet) "2" = " - ", # Details "3" = " > ", # Sub-details - paste0(paste(rep(" ", level), collapse = ""), "• ") # Fallback for level > 3 + paste0(paste(rep(" ", level), collapse = ""), "\u2022 ") # Fallback for level > 3 ) # Format type label (only show for level 0) @@ -82,7 +83,7 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { #' #' @keywords internal #' @noRd -#' @export +#' url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 2a3ab62..0342fe1 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -23,6 +23,8 @@ filterVCF( \arguments{ \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).} +\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).} + \item{filter.OD}{Updog filter} \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)} diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index dc1cc65..31f54a8 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -64,6 +64,7 @@ When \code{plot = TRUE}, a bar plot showing concordance percentage per sample is generated using \pkg{ggplot2}. } \examples{ +\dontrun{ result <- imputation_concordance( reference_genos = ref, imputed_genos = test, @@ -72,5 +73,6 @@ result <- imputation_concordance( verbose = TRUE, plot = TRUE ) +} } diff --git a/tests/testthat/test-imputation_concordance.R b/tests/testthat/test-imputation_concordance.R index f1fb421..459998c 100644 --- a/tests/testthat/test-imputation_concordance.R +++ b/tests/testthat/test-imputation_concordance.R @@ -1,6 +1,5 @@ context("Imputation Concordance") - test_that("test imputation",{ #Input variables ignore_file <- system.file("imputation_ignore.txt", package="BIGr") From 0be2e0fb06e3accde737195790a868b450ee0ff3 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:30:37 -0400 Subject: [PATCH 31/37] fix checks 2 --- tests/testthat/test-madc2vcf_multi.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R index 414d160..40ececd 100644 --- a/tests/testthat/test-madc2vcf_multi.R +++ b/tests/testthat/test-madc2vcf_multi.R @@ -34,8 +34,6 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { skip_if_offline("raw.githubusercontent.com") out <- tempfile(fileext = ".vcf") - on.exit(unlink(out), add = TRUE) - # Fixed allele ID format expect_no_error( madc2vcf_multi( @@ -54,6 +52,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { expect_equal(GT[3,5],"0/0/0/3") # Don't allow raw MADC + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_madc_raw, @@ -64,6 +63,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = alfalfa_madc, @@ -75,6 +75,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ) # Wrong IDs + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_madc_wrongID, @@ -85,6 +86,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = alfalfa_madc_wrongID, @@ -103,6 +105,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { expect_equal(GT[3,5],"0/0/0/3") ### Avoid IUPAC codes + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_iupac, @@ -113,6 +116,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." ) + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_lowercase, @@ -123,6 +127,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = potato_indel_madc, From 33fc87c10aaedbb788b518f1f5b391446fea1cdd Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:49:45 -0400 Subject: [PATCH 32/37] add VariantAnnotation to test env --- .github/workflows/R-CMD-check.yaml | 2 ++ tests/testthat/test-madc2vcf_multi.R | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..f7c1b98 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,6 +43,8 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD + bioc::VariantAnnotation needs: check - uses: r-lib/actions/check-r-package@v2 diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R index 40ececd..adca090 100644 --- a/tests/testthat/test-madc2vcf_multi.R +++ b/tests/testthat/test-madc2vcf_multi.R @@ -7,6 +7,7 @@ context("MADC to VCF via polyRAD") test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { skip_if_not_installed("polyRAD") + skip_if_not_installed("VariantAnnotation") github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" From 77107ba5c638874ec56e23744a0f8dfbf5fa9017 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 16:32:52 -0400 Subject: [PATCH 33/37] ignore madc2vcf_multi tests in actions --- .github/workflows/R-CMD-check.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f7c1b98..969e02d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,8 +43,6 @@ jobs: extra-packages: | any::rcmdcheck any::covr - any::polyRAD - bioc::VariantAnnotation needs: check - uses: r-lib/actions/check-r-package@v2 From ccf9e776a7051cd4f893064c77ccdc8cb193ca99 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 09:00:47 -0400 Subject: [PATCH 34/37] more messages and tests --- .github/workflows/R-CMD-check.yaml | 5 +++++ R/madc2vcf_all.R | 14 ++++++++++++++ R/madc2vcf_multi.R | 7 +++++++ R/madc2vcf_targets.R | 8 ++++++++ 4 files changed, 34 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..587188c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,7 +43,12 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD needs: check + + - name: Install VariantAnnotation (no Suggests) + run: pak::pkg_install("bioc::VariantAnnotation", dependencies = c("Depends", "Imports", "LinkingTo")) + shell: Rscript {0} - uses: r-lib/actions/check-r-package@v2 - name: Generate test coverage report diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index ac20835..17490f3 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -70,6 +70,20 @@ madc2vcf_all <- function(madc, others_rm_with_indels = TRUE, verbose = TRUE){ + vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") + vmsg("madc : %s", verbose = verbose, level = 1, madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, if (is.null(out_vcf)) "NULL" else out_vcf) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 6e86b45..0b67f3f 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -46,6 +46,13 @@ madc2vcf_multi <- function(madc_file, ploidy = 2L, verbose = TRUE) { + vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, ploidy) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 80af70f..19c486e 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -136,6 +136,14 @@ madc2vcf_targets <- function(madc_file, collapse_matches_counts = FALSE, verbose = TRUE) { + vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, collapse_matches_counts) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks From 8a00c9e1b0f69e462e185867e46082eb7c85ee50 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 09:03:42 -0400 Subject: [PATCH 35/37] bugfix --- R/madc2vcf_all.R | 26 +++++++++++++------------- R/madc2vcf_multi.R | 10 +++++----- R/madc2vcf_targets.R | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 17490f3..b25e6da 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -71,19 +71,19 @@ madc2vcf_all <- function(madc, verbose = TRUE){ vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") - vmsg("madc : %s", verbose = verbose, level = 1, madc) - vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) - vmsg("hap_seq_file : %s", verbose = verbose, level = 1, if (is.null(hap_seq_file)) "NULL" else hap_seq_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("n.cores : %s", verbose = verbose, level = 1, n.cores) - vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, alignment_score_thr) - vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, rm_multiallelic_SNP) - vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, multiallelic_SNP_dp_thr) - vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, multiallelic_SNP_sample_thr) - vmsg("add_others : %s", verbose = verbose, level = 1, add_others) - vmsg("others_max_snps : %s", verbose = verbose, level = 1, if (is.null(others_max_snps)) "NULL" else others_max_snps) - vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, others_rm_with_indels) - vmsg("out_vcf : %s", verbose = verbose, level = 1, if (is.null(out_vcf)) "NULL" else out_vcf) + vmsg("madc : %s", verbose = verbose, level = 1, type = ">>", madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, type = ">>", n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, type = ">>", alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, type = ">>", rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, type = ">>", add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, type = ">>", if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, type = ">>", others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, type = ">>", if (is.null(out_vcf)) "NULL" else out_vcf) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 0b67f3f..46c4fb3 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -47,11 +47,11 @@ madc2vcf_multi <- function(madc_file, verbose = TRUE) { vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") - vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) - vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("outfile : %s", verbose = verbose, level = 1, outfile) - vmsg("ploidy : %s", verbose = verbose, level = 1, ploidy) + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, type = ">>", outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, type = ">>", ploidy) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 19c486e..888c445 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -137,12 +137,12 @@ madc2vcf_targets <- function(madc_file, verbose = TRUE) { vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") - vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) - vmsg("output.file : %s", verbose = verbose, level = 1, output.file) - vmsg("botloci_file : %s", verbose = verbose, level = 1, if (is.null(botloci_file)) "NULL" else botloci_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, get_REF_ALT) - vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, collapse_matches_counts) + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, type = ">>", output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, type = ">>", get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, type = ">>", collapse_matches_counts) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") From f2013e3d9d37be0fc98c61da0831ceb92b8d5593 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 11:18:11 -0400 Subject: [PATCH 36/37] update man --- R/get_countsMADC.R | 3 ++- R/madc2vcf_all.R | 14 ++++++++++++++ R/madc2vcf_multi.R | 13 +++++++++++++ man/madc2vcf_all.Rd | 10 ++++++++++ man/madc2vcf_multi.Rd | 12 ++++++++++++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 396004a..3a9bc2b 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -95,8 +95,9 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + vmsg("Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences", verbose = verbose, level = 1, type = ">>") - warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion")) + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion. Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences.")) # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index b25e6da..63c031d 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -22,6 +22,18 @@ #' @details #' The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the `alignment_score_thr` parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the `alignment_score_thr` is not met, the corresponding SNPs are discarded. #' +#' **Sanity check behaviour and requirements** +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `markers_info` with `Ref`/`Alt`/`Indel_pos`/`Indel_length` + `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | **RefAltSeqs** | all paired | `botloci_file` | +#' | | unpaired | `botloci_file` + `hap_seq_file` (microhaplotype DB) | +#' +#' #' @examples #' # Example usage: #' @@ -323,6 +335,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) + if(sum(ref_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Ref_0001 sequence. Consider providing the haplotype database file to recover these tags", sum(ref_index==-1))) + if(sum(alt_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Alt_0002 sequence. Consider providing the haplotype database file to recover these tags", sum(alt_index==-1))) vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 46c4fb3..bcbae02 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -36,6 +36,19 @@ #' \code{polyRAD::RADdata2VCF}. #' } #' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +#' unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +#' remaining checks the required inputs are: +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' #' @importFrom utils read.csv write.csv read.table #' #' @export diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index a1e8c92..c15e69b 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -58,6 +58,16 @@ This function processes a MADC file to generate a VCF file containing both targe } \details{ The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the \code{alignment_score_thr} parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the \code{alignment_score_thr} is not met, the corresponding SNPs are discarded. + +\strong{Sanity check behaviour and requirements}\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{markers_info} with \code{Ref}/\code{Alt}/\code{Indel_pos}/\code{Indel_length} + \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \strong{RefAltSeqs} \tab all paired \tab \code{botloci_file} \cr + \tab unpaired \tab \code{botloci_file} + \code{hap_seq_file} (microhaplotype DB) \cr +} } \examples{ # Example usage: diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd index 5c3a777..70bc59d 100644 --- a/man/madc2vcf_multi.Rd +++ b/man/madc2vcf_multi.Rd @@ -56,4 +56,16 @@ The function performs the following steps: calls \code{polyRAD::IterateHWE}, then exports the result with \code{polyRAD::RADdata2VCF}. } + +\strong{Sanity check behaviour and requirements} + +The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +remaining checks the required inputs are:\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr +} } From b01c12ba1aaf5b62bafab31f410a241363bf8018 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 14:17:33 -0400 Subject: [PATCH 37/37] minor version up --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fa60f51..48ede7c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.7 +Version: 0.7.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index dd2f630..1b9559b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# BIGr 0.6.7 +# BIGr 0.7.0 ## New function `madc2vcf_multi`