diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..587188c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,7 +43,12 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD needs: check + + - name: Install VariantAnnotation (no Suggests) + run: pak::pkg_install("bioc::VariantAnnotation", dependencies = c("Depends", "Imports", "LinkingTo")) + shell: Rscript {0} - uses: r-lib/actions/check-r-package@v2 - name: Generate test coverage report diff --git a/.gitignore b/.gitignore index b64a99f..d3ffaad 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata revdep/ +.DS_Store diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/DESCRIPTION b/DESCRIPTION index 0cef1b8..48ede7c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.3 +Version: 0.7.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -23,7 +23,7 @@ Authors@R: c(person(given='Alexander M.', person(given='Dongyan', family='Zhao', role='ctb'), - person('Cornell', 'University', + person('University', "of Florida", role=c('cph'), comment = "Breeding Insight")) Maintainer: Alexander M. Sandercock @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: @@ -62,12 +62,14 @@ Imports: janitor, quadprog, tibble, - stringr + stringr, + ggplot2 Suggests: covr, spelling, rmdformats, knitr (>= 1.10), rmarkdown, + polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack diff --git a/NAMESPACE b/NAMESPACE index d47d95d..ae09080 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) +export(check_madc_sanity) export(check_ped) export(check_replicates) export(dosage2vcf) @@ -15,12 +16,15 @@ export(get_countsMADC) export(imputation_concordance) export(madc2gmat) export(madc2vcf_all) +export(madc2vcf_multi) export(madc2vcf_targets) export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(vmsg) import(dplyr) +import(ggplot2) import(janitor) import(parallel) import(quadprog) @@ -33,12 +37,22 @@ importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) +importFrom(dplyr,"%>%") +importFrom(dplyr,across) +importFrom(dplyr,case_when) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(dplyr,summarise) +importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) importFrom(stats,cor) +importFrom(stats,reorder) importFrom(stats,setNames) importFrom(utils,packageVersion) importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 19d2509..1b9559b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,90 @@ +# BIGr 0.7.0 + +## New function `madc2vcf_multi` + +- New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping +- Runs `check_madc_sanity` before loading the data and stops with informative errors if: + - Required columns are missing + - IUPAC (non-ATCG) codes are present in AlleleSequence + - Ref/Alt sequences are unpaired (`RefAltSeqs = FALSE`) + - Allele IDs have not been fixed by HapApp (`FixAlleleIDs = FALSE`) + - CloneIDs do not follow `Chr_Pos` format and no `markers_info` is provided +- New argument `markers_info`: optional path or URL to a CSV with `CloneID`/`BI_markerID`, `Chr`, and `Pos` columns; required when CloneIDs do not follow the `Chr_Pos` format +- Runs `check_botloci` to validate and reconcile CloneIDs between the MADC and botloci file, automatically fixing padding mismatches +- A corrected temp file is written and passed to `readDArTag` only when needed (all-NA rows/columns detected, CloneIDs remapped by `check_botloci`, or botloci IDs remapped) +- Accepts paths or URLs for `madc_file`, `botloci_file`, and `markers_info` +- Estimates overdispersion with `polyRAD::TestOverdispersion`, iterates priors with `polyRAD::IterateHWE`, and exports the result with `polyRAD::RADdata2VCF` +- `polyRAD` is a soft dependency (listed under `Suggests`); an informative error is raised if it is not installed + +# BIGr 0.6.6 + +## Updates on `madc2vcf_all` + +- New arguments for controlling processing of `Other` alleles: + - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction + - `others_max_snps`: discards Other alleles with more than this many SNP differences relative to the Ref sequence (default: 5) + - `others_rm_with_indels`: discards Other alleles containing insertions or deletions relative to the Ref sequence (default: `TRUE`) +- Others alleles that carry a different base at the target SNP position are now reported as a 3rd allele in the VCF instead of being silently dropped +- Target position is now correctly removed from Others alignments, preventing duplicate VCF positions and marker IDs +- Fixed a bug where Others alleles with "Ref_" or "Alt_" in their AlleleID would corrupt the target SNP REF/ALT fields and read depth counts in `merge_counts` +- Improved verbose messages throughout: counts of Other alleles found, kept, and discarded (by indel filter and by max SNP filter) are now reported; multiallelic target SNPs with a 3rd allele from Others are counted and reported +- Debug-level message (level 3) listing each Other allele added and its genomic position + +# BIGr 0.6.5 + +## Updates on madc2vcf functions +Details: + +- both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: + - [Columns] If MADC has the expected columns + - [allNArow | allNAcol] Presence of columns and rows with all NA (happens often when people open the MADC in excel before loading in R) + - [IUPACcodes] Presence of IUPAC codes on AlleleSequence + - [LowerCase] Presence of lower case bases on AlleleSequence + - [Indels] Presence of Indels + - [ChromPos] If CloneID follows the format Chr_Pos + - [RefAltSeqs] If all Ref Allele has corresponding Alt and vice-versa + - [OtherAlleles] If "Other" exists in the MADC AlleleID + +- Better messages if `verbose = TRUE` in `madc2vcf_all` +- `madc2vcf_all` support for Indels - markers_info with Indels position is required; only the target indel is extracted, off-targets are ignored for the tag +- `madc2vcf_targets` doesn’t run if: + - MADC Column names are not correct + - Ignore Other alleles - but inform the user if they exist or not and direct them to `madc2vcf_all` in case they want to extract them as well +- See the table for madc2vcf_targets requirements accordingly to MADC content: + +  | check status | get_REF_ALT | Requires +-- | -- | -- | -- +IUPAC | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +Indels | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +ChromPos | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info CHR/POS/REF/ALT or markers_info CHR/POS/ + botloci +  | FALSE | FALSE | markers_info CHR/POS +FixAlleleIDs | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info REF/ALT +  | FALSE | FALSE | - + +# BIGr 0.6.4 + +- Add function `vmsg` to organize messages printed on the console +- Add metadata to VCF header from madc2vcf_targets +- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci` +- Organize messages from `madc2vcf_targets` checks +- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function + # BIGr 0.6.3 -- Ignore tags when targets are indels +- New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information. +- Added new argument `markers_info`, which allows users to provide a CSV file with marker information such as CHROM, POS, marker type, and position of indels. For BI species, this information is available from [PanelHub](https://github.com/Breeding-Insight/BIGapp-PanelHub). +- Checked inputs for `madc2vcf_all`. +- Updated affiliation in `DESCRIPTION`. # BIGr 0.6.2 diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R new file mode 100644 index 0000000..2248779 --- /dev/null +++ b/R/check_madc_sanity.R @@ -0,0 +1,297 @@ +#' Run basic sanity checks on a MADC-style allele report +#' +#' @description +#' Performs nine quick validations on an allele report: +#' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` +#' *and* both `_0001` and `_0002` appear in `AlleleID`; +#' 3) **IUPACcodes** - presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** - presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** - reference/alternate allele lengths differ for the same `CloneID`, +#' or a `"-"` character is present in `AlleleSequence`; +#' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format +#' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); +#' 7) **allNAcol** - at least one column contains only `NA` or empty values; +#' 8) **allNArow** - at least one row contains only `NA` or empty values; +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row. +#' +#' @param report A `data.frame` with at least the columns +#' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also +#' used in the `FixAlleleIDs` check to inspect its first up to six entries. +#' If `FixAlleleIDs` is `FALSE` (raw DArT format), the first 7 rows are +#' treated as header filler and skipped before further checks are run. +#' +#' @details +#' - **FixAlleleIDs:** When the first six rows of the first column are all blank +#' or `"*"` (raw DArT format), row 7 is promoted to column headers and the +#' first 7 rows are dropped before subsequent checks are run. The check is +#' `TRUE` when the file has already been processed by HapApp (fixed IDs with +#' `_0001`/`_0002` suffixes). +#' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` +#' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs +#' `"Alt_0002"`, merged by `CloneID`, and flagged as indels if either (a) the +#' lengths of `AlleleSequence` differ, (b) the sequences have the same length +#' but more than one character differs between them (complex indel / local +#' rearrangement), or (c) a `"-"` character is present anywhere in +#' `AlleleSequence`. +#' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the +#' first part must match `"Chr"` (case-insensitive) and the second must be a +#' positive integer. Returns `FALSE` when any `CloneID` is `NA`. +#' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows +#' respectively; a cell is treated as missing when it is `NA` or an empty +#' string (`""`). Useful for flagging empty or corrupt entries. +#' - **RefAltSeqs:** For each unique `CloneID`, checks whether at least one row +#' with a `Ref` (`|Ref_` when `FixAlleleIDs = TRUE`, `|Ref$` otherwise) and +#' one row with an `Alt` (`|Alt_` / `|Alt$`) allele exist. `CloneID`s that +#' lack a `Ref` row are stored in `missRef`; those lacking an `Alt` row in +#' `missAlt`. The check is `TRUE` when both sets are empty. +#' - If required columns are missing (`Columns = FALSE`), only `Columns` and +#' `FixAlleleIDs` are evaluated; all other checks remain `NA` and +#' `indel_clone_ids`, `missRef`, and `missAlt` are returned as `NULL`. +#' +#' @return A named list with five elements: +#' \describe{ +#' \item{checks}{Named logical vector with nine entries: +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, +#' `ChromPos`, `allNAcol`, `allNArow`, `RefAltSeqs`. +#' `TRUE` means the condition was detected (or passed for `Columns`, +#' `FixAlleleIDs`, `ChromPos`, and `RefAltSeqs`); `NA` means the check +#' was skipped.} +#' \item{messages}{Named list of length-2 character vectors, one per check. +#' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` +#' when it is `FALSE`. Indexed by the same names as `checks`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt +#' lengths differ. Returns `character(0)` if none are found, or `NULL` +#' when required columns are missing.} +#' \item{missRef}{Character vector of `CloneID`s that have no `Ref` allele +#' row. Returns `character(0)` if all `CloneID`s have a `Ref` row, or +#' `NULL` when required columns are missing.} +#' \item{missAlt}{Character vector of `CloneID`s that have no `Alt` allele +#' row. Returns `character(0)` if all `CloneID`s have an `Alt` row, or +#' `NULL` when required columns are missing.} +#' } +#' +#' @export +check_madc_sanity <- function(report) { + + # Initialize + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + # Validate required columns + required <- c("CloneID", "AlleleID", "AlleleSequence") + missing_cols <- setdiff(required, names(report)) + checks["Columns"] <- length(missing_cols) == 0 + + if(checks[["Columns"]]){ + # ---- IUPACcodes ---- + iu <- grepl("[^ATCG-]", report$AlleleSequence, ignore.case = TRUE) + checks["IUPACcodes"] <- any(iu, na.rm = TRUE) + + # ---- LowerCase ---- + lc <- grepl("[atcg]", report$AlleleSequence) + checks["LowerCase"] <- any(lc, na.rm = TRUE) + + # ---- Indels ---- + refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + + merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE) + + if (nrow(merged) > 0) { + ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) + alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) + cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + + # Classic indel: different lengths + indel_mask <- cmp_ok & (ref_len != alt_len) + + # Complex indel: same length but >1 character difference between sequences + same_len <- cmp_ok & (ref_len == alt_len) + if (any(same_len)) { + n_diffs <- mapply(function(r, a) { + r_chars <- strsplit(r, "")[[1]] + a_chars <- strsplit(a, "")[[1]] + standard <- toupper(r_chars) %in% c("A","T","C","G") & toupper(a_chars) %in% c("A","T","C","G") + sum(toupper(r_chars[standard]) != toupper(a_chars[standard])) + }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) + indel_mask[same_len] <- n_diffs > 1 + } + + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence), na.rm = TRUE) + indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + + } else { + checks["Indels"] <- FALSE + indels <- character(0) + } + + # --- All NA ---- + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x) | x == ""))) + + # ---- Chrom Pos ---- + if(!any(is.na(report$CloneID))) { + pos <- strsplit(report$CloneID, "_") + format <- all(sapply(pos, length) == 2) + first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) + second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) + checks["ChromPos"] <- all(format, first, second) + } else checks["ChromPos"] <- FALSE + + # ---- RefAltSeqs ---- + all_clones <- unique(report$CloneID) + if (isTRUE(checks["FixAlleleIDs"])) { + has_ref <- unique(report$CloneID[grepl("\\|Ref_", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt_", report$AlleleID)]) + } else { + has_ref <- unique(report$CloneID[grepl("\\|Ref$", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt$", report$AlleleID)]) + } + missRef <- setdiff(all_clones, has_ref) + missAlt <- setdiff(all_clones, has_alt) + checks["RefAltSeqs"] <- length(missRef) == 0 & length(missAlt) == 0 + + # ---- OtherAlleles ---- + checks["OtherAlleles"] <- any(grepl("[|]Other", report$AlleleID)) + + } else { + indels <- NULL + missRef <- NULL + missAlt <- NULL + } + + messages[["Columns"]] <- c("Required columns are present", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", + "MADC not processed by HapApp") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them", + "No IUPAC (non-ATCG) codes found in AlleleSequence") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", + "No lowercase bases found in AlleleSequence") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ or >1 mismatch between same-length sequences) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + messages[["allNArow"]] <- c("One or more rows contain all NA values", + "No rows with all NA values") + messages[["allNAcol"]] <- c("One or more columns contain all NA values", + "No columns with all NA values") + messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", + paste0("Some CloneIDs are missing Ref and/or Alt alleles ", + "Missing Ref: ", paste(missRef, collapse = " "), ". ", + "Missing Alt: ", paste(missAlt, collapse = " "), ".")) + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID", + "No alleles other than Ref and Alt found in AlleleID") + + list(checks = checks, messages = messages, indel_clone_ids = indels, + missRef = missRef, missAlt = missAlt) +} + +#' Check and Adjust Botloci and MADC Marker Compatibility +#' +#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +#' +#' @param botloci A data frame containing the botloci markers. +#' @param report A data frame containing the MADC markers. +#' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE +#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE. +#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. +#' +#' @return A list containing the adjusted botloci and MADC data frames. +#' +#' @details +#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. +#' +#' @keywords internal +#' @noRd +check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + + # Check inputs + if(!ChromPos) { + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.") + # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + } + if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") + } + + if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + + pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) + + if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") + + if(pad_madc != pad_botloci) { + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 2, type = ">>") + if (pad_madc < pad_botloci) { + report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) + ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + if(!is.null(mi_df)) { + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID))) + ) + } + } else { + botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", + sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) + ) + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") + } + } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") + } + mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) + + if(!any(botloci$V1 %in% mk_info_CloneID)){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 2, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } else { + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") + report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + mi_df$CloneID <- mk_info_CloneID + } + } else { + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 2, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } + } + return(list(botloci, report, mi_df)) +} diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..0ebbd5a 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,142 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: +#' * Corrected pedigree as a dataframe in the global environment +#' * A report listing all detected issues +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check - for (i in 1:nrow(data)) { + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### + for (i in seq_len(nrow(data))) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +145,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +164,75 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))) + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### file names #### + file_base <- tools::file_path_sans_ext(basename(ped.file)) + corrected_name <- paste0(file_base, "_corrected") + report_name <- paste0(file_base, "_report") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") + + if (nrow(exact_duplicates) > 0) { + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate trios found.\n") + + if (nrow(repeated_ids_report) > 0) { + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(messy_parents) > 0) { + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + #### interactive save #### + cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, data, envir = .GlobalEnv) + assign("input_ped_report", input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + } else { + cat("No corrected pedigree was saved.\n") } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) } else { - if (verbose) { - cat("No dependencies found.\n") - } + # Silent automatic mode + assign(corrected_name, data, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) } - return(results) + invisible(input_ped_report) } diff --git a/R/filterVCF.R b/R/filterVCF.R index a54e32e..b9bca78 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -17,6 +17,7 @@ #' @param filter.SAMPLE.miss Sample missing data filter #' @param filter.SNP.miss SNP missing data filter #' @param ploidy The ploidy of the species being analyzed +#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity). #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned. #' @return A gzipped vcf file #' @importFrom vcfR read.vcfR @@ -26,42 +27,114 @@ #' @examples #' ## Use file paths for each file on the local system #' -#' #Temp location (only for example) -#' output_file <- tempfile() #' -#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), -#' filter.OD = 0.5, -#' filter.MAF = 0.05, -#' ploidy = 2, -#' output.file = output_file) -#' -#' # Removing the output for the example -#' rm(output_file) +#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv", +#' # filter.OD = 0.5, +#' # ploidy = 2, +#' # output.file = "name_for_vcf") #' #' ##The function will output the filtered VCF to the current working directory #' #' @export filterVCF <- function(vcf.file, - filter.OD = NULL, - filter.BIAS.min = NULL, - filter.BIAS.max = NULL, - filter.DP = NULL, - filter.MPP = NULL, - filter.PMC = NULL, - filter.MAF = NULL, - filter.SAMPLE.miss = NULL, - filter.SNP.miss = NULL, - ploidy, - output.file = NULL) { + quality.rates = FALSE, + filter.OD = NULL, + filter.BIAS.min = NULL, + filter.BIAS.max = NULL, + filter.DP = NULL, + filter.MPP = NULL, + filter.PMC = NULL, + filter.MAF = NULL, + filter.SAMPLE.miss = NULL, + filter.SNP.miss = NULL, + ploidy, + output.file = NULL) { #Should allow for any INFO field to be entered to be filtered - # Import VCF (can be .vcf or .vcf.gz) + + + # Read VCF (can be .vcf or .vcf.gz) + if (!inherits(vcf.file, "vcfR")) { - vcf <- read.vcfR(vcf.file, verbose = FALSE) + vcf <- read.vcfR(vcf.file) } else { vcf <- vcf.file - #rm(vcf.file) + } + + # Keep original VCF for pre‑filter statistics + vcf_orig <- vcf + + + # pre‑filtering quality rates + + if (quality.rates) { + ## Extract genotypes, depth and DP matrix + gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) + + dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] + if ("DP" %in% dfmt) { + dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) + } else { + dp_orig <- matrix(NA_real_, + nrow = nrow(gt_orig), ncol = ncol(gt_orig), + dimnames = dimnames(gt_orig)) + } + + + # 1. Observed heterozygosity (per‑marker & per‑sample) + + # Helper: TRUE if a genotype is heterozygous (any two different + # alleles, excluding missing "./.") + is_het <- function(g) { + if (is.na(g) || g == "./.") return(FALSE) + alleles <- strsplit(g, split = "[/|]")[[1]] + return(length(unique(alleles)) > 1) + } + #matrix of heterozygous calls + het_mat <- apply(gt_orig, c(1, 2), is_het) + + #Observed heterozygosity per marker and per sample + obs_het_marker <- rowMeans(het_mat, na.rm = TRUE) + obs_het_sample <- colMeans(het_mat, na.rm = TRUE) + + + #Per‑marker stats + + mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) + genotype_present <- !is.na(gt_orig) + genotyping_rate_marker <- rowMeans(genotype_present) + + markers_df <- data.frame( + marker = vcf_orig@fix[, "ID"], + mean_depth = round(mean_depth_marker, 2), + genotyping_rate = round(genotyping_rate_marker, 2), + obs_het = round(obs_het_marker, 2), + stringsAsFactors = FALSE + ) + + + #Per‑sample stats + + mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) + genotyping_rate_sample <- colMeans(genotype_present) + + samples_df <- data.frame( + sample = colnames(gt_orig), + mean_depth = round(mean_depth_sample, 2), + genotyping_rate = round(genotyping_rate_sample, 2), + obs_het = round(obs_het_sample, 2), + stringsAsFactors = FALSE + ) + + + #Write CSV + + base_name <- if (!is.null(output.file)) output.file else "pre_filter" + write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), + row.names = FALSE, quote = FALSE) + write.csv(samples_df, paste0(base_name, "_sample_stats.csv"), + row.names = FALSE, quote = FALSE) } #Update header based on user filtering parameters @@ -102,7 +175,7 @@ filterVCF <- function(vcf.file, # Extract the DP values if ("DP" %in% format_fields && !is.null(filter.DP)) { - message("Filtering by DP\n") + cat("Filtering by DP\n") dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold threshold <- as.numeric(filter.DP) @@ -116,7 +189,7 @@ filterVCF <- function(vcf.file, #Filter if the MPP field is present if ("MPP" %in% format_fields && !is.null(filter.MPP)) { - message("Filtering by MPP\n") + cat("Filtering by MPP\n") # Extract the MPP values mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold @@ -156,13 +229,13 @@ filterVCF <- function(vcf.file, # Filtering by OD if ("OD" %in% info_ids && !is.null(filter.OD)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by OD\n") + cat("Filtering by OD\n") od_values <- extract_info_value(info, "OD") # Ensure no NA values before filtering if (!all(is.na(od_values))) { vcf <- vcf[od_values < as.numeric(filter.OD), ] } else { - warning("No valid OD values found.\n") + cat("No valid OD values found.\n") } } @@ -171,26 +244,26 @@ filterVCF <- function(vcf.file, # Filtering by BIAS if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by BIAS\n") + cat("Filtering by BIAS\n") bias_values <- extract_info_value(info, "BIAS") # Ensure no NA values before filtering if (!all(is.na(bias_values))) { vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ] } else { - warning("No valid BIAS values found.\n") + cat("No valid BIAS values found.\n") } } # Filtering by PMC if ("PMC" %in% info_ids && !is.null(filter.PMC)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by PMC\n") + cat("Filtering by PMC\n") pmc_values <- extract_info_value(info, "PMC") # Ensure no NA values before filtering if (!all(is.na(pmc_values))) { vcf <- vcf[pmc_values < as.numeric(filter.PMC), ] } else { - warning("No valid PMC values found.\n") + cat("No valid PMC values found.\n") } } @@ -200,14 +273,14 @@ filterVCF <- function(vcf.file, gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) if (!is.null(filter.SNP.miss)) { - message("Filtering by SNP missing data\n") + cat("Filtering by SNP missing data\n") snp_missing_data <- rowMeans(is.na(gt_matrix)) vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ] gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) } if (!is.null(filter.SAMPLE.miss)) { - message("Filtering by Sample missing data\n") + cat("Filtering by Sample missing data\n") # Calculate the proportion of missing data for each sample sample_missing_data <- colMeans(is.na(gt_matrix)) # Identify samples to keep based on the missing data threshold @@ -222,30 +295,112 @@ filterVCF <- function(vcf.file, rm(gt_matrix) } + ##Convert GT to dosage + #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) + + # Function to determine the ploidy level from a genotype string + #determine_ploidy <- function(gt) { + # if (is.na(gt)) { + # return(NA) + # } + # return(length(strsplit(gt, "[|/]")[[1]])) + #} + + # Function to find a non-NA example genotype to determine ploidy + #find_example_gt <- function(matrix) { + # for (i in seq_len(nrow(matrix))) { + # for (j in seq_len(ncol(matrix))) { + # if (!is.na(matrix[i, j])) { + # return(matrix[i, j]) + # } + # } + # } + # return(NA) # Return NA if no non-NA genotype is found + #} + + # Find a non-NA example genotype + #example_gt <- find_example_gt(gt_matrix) + + # Determine the ploidy level + #if (!is.na(example_gt)) { + # ploidy <- determine_ploidy(example_gt) + #} else { + # stop("No non-NA genotype found to determine ploidy.") + #} + + # Generate lookup table for genotypes to dosage conversion + #generate_lookup_table <- function(ploidy) { + # possible_alleles <- 0:ploidy + # genotypes <- expand.grid(rep(list(possible_alleles), ploidy)) + # genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/")) + # dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy))) + # lookup_table <- setNames(dosage_values, genotypes) + # return(lookup_table) + #} + + # Generate the lookup table + #lookup_table <- generate_lookup_table(ploidy) + + # Function to convert genotype to dosage using the lookup table + #genotype_to_dosage <- function(gt, lookup_table) { + # if (is.na(gt)) { + # return(NA) + # } + # return(lookup_table[[gt]]) + #} + + # Function to convert genotype matrix to dosage matrix using vectorized operations + #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) { + # unique_gts <- unique(gt_matrix) + # gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts) + # valid_gts <- unique_gts[unique_gts %in% names(lookup_table)] + # gt_to_dosage[valid_gts] <- lookup_table[valid_gts] + # dosage_matrix <- gt_to_dosage[gt_matrix] + #colnames(dosage_matrix) <- colnames(gt_matrix) + #row.names(dosage_matrix) <- row.names(gt_matrix) + # return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix))) + #} + + # Convert the genotype matrix to dosage matrix + #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table) + ##MAF filter + #Compare my lengthy process to estimate MAF with vcfR::maf() function + #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations + #The step where I extract UD and calculate MAF is different... + #if ("UD" %in% format_fields) { + # maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy) + #} else { + #convert genotypes to dosage and filter + # maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy) + #} + #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code if (!is.null(filter.MAF)) { - message("Filtering by MAF\n") + cat("Filtering by MAF\n") maf_df <- data.frame(vcfR::maf(vcf, element = 2)) vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ] } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) - message("Exporting VCF\n") - if (!inherits(vcf.file, "vcfR")) { - if (!is.null(output.file)) { - output_name <- paste0(output.file, ".vcf.gz") + cat("Exporting VCF\n") + if (!inherits(vcf.file, "vcfR")){ + if (!is.null(output.file)){ + output_name <- paste0(output.file,".vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } - } else { - if (!is.null(output.file)) { - output_name <- paste0(output.file, "_filtered.vcf.gz") + }else{ + if (!is.null(output.file)){ + output_name <- paste0(output.file,"_filtered.vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } } + #Message that includes the output vcf stats + print(vcf) + #Message samples_removed <- starting_samples - (ncol(vcf@gt)-1) SNPs_removed <- starting_snps - nrow(vcf) @@ -253,3 +408,81 @@ filterVCF <- function(vcf.file, message("SNPs removed due to filtering: ",SNPs_removed) message("Complete!") } +#This is not reliable, so no longer use this shortcut to get dosage matrix +#test2 <- vcfR2genlight(vcf) + + +#####Testing custom VCF reading function###### +# Open the gzipped VCF file +#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt") + +# Read in the entire file +#lines <- readLines(con) +#close(con) +# Read in the entire file +#lines <- readLines("/Users/ams866/Desktop/output.vcf") +# Filter out lines that start with ## +#filtered_lines <- lines[!grepl("^##", lines)] +# Create a temporary file to write the filtered lines +#temp_file <- tempfile() +#writeLines(filtered_lines, temp_file) +# Read in the filtered data using read.table or read.csv +#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE) +# Clean up the temporary file +#unlink(temp_file) + +##Extract INFO column and Filter SNPs by those values +#Update the filtering options by the items present in the INFO column? + +# Load required library +#library(dplyr) + +# Split INFO column into key-value pairs +#vcf_data_parsed <- vcf_data %>% +# mutate(INFO_PARSED = strsplit(INFO, ";")) %>% +# unnest(INFO_PARSED) %>% +# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>% +# spread(KEY, VALUE) + +#Filter by DP +#filtered_vcf_data <- vcf_data_parsed %>% +# filter(as.numeric(DP) > 10) + +# View the filtered dataframe +#print(filtered_vcf_data) + +##Extracting and filtering by FORMAT column +# Identify the columns that are not sample columns +#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT") +# Identify the sample columns +#sample_cols <- setdiff(names(vcf_data), non_sample_cols) +# Extract FORMAT keys +#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]] +# Split SAMPLE columns based on FORMAT +#vcf_data_samples <- vcf_data %>% +# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>% +# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>% +# unnest_wider(all_of(sample_cols), names_sep = "_") + +# View the parsed dataframe +#print(head(vcf_data_samples)) + +# Create separate dataframes for each FORMAT variable +#format_dfs <- lapply(format_keys, function(format_key) { +# vcf_data_samples %>% +# select(ID, ends_with(paste0("_", format_key))) %>% +# column_to_rownames("ID") +#}) + +# Assign names to the list elements +#names(format_dfs) <- format_keys + +# Access the separate dataframes +#gt_df <- format_dfs$GT # Genotype dataframe +#ad_df <- format_dfs$AD # Allelic depths dataframe + +#*I think the above method is okay if you only need to filter at the INFO level, +#*But I think if you want to filter for FORMAT, that vcfR is probably best, +#*Will need to explore further if I can easily just filter for MPP by checking if it is above a +#*threshold, and then converting the GT and UD values to NA if so... +#*If that is efficient and works, then I will just use this custom VCF method... diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8be1cda..3a9bc2b 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -1,13 +1,38 @@ #' Obtain Read Counts from MADC File #' -#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +#' Reads a DArTag MADC report and returns reference and total read count matrices +#' per marker and sample. Only `Ref` and `Alt` target loci are retained; +#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on +#' `collapse_matches_counts`. #' +#' @details +#' Either `madc_file` or `madc_object` must be provided (not both `NULL`). +#' When `madc_object` is supplied it is passed directly to `get_counts()`, +#' skipping file I/O. The function constructs: +#' - `ref_matrix` — per-sample reference allele counts. +#' - `size_matrix` — per-sample total counts (ref + alt). +#' +#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows +#' are removed with a warning. A summary of the proportion of zero-count +#' data points (missing data) is reported via `vmsg()`. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., as returned by `check_botloci()`). When supplied, file reading is +#' skipped. At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A named list with two numeric matrices, both with markers as rows +#' and samples as columns: +#' \describe{ +#' \item{`ref_matrix`}{Reference allele read counts.} +#' \item{`size_matrix`}{Total read counts (reference + alternative).} +#' } #' -#' @param madc_file Path to MADC file -#' @import dplyr -#' @return A list of read count matrices for reference, alternate, and total read count values #' @examples #' # Get the path to the MADC file #' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") @@ -15,17 +40,37 @@ #' # Extract the read count matrices #' counts_matrices <- get_countsMADC(madc_path) #' -#' # Access the reference, alternate, and size matrices -#' -#' # ref_matrix <- counts_matrices$ref_matrix -#' # alt_matrix <- counts_matrices$alt_matrix +#' # Access the reference and size matrices +#' # ref_matrix <- counts_matrices$ref_matrix #' # size_matrix <- counts_matrices$size_matrix #' #' rm(counts_matrices) +#' +#' @seealso [get_counts()], [check_madc_sanity()] +#' +#' @import dplyr #' @export -get_countsMADC <- function(madc_file) { +get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + + vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>") + vmsg(ifelse(collapse_matches_counts, + "|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.", + "|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."), + verbose = verbose, level = 1, type = ">>") + # This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output - update_df <- get_counts(madc_file) + if (is.null(madc_object)) { + update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } else { + update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } + # Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning + update_df <- as.data.frame(update_df) # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) @@ -43,9 +88,17 @@ get_countsMADC <- function(madc_file) { #Retain only the rows in common if they are not identical and provide warning if (same == FALSE) { - warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.") # Find the common CloneIDs between the two dataframes + all_mks <- unique(c(rownames(ref_df), rownames(alt_df))) common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + n_singles <- length(all_mks) - length(common_ids) + + vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") + vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + vmsg("Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences", verbose = verbose, level = 1, type = ">>") + + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion. Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences.")) + # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] alt_df <- alt_df[common_ids, ] @@ -77,7 +130,7 @@ get_countsMADC <- function(madc_file) { # Print the result ratio_missing_data <- count_zeros / length(size_matrix) - message("Ratio of missing data =", ratio_missing_data, "\n") + vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>") # Return the ref and alt matrices as a list matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix) @@ -86,41 +139,116 @@ get_countsMADC <- function(madc_file) { } -get_counts <- function(madc_file) { +#' Read and Pre-process a MADC File +#' +#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +#' file format, and returns a filtered data frame containing only `Ref` and `Alt` +#' haplotype rows ready for count-matrix construction. +#' +#' @details +#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data +#' frame) must be supplied; at least one is required. +#' +#' **Format detection** (applied to file or object alike): the first seven rows +#' of the first column are inspected: +#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are +#' treated as DArT placeholder rows and skipped. +#' - **Fixed-allele-ID format**: no filler rows — data are used as-is. +#' +#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`): +#' - `FALSE` (default): these rows are simply discarded. +#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt` +#' row for the same `CloneID`. +#' +#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`) +#' are stripped to the canonical `|Ref` / `|Alt` form. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., from `check_botloci()`). When supplied, file reading is skipped. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), those rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining +#' all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample +#' count columns, etc.). +#' +#' @importFrom dplyr mutate group_by summarise across where select +#' @importFrom dplyr %>% filter case_when +#' +#' @keywords internal +get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + # Read the MADC file + + if(!is.null(madc_file)){ #Read only the first column for the first seven rows first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) #Check if all entries in the first column are either blank or "*" check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + } else { + check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*")) + } + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline if (check_entries) { #Note: This assumes that the first 7 rows are placeholder info from DArT processing - #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - - #Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] - - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - + vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + } else { + madc_df <- madc_object[-(1:7), ] + } } else { #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - # Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] + vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + } else { + madc_df <- madc_object + } + } - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + if(collapse_matches_counts){ + filtered_df <- madc_df[order(madc_df$AlleleID),] %>% + # Keep only Ref/Alt alleles and their Match variants; drop other allele types + filter(grepl("\\|(Ref|Alt)(Match)?(_|$)", AlleleID)) %>% + mutate( + Type = case_when( + grepl("\\|Alt(Match)?(_|$)", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?(_|$)", AlleleID) ~ "Ref" + ) + ) %>% + group_by(CloneID, Type) %>% + summarise( + AlleleID = paste0(unique(CloneID), "|", unique(Type)), + AlleleSequence = first(AlleleSequence), + across(where(is.numeric), ~ sum(.x, na.rm = TRUE)), + .groups = "drop" + ) %>% + select(AlleleID, CloneID, AlleleSequence, everything(), -Type) + } else { + #Retain only the Ref and Alt haplotypes + filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + return(filtered_df) } diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 6ab2eba..1eb441a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,89 +1,124 @@ #' Calculate Concordance between Imputed and Reference Genotypes #' -#' This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -#' It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -#' will not be considered for concordance if the `missing_code` argument is used. If a specific subset of markers should be excluded, -#' it can be provided using the `snps_2_exclude` argument. +#' This function calculates the concordance between imputed and reference +#' genotypes. It assumes that samples are rows and markers are columns. +#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +#' Missing data in either dataset can be excluded from the concordance calculation +#' using the `missing_code` argument. Specific markers can be excluded using +#' the `snps_2_exclude` argument. #' -#' @param reference_genos A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param imputed_genos A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param missing_code An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation. -#' @param snps_2_exclude An optional vector of marker IDs to exclude from the concordance calculation. -#' @param verbose A logical value indicating whether to print a summary of the concordance results. Default is FALSE. +#' @param reference_genos A data frame containing reference genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @return A list with two elements: -#' \itemize{ -#' \item \code{result_df}: A data frame with sample IDs and their concordance percentages. -#' \item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. -#' } +#' @param imputed_genos A data frame containing imputed genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @details The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +#' @param missing_code Optional value specifying missing data. If provided, +#' loci with this value in either dataset will be excluded from the concordance calculation. #' -#' @import dplyr +#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. #' -#' @examples +#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, +#' median, mean, maximum) of concordance percentages. #' -#' # Example Input variables -#' ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -#' ref_file <- system.file("imputation_reference.txt", package="BIGr") -#' test_file <- system.file("imputation_test.txt", package="BIGr") +#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage +#' by sample. #' -#' # Import files -#' snps = read.table(ignore_file, header = TRUE) -#' ref = read.table(ref_file, header = TRUE) -#' test = read.table(test_file, header = TRUE) +#' @param print_result Logical. If `TRUE` (default), prints the concordance +#' results data frame to the console. If `FALSE`, results are returned invisibly. #' -#' #Calculations -#' result <- imputation_concordance(reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = snps, -#' missing_code = 5, -#' verbose = FALSE) +#' @return A data frame with: +#' \itemize{ +#' \item \code{ID}: Sample identifiers shared between the datasets. +#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' } +#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' +#' @details +#' The function: +#' \enumerate{ +#' \item Identifies common samples and markers between the datasets. +#' \item Optionally excludes specified SNPs. +#' \item Removes loci with missing data (if \code{missing_code} is provided). +#' \item Computes per-sample concordance as the percentage of matching genotypes. +#' } #' +#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +#' is generated using \pkg{ggplot2}. #' -#' @export +#' @import dplyr +#' @import ggplot2 +#' +#' @examples +#' \dontrun{ +#' result <- imputation_concordance( +#' reference_genos = ref, +#' imputed_genos = test, +#' snps_2_exclude = snps, +#' missing_code = 5, +#' verbose = TRUE, +#' plot = TRUE +#' ) +#' } #' +#' @importFrom stats reorder +#' @export imputation_concordance <- function(reference_genos, imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE) { + verbose = FALSE, + plot = FALSE, + print_result = TRUE) { # Find common IDs common_ids <- intersect(imputed_genos$ID, reference_genos$ID) - imputed_genos <- imputed_genos %>% filter(ID %in% common_ids) %>% arrange(ID) - reference_genos <- reference_genos %>% filter(ID %in% common_ids) %>% arrange(ID) + imputed_genos <- imputed_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) + + reference_genos <- reference_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) - # Find common SNPs, excluding those in snps_2_exclude if provided + # Find common SNPs common_snps <- setdiff( intersect(colnames(imputed_genos), colnames(reference_genos)), as.vector(unlist(snps_2_exclude)) ) - # Subset and convert to matrices for faster computation + # Remove ID column if present + common_snps <- setdiff(common_snps, "ID") + + # Convert to matrices imputed_matrix <- as.matrix(imputed_genos[, common_snps]) reference_matrix <- as.matrix(reference_genos[, common_snps]) - # Identify valid SNPs that are not missing in either dataset + # Identify valid SNPs if (!is.null(missing_code)) { - valid_snps <- (imputed_matrix != missing_code) & (reference_matrix != missing_code) + valid_snps <- (imputed_matrix != missing_code) & + (reference_matrix != missing_code) } else { - valid_snps <- matrix(TRUE, nrow = nrow(imputed_matrix), ncol = ncol(imputed_matrix)) + valid_snps <- matrix(TRUE, + nrow = nrow(imputed_matrix), + ncol = ncol(imputed_matrix)) } - # Compute concordance (row-wise percentage of matching SNPs) + # Compute concordance matches <- (imputed_matrix == reference_matrix) & valid_snps - percentage_match <- rowSums(matches, na.rm = TRUE) / rowSums(valid_snps, na.rm = TRUE) + percentage_match <- rowSums(matches, na.rm = TRUE) / + rowSums(valid_snps, na.rm = TRUE) + + percentage_match[is.nan(percentage_match)] <- NA - # Create output data frame + # Output data frame (original structure preserved) result_df <- data.frame( - ID = common_ids, + ID = imputed_genos$ID, Concordance = paste0(round(percentage_match * 100, 2), "%") ) - # Print mean concordance + # Summary statistics summary_concordance <- summary(percentage_match, na.rm = TRUE) * 100 names(summary_concordance) <- c("Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max") @@ -94,6 +129,35 @@ imputation_concordance <- function(reference_genos, } } - return(result_df) -} + # Print results to console (NEW OPTION) + if (print_result) { + print(result_df) + } + + # Optional plot + if (plot) { + + plot_df <- data.frame( + ID = imputed_genos$ID, + Concordance = percentage_match * 100 + ) + + concordance_plot <- ggplot(plot_df, + aes(x = reorder(ID, Concordance), + y = Concordance)) + + geom_bar(stat = "identity") + + labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + print(concordance_plot) + } + + if (print_result) { + return(result_df) + } else { + invisible(result_df) + } +} diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 2dc4fb5..63c031d 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -2,8 +2,8 @@ #' #' This function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It includes options for filtering multiallelic SNPs and parallel processing to improve performance. #' -#' @param madc A string specifying the path to the MADC file. -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. +#' @param madc Required. A string specifying the path or URL to the MADC file. +#' @param botloci_file Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand. #' @param hap_seq_file A string specifying the path to the haplotype database fasta file. #' @param rm_multiallelic_SNP A logical value. If TRUE, SNPs with more than one alternative base are removed. If FALSE, the thresholds specified by `multiallelic_SNP_dp_thr` and `multiallelic_SNP_sample_thr` are used to filter low-frequency SNP alleles. Default is FALSE. #' @param multiallelic_SNP_dp_thr A numeric value specifying the minimum depth by tag threshold for filtering low-frequency SNP alleles when `rm_multiallelic_SNP` is FALSE. Default is 0. @@ -11,6 +11,10 @@ #' @param alignment_score_thr A numeric value specifying the minimum alignment score threshold. Default is 40. #' @param n.cores An integer specifying the number of cores to use for parallel processing. Default is 1. #' @param out_vcf A string specifying the name of the output VCF file. If the file extension is not `.vcf`, it will be appended automatically. +#' @param markers_info A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed). +#' @param add_others A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE. +#' @param others_max_snps An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit). +#' @param others_rm_with_indels A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE. #' @param verbose A logical value indicating whether to print metrics and progress to the console. Default is TRUE. #' #' @return This function does not return an R object. It writes the processed VCF file v4.3 to the specified `out_vcf` path. @@ -18,6 +22,18 @@ #' @details #' The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the `alignment_score_thr` parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the `alignment_score_thr` is not met, the corresponding SNPs are discarded. #' +#' **Sanity check behaviour and requirements** +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `markers_info` with `Ref`/`Alt`/`Indel_pos`/`Indel_length` + `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | **RefAltSeqs** | all paired | `botloci_file` | +#' | | unpaired | `botloci_file` + `hap_seq_file` (microhaplotype DB) | +#' +#' #' @examples #' # Example usage: #' @@ -51,8 +67,8 @@ #' @import vcfR #' #' @export -madc2vcf_all <- function(madc = NULL, - botloci_file = NULL, +madc2vcf_all <- function(madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -60,13 +76,56 @@ madc2vcf_all <- function(madc = NULL, multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE){ + vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") + vmsg("madc : %s", verbose = verbose, level = 1, type = ">>", madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, type = ">>", n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, type = ">>", alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, type = ">>", rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, type = ">>", add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, type = ">>", if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, type = ">>", others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, type = ">>", if (is.null(out_vcf)) "NULL" else out_vcf) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(is.null(madc) || !(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(is.null(botloci_file) || !(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) && !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") + + ## n.cores as integer + if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") + + ## alignment_score_thr, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr as numeric + if(!is.numeric(alignment_score_thr)) stop("alignment_score_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_dp_thr)) stop("multiallelic_SNP_dp_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_sample_thr)) stop("multiallelic_SNP_sample_thr should be numeric.") + + ## out_vcf as string + if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") + + ## rm_multiallelic_SNP, add_others and verbose as logical + if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(add_others)) stop("add_others should be logical.") + if(!is.null(others_max_snps) && (!is.numeric(others_max_snps) || others_max_snps < 1)) stop("others_max_snps should be a positive integer or NULL.") + if(!is.logical(others_rm_with_indels)) stop("others_rm_with_indels should be logical.") + if(!is.logical(verbose)) stop("verbose should be logical.") + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=') - if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file") - if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") + report <- read.csv(madc, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes")])){ + idx <- which((checks$checks[c("IUPACcodes")])) + stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) + } + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + # Standardize marker ID column to CloneID + if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) { + colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID" + vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use", verbose = verbose, level = 1) + } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) { + stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + # Validate CloneID values + if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) + stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") + if(!any(mi_df$CloneID %in% report$CloneID)) + stop("None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + n_match <- sum(mi_df$CloneID %in% report$CloneID) + n_total <- length(unique(report$CloneID)) + if(n_match < n_total) + vmsg("%s of %s MADC CloneIDs found in markers_info. Unmatched markers will be removed", verbose = verbose, level = 1, n_match, n_total) + } else mi_df <- NULL + + if(any(!checks$checks[c("ChromPos")])){ + if(is.null(markers_info)) { + stop(paste(messages_results[c("ChromPos")], collapse = "\n")) + } else { + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", + "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") + } + } + + if(any(checks$checks[c("Indels")])){ + idx <- which((checks$checks[c("Indels")])) + if(is.null(markers_info)) { + vmsg("The MADC file contains indels and markers_info file is not provided",verbose = verbose, level = 1, type = ">>") + vmsg("Tags with indels as targets will be flagged with warnings and removed from the analysis ",verbose = verbose, level = 2, type = ">>") + vmsg("Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels",verbose = verbose, level = 2, type = ">>") + + } else { + if(checks$checks["Indels"] && + !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) + stop("Indels detected in MADC file. ", + "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + if(!"Type" %in% colnames(mi_df)) { + mi_df$Type <- ifelse(nchar(as.character(mi_df$Ref)) > 1 | nchar(as.character(mi_df$Alt)) > 1, + "Indel", "SNP") + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels)", + verbose = verbose, level = 1, + sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) + } + vmsg("The MADC file contains indels and markers_info file was provided with all required columns",verbose = verbose, level = 1, type = ">>") + vmsg("Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors",verbose = verbose, level = 2, type = ">>") + } + } + vmsg("Inputs checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded", verbose = verbose, level = 1) + } + + botloci <- read.csv(botloci_file, header = F) if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL # Check marker names compatibility between MADC and botloci - checked_botloci <- check_botloci(botloci, report) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + # Derive position padding width from CloneIDs in the original report + pad_width <- unique(nchar(sub(".*_", "", unique(report$CloneID)))) + if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") + pad_width <- pad_width[1] my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + checks = checks, + mi_df = mi_df, + pad_width = pad_width, + add_others = add_others, + others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = verbose) + vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") + vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, multiallelic_SNP_dp_thr = multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr = multiallelic_SNP_sample_thr, + pad_width = pad_width, verbose = verbose) #Make a header separate from the dataframe @@ -114,6 +275,8 @@ madc2vcf_all <- function(madc = NULL, vcf_term <- sapply(strsplit(out_vcf, "[.]"), function(x) x[length(x)]) if(length(vcf_term) != 0) if(vcf_term != "vcf") out_vcf <- paste0(out_vcf,".vcf") + vmsg("VCF ready! Output written to: %s", verbose = verbose, level = 0, type = ">>", out_vcf) + writeLines(vcf_header, con = out_vcf) suppressWarnings( write.table(vcf_body, file = out_vcf, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) @@ -132,10 +295,13 @@ madc2vcf_all <- function(madc = NULL, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, + checks = NULL, mi_df = NULL, pad_width = NULL, + add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, + verbose = TRUE){ - if(!is.null(hap_seq)){ - hap_seq <- get_ref_alt_hap_seq(hap_seq) + if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ + hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } nsamples <- ncol(report) - 3 @@ -148,7 +314,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) - add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) + add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = FALSE)) stopCluster(clust) ref_index <- sapply(add_ref_alt_results, "[[",2) @@ -160,17 +326,25 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) - if(verbose){ - cat("The Ref_0001 sequence had to be added for:", sum(ref_index==1),"tags\n") - cat("The Alt_0002 sequence had to be added for:", sum(alt_index==1),"tags\n") - cat("Tags discarded due to lack of Ref_0001 sequence:", sum(ref_index==-1),"tags\n") - cat("Tags discarded due to lack of Alt_0002 sequence:", sum(alt_index==-1),"tags\n") + if(!is.null(hap_seq)){ + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences", verbose = verbose, level = 1) + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) + } else { + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis", verbose = verbose, level = 1) } + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) + if(sum(ref_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Ref_0001 sequence. Consider providing the haplotype database file to recover these tags", sum(ref_index==-1))) + if(sum(alt_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Alt_0002 sequence. Consider providing the haplotype database file to recover these tags", sum(alt_index==-1))) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df", "add_others", "others_max_snps", "others_rm_with_indels")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, + add_others = add_others, others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = FALSE)) stopCluster(clust) my_results_csv <- lapply(compare_results, "[[", 1) @@ -182,13 +356,39 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_N <- unlist(rm_N) rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) - - if(verbose){ - cat("Number of tags removed because of low alignment score:", length(rm_score),"tags\n") - cat("Number of tags removed because of N in the alternative sequence:", length(rm_N),"tags\n") - cat("Number of tags removed because of indels as targets (not yet supported):", length(rm_indels),"tags\n") + n_rm_others_indels <- sum(sapply(compare_results, "[[", 5)) + n_rm_others_maxsnps <- sum(sapply(compare_results, "[[", 6)) + + vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) + if(length(rm_indels) > 0) { + if(!is.null(mi_df) && all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) { + vmsg("Number of tags with indels as targets: %s tags (markers_info provided with required columns; targets exported, off-targets skipped)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } else { + vmsg("Number of tags removed because of indels as targets: %s tags (no markers_info with Ref/Alt/Indel_pos provided; tags discarded)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } + } else { + vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") + } + n_others_total <- sum(sapply(compare_results, "[[", 7)) + n_others_kept <- n_others_total - n_rm_others_indels - n_rm_others_maxsnps + others_added_info <- unlist(lapply(compare_results, "[[", 8)) + if(add_others) { + vmsg("Number of Other alleles found: %s (%s kept after filters, %s discarded)", verbose = verbose, level = 2, type = ">>", n_others_total, n_others_kept, n_others_total - n_others_kept) + if(others_rm_with_indels) + vmsg("Number of Other alleles discarded due to indels vs Ref: %s", verbose = verbose, level = 2, type = ">>", n_rm_others_indels) + if(!is.null(others_max_snps)) + vmsg("Number of Other alleles discarded due to exceeding max SNPs (%s): %s", verbose = verbose, level = 2, type = ">>", others_max_snps, n_rm_others_maxsnps) + # if(length(others_added_info) > 0) { + # vmsg("Others tags added:", verbose = verbose, level = 3, type = ">>") + # for(msg in others_added_info) vmsg(" %s", verbose = verbose, level = 3, type = ">>", msg) + # } + } else { + vmsg("Number of Other alleles found: %s (not processed, add_others = FALSE)", verbose = verbose, level = 2, type = ">>", n_others_total) } + vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) + rownames(my_results_csv) <- NULL return(my_results_csv) } @@ -282,10 +482,48 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40){ +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df = NULL, add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, verbose = FALSE){ + + #idx <- which(names(updated_by_cloneID) == "Ra01_020534029") + #one_tag <- updated_by_cloneID[[idx]] cloneID <- one_tag$CloneID[1] + isBotLoci <- cloneID %in% botloci[,1] - # If marker is present in the botloc list, get the reverse complement sequence + if(!is.null(mi_df)) { + one_mi_df <- mi_df[which(mi_df$CloneID %in% cloneID), ] + # Handle duplicate CloneIDs in markers_info + if(nrow(one_mi_df) > 1) { + key_cols <- intersect(c("CloneID", "Chr", "Pos", "Ref", "Alt", "Type"), colnames(one_mi_df)) + if(nrow(unique(one_mi_df[, key_cols])) == 1) { + warning("Duplicate CloneID '", cloneID, "' found in markers_info with identical key columns. Keeping the first entry.") + one_mi_df <- one_mi_df[1, ] + } else { + stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") + } + } + isIndel <- !is.null(one_mi_df$Type) && !is.na(one_mi_df$Type) && tolower(one_mi_df$Type) == "indel" + } else { + isIndel <- FALSE + } + + if(isIndel){ + update_tag <- one_tag[grep("Ref_0001$",one_tag$AlleleID),] + update_tag[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag_temp <- one_tag[grep("Alt_0002$",one_tag$AlleleID),] + update_tag_temp[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag <- rbind(update_tag, update_tag_temp) + + return(list(update_tag = update_tag, + rm_score = NULL, + rm_N = NULL, + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) + } + + # If marker is present in the botloci list, get the reverse complement sequence if(isBotLoci) one_tag$AlleleSequence <- sapply(one_tag$AlleleSequence, function(x) as.character(reverseComplement(DNAString(x)))) chr <- sapply(strsplit(cloneID, "_"), function(x) x[-length(x)]) @@ -299,7 +537,10 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ sigma <- nucleotideSubstitutionMatrix(match = 1, mismatch = -0.5, baseOnly = FALSE) # baseOnly = FALSE avoid breaking when N is present (they will be filtered after)) align <- pairwiseAlignment(ref_seq, alt_seq, - substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + substitutionMatrix = sigma, + gapOpening=-1.4, + gapExtension=-0.1, + type = "global") # The score is a bit different from the python script despite same weights if(align@score > alignment_score_thr){ # if score for the target sequence is smaller than the threshold, the tag will be discarted @@ -309,7 +550,11 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ return(list(update_tag = NULL, rm_score = NULL, rm_N = NULL, - rm_indels= cloneID)) + rm_indels = cloneID, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } ref_base <- substring(ref_seq, align@pattern@mismatch@unlistData, align@pattern@mismatch@unlistData) alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) @@ -335,50 +580,126 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ if(length(rm_target) >0) pos_ref_idx <- pos_ref_idx[-rm_target] # Cases found where the AltMatch is another alternative for the target SNP - they are discarted if(length(pos_ref_idx) >0){ - ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + ref_base_match <- substring(ref_seq, pos_ref_idx, pos_ref_idx) pos_alt_idx <- align@subject@mismatch@unlistData # If there are indels, the position in the alternative is not the same as the reference if(length(rm_target) >0) pos_alt_idx <- pos_alt_idx[-rm_target] # remove target position when is AltMatch - but the order in the sequence is the same - alt_base <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + alt_base_match <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + + # If Match sequences have N, do not consider as polymorphism + if(any(!alt_base_match %in% c("A", "T", "C", "G"))) { + ref_base_match <- ref_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + alt_base_match <- alt_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + } + + if(length(alt_base_match) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + # The reported position is always on reference + pos <- pos_target - (pos_target_idx - pos_ref_idx) + + # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table + update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base_match)), ] + + update_tag_temp$Chromosome <- chr + update_tag_temp$SNP_position_in_Genome <- pos + update_tag_temp$Ref <- ref_base_match + update_tag_temp$Alt <- alt_base_match + + update_tag <- rbind(update_tag, update_tag_temp) + } + } + } + } + others_seq <- one_tag[grep("Other",one_tag$AlleleID),] + n_others_total <- nrow(others_seq) + n_rm_others_indels <- 0L + n_rm_others_maxsnps <- 0L + others_added_info <- character(0) + + if(add_others && nrow(others_seq) > 0){ + for(j in seq_len(nrow(others_seq))){ + align <- pairwiseAlignment(ref_seq, # Align with the reference + others_seq[j,]$AlleleSequence, + substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + # Filter: discard Others with indels relative to Ref + if(others_rm_with_indels && + (length(align@pattern@indel@unlistData) > 0 | length(align@subject@indel@unlistData) > 0)) { + n_rm_others_indels <- n_rm_others_indels + 1L + next + } + pos_ref_idx <- align@pattern@mismatch@unlistData + pos_alt_idx <- align@subject@mismatch@unlistData + # Filter: discard Others with too many SNPs vs Ref (count before removing target position) + if(!is.null(others_max_snps) && length(pos_ref_idx) > others_max_snps) { + n_rm_others_maxsnps <- n_rm_others_maxsnps + 1L + next + } + rm_target_other <- which(pos_ref_idx == pos_target_idx) # remove target position if base is the same as Ref or Alt + if(length(rm_target_other) > 0) { + other_tag_base <- substring(others_seq[j,]$AlleleSequence, pos_target_idx, pos_target_idx) + if(other_tag_base == ref_base | other_tag_base == alt_base){ # If Other has same base as REF and ALT, it won't be considered in their counts + pos_ref_idx <- pos_ref_idx[-rm_target_other] + pos_alt_idx <- pos_alt_idx[-rm_target_other] + } + } + other_ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + other_alt_base <- substring(others_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + # Cases found where the AltMatch is another alternative for the target SNP - they are discarted + if(length(pos_ref_idx) >0){ # If Match sequences have N, do not consider as polymorphism - if(any(!alt_base %in% c("A", "T", "C", "G"))) { - ref_base <- ref_base[-which(!alt_base %in% c("A", "T", "C", "G"))] - pos_ref_idx <- pos_ref_idx[-which(!alt_base %in% c("A", "T", "C", "G"))] - alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] + if(any(!other_alt_base %in% c("A", "T", "C", "G"))) { + other_ref_base <- other_ref_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + other_alt_base <- other_alt_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] } - if(length(alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + if(length(other_alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted # The reported position is always on reference pos <- pos_target - (pos_target_idx - pos_ref_idx) # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table - update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base)), ] + update_tag_temp <- one_tag[grep("Other",one_tag$AlleleID)[j],][rep(1, length(other_alt_base)), ] update_tag_temp$Chromosome <- chr update_tag_temp$SNP_position_in_Genome <- pos - update_tag_temp$Ref <- ref_base - update_tag_temp$Alt <- alt_base + update_tag_temp$Ref <- other_ref_base + update_tag_temp$Alt <- other_alt_base update_tag <- rbind(update_tag, update_tag_temp) + others_added_info <- c(others_added_info, + paste0(others_seq[j,]$AlleleID, " -> position(s): ", paste(pos, collapse = ", "))) } } } } + return(list(update_tag = update_tag, # updated data.frame, NULL if discarted rm_score = NULL, # cloneID if removed because of low alignment score, NULL if kept rm_N = NULL, - rm_indels = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept + rm_indels = NULL, + n_rm_others_indels = n_rm_others_indels, + n_rm_others_maxsnps = n_rm_others_maxsnps, + n_others_total = n_others_total, + others_added_info = others_added_info)) } else { return(list(update_tag = NULL, rm_score = NULL, rm_N = cloneID, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } else{ return(list(update_tag = NULL, rm_score = cloneID, rm_N = NULL, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } @@ -388,7 +709,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ #' @param hap_seq haplotype db #' #' @noRd -get_ref_alt_hap_seq <- function(hap_seq){ +get_ref_alt_hap_seq <- function(hap_seq, botloci){ + headers <- hap_seq$V1[grep(">",hap_seq$V1)] headers <- gsub(">", "", headers) @@ -406,6 +728,17 @@ get_ref_alt_hap_seq <- function(hap_seq){ seqs <- sapply(seqs, function(x) paste0(x, collapse = "")) hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs) + + # Check padding + hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1]) + botloci_cloneID <- botloci$V1 + + pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID))) + + if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.") + if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.") + return(hap_seq) } @@ -432,6 +765,7 @@ create_VCF_body <- function(csv, multiallelic_SNP_dp_thr = 2, multiallelic_SNP_sample_thr = 10, n.cores = 1, + pad_width = NULL, verbose = TRUE){ # Make sure counts are numeric @@ -442,14 +776,32 @@ create_VCF_body <- function(csv, clust <- makeCluster(n.cores) #clusterExport(clust, c("merge_counts","rm_multiallelic_SNP", "multiallelic_SNP_dp_thr", "multiallelic_SNP_sample_thr")) - vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr)) + vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, pad_width)) stopCluster(clust) vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) - rm_mks <- sapply(vcf_tag_list, "[[" ,2) - - if(verbose){ - print(paste("SNP removed because presented more than one allele:", sum(rm_mks))) + rm_mks <- sapply(vcf_tag_list, "[[", 2) # total removed + total_mks <- sapply(vcf_tag_list, "[[", 3) # total multiallelic found + rm_setting <- sapply(vcf_tag_list, "[[", 4) # removed by rm_multiallelic_SNP=TRUE + rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering + kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic + simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + multi_others_target <- sapply(vcf_tag_list, "[[", 8) # multiallelic target from Others + + vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") + + vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + vmsg("Multiallelic target SNPs with a 3rd allele from Others: %s", verbose = verbose, level = 2, type = ">>", sum(multi_others_target)) + if(rm_multiallelic_SNP) { + vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) + } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { + vmsg("Removed (empty after filtering; depth thr = %s, sample thr = %s): %s", + verbose = verbose, level = 3, type = ">>", + multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, sum(rm_filter)) + vmsg("Kept as multiallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + vmsg("Simplified to biallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(simplified)) + } else { + vmsg("All kept (rm_multiallelic_SNP = FALSE, no thresholds set): %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) } for(i in seq_along(vcf_tag_list1)) { @@ -463,20 +815,12 @@ create_VCF_body <- function(csv, vcf_body$V3 <- as.numeric(vcf_body$V3) rownames(vcf_body) <- NULL - # Remove padding - sp <- strsplit(vcf_body$target, "_") - pos <- sapply(sp, function(x) x[length(x)]) - chr <- sapply(sp, function(x) paste0(x[-length(x)], collapse = "_")) - vcf_body$target <- paste0(chr, "_",as.numeric(pos)) - # Dealing with repeated positions # discard the ones that are not the target and keep only the first if all are off-targets if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - if(verbose){ - print(paste("Different primers pair capture same SNP positions in", length(repeated), "locations. The repeated were discarded.")) - } + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded", verbose = verbose, level = 2, type = ">>", length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] @@ -494,6 +838,10 @@ create_VCF_body <- function(csv, vcf_body_new <- rbind(vcf_body_new, repeated_tab_stay) } else vcf_body_new <- vcf_body + vmsg("Filters finished", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF...", verbose = verbose, level = 0, type = ">>") + vcf_body_new <- vcf_body_new[,-1] colnames(vcf_body_new) <- c("#CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", "INFO","FORMAT", colnames(csv)[-c(1:7)]) @@ -515,25 +863,80 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0){ - +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, + multiallelic_SNP_sample_thr = 0, pad_width = NULL){ + #cloneID_unit <- cloneID[[250]] #Get counts for target SNP - rm <- 0 - RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE + rm_by_filter <- 0 # removed because empty after threshold filtering + kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) + simplified <- 0 # simplified from multiallelic to biallelic by filtering + total_multiallelic <- 0 + multiallelic_others_target <- 0 # target SNPs with a 3rd allele from Others + + # Target marker + RefTag <- apply(cloneID_unit[which((grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("RefMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + AltTag <- apply(cloneID_unit[which((grepl("Alt_0002$", cloneID_unit$AlleleID) | grepl("AltMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + + cloneID <- cloneID_unit$CloneID[1] + if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) + info <- cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),] + + # In case there are Others that add multiallelics to targets + others_target <- cloneID_unit[,3] %in% cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),3] + if(sum(others_target) > 2 & !rm_multiallelic_SNP){ # If target is multiallelic + multiallelic_others_target <- 1 + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + other_alts <- unique(cloneID_unit[idx_other,5]) + other_alts_info <- cloneID_unit[idx_other,] + OtherTag_list <- list() + total <- rep(0, length(RefTag)) + ads <- vector() + tab_counts <- paste0(RefTag + AltTag + total, ":", RefTag, ":", RefTag, ",", AltTag) + for(j in 1:length(other_alts)){ + temp_other <- which(other_alts_info[,5] == other_alts[j]) + OtherTag_list[[j]] <- apply(other_alts_info[temp_other, -c(1:7)], 2, sum) + total_temp <- OtherTag_list[[j]] + + if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles + if(sum(total_temp > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr) next() + } + total <- total + total_temp + tab_counts <- paste0(tab_counts, ",",OtherTag_list[[j]]) + ads_temp <- sum(OtherTag_list[[j]]) + ads <- paste0(ads, ",", ads_temp) + } + alts <- paste0(info$Alt, ",", paste0(other_alts, collapse = ",")) + info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", + "ADS=",sum(RefTag),",",sum(AltTag), ads) + } else { + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag) + alts <- info$Alt + info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", + "ADS=",sum(RefTag),",",sum(AltTag)) + } - info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), - info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", - "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") + cloneID, + info$Ref, + alts, + ".", + ".", + info_mk, + "DP:RA:AD") vcf_tag <- c(info, tab_counts) # Check if there are more than one alternative allele by loci - off_tag <- cloneID_unit[-which(grepl("Ref_", cloneID_unit$AlleleID) | grepl("Alt_", cloneID_unit$AlleleID)),] + rm_tags <- which(grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("Alt_0002$", cloneID_unit$AlleleID)) + if(sum(others_target) > 2){ + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + off_tag <- cloneID_unit[-c(rm_tags,idx_other),] + } else { + off_tag <- cloneID_unit[-rm_tags,] + } + if(nrow(off_tag)){ # If there are off target SNP by_pos <- split.data.frame(off_tag, off_tag$SNP_position_in_Genome) @@ -543,19 +946,21 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic alleles <- unique(by_pos[[i]]$AlleleID) if(length(unique(by_pos[[i]]$Alt)) > 1){ # If SNP is multiallelic + total_multiallelic <- total_multiallelic + 1 if(rm_multiallelic_SNP){ # option to remove multiallelics - rm <- rm + 1 + rm_by_setting <- rm_by_setting + 1 next() } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles rm.idx <- which(apply(by_pos[[i]][,-c(1:7)], 1, function(x) sum(x > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr)) if(length(rm.idx)) up_by_pos <- by_pos[[i]][-rm.idx,] else up_by_pos <- by_pos[[i]] if(length(unique(up_by_pos$Alt)) == 0) { # If after applied filter all tags are gone - rm <- rm + 1 + rm_by_filter <- rm_by_filter + 1 next() } else if (length(unique(up_by_pos$Alt)) > 1 ){ # If after applied filter the SNP remains multiallelic + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(up_by_pos, up_by_pos$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -570,12 +975,14 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- unique.data.frame(info) } else { # If after applied filter, only one alternative remains + simplified <- simplified + 1 alt <- apply(up_by_pos[,-c(1:7)], 2, sum) total_alt <- alt info <- unique.data.frame(up_by_pos[,2:5]) } } else { # If rm_multiallelic_SNP set to FALSE and threshold is 0, keep all multiallelics + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(by_pos[[i]], by_pos[[i]]$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -602,7 +1009,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + paste0(info$Chromosome, "_", formatC(as.integer(as.numeric(info$SNP_position_in_Genome)), width = pad_width, flag = "0", format = "d")), info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(ref, total_alt)),";", "ADS=",sum(ref),",",sum(total_alt)), "DP:RA:AD") @@ -612,5 +1019,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified, multiallelic_others_target)) } diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R new file mode 100644 index 0000000..bcbae02 --- /dev/null +++ b/R/madc2vcf_multi.R @@ -0,0 +1,193 @@ +#' Convert MADC file to VCF using polyRAD for multiallelic genotyping +#' +#' This function converts a DArTag MADC file to a VCF using the polyRAD package's +#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before +#' loading the data, applies corrections for lowercase sequences and all-NA +#' rows/columns, and sets `n.header.rows` automatically based on whether the +#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +#' format (no header rows). +#' +#' @param madc_file character. Path or URL to the input MADC CSV file. +#' @param botloci_file character. Path or URL to the botloci file listing target +#' IDs designed on the bottom strand. +#' @param outfile character. Path for the output VCF file. +#' @param markers_info character or NULL. Optional path or URL to a CSV file +#' with marker metadata. Required when CloneIDs do not follow the +#' \code{Chr_Pos} format; must contain \code{CloneID} (or +#' \code{BI_markerID}), \code{Chr}, and \code{Pos} columns. +#' @param ploidy integer. Ploidy level of the samples passed to \code{taxaPloidy}. +#' Default is 2. +#' @param verbose logical. Whether to print progress messages. Default is TRUE. +#' +#' @return Invisible NULL. Writes a VCF file to \code{outfile}. +#' +#' @details +#' The function performs the following steps: +#' \enumerate{ +#' \item Reads the MADC file and runs \code{check_madc_sanity}. +#' \item Validates the botloci file against MADC CloneIDs using +#' \code{check_botloci}, fixing any padding mismatches automatically. +#' \item Converts lowercase bases in AlleleSequence to uppercase if detected. +#' \item Removes all-NA rows and columns if detected. +#' \item Writes the corrected data to a temporary file and passes it to +#' \code{polyRAD::readDArTag}. +#' \item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +#' calls \code{polyRAD::IterateHWE}, then exports the result with +#' \code{polyRAD::RADdata2VCF}. +#' } +#' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +#' unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +#' remaining checks the required inputs are: +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' +#' @importFrom utils read.csv write.csv read.table +#' +#' @export +madc2vcf_multi <- function(madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE) { + + vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, type = ">>", outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, type = ">>", ploidy) + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") + if (!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if (!is.null(markers_info) && !(file.exists(markers_info) | url_exists(markers_info))) stop("markers_info file not found. Please provide a valid path or URL.") + if (!is.numeric(ploidy) || ploidy < 1) stop("ploidy must be a positive integer.") + + # ---- Load markers_info if provided ---- + mi_df <- if (!is.null(markers_info)) read.csv(markers_info) else NULL + + # ---- Read and sanity-check ---- + report <- read.csv(madc_file, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + for (i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if (!checks$checks["Columns"]) + stop("The MADC file is missing required columns (CloneID, AlleleID, AlleleSequence)") + + if (checks$checks["IUPACcodes"]) + stop("MADC Allele Sequences contain IUPAC (non-ATCG) codes. Please run HapApp to clean MADC file before using this function.") + + if (!isTRUE(checks$checks["RefAltSeqs"])) + stop("Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function.") + + if (!isTRUE(checks$checks["FixAlleleIDs"])) + stop("The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function.") + + if (!isTRUE(checks$checks["ChromPos"])) { + if (is.null(markers_info)) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + if (!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + } + + # ---- Check botloci vs MADC CloneIDs ---- + vmsg("Checking botloci file", verbose = verbose, level = 0, type = ">>") + cloneids_before <- report$CloneID + botloci_df <- read.table(botloci_file, header = FALSE) + botloci_before <- botloci_df$V1 + checked_botloci <- check_botloci(botloci_df, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci_df <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + cloneid_changed <- !identical(report$CloneID, cloneids_before) + botloci_changed <- !identical(botloci_df$V1, botloci_before) + + # ---- Botloci temp file (if IDs were remapped) ---- + if (botloci_changed) { + tmp_botloci <- tempfile() + on.exit(unlink(tmp_botloci), add = TRUE) + write.table(botloci_df, tmp_botloci, row.names = FALSE, col.names = FALSE, quote = FALSE) + botloci_input <- tmp_botloci + } else { + botloci_input <- botloci_file + } + + # ---- Corrections: only create a temp file if needed ---- + need_temp <- isTRUE(checks$checks["allNArow"]) || isTRUE(checks$checks["allNAcol"]) || cloneid_changed + + if (need_temp) { + if (checks$checks["LowerCase"]) { + vmsg("MADC Allele Sequences contain lowercase characters. Converting to uppercase", + verbose = verbose, level = 1, type = ">>") + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if (checks$checks["allNArow"]) { + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA row(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[!idx, ] + } + + if (checks$checks["allNAcol"]) { + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA column(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[, !idx] + } + + tmp_madc <- tempfile(fileext = ".csv") + on.exit(unlink(tmp_madc), add = TRUE) + write.csv(report, tmp_madc, row.names = FALSE, quote = TRUE) + input_file <- tmp_madc + } else { + if (checks$checks["LowerCase"]) + vmsg("MADC Allele Sequences contain lowercase characters. polyRAD will handle them", + verbose = verbose, level = 1, type = ">>") + input_file <- madc_file + } + + vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + + raddat <- polyRAD::readDArTag( + file = input_file, + botloci = botloci_input, + n.header.rows = 0L, + sample.name.row = 1, + trim.sample.names = "", + taxaPloidy = as.integer(ploidy) + ) + + overdispersionP <- polyRAD::TestOverdispersion(raddat) + my_ovdisp <- overdispersionP$optimal + + vmsg("Running HWE iteration (overdispersion = %s)", verbose = verbose, level = 0, type = ">>", my_ovdisp) + + raddat_hwe <- polyRAD::IterateHWE(raddat, overdispersion = my_ovdisp) + + vmsg("Writing VCF to %s", verbose = verbose, level = 0, type = ">>", outfile) + + polyRAD::RADdata2VCF(raddat_hwe, file = outfile, asSNPs = FALSE, hindhe = FALSE) + + vmsg("Done!", verbose = verbose, level = 0, type = ">>") + + invisible(NULL) +} + diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1b02c31..888c445 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -1,47 +1,293 @@ #' Format MADC Target Loci Read Counts Into VCF #' -#' This function will extract the read count information from a MADC file target markers and convert to VCF file format. +#' Convert DArTag MADC target read counts to a VCF #' -#' The DArTag MADC file format is not commonly supported through existing tools. This function -#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -#' genotyping panel target markers only +#' @description +#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target +#' read counts for the panel’s target loci. This is useful because MADC is not +#' widely supported by general-purpose tools, while VCF is. #' -#' @param madc_file Path to MADC file -#' @param output.file output file name and path -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. -#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded. +#' @details +#' **What this function does** +#' - Runs basic sanity checks on the MADC file via `check_madc_sanity()` (column +#' presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +#' chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). +#' - Extracts reference and total read counts per sample and target. +#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). +#' - If `get_REF_ALT = TRUE`, recovers REF/ALT bases either from `markers_info` +#' (when `Ref`/`Alt` columns are present) or by comparing the Ref/Alt probe +#' sequences in the MADC file (with strand correction via `botloci_file`). +#' Targets with >1 polymorphism between sequences are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, +#' `ALT`, bypassing sequence-based inference. +#' +#' **Output VCF layout** +#' - `INFO` fields: +#' * `DP` — total depth across all samples for the locus +#' * `ADS` — total counts across samples in the order `ref,alt` +#' - `FORMAT` fields (per sample): +#' * `DP` — total reads (ref + alt) +#' * `RA` — reads supporting the reference allele +#' * `AD` — `"ref,alt"` counts +#' +#' **Strand handling** +#' If a target ID appears in `botloci_file`, its probe sequences are reverse- +#' complemented prior to base comparison so that REF/ALT are reported in the +#' top-strand genomic orientation. +#' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if required columns (`CloneID`, `AlleleID`, +#' `AlleleSequence`) are missing. +#' +#' For the remaining checks the required inputs depend on the combination of +#' check result and `get_REF_ALT`: +#' +#' | Check | Status | `get_REF_ALT` | Required | +#' |---|---|---|---| +#' | **IUPAC codes** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **Indels** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **ChromPos** | valid | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | valid | `FALSE` | — | +#' | | invalid | `TRUE` | `markers_info` with `Chr`/`Pos`/`Ref`/`Alt` **or** `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | | invalid | `FALSE` | `markers_info` with `Chr`/`Pos` | +#' | **FixAlleleIDs** | fixed | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | fixed | `FALSE` | — | +#' | | not fixed | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | not fixed | `FALSE` | — | +#' +#' @param madc_file character. Path to the input MADC CSV file. +#' @param output.file character. Path to the output VCF file to write. +#' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text +#' file listing target IDs designed on the **bottom** strand (one ID per line). +#' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and +#' `markers_info` does not supply `Ref` and `Alt` columns. Also required when +#' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. +#' @param markers_info character or `NULL`. Optional path to a CSV providing target +#' metadata. Accepted columns: +#' - `CloneID` or `BI_markerID` (required as marker identifier); +#' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; +#' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover +#' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` +#' columns take priority; otherwise probe sequences from the MADC are compared +#' (with `botloci_file` for strand correction). Targets with more than one +#' difference between Ref/Alt sequences are removed. When `FALSE`, REF and ALT +#' are set to `"."` in the output VCF. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for +#' `|AltMatch` and `|RefMatch` rows are summed into their corresponding `|Ref` +#' and `|Alt` rows before building the matrices. Useful when the MADC contains +#' multiple allele rows per target that should be aggregated. +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress +#' messages about each processing step. +#' +#' @return (Invisibly) returns the path to `output.file`. The side effect is a +#' **VCF v4.3** written to disk containing one row per target and columns for all +#' samples in the MADC file. +#' +#' @section Dependencies: +#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base +#' **utils**. Helper functions expected in this package: `check_madc_sanity()`, +#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`. +#' +#' @examples +#' # Example files shipped with the package +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", +#' package = "BIGr") +#' out_vcf <- tempfile(fileext = ".vcf") +#' +#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +#' \dontrun{ +#' madc2vcf_targets( +#' madc_file = madc_file, +#' output.file = out_vcf, +#' botloci_file = bot_file, +#' get_REF_ALT = TRUE +#' ) +#' } +#' +#' # Clean up (example) +#' unlink(out_vcf) +#' +#' @seealso +#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()` #' -#' @return A VCF file v4.3 with the target marker read count information #' @import dplyr #' @import tidyr #' @import tibble -#' @importFrom Rdpack reprompt #' @importFrom reshape2 melt dcast #' @importFrom utils write.table #' @importFrom Biostrings DNAString reverseComplement -#' @return A VCF file v4.3 with the target marker read count information -#' -#' @examples -#' # Load example files -#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#' -#' #Temp location (only for example) -#' output_file <- tempfile() -#' -#' # Convert MADC to VCF -#' madc2vcf_targets(madc_file = madc_file, -#' output.file = output_file, -#' get_REF_ALT = TRUE, -#' botloci_file = bot_file) -#' -#' rm(output_file) #' #' @export -madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) { - #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) +madc2vcf_targets <- function(madc_file, + output.file, + botloci_file = NULL, + markers_info = NULL, + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = TRUE) { + + vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, type = ">>", output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, type = ">>", get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, type = ">>", collapse_matches_counts) + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(!(file.exists(madc_file) | url_exists(madc_file))) stop("The MADC file does not exist.") + if(!is.character(output.file)) stop("output.file must be a character string.") + if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.null(markers_info) && !file.exists(markers_info) && !url_exists(markers_info)) stop("The markers_info file does not exist.") + if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") + if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") + + # Create a VCF header line with metadata about the command and its parameters + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets=') + + # MADC checks + report <- read.csv(madc_file) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if(any(!(checks$checks[c("Columns")]))){ + idx <- which(!(checks$checks[c("Columns")])) + if(length(idx) > 0) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ + idx <- which((checks$checks[c("IUPACcodes", "Indels")])) + if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", + paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) + else vmsg("MADC file has IUPAC codes and/or indels, but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + } + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + # ---- Validate botloci and pre-process CloneIDs based on get_REF_ALT logic ---- + mi_df <- NULL # markers_info data frame (loaded once, reused below) + mi_has_ref_alt <- FALSE # TRUE when markers_info provides Ref and Alt columns + botloci <- NULL # botloci data frame (set when needed) + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) + } + + if(!checks$checks["FixAlleleIDs"]){ + vmsg("MADC file has not been processed by HapApp.", verbose = verbose, level = 1) + if(get_REF_ALT){ + if(!mi_has_ref_alt) stop("MADC file has not been processed by HapApp. BIGr only provide results if get_REF_ALT is set to FALSE or if is TRUE but a marker_info with REF and ALT information is provided.") + } + # The check points to FALSE if the 6 initial rows exist or if there are no fixed allele ID (aka _0001, _0002, etc) + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + } + + if(checks$checks["allNArow"]){ + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + report <- report[!idx, ] + vmsg("MADC contains rows with all NA values. Rows %s will be removed.", verbose = verbose, level = 1, type = ">>", paste(which(idx), collapse = ", ")) + } + + if(checks$checks["allNAcol"]){ + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + report <- report[, !idx] + vmsg("MADC contains columns with all NA values. Columns %s will be removed.", verbose = verbose, level = 1, type = ">>", paste0(which(idx), collapse = ",")) + } + + if(!isTRUE(checks$checks["ChromPos"])) { + if(is.null(markers_info)){ + stop("CloneID column does not follow the 'Chr_Pos'. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + } else { + + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + + } + } + + if(get_REF_ALT) { + + if(mi_has_ref_alt) { + # markers_info supplies REF and ALT — no botloci required + vmsg("markers_info contains Ref and Alt columns. REF and ALT will be taken from markers_info.", + verbose = verbose, level = 1, type = ">>") + + } else { + if(checks$checks["Indels"]) + stop("Indels detected in MADC file. Since get_REF_ALT = TRUE, a markers_info file with REF/ALT information is required.") + + # REF/ALT must be extracted from probe sequences — botloci is required + if(is.null(botloci_file) || (!file.exists(botloci_file) && !url_exists(botloci_file))) + stop("get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE.") + + # Validate that CloneIDs match botloci marker names (after any remapping) + botloci <- read.table(botloci_file, header = FALSE) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + } + } + + # Throw message if OtherAlleles are present + if(checks$checks["OtherAlleles"]) { + vmsg("AlleleID contains alleles other than Ref and Alt. These will be ignored in the VCF output. Use function madc2vcf_all to include them.", verbose = verbose, level = 1, type = ">>") + } + + # Make sure counts are numeric + count_cols <- setdiff(colnames(report), c("CloneID", "AlleleID", "AlleleSequence")) + report[count_cols] <- lapply(report[count_cols], function(x) as.numeric(as.character(x))) + + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") + + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) - matrices <- get_countsMADC(madc_file) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -55,20 +301,168 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = ) row.names(ad_df) <- row.names(ref_df) - #Obtaining Chr and Pos information from the row_names - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") + + if(get_REF_ALT && mi_has_ref_alt) { + vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") + + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", + id_col, ", Chr, Pos, Ref, Alt")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- matched$Ref + alt_base <- matched$Alt + + } else if(!is.null(markers_info) && !get_REF_ALT) { + vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") + + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + + } else { + vmsg(ifelse(get_REF_ALT, + "Recovering CHROM and POS from CloneID for probe-sequence REF/ALT extraction...", + "No markers_info file provided. Recovering CHROM and POS from CloneID..."), + verbose = verbose, level = 0, type = ">>") + + # Split on the last underscore to handle chromosome names containing underscores + # (e.g. Chr_01_000123456). When ChromPos was FALSE, check_botloci already + # remapped CloneIDs to Chr_PaddedPos, so this split is always valid. + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_(?=[^_]*$)") %>% + select(CHROM, POS) + new_df$POS <- sub("^0+", "", new_df$POS) + vmsg("CHROM and POS recovered from CloneID.", verbose = verbose, level = 1, type = ">>") + + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + if(get_REF_ALT) { + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", + verbose = verbose, level = 0, type = ">>") + + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) + csv <- csv[which(csv$CloneID %in% rownames(ad_df)), ] + + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + orig_ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + + if(all(sort(ref_ord) == sort(alt_ord))) { + # Key sequences by CloneID, then reorder to MADC row order so that + # loop index i always corresponds to rownames(size_df)[i]. + ref_seq_by_id <- setNames(orig_ref_seq, ref_ord) + alt_seq_by_id <- setNames(orig_alt_seq, alt_ord) + madc_ids <- rownames(size_df) + orig_ref_seq <- ref_seq_by_id[madc_ids] + orig_alt_seq <- alt_seq_by_id[madc_ids] + + more_poly <- no_diff <- 0 + ref_base <- alt_base <- rep(NA_character_, length(madc_ids)) + names(ref_base) <- names(alt_base) <- madc_ids + for(i in seq_along(madc_ids)) { + if(is.na(orig_ref_seq[i]) || is.na(orig_alt_seq[i])) next + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + if(length(temp_list[[1]]) != length(temp_list[[2]])) + stop(paste0("Marker '", madc_ids[i], "' has Ref and Alt probe sequences of different lengths ", + "(", length(temp_list[[1]]), " vs ", length(temp_list[[2]]), "). ", + "This should not happen for SNP markers. ", + "If this is an indel, please provide a markers_info file with Ref and Alt columns.")) + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(length(idx_diff) > 1) { + ref_base[i] <- NA + alt_base[i] <- NA + more_poly <- more_poly + 1 + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + if(madc_ids[i] %in% botloci[, 1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } + } else { + ref_base[i] <- NA + alt_base[i] <- NA + no_diff <- no_diff + 1 + } + } + if(more_poly > 0) vmsg(paste(more_poly, "markers removed: more than one polymorphism between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed: no differences found between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) + } else { + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT bases could not be recovered: missing reference or alternative sequences.", + verbose = verbose, level = 1, type = ">>") + } + } else { + # ── get_REF_ALT = FALSE, no markers_info ───────────────────────── + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + } + } + + vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF dataframe", verbose = verbose, level = 0, type = ">>") #Make a header separate from the dataframe vcf_header <- c( "##fileformat=VCFv4.3", @@ -79,86 +473,10 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = '##INFO=', '##FORMAT=', '##FORMAT=', - '##FORMAT=' + '##FORMAT=', + bigr_meta ) - # Get REF and ALT - if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] - - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) - - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] - - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence - - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) - - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] - - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) - - ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling - temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences - ref_base[i] <- NA - alt_base[i] <- NA - } else if(length(idx_diff) == 1) { - orig_ref_base <- temp_list[[1]][idx_diff] - orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { - ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) - alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) - } else { - ref_base[i] <- orig_ref_base - alt_base[i] <- orig_alt_base - } - } else { - # No differences found - ref_base[i] <- NA - alt_base[i] <- NA - } - } - } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") - ref_base <- "." - alt_base <- "." - } - - } else { - ref_base <- "." - alt_base <- "." - } - #Make the header#Make the VCF df vcf_df <- data.frame( CHROM = new_df$CHROM, @@ -180,40 +498,24 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = vcf_df$FORMAT <- paste("DP","RA","AD",sep=":") #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) - - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) - - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value - - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + m_size <- melt(as.matrix(size_df), varnames = c("Row", "Col"), value.name = "Value") + m_ref <- melt(as.matrix(ref_df), varnames = c("Row", "Col"), value.name = "Value") + m_ad <- melt(as.matrix(ad_df), varnames = c("Row", "Col"), value.name = "Value") - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + combined_long <- m_size + combined_long$Combined <- paste(m_size$Value, m_ref$Value, m_ad$Value, sep = ":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) + combined_wide <- suppressMessages(dcast(combined_long, Row ~ Col, value.var = "Combined")) + rownames(combined_wide) <- combined_wide$Row + combined_wide$Row <- NULL + colnames(combined_wide) <- colnames(size_df) - return(as.matrix(combined_wide)) - } - - # Combine the matrices - geno_df <- make_vcf_format(as.matrix(size_df), as.matrix(ref_df), as.matrix(ad_df)) + geno_df <- as.matrix(combined_wide) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) + vmsg("VCF dataframe prepared", verbose = verbose, level = 1, type = ">>") # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" @@ -221,8 +523,30 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = # Sort vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] - if(sum(is.na(vcf_df$REF)) >1) { - warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) + # Remove markers with NA CHROM/POS (unmatched in markers_info, Case 3) + na_coord <- is.na(vcf_df[, 1]) | is.na(vcf_df$POS) + if(any(na_coord)) { + vmsg(paste(sum(na_coord), "markers removed: no matching entry found in markers_info."), verbose = verbose, level = 1, type = ">>") + warning(paste(sum(na_coord), "markers removed: no matching entry found in markers_info.")) + vcf_df <- vcf_df[!na_coord, ] + } + + if(sum(is.na(vcf_df$REF)) > 0) { + vmsg( + paste( + sum(is.na(vcf_df$REF)), + "markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences)." + ), + verbose = verbose, + level = 1, + type = ">>" + ) + warning( + paste( + "Markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences):", + sum(is.na(vcf_df$REF)) + ) + ) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } @@ -233,12 +557,5 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - #Unload all items from memory - rm(matrices) - rm(ref_df) - rm(alt_df) - rm(size_df) - rm(ad_df) - rm(vcf_df) - rm(geno_df) + vmsg(paste("VCF file written to", output.file), verbose = verbose, level = 0, type = ">>") } diff --git a/R/utils.R b/R/utils.R index c280ad2..59e5563 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,8 +1,9 @@ #Internal Functions utils::globalVariables(c( - "ALT", "AlleleID", "CHROM", "Data", "ID", "MarkerName", "POS", - "QPseparate", "QPsolve_par", "REF", "Var1", "Variant", "geno", + "ALT", "AlleleID", "AlleleSequence", "CHROM", "Concordance", "Data", "ID", + "MarkerName", "POS", + "QPseparate", "QPsolve_par", "REF", "Type", "Var1", "Variant", "geno", "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", @@ -26,45 +27,67 @@ convert_to_dosage <- function(gt) { }) } -#' Check and Adjust Botloci and MADC Marker Compatibility -#' -#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +##' Verbose Message Utility +##' +##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +##' +##' @param text Character string, the message to print (supports sprintf formatting). +##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +##' @param ... Additional arguments passed to sprintf for formatting. +##' +##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +##' +##' @return No return value, called for side effects. +##' @export +vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { + if (!verbose) return(invisible()) + # Format timestamp + timestamp <- format(Sys.time(), "[%H:%M:%S]") + + # Create indentation based on level + indent <- switch(as.character(level), + "0" = "", # Section headers + "1" = " \u2219 ", # Main steps (medium bullet) + "2" = " - ", # Details + "3" = " > ", # Sub-details + paste0(paste(rep(" ", level), collapse = ""), "\u2022 ") # Fallback for level > 3 + ) + + # Format type label (only show for level 0) + type_label <- if (level == 0) sprintf("%-1s ", type) else "" + + # Format message text + dots <- list(...) + if(length(dots) == 0) { + msg_text <- text + } else { + msg_text <- sprintf(text, ...) + } + # Combine everything + formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) + message(formatted_msg) +} + + +#' Check Whether a URL Is Accessible #' -#' @param botloci A data frame containing the botloci markers. -#' @param report A data frame containing the MADC markers. -#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. +#' Attempts to open a connection to the given URL and returns `TRUE` if +#' successful, `FALSE` otherwise. Errors and warnings are both treated as +#' inaccessible. #' -#' @return A list containing the adjusted botloci and MADC data frames. +#' @param u character. The URL to test. #' -#' @details -#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. +#' @return A single logical: `TRUE` if the URL can be opened, `FALSE` if not. #' #' @keywords internal #' @noRd -check_botloci <- function(botloci, report, verbose=TRUE){ - if(!any(botloci$V1 %in% report$CloneID)) { - if(verbose) message("None of the botloci markers could be found in the MADC file. Checking padding match...\n") - - pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) - pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) - - if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") - - if(pad_madc != pad_botloci) { - if(verbose) message("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.\n") - if (pad_madc < pad_botloci) { - report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) - ) - } else { - botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", - sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) - ) - if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") - } - } else { - stop("Check marker IDs in both MADC and botloci files. They should be the same.") - } - } - return(list(botloci, report)) +#' +url_exists <- function(u) { + tryCatch({ + con <- url(u, open = "rb") + close(con) + TRUE + }, error = function(e) FALSE, warning = function(w) FALSE) } diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd new file mode 100644 index 0000000..0398625 --- /dev/null +++ b/man/check_madc_sanity.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_madc_sanity.R +\name{check_madc_sanity} +\alias{check_madc_sanity} +\title{Run basic sanity checks on a MADC-style allele report} +\usage{ +check_madc_sanity(report) +} +\arguments{ +\item{report}{A \code{data.frame} with at least the columns +\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also +used in the \code{FixAlleleIDs} check to inspect its first up to six entries. +If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are +treated as header filler and skipped before further checks are run.} +} +\value{ +A named list with five elements: +\describe{ +\item{checks}{Named logical vector with nine entries: +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, +\code{ChromPos}, \code{allNAcol}, \code{allNArow}, \code{RefAltSeqs}. +\code{TRUE} means the condition was detected (or passed for \code{Columns}, +\code{FixAlleleIDs}, \code{ChromPos}, and \code{RefAltSeqs}); \code{NA} means the check +was skipped.} +\item{messages}{Named list of length-2 character vectors, one per check. +Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} +when it is \code{FALSE}. Indexed by the same names as \code{checks}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt +lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} +when required columns are missing.} +\item{missRef}{Character vector of \code{CloneID}s that have no \code{Ref} allele +row. Returns \code{character(0)} if all \code{CloneID}s have a \code{Ref} row, or +\code{NULL} when required columns are missing.} +\item{missAlt}{Character vector of \code{CloneID}s that have no \code{Alt} allele +row. Returns \code{character(0)} if all \code{CloneID}s have an \code{Alt} row, or +\code{NULL} when required columns are missing.} +} +} +\description{ +Performs nine quick validations on an allele report: +\enumerate{ +\item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} +\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; +\item \strong{IUPACcodes} - presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} - presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} - reference/alternate allele lengths differ for the same \code{CloneID}, +or a \code{"-"} character is present in \code{AlleleSequence}; +\item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format +(prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); +\item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; +\item \strong{allNArow} - at least one row contains only \code{NA} or empty values; +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row. +} +} +\details{ +\itemize{ +\item \strong{FixAlleleIDs:} When the first six rows of the first column are all blank +or \code{"*"} (raw DArT format), row 7 is promoted to column headers and the +first 7 rows are dropped before subsequent checks are run. The check is +\code{TRUE} when the file has already been processed by HapApp (fixed IDs with +\verb{_0001}/\verb{_0002} suffixes). +\item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} +(case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs +\code{"Alt_0002"}, merged by \code{CloneID}, and flagged as indels if either (a) the +lengths of \code{AlleleSequence} differ, (b) the sequences have the same length +but more than one character differs between them (complex indel / local +rearrangement), or (c) a \code{"-"} character is present anywhere in +\code{AlleleSequence}. +\item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the +first part must match \code{"Chr"} (case-insensitive) and the second must be a +positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. +\item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows +respectively; a cell is treated as missing when it is \code{NA} or an empty +string (\code{""}). Useful for flagging empty or corrupt entries. +\item \strong{RefAltSeqs:} For each unique \code{CloneID}, checks whether at least one row +with a \code{Ref} (\verb{|Ref_} when \code{FixAlleleIDs = TRUE}, \verb{|Ref$} otherwise) and +one row with an \code{Alt} (\verb{|Alt_} / \verb{|Alt$}) allele exist. \code{CloneID}s that +lack a \code{Ref} row are stored in \code{missRef}; those lacking an \code{Alt} row in +\code{missAlt}. The check is \code{TRUE} when both sets are empty. +\item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and +\code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and +\code{indel_clone_ids}, \code{missRef}, and \code{missAlt} are returned as \code{NULL}. +} +} diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..ea63de7 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,58 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: +\itemize{ +\item Corrected pedigree as a dataframe in the global environment +\item A report listing all detected issues +} } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + } diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 676ef7f..0342fe1 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,6 +6,7 @@ \usage{ filterVCF( vcf.file, + quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, @@ -22,6 +23,8 @@ filterVCF( \arguments{ \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).} +\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).} + \item{filter.OD}{Updog filter} \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)} @@ -58,17 +61,11 @@ The VCF format is v4.3 \examples{ ## Use file paths for each file on the local system -#Temp location (only for example) -output_file <- tempfile() - -filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), - filter.OD = 0.5, - filter.MAF = 0.05, - ploidy = 2, - output.file = output_file) -# Removing the output for the example -rm(output_file) +#filterVCF(vcf.file = "example_dart_Dosage_Report.csv", + # filter.OD = 0.5, + # ploidy = 2, + # output.file = "name_for_vcf") ##The function will output the filtered VCF to the current working directory diff --git a/man/get_counts.Rd b/man/get_counts.Rd new file mode 100644 index 0000000..1879e07 --- /dev/null +++ b/man/get_counts.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_countsMADC.R +\name{get_counts} +\alias{get_counts} +\title{Read and Pre-process a MADC File} +\usage{ +get_counts( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., from \code{check_botloci()}). When supplied, file reading is skipped. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), those rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} +} +\value{ +A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining +all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample +count columns, etc.). +} +\description{ +Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt} +haplotype rows ready for count-matrix construction. +} +\details{ +\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data +frame) must be supplied; at least one is required. + +\strong{Format detection} (applied to file or object alike): the first seven rows +of the first column are inspected: +\itemize{ +\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are +treated as DArT placeholder rows and skipped. +\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is. +} + +\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}): +\itemize{ +\item \code{FALSE} (default): these rows are simply discarded. +\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt} +row for the same \code{CloneID}. +} + +In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002}) +are stripped to the canonical \verb{|Ref} / \verb{|Alt} form. +} +\keyword{internal} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 66c5708..28fca1e 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -4,18 +4,53 @@ \alias{get_countsMADC} \title{Obtain Read Counts from MADC File} \usage{ -get_countsMADC(madc_file) +get_countsMADC( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., as returned by \code{check_botloci()}). When supplied, file reading is +skipped. At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), \verb{|AltMatch} and \verb{|RefMatch} rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} } \value{ -A list of read count matrices for reference, alternate, and total read count values +A named list with two numeric matrices, both with markers as rows +and samples as columns: +\describe{ +\item{\code{ref_matrix}}{Reference allele read counts.} +\item{\code{size_matrix}}{Total read counts (reference + alternative).} +} } \description{ -This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +Reads a DArTag MADC report and returns reference and total read count matrices +per marker and sample. Only \code{Ref} and \code{Alt} target loci are retained; +\verb{|AltMatch} / \verb{|RefMatch} rows are either discarded or collapsed depending on +\code{collapse_matches_counts}. +} +\details{ +Either \code{madc_file} or \code{madc_object} must be provided (not both \code{NULL}). +When \code{madc_object} is supplied it is passed directly to \code{get_counts()}, +skipping file I/O. The function constructs: +\itemize{ +\item \code{ref_matrix} — per-sample reference allele counts. +\item \code{size_matrix} — per-sample total counts (ref + alt). +} + +Markers whose \code{CloneID} appears only in the \code{Ref} or only in the \code{Alt} rows +are removed with a warning. A summary of the proportion of zero-count +data points (missing data) is reported via \code{vmsg()}. } \examples{ # Get the path to the MADC file @@ -24,11 +59,13 @@ madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") # Extract the read count matrices counts_matrices <- get_countsMADC(madc_path) -# Access the reference, alternate, and size matrices - -# ref_matrix <- counts_matrices$ref_matrix -# alt_matrix <- counts_matrices$alt_matrix +# Access the reference and size matrices +# ref_matrix <- counts_matrices$ref_matrix # size_matrix <- counts_matrices$size_matrix rm(counts_matrices) + +} +\seealso{ +\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}} } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 0c06134..31f54a8 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -9,55 +9,70 @@ imputation_concordance( imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE + verbose = FALSE, + plot = FALSE, + print_result = TRUE ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{reference_genos}{A data frame containing reference genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{imputed_genos}{A data frame containing imputed genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{missing_code}{An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value specifying missing data. If provided, +loci with this value in either dataset will be excluded from the concordance calculation.} -\item{snps_2_exclude}{An optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} -\item{verbose}{A logical value indicating whether to print a summary of the concordance results. Default is FALSE.} +\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, +median, mean, maximum) of concordance percentages.} + +\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage +by sample.} + +\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance +results data frame to the console. If \code{FALSE}, results are returned invisibly.} } \value{ -A list with two elements: +A data frame with: \itemize{ -\item \code{result_df}: A data frame with sample IDs and their concordance percentages. -\item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. +\item \code{ID}: Sample identifiers shared between the datasets. +\item \code{Concordance}: Percentage of matching genotypes per sample. } +If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -will not be considered for concordance if the \code{missing_code} argument is used. If a specific subset of markers should be excluded, -it can be provided using the \code{snps_2_exclude} argument. +This function calculates the concordance between imputed and reference +genotypes. It assumes that samples are rows and markers are columns. +Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +Missing data in either dataset can be excluded from the concordance calculation +using the \code{missing_code} argument. Specific markers can be excluded using +the \code{snps_2_exclude} argument. } \details{ -The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +The function: +\enumerate{ +\item Identifies common samples and markers between the datasets. +\item Optionally excludes specified SNPs. +\item Removes loci with missing data (if \code{missing_code} is provided). +\item Computes per-sample concordance as the percentage of matching genotypes. } -\examples{ - -# Example Input variables -ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -ref_file <- system.file("imputation_reference.txt", package="BIGr") -test_file <- system.file("imputation_test.txt", package="BIGr") - -# Import files -snps = read.table(ignore_file, header = TRUE) -ref = read.table(ref_file, header = TRUE) -test = read.table(test_file, header = TRUE) - -#Calculations -result <- imputation_concordance(reference_genos = ref, - imputed_genos = test, - snps_2_exclude = snps, - missing_code = 5, - verbose = FALSE) - +When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +is generated using \pkg{ggplot2}. +} +\examples{ +\dontrun{ +result <- imputation_concordance( + reference_genos = ref, + imputed_genos = test, + snps_2_exclude = snps, + missing_code = 5, + verbose = TRUE, + plot = TRUE +) +} } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 6fe7f11..c15e69b 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -5,8 +5,8 @@ \title{Converts MADC file to VCF recovering target and off-target SNPs} \usage{ madc2vcf_all( - madc = NULL, - botloci_file = NULL, + madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -14,13 +14,17 @@ madc2vcf_all( multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE ) } \arguments{ -\item{madc}{A string specifying the path to the MADC file.} +\item{madc}{Required. A string specifying the path or URL to the MADC file.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand.} \item{hap_seq_file}{A string specifying the path to the haplotype database fasta file.} @@ -36,6 +40,14 @@ madc2vcf_all( \item{out_vcf}{A string specifying the name of the output VCF file. If the file extension is not \code{.vcf}, it will be appended automatically.} +\item{markers_info}{A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed).} + +\item{add_others}{A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE.} + +\item{others_max_snps}{An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit).} + +\item{others_rm_with_indels}{A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE.} + \item{verbose}{A logical value indicating whether to print metrics and progress to the console. Default is TRUE.} } \value{ @@ -46,6 +58,16 @@ This function processes a MADC file to generate a VCF file containing both targe } \details{ The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the \code{alignment_score_thr} parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the \code{alignment_score_thr} is not met, the corresponding SNPs are discarded. + +\strong{Sanity check behaviour and requirements}\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{markers_info} with \code{Ref}/\code{Alt}/\code{Indel_pos}/\code{Indel_length} + \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \strong{RefAltSeqs} \tab all paired \tab \code{botloci_file} \cr + \tab unpaired \tab \code{botloci_file} + \code{hap_seq_file} (microhaplotype DB) \cr +} } \examples{ # Example usage: diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd new file mode 100644 index 0000000..70bc59d --- /dev/null +++ b/man/madc2vcf_multi.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2vcf_multi.R +\name{madc2vcf_multi} +\alias{madc2vcf_multi} +\title{Convert MADC file to VCF using polyRAD for multiallelic genotyping} +\usage{ +madc2vcf_multi( + madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character. Path or URL to the input MADC CSV file.} + +\item{botloci_file}{character. Path or URL to the botloci file listing target +IDs designed on the bottom strand.} + +\item{outfile}{character. Path for the output VCF file.} + +\item{markers_info}{character or NULL. Optional path or URL to a CSV file +with marker metadata. Required when CloneIDs do not follow the +\code{Chr_Pos} format; must contain \code{CloneID} (or +\code{BI_markerID}), \code{Chr}, and \code{Pos} columns.} + +\item{ploidy}{integer. Ploidy level of the samples passed to \code{taxaPloidy}. +Default is 2.} + +\item{verbose}{logical. Whether to print progress messages. Default is TRUE.} +} +\value{ +Invisible NULL. Writes a VCF file to \code{outfile}. +} +\description{ +This function converts a DArTag MADC file to a VCF using the polyRAD package's +\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before +loading the data, applies corrections for lowercase sequences and all-NA +rows/columns, and sets \code{n.header.rows} automatically based on whether the +MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +format (no header rows). +} +\details{ +The function performs the following steps: +\enumerate{ +\item Reads the MADC file and runs \code{check_madc_sanity}. +\item Validates the botloci file against MADC CloneIDs using +\code{check_botloci}, fixing any padding mismatches automatically. +\item Converts lowercase bases in AlleleSequence to uppercase if detected. +\item Removes all-NA rows and columns if detected. +\item Writes the corrected data to a temporary file and passes it to +\code{polyRAD::readDArTag}. +\item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +calls \code{polyRAD::IterateHWE}, then exports the result with +\code{polyRAD::RADdata2VCF}. +} + +\strong{Sanity check behaviour and requirements} + +The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +remaining checks the required inputs are:\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr +} +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index a790460..30363a6 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -4,44 +4,153 @@ \alias{madc2vcf_targets} \title{Format MADC Target Loci Read Counts Into VCF} \usage{ -madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) +madc2vcf_targets( + madc_file, + output.file, + botloci_file = NULL, + markers_info = NULL, + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character. Path to the input MADC CSV file.} -\item{output.file}{output file name and path} +\item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text +file listing target IDs designed on the \strong{bottom} strand (one ID per line). +Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when +\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} -\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.} +\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target +metadata. Accepted columns: +\itemize{ +\item \code{CloneID} or \code{BI_markerID} (required as marker identifier); +\item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; +\item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +}} + +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover +REF/ALT bases. The source is chosen automatically: \code{markers_info} \code{Ref}/\code{Alt} +columns take priority; otherwise probe sequences from the MADC are compared +(with \code{botloci_file} for strand correction). Targets with more than one +difference between Ref/Alt sequences are removed. When \code{FALSE}, REF and ALT +are set to \code{"."} in the output VCF.} + +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for +\verb{|AltMatch} and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} +and \verb{|Alt} rows before building the matrices. Useful when the MADC contains +multiple allele rows per target that should be aggregated.} + +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress +messages about each processing step.} } \value{ -A VCF file v4.3 with the target marker read count information - -A VCF file v4.3 with the target marker read count information +(Invisibly) returns the path to \code{output.file}. The side effect is a +\strong{VCF v4.3} written to disk containing one row per target and columns for all +samples in the MADC file. } \description{ -This function will extract the read count information from a MADC file target markers and convert to VCF file format. +Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target +read counts for the panel’s target loci. This is useful because MADC is not +widely supported by general-purpose tools, while VCF is. } \details{ -The DArTag MADC file format is not commonly supported through existing tools. This function -will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -genotyping panel target markers only +Convert DArTag MADC target read counts to a VCF + +\strong{What this function does} +\itemize{ +\item Runs basic sanity checks on the MADC file via \code{check_madc_sanity()} (column +presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). +\item Extracts reference and total read counts per sample and target. +\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). +\item If \code{get_REF_ALT = TRUE}, recovers REF/ALT bases either from \code{markers_info} +(when \code{Ref}/\code{Alt} columns are present) or by comparing the Ref/Alt probe +sequences in the MADC file (with strand correction via \code{botloci_file}). +Targets with >1 polymorphism between sequences are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, +\code{ALT}, bypassing sequence-based inference. } -\examples{ -# Load example files -madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#Temp location (only for example) -output_file <- tempfile() +\strong{Output VCF layout} +\itemize{ +\item \code{INFO} fields: +\itemize{ +\item \code{DP} — total depth across all samples for the locus +\item \code{ADS} — total counts across samples in the order \verb{ref,alt} +} +\item \code{FORMAT} fields (per sample): +\itemize{ +\item \code{DP} — total reads (ref + alt) +\item \code{RA} — reads supporting the reference allele +\item \code{AD} — \code{"ref,alt"} counts +} +} -# Convert MADC to VCF -madc2vcf_targets(madc_file = madc_file, - output.file = output_file, - get_REF_ALT = TRUE, - botloci_file = bot_file) +\strong{Strand handling} +If a target ID appears in \code{botloci_file}, its probe sequences are reverse- +complemented prior to base comparison so that REF/ALT are reported in the +top-strand genomic orientation. -rm(output_file) +\strong{Sanity check behaviour and requirements} +The function always stops if required columns (\code{CloneID}, \code{AlleleID}, +\code{AlleleSequence}) are missing. + +For the remaining checks the required inputs depend on the combination of +check result and \code{get_REF_ALT}:\tabular{llll}{ + Check \tab Status \tab \code{get_REF_ALT} \tab Required \cr + \strong{IUPAC codes} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{Indels} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{ChromPos} \tab valid \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab valid \tab \code{FALSE} \tab — \cr + \tab invalid \tab \code{TRUE} \tab \code{markers_info} with \code{Chr}/\code{Pos}/\code{Ref}/\code{Alt} \strong{or} \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \tab invalid \tab \code{FALSE} \tab \code{markers_info} with \code{Chr}/\code{Pos} \cr + \strong{FixAlleleIDs} \tab fixed \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab fixed \tab \code{FALSE} \tab — \cr + \tab not fixed \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not fixed \tab \code{FALSE} \tab — \cr +} +} +\section{Dependencies}{ + +Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base +\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()}, +\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}. +} + +\examples{ +# Example files shipped with the package +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", + package = "BIGr") +out_vcf <- tempfile(fileext = ".vcf") + +# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +\dontrun{ +madc2vcf_targets( + madc_file = madc_file, + output.file = out_vcf, + botloci_file = bot_file, + get_REF_ALT = TRUE +) +} + +# Clean up (example) +unlink(out_vcf) + +} +\seealso{ +\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()} } diff --git a/man/vmsg.Rd b/man/vmsg.Rd new file mode 100644 index 0000000..abcc768 --- /dev/null +++ b/man/vmsg.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{vmsg} +\alias{vmsg} +\title{Verbose Message Utility} +\usage{ +vmsg(text, verbose = FALSE, level = 1, type = ">>", ...) +} +\arguments{ +\item{text}{Character string, the message to print (supports sprintf formatting).} + +\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.} + +\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).} + +\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.} + +\item{...}{Additional arguments passed to sprintf for formatting.} +} +\value{ +No return value, called for side effects. +} +\description{ +Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +} +\details{ +Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +} diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R new file mode 100644 index 0000000..d0c45cf --- /dev/null +++ b/tests/testthat/test-check_madc_sanity.R @@ -0,0 +1,71 @@ +test_that("check madc",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") + + # raw madc + report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test lower case + report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test IUPAC + report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # clean alfalfa madc (fixed allele IDs, Chr_Pos CloneIDs, no issues) + report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel madc (ChromPos FALSE because IDs are not Chr_Pos) + report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel IUPAC (IUPAC codes + lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel lowercase (lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato more indels madc ChromPosFALSE + report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) +}) + + diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index fdf69e0..706143f 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -15,7 +15,7 @@ test_that("test imputation",{ messy_parents <- output.list$messy_parents missing_parents <- output.list$missing_parents - expect_true(df_length == 4) + expect_true(df_length == 5) # Before was 4 expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) expect_true(nrow(missing_parents) == 13) diff --git a/tests/testthat/test-imputation_concordance.R b/tests/testthat/test-imputation_concordance.R index f1fb421..459998c 100644 --- a/tests/testthat/test-imputation_concordance.R +++ b/tests/testthat/test-imputation_concordance.R @@ -1,6 +1,5 @@ context("Imputation Concordance") - test_that("test imputation",{ #Input variables ignore_file <- system.file("imputation_ignore.txt", package="BIGr") diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f2095c5..c8c860f 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -22,7 +22,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) set.seed(456) madc2vcf_all(madc = madc_file, @@ -34,7 +34,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp_multi, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) vcf_multi <- read.vcfR(temp_multi) @@ -56,7 +56,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) @@ -65,3 +65,467 @@ test_that("test madc offtargets",{ rm(vcf) }) + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + #out <- "test.vcf" + # Default parameters + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 2, + rm_multiallelic_SNP = FALSE, + multiallelic_SNP_sample_thr = 0, + multiallelic_SNP_dp_thr = 0, + alignment_score_thr = 40, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) + unlink(out) + + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) + + # Test error when botloci_file is NULL + expect_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = NULL, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + # Test that it also works when markers_info is provided together with botloci + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + multiallelic_SNP_dp_thr = 80, + multiallelic_SNP_sample_thr = 2, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 234777) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 3) + + # Remove multiallelics + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + rm_multiallelic_SNP = TRUE, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + multi <- grepl(",", vcf@fix[,5]) + expect_false(any(multi)) # It has multiallelics + expect_equal(sum(multi), 0) + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when markers_info does not match MADC CloneIDs + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE), + regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs." + ) + + # Test error when markers_info_ChromPos is provided but IDs still don't match botloci + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + + }) + + test_that("alfalfa lower case missing 3 ref and 1 alt fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 233719) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) + + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence\\. This codes are not currently supported by BIGr/BIGapp\\. Run HapApp to replace them" + ) + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 226838) + expect_equal(sum(dp[3,]), 3996) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 3937) + expect_equal(sum(dp[,5]), 248571) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 219742) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # markers_info without Ref/Alt/Indel_pos while indels present -> error + expect_error( + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE), + regexp = "Indels detected in MADC file. The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns." + ) + }) + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + + expect_error( + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" + ) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), regexp = "MADC not processed by HapApp" + ) + }) +}) + diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R new file mode 100644 index 0000000..adca090 --- /dev/null +++ b/tests/testthat/test-madc2vcf_multi.R @@ -0,0 +1,150 @@ +context("MADC to VCF via polyRAD") + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { + + skip_if_not_installed("polyRAD") + skip_if_not_installed("VariantAnnotation") + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + out <- tempfile(fileext = ".vcf") + # Fixed allele ID format + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + # Don't allow raw MADC + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = FALSE + ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + # Wrong IDs + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + markers_info = alfalfa_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + ### Avoid IUPAC codes + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_iupac, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = potato_indel_madc, + botloci_file = potato_botloci, + outfile = out, + markers_info = potato_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 277) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/1/1/6") + +}) + diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 524b687..a64da34 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -69,33 +69,10 @@ test_that("bottom strand markers have correct REF/ALT", { # Get results from both functions suppressWarnings( madc2vcf_targets(madc_file = madc_file, output.file = temp_targets, - get_REF_ALT = TRUE, botloci_file = bot_file) - ) - - suppressWarnings( - madc2vcf_all(madc = madc_file, botloci_file = bot_file, - hap_seq_file = NULL, out_vcf = temp_all, verbose = FALSE) + get_REF_ALT = TRUE, botloci_file = bot_file) ) vcf_targets <- read.vcfR(temp_targets, verbose = FALSE) - vcf_all <- read.vcfR(temp_all, verbose = FALSE) - - # Find common markers between both outputs - common_markers <- intersect(vcf_targets@fix[,"ID"], vcf_all@fix[,"ID"]) - - if(length(common_markers) > 0) { - # Compare REF/ALT for common markers - targets_subset <- vcf_targets@fix[vcf_targets@fix[,"ID"] %in% common_markers,] - all_subset <- vcf_all@fix[vcf_all@fix[,"ID"] %in% common_markers,] - - # Sort both by ID for comparison - targets_subset <- targets_subset[order(targets_subset[,"ID"]),] - all_subset <- all_subset[order(all_subset[,"ID"]),] - - # Check that REF/ALT match between the two functions - expect_equal(targets_subset[,"REF"], all_subset[,"REF"]) - expect_equal(targets_subset[,"ALT"], all_subset[,"ALT"]) - } # Validate that all REF/ALT are valid nucleotides expect_true(all(vcf_targets@fix[,"REF"] %in% c("A", "T", "G", "C", "."))) @@ -107,5 +84,644 @@ test_that("bottom strand markers have correct REF/ALT", { expect_true(all(vcf_targets@fix[valid_snps,"REF"] != vcf_targets@fix[valid_snps,"ALT"])) } - rm(vcf_targets, vcf_all, temp_targets, temp_all) + rm(vcf_targets, temp_targets) +}) + + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4139) + expect_equal(sum(DP[,5]), 42869) + unlink(out) + + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = TRUE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when get_REF_ALT = TRUE but no markers_info or botloci provided to extract REF/ALT + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE), + regexp = "get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE." + ) + + # Test that it works when get_REF_ALT = TRUE and botloci is provided (REF/ALT recovered from probe sequences) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci (should give same result as above but just to confirm no interference between the two) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC (even after trying to fix potential padding mismatch with ChromPos info in markers_info) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when no matching CloneID between markers_info and MADC to fix the botloci mismatch issue (even if botloci file is not used, the function should still check that the provided markers_info can match with MADC CloneIDs to be able to use the ChromPos info to fix potential padding mismatch) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE), + "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match." + ) + + # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue + # (even if botloci file is not used, the function should still be able to use the ChromPos info in markers_info to + # fix potential padding mismatch and find matching IDs between MADC and botloci) + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info_ChromPos, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info_ChromPos, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + botloci_file = potato_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # get_REF_ALT = FALSE: same counts as alfalfa_madc + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE) + ) + + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = TRUE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + }) })