diff --git a/.gitignore b/.gitignore index 3b74a301..9e2f8816 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,6 @@ *.ASOIAF.ged *.Rproj *.[rR][dD]ata -*.[rR]ds *.dbf *.doc* *.eps @@ -52,5 +51,7 @@ tests/testthat/Rplots.pdf vignettes/articles/paper.html vignettes/rewritten_relatedness_vignette.Rmd vignettes/rewritten_relatedness_vignette.Xmd +revdep/ +/.claude vignettes/understanding_relatedness.Rmd vignettes/understanding_relatedness.Xmd diff --git a/NAMESPACE b/NAMESPACE index fe6d1480..4de83c70 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(computeParentAdjacency) export(createGenDataFrame) export(determineSex) export(dropLink) +export(findLeaves) export(fitComponentModel) export(fitPedigreeModel) export(getWikiTreeSummary) @@ -57,6 +58,7 @@ export(summarizeMatrilines) export(summarizePatrilines) export(summarizePedigrees) export(traceTreePaths) +export(trimPedigree) export(vech) importFrom(Matrix,Diagonal) importFrom(Matrix,sparseMatrix) diff --git a/NEWS.md b/NEWS.md index aa6b581a..7bb74d58 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * Fixed missing checkpoint for ram_checkpoint * Try a chunk_size argument for ped2com to reduce memory usage during transpose * Try filter method for whose relatedness to return by individual ID +* Renamed `ytemp` parameter to `obs_ids` in `buildOneFamilyGroup()` and `buildFamilyGroups()` for clarity +* Expanded v6 vignette with data requirements reference and real-data workflow using the `hazard` dataset # BGmisc 1.6.0.1 ## CRAN submission diff --git a/R/buildComponent.R b/R/buildComponent.R index 2802b229..4b2cdbd0 100644 --- a/R/buildComponent.R +++ b/R/buildComponent.R @@ -222,9 +222,26 @@ ped2com <- function(ped, component, } # isPar is the adjacency matrix. 'A' matrix from RAM - if (config$component %in% c("common nuclear")) { Matrix::diag(isPar) <- 1 + + if (!is.null(config$keep_ids)) { + isPar <- .subsetKeepIds( + component = isPar, + keep_ids = keep_ids, + available_ids = rownames(isPar), + config = config, + drop = FALSE, + verbose_message = "Subsetting adjacency matrix to %d target individuals\n" + ) # also need to drop columns here because the adjacency matrix is used in the path tracing and we want to make sure the paths are correct for the target individuals. We will keep all columns for the path tracing but subset to the target rows so that the relatedness values are correct for the target individuals. + + if (length(rownames(isPar)) > 1) { + isPar <- isPar[, rownames(isPar), drop = FALSE] + } # else { + # isPar <- isPar[rownames(isPar)] + # } + # isPar <- isPar[, rownames(isPar)] # + } if (config$sparse == FALSE) { isPar <- as.matrix(isPar) } diff --git a/R/buildmxPedigrees.R b/R/buildmxPedigrees.R index 7bbcff3e..3e61fe83 100644 --- a/R/buildmxPedigrees.R +++ b/R/buildmxPedigrees.R @@ -20,7 +20,8 @@ buildPedigreeModelCovariance <- function( vars = list( ad2 = 0.5, dd2 = 0.3, - cn2 = 0.2, ce2 = 0.4, + cn2 = 0.2, + ce2 = 0.4, mt2 = 0.1, am2 = 0.25, ee2 = 0.6 @@ -104,8 +105,10 @@ buildPedigreeModelCovariance <- function( #' @param Mtdmat Mitochondrial genetic relatedness matrix (from \code{\link{ped2mit}}). #' @param Amimat Additive by mitochondrial interaction relatedness matrix. #' @param Dmgmat Dominance genetic relatedness matrix. -#' @param full_df_row A 1-row matrix of observed data with column names matching \code{ytemp}. -#' @param ytemp A character vector of variable names corresponding to the observed data columns. +#' @param full_df_row A 1-row matrix of observed data with column names matching \code{obs_ids}. +#' @param obs_ids A character vector of individual IDs corresponding to the columns of +#' \code{full_df_row} and the rows/columns of the relatedness matrices. Must be in the +#' same order as the relatedness matrix rows. #' @return An OpenMx model for the specified family group. #' @export @@ -118,7 +121,7 @@ buildOneFamilyGroup <- function( Amimat = NULL, Dmgmat = NULL, full_df_row, - ytemp + obs_ids ) { if (!requireNamespace("OpenMx", quietly = TRUE)) { stop("OpenMx package is required for buildOneFamilyGroup function. Please install it.") @@ -208,10 +211,10 @@ buildOneFamilyGroup <- function( OpenMx::mxData(observed = full_df_row, type = "raw", sort = FALSE), OpenMx::mxMatrix("Full", nrow = 1, ncol = fsize, name = "M", free = TRUE, - labels = "meanLI", dimnames = list(NULL, ytemp) + labels = "meanLI", dimnames = list(NULL, obs_ids) ), OpenMx::mxAlgebraFromString(algebra_str, - name = "V", dimnames = list(ytemp, ytemp) + name = "V", dimnames = list(obs_ids, obs_ids) ), OpenMx::mxExpectationNormal(covariance = "V", means = "M"), OpenMx::mxFitFunctionML() @@ -227,7 +230,8 @@ buildOneFamilyGroup <- function( #' provided relatedness matrices and observed data. #' #' @param dat A data frame where each row represents a family group and columns correspond to observed variables. -#' @param ytemp A vector of variable names corresponding to the observed data. +#' @param obs_ids A character vector of individual IDs corresponding to the columns of \code{dat} +#' and the rows/columns of the relatedness matrices. #' @param Addmat Additive genetic relatedness matrix. #' @param Nucmat Nuclear family shared environment relatedness matrix. #' @param Extmat Extended family shared environment relatedness matrix. @@ -239,7 +243,7 @@ buildOneFamilyGroup <- function( #' @export buildFamilyGroups <- function( - dat, ytemp, + dat, obs_ids, Addmat = NULL, Nucmat = NULL, Extmat = NULL, @@ -256,17 +260,17 @@ buildFamilyGroups <- function( groups <- vector("list", numfam) for (afam in seq_len(numfam)) { - full_df_row <- matrix(dat[afam, ], nrow = 1, dimnames = list(NULL, ytemp)) + full_df_row <- matrix(dat[afam, ], nrow = 1, dimnames = list(NULL, obs_ids)) groups[[afam]] <- buildOneFamilyGroup( - group_name = paste0(prefix, afam), - Addmat = Addmat, - Nucmat = Nucmat, - Extmat = Extmat, - Mtdmat = Mtdmat, - Amimat = Amimat, - Dmgmat = Dmgmat, + group_name = paste0(prefix, afam), + Addmat = Addmat, + Nucmat = Nucmat, + Extmat = Extmat, + Mtdmat = Mtdmat, + Amimat = Amimat, + Dmgmat = Dmgmat, full_df_row = full_df_row, - ytemp = ytemp + obs_ids = obs_ids ) } @@ -347,6 +351,7 @@ buildPedigreeMx <- function(model_name, vars, group_models) { #' @param group_models Optional list of pre-built OpenMx family group models #' (from \code{\link{buildOneFamilyGroup}}). If NULL, they are generated from \code{data} #' using the provided relatedness matrices. +#' @param intervals Logical. If TRUE (default), compute confidence intervals for the parameters using \code{mxSE} and \code{mxCI}. #' @param Addmat Additive genetic relatedness matrix. Required when \code{group_models} is NULL. #' @param Nucmat Common nuclear environment relatedness matrix. Optional. #' @param Extmat Common extended environment relatedness matrix. Optional. @@ -363,7 +368,8 @@ fitPedigreeModel <- function( vars = list( ad2 = 0.5, dd2 = 0.3, - cn2 = 0.2, ce2 = 0.4, + cn2 = 0.2, + ce2 = 0.4, mt2 = 0.1, am2 = 0.25, ee2 = 0.6 @@ -371,6 +377,7 @@ fitPedigreeModel <- function( data = NULL, group_models = NULL, tryhard = TRUE, + intervals = TRUE, Addmat = NULL, Nucmat = NULL, Extmat = NULL, @@ -387,10 +394,10 @@ fitPedigreeModel <- function( if (is.null(data)) { stop("Either 'group_models' or 'data' must be provided.") } - ytemp <- colnames(data) + obs_ids <- colnames(data) group_models <- buildFamilyGroups( dat = data, - ytemp = ytemp, + obs_ids = obs_ids, Addmat = Addmat, Nucmat = Nucmat, Extmat = Extmat, @@ -405,10 +412,10 @@ fitPedigreeModel <- function( vars = vars, group_models = group_models ) - if (tryhard) { - fitted_model <- OpenMx::mxTryHard(pedigree_model, silent = TRUE, extraTries = 10, intervals = TRUE) + if (tryhard == TRUE) { + fitted_model <- OpenMx::mxTryHard(pedigree_model, silent = TRUE, extraTries = 10, intervals = intervals) } else { - fitted_model <- OpenMx::mxRun(pedigree_model) + fitted_model <- OpenMx::mxRun(pedigree_model, intervals = intervals) } fitted_model } diff --git a/R/trimPedigree.R b/R/trimPedigree.R new file mode 100644 index 00000000..0cbf9319 --- /dev/null +++ b/R/trimPedigree.R @@ -0,0 +1,244 @@ +#' Find Leaf Nodes in a Pedigree +#' +#' Identifies individuals who are structural "leaves" in the pedigree network --- +#' those who can potentially be removed without substantially altering the +#' connectivity of the larger tree. +#' +#' Two types of leaves are identified: +#' \itemize{ +#' \item \strong{Terminal nodes}: individuals with outdegree 0, meaning they +#' have no children recorded in the pedigree. Controlled by +#' \code{include_terminal}. +#' \item \strong{Founder singletons}: individuals with indegree 0 \emph{and} +#' outdegree 1, meaning they are founders (no recorded parents) who appear +#' as a parent of exactly one child. Controlled by +#' \code{include_founder_singletons}. +#' } +#' +#' In the directed pedigree graph used by \code{\link{ped2graph}}, edges run +#' from \strong{parent to child}. Consequently, indegree reflects the number of +#' recorded parents and outdegree reflects the number of recorded children. +#' +#' @inheritParams ped2fam +#' @param include_terminal Logical. If \code{TRUE} (default), flag individuals +#' with no children (outdegree 0) as leaves. +#' @param include_founder_singletons Logical. If \code{TRUE} (default), also +#' flag founders with exactly one child (indegree 0, outdegree 1) as leaves. +#' @param keep_var Character. Optional column name of a phenotypic variable. +#' When supplied, individuals are protected from removal based on their value +#' in this column (see \code{keep_vals}). +#' @param keep_vals Optional vector of values in \code{keep_var} that protect an +#' individual from being flagged as a leaf. If \code{NULL} (default) and +#' \code{keep_var} is supplied, any individual with a \emph{non-missing} value +#' is protected. To protect individuals with missing data instead, pass +#' \code{keep_vals = NA}. +#' @param verbose Logical. If \code{TRUE}, print counts of each leaf type. +#' +#' @return A character vector of person IDs that are leaf nodes. +#' +#' @seealso \code{\link{trimPedigree}} to iteratively remove the identified leaves. +#' +#' @examples +#' \dontrun{ +#' ped <- data.frame( +#' ID = 1:6, +#' dadID = c(NA, NA, 1, 1, 3, NA), +#' momID = c(NA, NA, 2, 2, 4, NA) +#' ) +#' findLeaves(ped) +#' } +#' @export +findLeaves <- function(ped, + personID = "ID", + momID = "momID", + dadID = "dadID", + include_terminal = TRUE, + include_founder_singletons = TRUE, + keep_var = NULL, + keep_vals = NULL, + verbose = FALSE) { + if (!include_terminal && !include_founder_singletons) { + stop("At least one of include_terminal or include_founder_singletons must be TRUE.") + } + + if (!is.null(keep_var) && !keep_var %in% names(ped)) { + stop("keep_var '", keep_var, "' not found in pedigree column names.") + } + + pg <- ped2graph(ped, personID = personID, momID = momID, dadID = dadID) + + indeg <- igraph::degree(pg, mode = "in") + outdeg <- igraph::degree(pg, mode = "out") + + terminal_ids <- if (include_terminal) names(outdeg)[outdeg == 0] else character(0) + founder_singleton_ids <- if (include_founder_singletons) names(indeg)[indeg == 0 & outdeg == 1] else character(0) + + leaf_ids <- union(terminal_ids, founder_singleton_ids) + + # Restrict to IDs that are actual rows in the pedigree (ped2graph may add + # phantom nodes for parents listed in momID/dadID but absent as rows) + leaf_ids <- leaf_ids[leaf_ids %in% as.character(ped[[personID]])] + + # Protect individuals based on phenotype values + if (!is.null(keep_var)) { + phenotype_vals <- ped[[keep_var]] + names(phenotype_vals) <- as.character(ped[[personID]]) + + if (is.null(keep_vals)) { + # Protect anyone with non-missing phenotype data + protected_ids <- names(phenotype_vals)[!is.na(phenotype_vals)] + } else if (anyNA(keep_vals)) { + # Protect anyone with missing phenotype data (and any other keep_vals) + non_na_vals <- keep_vals[!is.na(keep_vals)] + protected_ids <- names(phenotype_vals)[ + is.na(phenotype_vals) | phenotype_vals %in% non_na_vals + ] + } else { + # Protect anyone whose phenotype matches keep_vals + protected_ids <- names(phenotype_vals)[phenotype_vals %in% keep_vals] + } + + n_before <- length(leaf_ids) + leaf_ids <- leaf_ids[!leaf_ids %in% protected_ids] + + if (verbose == TRUE) { + message(n_before - length(leaf_ids), " leaf node(s) protected by keep_var '", keep_var, "'.") + } + } + + if (verbose == TRUE) { + if (include_terminal) { + message(length(terminal_ids), " terminal nodes (outdegree == 0).") + } + if (include_founder_singletons) { + message(length(founder_singleton_ids), " founder singletons (indegree == 0 & outdegree == 1).") + } + message(length(leaf_ids), " total leaf nodes identified.") + } + + return(leaf_ids) +} + + +#' Iteratively Trim Leaf Nodes from a Pedigree +#' +#' Repeatedly removes structural leaf nodes from a pedigree until no further +#' trimming is possible or a stopping condition is reached. After each removal +#' pass, parent ID columns are updated so that references to removed individuals +#' are set to \code{NA}. +#' +#' The trimming process peels the pedigree from the outside in: first removing +#' the outermost leaves, then re-evaluating the remaining structure so that +#' newly exposed leaves can be removed in subsequent iterations. +#' +#' Iteration stops when any of the following conditions is met: +#' \itemize{ +#' \item No leaf nodes remain. +#' \item The number of iterations reaches \code{max_iter}. +#' \item Removing the next batch of leaves would reduce the pedigree below +#' \code{min_size} rows. +#' } +#' +#' @inheritParams findLeaves +#' @param max_iter Integer or \code{Inf}. Maximum number of trimming iterations. +#' Defaults to \code{Inf}, which trims until no other stopping condition applies. +#' @param min_size Integer. Minimum number of individuals to retain. Trimming +#' stops before any removal that would reduce the pedigree below this size. +#' Defaults to \code{0L}. +#' @param remove_ids Character vector of additional individual IDs to remove +#' before any leaf-based trimming. Defaults to \code{NULL}. +#' @param keep_var Character. Optional column name of a phenotypic variable. +#' Passed to \code{\link{findLeaves}} at every iteration so that individuals +#' with protected phenotype values are never removed. +#' @param keep_vals Optional vector of phenotype values that protect an +#' individual from removal. See \code{\link{findLeaves}} for full details. +#' +#' @return A trimmed pedigree \code{data.frame} with the same columns as the +#' input. Parent ID columns (\code{momID}, \code{dadID}) are updated to +#' \code{NA} for any references to removed individuals. +#' +#' @seealso \code{\link{findLeaves}} to preview which individuals would be removed. +#' +#' @examples +#' \dontrun{ +#' ped <- data.frame( +#' ID = 1:6, +#' dadID = c(NA, NA, 1, 1, 3, NA), +#' momID = c(NA, NA, 2, 2, 4, NA) +#' ) +#' trimPedigree(ped, verbose = TRUE) +#' trimPedigree(ped, min_size = 2, verbose = TRUE) +#' } +#' @export +trimPedigree <- function(ped, + personID = "ID", + momID = "momID", + dadID = "dadID", + include_terminal = TRUE, + include_founder_singletons = TRUE, + max_iter = Inf, + min_size = 0L, + remove_ids = NULL, + keep_var = NULL, + keep_vals = NULL, + verbose = FALSE) { + # Apply any user-supplied forced removals first + if (!is.null(remove_ids)) { + ped <- ped[!as.character(ped[[personID]]) %in% as.character(remove_ids), ] + if (verbose == TRUE) { + message("Removed ", length(remove_ids), " user-specified IDs. ", nrow(ped), " individuals remain.") + } + } + + iter <- 0L + + repeat { + leaf_ids <- findLeaves( + ped, + personID = personID, + momID = momID, + dadID = dadID, + include_terminal = include_terminal, + include_founder_singletons = include_founder_singletons, + keep_var = keep_var, + keep_vals = keep_vals, + verbose = FALSE + ) + + if (length(leaf_ids) == 0L) break + + # Enforce min_size: do not remove if doing so would drop below the threshold + if (nrow(ped) - length(leaf_ids) < min_size) { + if (verbose == TRUE) { + message("Stopping: removing ", length(leaf_ids), " leaf node(s) would drop below min_size (", min_size, ").") + } + break + } + + ped <- ped[!as.character(ped[[personID]]) %in% leaf_ids, ] + iter <- iter + 1L + + if (verbose == TRUE) { + message( + "Iteration ", iter, ": removed ", length(leaf_ids), + " leaf node(s). ", nrow(ped), " individuals remain." + ) + } + + if (iter >= max_iter) break + } + + # Nullify dangling parent references introduced by removals + if (momID %in% names(ped)) { + mom_vec <- ped[[momID]] + mom_idx <- !is.na(mom_vec) & !as.character(mom_vec) %in% as.character(ped[[personID]]) + ped[[momID]][mom_idx] <- NA + } + if (dadID %in% names(ped)) { + dad_vec <- ped[[dadID]] + dad_idx <- !is.na(dad_vec) & !as.character(dad_vec) %in% as.character(ped[[personID]]) + ped[[dadID]][dad_idx] <- NA + } + + return(ped) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index cc6460d3..4fe5fd95 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,3 @@ +url: https://r-computing-lab.github.io/BGmisc/ template: - bootstrap: 5 - light-switch: false -url: https://R-Computing-Lab.github.io/BGmisc + bootstrap: 5 \ No newline at end of file diff --git a/data-raw/benchged.R b/data-raw/benchged.R index 18a9fc47..3d27cf07 100644 --- a/data-raw/benchged.R +++ b/data-raw/benchged.R @@ -5,7 +5,6 @@ library(Matrix) library(tidyverse) - # Run benchmarking for "loop" and "indexed" methods in ped2com() benchmark_results <- microbenchmark( reg = { diff --git a/inst/extdata/fitted_multi.rds b/inst/extdata/fitted_multi.rds new file mode 100644 index 00000000..fb2cb188 Binary files /dev/null and b/inst/extdata/fitted_multi.rds differ diff --git a/man/buildFamilyGroups.Rd b/man/buildFamilyGroups.Rd index 731914ea..dd54c7a8 100644 --- a/man/buildFamilyGroups.Rd +++ b/man/buildFamilyGroups.Rd @@ -6,7 +6,7 @@ \usage{ buildFamilyGroups( dat, - ytemp, + obs_ids, Addmat = NULL, Nucmat = NULL, Extmat = NULL, @@ -19,7 +19,8 @@ buildFamilyGroups( \arguments{ \item{dat}{A data frame where each row represents a family group and columns correspond to observed variables.} -\item{ytemp}{A vector of variable names corresponding to the observed data.} +\item{obs_ids}{A character vector of individual IDs corresponding to the columns of \code{dat} +and the rows/columns of the relatedness matrices.} \item{Addmat}{Additive genetic relatedness matrix.} diff --git a/man/buildOneFamilyGroup.Rd b/man/buildOneFamilyGroup.Rd index eaf5da63..437479e3 100644 --- a/man/buildOneFamilyGroup.Rd +++ b/man/buildOneFamilyGroup.Rd @@ -13,7 +13,7 @@ buildOneFamilyGroup( Amimat = NULL, Dmgmat = NULL, full_df_row, - ytemp + obs_ids ) } \arguments{ @@ -32,9 +32,11 @@ a common-extended-environment term using a unit matrix is included.} \item{Dmgmat}{Dominance genetic relatedness matrix.} -\item{full_df_row}{A 1-row matrix of observed data with column names matching \code{ytemp}.} +\item{full_df_row}{A 1-row matrix of observed data with column names matching \code{obs_ids}.} -\item{ytemp}{A character vector of variable names corresponding to the observed data columns.} +\item{obs_ids}{A character vector of individual IDs corresponding to the columns of +\code{full_df_row} and the rows/columns of the relatedness matrices. Must be in the +same order as the relatedness matrix rows.} } \value{ An OpenMx model for the specified family group. diff --git a/man/findLeaves.Rd b/man/findLeaves.Rd new file mode 100644 index 00000000..5868fda5 --- /dev/null +++ b/man/findLeaves.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/trimPedigree.R +\name{findLeaves} +\alias{findLeaves} +\title{Find Leaf Nodes in a Pedigree} +\usage{ +findLeaves( + ped, + personID = "ID", + momID = "momID", + dadID = "dadID", + include_terminal = TRUE, + include_founder_singletons = TRUE, + keep_var = NULL, + keep_vals = NULL, + verbose = FALSE +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{include_terminal}{Logical. If \code{TRUE} (default), flag individuals +with no children (outdegree 0) as leaves.} + +\item{include_founder_singletons}{Logical. If \code{TRUE} (default), also +flag founders with exactly one child (indegree 0, outdegree 1) as leaves.} + +\item{keep_var}{Character. Optional column name of a phenotypic variable. +When supplied, individuals are protected from removal based on their value +in this column (see \code{keep_vals}).} + +\item{keep_vals}{Optional vector of values in \code{keep_var} that protect an +individual from being flagged as a leaf. If \code{NULL} (default) and +\code{keep_var} is supplied, any individual with a \emph{non-missing} value +is protected. To protect individuals with missing data instead, pass +\code{keep_vals = NA}.} + +\item{verbose}{Logical. If \code{TRUE}, print counts of each leaf type.} +} +\value{ +A character vector of person IDs that are leaf nodes. +} +\description{ +Identifies individuals who are structural "leaves" in the pedigree network --- +those who can potentially be removed without substantially altering the +connectivity of the larger tree. +} +\details{ +Two types of leaves are identified: +\itemize{ + \item \strong{Terminal nodes}: individuals with outdegree 0, meaning they + have no children recorded in the pedigree. Controlled by + \code{include_terminal}. + \item \strong{Founder singletons}: individuals with indegree 0 \emph{and} + outdegree 1, meaning they are founders (no recorded parents) who appear + as a parent of exactly one child. Controlled by + \code{include_founder_singletons}. +} + +In the directed pedigree graph used by \code{\link{ped2graph}}, edges run +from \strong{parent to child}. Consequently, indegree reflects the number of +recorded parents and outdegree reflects the number of recorded children. +} +\examples{ +\dontrun{ +ped <- data.frame( + ID = 1:6, + dadID = c(NA, NA, 1, 1, 3, NA), + momID = c(NA, NA, 2, 2, 4, NA) +) +findLeaves(ped) +} +} +\seealso{ +\code{\link{trimPedigree}} to iteratively remove the identified leaves. +} diff --git a/man/fitPedigreeModel.Rd b/man/fitPedigreeModel.Rd index e8b93f02..91f8e44b 100644 --- a/man/fitPedigreeModel.Rd +++ b/man/fitPedigreeModel.Rd @@ -11,6 +11,7 @@ fitPedigreeModel( data = NULL, group_models = NULL, tryhard = TRUE, + intervals = TRUE, Addmat = NULL, Nucmat = NULL, Extmat = NULL, @@ -34,6 +35,8 @@ using the provided relatedness matrices.} \item{tryhard}{Logical. If TRUE (default), use \code{mxTryHard} for robust optimization; if FALSE, use \code{mxRun}.} +\item{intervals}{Logical. If TRUE (default), compute confidence intervals for the parameters using \code{mxSE} and \code{mxCI}.} + \item{Addmat}{Additive genetic relatedness matrix. Required when \code{group_models} is NULL.} \item{Nucmat}{Common nuclear environment relatedness matrix. Optional.} diff --git a/man/trimPedigree.Rd b/man/trimPedigree.Rd new file mode 100644 index 00000000..eff48207 --- /dev/null +++ b/man/trimPedigree.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/trimPedigree.R +\name{trimPedigree} +\alias{trimPedigree} +\title{Iteratively Trim Leaf Nodes from a Pedigree} +\usage{ +trimPedigree( + ped, + personID = "ID", + momID = "momID", + dadID = "dadID", + include_terminal = TRUE, + include_founder_singletons = TRUE, + max_iter = Inf, + min_size = 0L, + remove_ids = NULL, + keep_var = NULL, + keep_vals = NULL, + verbose = FALSE +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{include_terminal}{Logical. If \code{TRUE} (default), flag individuals +with no children (outdegree 0) as leaves.} + +\item{include_founder_singletons}{Logical. If \code{TRUE} (default), also +flag founders with exactly one child (indegree 0, outdegree 1) as leaves.} + +\item{max_iter}{Integer or \code{Inf}. Maximum number of trimming iterations. +Defaults to \code{Inf}, which trims until no other stopping condition applies.} + +\item{min_size}{Integer. Minimum number of individuals to retain. Trimming +stops before any removal that would reduce the pedigree below this size. +Defaults to \code{0L}.} + +\item{remove_ids}{Character vector of additional individual IDs to remove +before any leaf-based trimming. Defaults to \code{NULL}.} + +\item{keep_var}{Character. Optional column name of a phenotypic variable. +Passed to \code{\link{findLeaves}} at every iteration so that individuals +with protected phenotype values are never removed.} + +\item{keep_vals}{Optional vector of phenotype values that protect an +individual from removal. See \code{\link{findLeaves}} for full details.} + +\item{verbose}{Logical. If \code{TRUE}, print counts of each leaf type.} +} +\value{ +A trimmed pedigree \code{data.frame} with the same columns as the + input. Parent ID columns (\code{momID}, \code{dadID}) are updated to + \code{NA} for any references to removed individuals. +} +\description{ +Repeatedly removes structural leaf nodes from a pedigree until no further +trimming is possible or a stopping condition is reached. After each removal +pass, parent ID columns are updated so that references to removed individuals +are set to \code{NA}. +} +\details{ +The trimming process peels the pedigree from the outside in: first removing +the outermost leaves, then re-evaluating the remaining structure so that +newly exposed leaves can be removed in subsequent iterations. + +Iteration stops when any of the following conditions is met: +\itemize{ + \item No leaf nodes remain. + \item The number of iterations reaches \code{max_iter}. + \item Removing the next batch of leaves would reduce the pedigree below + \code{min_size} rows. +} +} +\examples{ +\dontrun{ +ped <- data.frame( + ID = 1:6, + dadID = c(NA, NA, 1, 1, 3, NA), + momID = c(NA, NA, 2, 2, 4, NA) +) +trimPedigree(ped, verbose = TRUE) +trimPedigree(ped, min_size = 2, verbose = TRUE) +} +} +\seealso{ +\code{\link{findLeaves}} to preview which individuals would be removed. +} diff --git a/tests/testthat/test-buildComponent.R b/tests/testthat/test-buildComponent.R index ca733e9c..1d2c70bf 100644 --- a/tests/testthat/test-buildComponent.R +++ b/tests/testthat/test-buildComponent.R @@ -486,7 +486,6 @@ test_that("keep_ids subset produces correct relatedness values across all famili }) - test_that("keep_ids with unknown IDs warns and drops missing entries", { data(hazard) keep <- c(as.character(hazard$ID[1:3]), "BOGUS_ID") diff --git a/tests/testthat/test-buildmxPedigrees.R b/tests/testthat/test-buildmxPedigrees.R index ed1eb00e..16264f3b 100644 --- a/tests/testthat/test-buildmxPedigrees.R +++ b/tests/testthat/test-buildmxPedigrees.R @@ -6,8 +6,8 @@ make_add2 <- function() matrix(c(1, 0.5, 0.5, 1), nrow = 2) # Helper: a 2-person observed data row -make_dat2 <- function(ytemp = c("y1", "y2")) { - matrix(c(1.5, 2.5), nrow = 1, dimnames = list(NULL, ytemp)) +make_dat2 <- function(obs_ids = c("y1", "y2")) { + matrix(c(1.5, 2.5), nrow = 1, dimnames = list(NULL, obs_ids)) } # ─── buildPedigreeModelCovariance ──────────────────────────────────────────── @@ -98,11 +98,11 @@ test_that("buildOneFamilyGroup errors when no relatedness matrix is provided", { dat <- make_dat2() expect_error( buildOneFamilyGroup( - group_name = "fam1", - Addmat = NULL, Nucmat = NULL, Extmat = NULL, - Mtdmat = NULL, Amimat = NULL, Dmgmat = NULL, + group_name = "fam1", + Addmat = NULL, Nucmat = NULL, Extmat = NULL, + Mtdmat = NULL, Amimat = NULL, Dmgmat = NULL, full_df_row = dat, - ytemp = c("y1", "y2") + obs_ids = c("y1", "y2") ), regexp = "At least one relatedness matrix must be provided" ) @@ -114,10 +114,10 @@ test_that("buildOneFamilyGroup returns an mxModel with an additive matrix", { dat <- make_dat2() mod <- expect_no_error( buildOneFamilyGroup( - group_name = "fam1", - Addmat = Addmat, + group_name = "fam1", + Addmat = Addmat, full_df_row = dat, - ytemp = c("y1", "y2") + obs_ids = c("y1", "y2") ) ) expect_true(inherits(mod, "MxModel")) @@ -131,10 +131,10 @@ test_that("buildOneFamilyGroup returns an mxModel with nuclear family matrix", { dat <- make_dat2() mod <- expect_no_error( buildOneFamilyGroup( - group_name = "fam2", - Nucmat = Nucmat, + group_name = "fam2", + Nucmat = Nucmat, full_df_row = dat, - ytemp = c("y1", "y2") + obs_ids = c("y1", "y2") ) ) expect_true(inherits(mod, "MxModel")) @@ -148,10 +148,10 @@ test_that("buildOneFamilyGroup determines family size from any provided matrix", dat <- make_dat2() mod <- expect_no_error( buildOneFamilyGroup( - group_name = "famExt", - Extmat = Extmat, + group_name = "famExt", + Extmat = Extmat, full_df_row = dat, - ytemp = c("y1", "y2") + obs_ids = c("y1", "y2") ) ) # # Extmat signals "include Vce"; the algebra always uses U (unit matrix) @@ -169,7 +169,7 @@ test_that("buildFamilyGroups returns one group model per row of data", { dimnames = list(NULL, c("y1", "y2")) ) groups <- expect_no_error( - buildFamilyGroups(dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat) + buildFamilyGroups(dat = dat, obs_ids = c("y1", "y2"), Addmat = Addmat) ) expect_true(is.list(groups)) expect_equal(length(groups), nrow(dat)) @@ -180,7 +180,7 @@ test_that("buildFamilyGroups names group models with supplied prefix", { Addmat <- make_add2() dat <- matrix(c(1.0, 2.0), nrow = 1, dimnames = list(NULL, c("y1", "y2"))) groups <- buildFamilyGroups( - dat = dat, ytemp = c("y1", "y2"), + dat = dat, obs_ids = c("y1", "y2"), Addmat = Addmat, prefix = "family" ) expect_equal(groups[[1]]$name, "family1") @@ -191,7 +191,7 @@ test_that("buildFamilyGroups default prefix is 'fam'", { Addmat <- make_add2() dat <- matrix(c(1.0, 2.0), nrow = 1, dimnames = list(NULL, c("y1", "y2"))) groups <- buildFamilyGroups( - dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + dat = dat, obs_ids = c("y1", "y2"), Addmat = Addmat ) expect_equal(groups[[1]]$name, "fam1") }) @@ -210,7 +210,7 @@ test_that("buildPedigreeMx returns a multigroup mxModel", { dimnames = list(NULL, c("y1", "y2")) ) group_models <- buildFamilyGroups( - dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + dat = dat, obs_ids = c("y1", "y2"), Addmat = Addmat ) mod <- expect_no_error( buildPedigreeMx( @@ -249,7 +249,7 @@ test_that("fitPedigreeModel runs end-to-end with a trivial dataset", { ) Addmat <- make_add2() group_models <- buildFamilyGroups( - dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + dat = dat, obs_ids = c("y1", "y2"), Addmat = Addmat ) vars <- list( ad2 = 0.4, dd2 = 0.1, cn2 = 0.1, ce2 = 0.1, diff --git a/tests/testthat/test-trimPedigree.R b/tests/testthat/test-trimPedigree.R new file mode 100644 index 00000000..97137a34 --- /dev/null +++ b/tests/testthat/test-trimPedigree.R @@ -0,0 +1,369 @@ +# Test Case 1: findLeaves returns a character vector for the hazard dataset +test_that("findLeaves returns a character vector of leaf IDs for hazard dataset", { + data(hazard) + leaves <- findLeaves(hazard, include_terminal = TRUE, include_founder_singletons = FALSE) + + + expect_true(is.character(leaves)) + expect_true(length(leaves) > 0) + + expect_true(all(leaves %in% as.character(hazard$ID))) +}) + + +# Test Case 2: findLeaves identifies terminal nodes correctly in the hazard dataset +test_that("findLeaves correctly identifies terminal nodes in hazard dataset", { + data(hazard) + leaves_terminal <- findLeaves(hazard, include_terminal = TRUE, include_founder_singletons = FALSE) + leaves_all <- findLeaves(hazard, include_founder_singletons = TRUE, include_terminal = TRUE) + # Terminal-only should be a subset of all leaves + expect_true(all(leaves_terminal %in% leaves_all)) + # All returned IDs must be present in the pedigree rows + expect_true(all(leaves_all %in% as.character(hazard$ID))) + + + # find all terminal nodes (outdegree == 0) in the hazard dataset + hazard$dadID0 <- hazard$dadID + hazard$momID0 <- hazard$momID + hazard$dadID0[is.na(hazard$dadID)] <- 0 + hazard$momID0[is.na(hazard$momID)] <- 0 + parent_ids <- unique(c(hazard$dadID0, hazard$momID0)) + terminal_nodes <- setdiff(as.character(hazard$ID), as.character(parent_ids)) + expect_true(all(leaves_terminal %in% terminal_nodes)) + + leaves_founder <- findLeaves(hazard, include_terminal = F, include_founder_singletons = T) + # find all founder singletons (indegree == 0, outdegree == 1) in the hazard dataset + outdegree <- table(c(hazard$dadID0, hazard$momID0)) + + + viable <- names(outdegree)[outdegree == 1] + # needs to have no parents + parentless <- as.character(hazard$ID[hazard$dadID0 == 0 & hazard$momID0 == 0]) + founder_singletons <- parentless[parentless %in% viable] + + expect_true(all(leaves_founder %in% founder_singletons)) + + leaves_all <- findLeaves(hazard, include_founder_singletons = TRUE, include_terminal = TRUE) + expect_true(all(leaves_all %in% c(terminal_nodes, founder_singletons))) +}) + +# Test Case 3: findLeaves include_founder_singletons flag adds more leaves in hazard dataset +test_that("findLeaves finds more leaves with include_founder_singletons = TRUE in hazard dataset", { + data(hazard) + leaves_terminal <- findLeaves(hazard, include_founder_singletons = FALSE) + leaves_all <- findLeaves(hazard, include_founder_singletons = TRUE) + expect_true(length(leaves_all) >= length(leaves_terminal)) +}) + +# Test Case 4: findLeaves verbose prints messages for hazard dataset +test_that("findLeaves verbose prints messages for hazard dataset", { + data(hazard) + expect_message(findLeaves(hazard, verbose = TRUE), "terminal nodes") + expect_message(findLeaves(hazard, verbose = TRUE), "total leaf nodes") +}) + +# Test Case 5: findLeaves works with non-default column names using potter dataset +test_that("findLeaves works with non-default personID column in potter dataset", { + data(potter) + potter$ID <- potter$personID + leaves <- findLeaves(potter, personID = "ID", momID = "momID", dadID = "dadID") + expect_true(is.character(leaves)) + expect_true(all(leaves %in% as.character(potter$ID))) +}) + +# Test Case 6: trimPedigree reduces the size of the hazard dataset +test_that("trimPedigree reduces pedigree size for hazard dataset", { + data(hazard) + trimmed <- trimPedigree(hazard) + expect_true(nrow(trimmed) < nrow(hazard)) + expect_true(nrow(trimmed) >= 0) +}) + +# Test Case 7: trimPedigree produces no dangling parent references in hazard dataset +test_that("trimPedigree leaves no dangling parent references in hazard dataset", { + data(hazard) + trimmed <- trimPedigree(hazard) + remaining_ids <- as.character(trimmed$ID) + dad_refs <- as.character(trimmed$dadID[!is.na(trimmed$dadID)]) + mom_refs <- as.character(trimmed$momID[!is.na(trimmed$momID)]) + expect_true(all(dad_refs %in% remaining_ids)) + expect_true(all(mom_refs %in% remaining_ids)) +}) + +# Test Case 8: trimPedigree with max_iter = 1 removes only one layer of leaves in hazard dataset +test_that("trimPedigree with max_iter = 1 removes fewer individuals than full trim in hazard dataset", { + data(hazard) + trimmed_one <- trimPedigree(hazard, max_iter = 1) + trimmed_full <- trimPedigree(hazard, max_iter = Inf) + expect_true(nrow(trimmed_one) >= nrow(trimmed_full)) +}) + +# Test Case 9: trimPedigree verbose prints iteration messages for hazard dataset +test_that("trimPedigree verbose prints iteration messages for hazard dataset", { + data(hazard) + expect_message(trimPedigree(hazard, verbose = TRUE), "Iteration") +}) + +# Test Case 10: trimPedigree with remove_ids removes specified individuals before trimming in hazard dataset +test_that("trimPedigree remove_ids removes specified individuals before leaf trimming in hazard dataset", { + data(hazard) + target_id <- hazard$ID[1] + trimmed <- trimPedigree(hazard, remove_ids = target_id) + expect_false(target_id %in% trimmed$ID) +}) + +# Test Case 11: trimPedigree returns a data.frame with the same columns as input for hazard dataset +test_that("trimPedigree preserves input column structure for hazard dataset", { + data(hazard) + trimmed <- trimPedigree(hazard) + expect_true(inherits(trimmed, "data.frame")) + expect_true(all(names(hazard) %in% names(trimmed))) +}) + +# Test Case 12: trimPedigree with include_founder_singletons = FALSE trims fewer in hazard dataset +test_that("trimPedigree trims more with founder singletons than without in hazard dataset", { + data(hazard) + trimmed_no_founders <- trimPedigree(hazard, include_founder_singletons = FALSE) + trimmed_with_founders <- trimPedigree(hazard, include_founder_singletons = TRUE) + expect_true(nrow(trimmed_with_founders) <= nrow(trimmed_no_founders)) +}) + +# Test Case 13: findLeaves returns a character vector for the inbreeding dataset +test_that("findLeaves returns a character vector of leaf IDs for inbreeding dataset", { + data(inbreeding) + leaves <- findLeaves(inbreeding) + expect_true(is.character(leaves)) + expect_true(length(leaves) > 0) + expect_true(all(leaves %in% as.character(inbreeding$ID))) +}) + +# Test Case 14: trimPedigree reduces pedigree size and preserves column structure for inbreeding dataset +test_that("trimPedigree reduces pedigree size and preserves columns for inbreeding dataset", { + data(inbreeding) + trimmed <- trimPedigree(inbreeding) + expect_true(nrow(trimmed) < nrow(inbreeding)) + expect_true(inherits(trimmed, "data.frame")) + expect_true(all(names(inbreeding) %in% names(trimmed))) +}) + +# Test Case 15: trimPedigree leaves no dangling parent references in inbreeding dataset +test_that("trimPedigree leaves no dangling parent references in inbreeding dataset", { + data(inbreeding) + trimmed <- trimPedigree(inbreeding) + remaining_ids <- as.character(trimmed$ID) + dad_refs <- as.character(trimmed$dadID[!is.na(trimmed$dadID)]) + mom_refs <- as.character(trimmed$momID[!is.na(trimmed$momID)]) + expect_true(all(dad_refs %in% remaining_ids)) + expect_true(all(mom_refs %in% remaining_ids)) +}) + +# Test Case 16: findLeaves identifies terminal nodes (outdegree == 0) in a known minimal pedigree +test_that("findLeaves identifies terminal nodes correctly in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3), + dadID = c(NA, 1, 2), + momID = c(NA, NA, NA) + ) + leaves <- findLeaves(ped, include_founder_singletons = FALSE) + expect_true("3" %in% leaves) + expect_false("1" %in% leaves) +}) + +# Test Case 17: findLeaves identifies founder singletons (indegree == 0, outdegree == 1) in a minimal pedigree +test_that("findLeaves identifies founder singletons correctly in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3), + dadID = c(NA, NA, 1), + momID = c(NA, NA, 2) + ) + leaves_all <- findLeaves(ped, include_founder_singletons = TRUE) + leaves_terminal <- findLeaves(ped, include_founder_singletons = FALSE) + expect_true("1" %in% leaves_all) + expect_true("2" %in% leaves_all) + expect_false("1" %in% leaves_terminal) + expect_false("2" %in% leaves_terminal) +}) + +# Test Case 18: trimPedigree respects max_iter step-by-step on a known chain +test_that("trimPedigree respects max_iter on a known linear chain", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + dadID = c(NA, 1, 2, 3), + momID = c(NA, NA, NA, NA) + ) + trimmed_1 <- trimPedigree(ped, include_founder_singletons = FALSE, max_iter = 1) + expect_false(4 %in% trimmed_1$ID) + expect_true(3 %in% trimmed_1$ID) + + trimmed_2 <- trimPedigree(ped, include_founder_singletons = FALSE, max_iter = 2) + expect_false(4 %in% trimmed_2$ID) + expect_false(3 %in% trimmed_2$ID) + expect_true(2 %in% trimmed_2$ID) +}) + +# Test Case 19: trimPedigree remove_ids forces removal before leaf trimming in a minimal pedigree +test_that("trimPedigree remove_ids forces removal before leaf trimming in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + dadID = c(NA, NA, 1, 3), + momID = c(NA, NA, 2, NA) + ) + trimmed <- trimPedigree(ped, + remove_ids = 4, + include_founder_singletons = FALSE, max_iter = 1 + ) + expect_false(4 %in% trimmed$ID) + expect_false(3 %in% trimmed$ID) +}) + +# Test Case 20: trimPedigree with full iteration removes all nodes from a fully trimable pedigree +test_that("trimPedigree with full iteration removes all nodes from a fully trimmable pedigree", { + ped <- data.frame( + ID = c(1, 2, 3), + dadID = c(NA, NA, 1), + momID = c(NA, NA, 2) + ) + trimmed <- trimPedigree(ped, include_founder_singletons = TRUE, max_iter = Inf) + expect_equal(nrow(trimmed), 0L) +}) + +# Test Case 21: findLeaves with include_terminal = FALSE excludes terminal nodes in a minimal pedigree +test_that("findLeaves with include_terminal = FALSE excludes terminal nodes in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3), + dadID = c(NA, NA, 1), + momID = c(NA, NA, 2) + ) + # With include_terminal = FALSE: node 3 (outdegree 0) should not appear + leaves <- findLeaves(ped, include_terminal = FALSE, include_founder_singletons = TRUE) + expect_false("3" %in% leaves) + # Founder singletons 1 and 2 should still appear + expect_true("1" %in% leaves) + expect_true("2" %in% leaves) +}) + +# Test Case 22: findLeaves errors when both include flags are FALSE +test_that("findLeaves errors when both include_terminal and include_founder_singletons are FALSE", { + data(hazard) + expect_error( + findLeaves(hazard, include_terminal = FALSE, include_founder_singletons = FALSE), + "At least one" + ) +}) + +# Test Case 23: trimPedigree with include_terminal = FALSE does not remove terminal nodes in hazard dataset +test_that("trimPedigree with include_terminal = FALSE does not remove terminal nodes in hazard dataset", { + data(hazard) + # Identify terminal IDs before trimming + terminal_ids <- findLeaves(hazard, include_terminal = TRUE, include_founder_singletons = FALSE) + trimmed <- trimPedigree(hazard, include_terminal = FALSE, include_founder_singletons = TRUE) + # None of the original terminal nodes should have been removed + expect_true(all(terminal_ids %in% as.character(trimmed$ID))) +}) + +# Test Case 24: trimPedigree min_size prevents trimming below threshold in hazard dataset +test_that("trimPedigree min_size prevents pedigree from falling below threshold in hazard dataset", { + data(hazard) + min_n <- nrow(hazard) - 5L + trimmed <- trimPedigree(hazard, min_size = min_n) + expect_true(nrow(trimmed) >= min_n) +}) + +# Test Case 25: trimPedigree min_size = 0 behaves identically to default in hazard dataset +test_that("trimPedigree min_size = 0 behaves identically to default in hazard dataset", { + data(hazard) + trimmed_default <- trimPedigree(hazard) + trimmed_min_zero <- trimPedigree(hazard, min_size = 0L) + expect_equal(trimmed_default, trimmed_min_zero) +}) + +# Test Case 26: trimPedigree min_size verbose message fires when threshold would be breached +test_that("trimPedigree min_size prints stopping message when threshold would be breached", { + ped <- data.frame( + ID = c(1, 2, 3), + dadID = c(NA, NA, 1), + momID = c(NA, NA, 2) + ) + expect_message( + trimPedigree(ped, include_founder_singletons = TRUE, min_size = 2L, verbose = TRUE), + "min_size" + ) +}) + +# Test Case 27: findLeaves errors when keep_var is not a column in the pedigree +test_that("findLeaves errors when keep_var column does not exist", { + data(hazard) + expect_error(findLeaves(hazard, keep_var = "not_a_column"), "not found") +}) + +# Test Case 28: findLeaves with keep_var protects individuals with non-missing phenotype in a minimal pedigree +test_that("findLeaves with keep_var protects individuals with non-missing phenotype in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + affected = c(NA, NA, 1, NA) + ) + # Node 3 and 4 are terminal; node 3 has affected = 1 so should be protected + leaves <- findLeaves(ped, keep_var = "affected") + expect_false("3" %in% leaves) + expect_true("4" %in% leaves) +}) + +# Test Case 29: findLeaves with keep_var and keep_vals protects only matching values in a minimal pedigree +test_that("findLeaves with keep_var and keep_vals protects only specified values in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + affected = c(NA, NA, 1, 0) + ) + # Protect only affected == 1; node 4 (affected = 0) remains removable + leaves_protect_1 <- findLeaves(ped, keep_var = "affected", keep_vals = 1) + expect_false("3" %in% leaves_protect_1) + expect_true("4" %in% leaves_protect_1) + + # Protect only affected == 0; node 3 (affected = 1) remains removable + leaves_protect_0 <- findLeaves(ped, keep_var = "affected", keep_vals = 0) + expect_true("3" %in% leaves_protect_0) + expect_false("4" %in% leaves_protect_0) +}) + +# Test Case 30: findLeaves with keep_vals = NA protects individuals with missing phenotype in a minimal pedigree +test_that("findLeaves with keep_vals = NA protects individuals with missing phenotype in a minimal pedigree", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + affected = c(NA, NA, 1, NA) + ) + # Protect those with missing phenotype; node 4 (NA) protected, node 3 (value=1) removable + leaves <- findLeaves(ped, keep_var = "affected", keep_vals = NA) + expect_true("3" %in% leaves) + expect_false("4" %in% leaves) +}) + +# Test Case 31: trimPedigree with keep_var never removes protected individuals from hazard dataset +test_that("trimPedigree with keep_var never removes individuals with non-missing phenotype in hazard dataset", { + data(hazard) + # Add a synthetic phenotype: mark the first 10 individuals as affected + hazard$affected <- NA + hazard$affected[1:10] <- 1 + protected_ids <- as.character(hazard$ID[1:10]) + + trimmed <- trimPedigree(hazard, keep_var = "affected") + # All protected individuals must still be present + expect_true(all(protected_ids %in% as.character(trimmed$ID))) +}) + +# Test Case 32: trimPedigree with keep_var removes more individuals than without in hazard dataset +test_that("trimPedigree with keep_var removes fewer individuals than without in hazard dataset", { + data(hazard) + hazard$affected <- NA + hazard$affected[1:10] <- 1 + + trimmed_no_keep <- trimPedigree(hazard) + trimmed_keep <- trimPedigree(hazard, keep_var = "affected") + # Protecting phenotyped individuals means fewer are removed overall + expect_true(nrow(trimmed_keep) >= nrow(trimmed_no_keep)) +}) diff --git a/vignettes/articles/v61_pedigree_model_fitting.Rmd b/vignettes/articles/v61_pedigree_model_fitting.Rmd new file mode 100644 index 00000000..a23815be --- /dev/null +++ b/vignettes/articles/v61_pedigree_model_fitting.Rmd @@ -0,0 +1,175 @@ +--- +title: "Extended: Fitting Pedigree-Based Variance Component Models" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Extended: Fitting Pedigree-Based Variance Component Models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(rmarkdown.html_vignette.check_title = FALSE) +#source('https://vipbg.vcu.edu/vipbg/OpenMx2/software/getOpenMx.R') + +library(BGmisc) + +has_openmx <- requireNamespace("OpenMx", quietly = TRUE) + +if (has_openmx) { + library(OpenMx) + mxOption(key="Number of Threads", value=imxGetNumThreads()) + +} else { + message( + "OpenMx is not installed. Code will be shown but not executed.\n", + "Install with: install.packages('OpenMx')" + ) +} + +bgmisc_testing_env <- Sys.getenv("BGMISC_TESTING", unset = "") +bgmisc_testing <- tolower(bgmisc_testing_env) %in% c("1", "true", "yes") +run_models <- has_openmx && interactive() && !bgmisc_testing +rds_dir <- file.path( "inst/extdata/") +rds_file <- file.path(rds_dir, "fitted_multi.rds") +``` + +# Introduction + +This vignette extends the example from `vignette("v60_pedigree_model_fitting", package = "BGmisc")` to show how to fit models to multiple families simultaneously. The key functions are `buildOneFamilyGroup()` and `buildPedigreeMx()`, which translate pedigree data into OpenMx model specifications. + +# Scaling Up to Many Families + + +```{r krsp-prep} +library(ggpedigree) # for pedigree data) +library(tidyverse) +data("redsquirrels_full") + +ped_krsp <- redsquirrels_full |> + transmute( + ID = as.integer(personID), + momID = as.integer(momID), + dadID = as.integer(dadID), + sex = sex, + famID = as.integer(famID), + lrs = lrs + ) + +cat( + "KRSP pedigree:", nrow(ped_krsp), "individuals,", + n_distinct(ped_krsp$famID), "grids\n" +) +summarizeFamilies(ped_krsp, famID = "famID")$family_summary |> arrange(desc(count)) +``` + +```{r} +minim_family_size <- 10 + +ped_krsp_subset <- ped_krsp |> + group_by(famID) |> + filter(sum(!is.na(lrs)) >= minim_family_size) |> + ungroup() + +id_families <- unique(ped_krsp_subset$famID) +n_families <- length(id_families) + +# Pre-allocate storage +add_list <- vector("list", length(n_families)) +cn_list <- vector("list", length(n_families)) +mt_list <- vector("list", length(n_families)) +obs_ids_list <- vector("list", length(n_families)) +pheno_list <- vector("list", length(n_families)) + + +# Starting values for variance components +start_vars <- list( + ad2 = 0.3, # additive genetic + cn2 = 0.1, # common nuclear environment + ce2 = 0, # common extended (not estimated here) + mt2 = 0.1, # mitochondrial + dd2 = 0, # dominance (not estimated here) + am2 = 0, # A x Mt interaction (not estimated here) + ee2 = 0.5 # unique environment +) +``` +```{r, eval = has_openmx} +for (i in seq_len(n_families)) { + ped_i <- subset(ped_krsp_subset, famID == id_families[i]) + phenotypic_ids <- ped_i$ID[!is.na(ped_i$lrs)] + A_i <- as.matrix(ped2add(ped_i, sparse = FALSE, keep_ids = phenotypic_ids)) + Cn_i <- as.matrix(ped2cn(ped_i, sparse = FALSE, keep_ids = phenotypic_ids)) + Mt_i <- as.matrix(ped2mit(ped_i, sparse = FALSE, keep_ids = phenotypic_ids)) + + n_i <- nrow(A_i) + id_order_i <- rownames(A_i) + + pheno_vals <- ped_i$lrs[match(id_order_i, as.character(ped_i$ID))] + + obs_ids_i <- make.names(id_order_i[!is.na(pheno_vals)]) + pheno_row_i <- matrix(as.double(pheno_vals[!is.na(pheno_vals)]), + nrow = 1, + dimnames = list(NULL, obs_ids_i) + ) + + rownames(A_i) <- colnames(A_i) <- obs_ids_i + rownames(Cn_i) <- colnames(Cn_i) <- obs_ids_i + rownames(Mt_i) <- colnames(Mt_i) <- obs_ids_i + add_list[[i]] <- A_i + cn_list[[i]] <- Cn_i + mt_list[[i]] <- Mt_i + obs_ids_list[[i]] <- obs_ids_i + pheno_list[[i]] <- pheno_row_i +} + + +group_models <- lapply(seq_len(n_families), function(i) { + buildOneFamilyGroup( + group_name = paste0("ped", i), + Addmat = add_list[[i]], + Nucmat = cn_list[[i]], + Mtdmat = mt_list[[i]], + full_df_row = pheno_list[[i]], + obs_ids = obs_ids_list[[i]] + ) +}) + +multi_model <- buildPedigreeMx( + model_name = "MultiPedigreeModel", + vars = start_vars, + group_models = group_models +) +``` +```{r, include = FALSE} +fitted_multi <- NULL # ensure variable exists for later use +``` +```{r, eval = run_models} +fitted_multi <- mxRun(multi_model, intervals= FALSE) + +saveRDS(fitted_multi, "inst/extdata/fitted_multi.rds") +``` + +``` +fitted_multi <- mxRun(multi_model) +``` +```{r, include = FALSE, eval = file.exists(rds_file) && !run_models} +fitted_multi <- readRDS(rds_file) +``` +```{r, eval = has_openmx && !is.null(fitted_multi)} +summary(fitted_multi) + +total_var <- sum(fitted_multi$ModelOne$Vad$values, fitted_multi$ModelOne$Vcn$values, fitted_multi$ModelOne$Vmt$values, fitted_multi$ModelOne$Ver$values) +``` + + +```{r single-fam-results, eval = has_openmx && !is.null(fitted_multi)} +cat("Additive genetic (Vad):", fitted_multi$ModelOne$Vad$values/total_var, "\n") +cat("Common nuclear (Vcn):", fitted_multi$ModelOne$Vcn$values/total_var, "\n") +cat("Mitochondrial (Vmt):", fitted_multi$ModelOne$Vmt$values/total_var, "\n") +cat("Unique environ. (Ver):", fitted_multi$ModelOne$Ver$values/total_var, "\n") +``` + +As you can see, we have fit a multigroup pedigree model using `r n_families` of squirrels. diff --git a/vignettes/v60_pedigree_model_fitting.Rmd b/vignettes/v60_pedigree_model_fitting.Rmd new file mode 100644 index 00000000..6258b476 --- /dev/null +++ b/vignettes/v60_pedigree_model_fitting.Rmd @@ -0,0 +1,520 @@ +--- +title: "Fitting Pedigree-Based Variance Component Models" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Fitting Pedigree-Based Variance Component Models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +# Introduction + +A core goal of behavior genetics is to decompose observed phenotypic variance into genetic and environmental components. Extended pedigrees — multi-generational families with known parentage — provide richer information about relatedness than twin studies alone and allow researchers to estimate a wider array of variance components. + +The `BGmisc` package provides a complete pipeline for pedigree-based variance component modeling: + +1. **Compute** relatedness matrices with `ped2add()`, `ped2cn()`, `ped2mit()`, and `ped2ce()` +2. **Check identification** with `identifyComponentModel()` +3. **Build** OpenMx group models with `buildOneFamilyGroup()` +4. **Fit** the combined model with `buildPedigreeMx()` and `mxRun()` (or the wrapper `fitPedigreeModel()`) + +This vignette builds up in three parts: a single-family model, a two-family multi-group model, and a scaled-up simulation study. An extension vignette `vignette("v61_pedigree_model_fitting", package = "BGmisc")` covers more complex models using squirrel data from the Kluane Red Squirrel Project. + +## Prerequisites + +```{r setup} +library(BGmisc) + +has_openmx <- requireNamespace("OpenMx", quietly = TRUE) +has_mvtnorm <- requireNamespace("mvtnorm", quietly = TRUE) + +if (has_openmx) { + library(OpenMx) +} else { + message( + "OpenMx is not installed. Code will be shown but not executed.\n", + "Install with: install.packages('OpenMx')" + ) +} + +run_models <- has_openmx +``` + +--- + +# Data Requirements at a Glance + +Before diving in, here is a concise reference for what your data must look like. Return to this section whenever something is unclear. + +## Pedigree data frame + +One row per individual, at minimum: + +| Column | Type | Description | +|--------|------|-------------| +| `ID` | integer/character | Unique individual identifier | +| `dadID` | same as `ID` | Father's ID; `NA` or `0` if unknown | +| `momID` | same as `ID` | Mother's ID; `NA` or `0` if unknown | +| `sex` | integer | `0` = male, `1` = female | +| `famID` | integer/character | Extended family identifier | + +Column names are flexible — pass `personID = "myID"`, `famID = "famID"`, etc. to the `ped2*` functions. + +> **Tip:** Use `checkIDs()`, `checkSex()`, and `checkParents()` to validate your pedigree before computing relatedness matrices. + +## Relatedness matrices + +Each matrix returned by `ped2add()`, `ped2cn()`, etc. is square ($n \times n$), symmetric, and **labeled**: `rownames` and `colnames` are the individual IDs. These labels are the link between phenotype data and the model. + +## Phenotype data format + +The model-fitting functions expect phenotype data as a **one-row matrix**, where: + +- Each **column** is one individual. +- **Column names** must exactly match `rownames(add_matrix)` for that family — same IDs, same order. +- The vector of those column names is called `obs_ids` throughout this vignette. + +If your raw data is in long format (one row per person), you need to pivot it wide — this is shown step by step in Part 1 below. + +## Missing phenotypes + +Individuals present in the pedigree only to trace relatedness (e.g., deceased ancestors) must be **removed from the matrices** before fitting. The `ped2*` functions include every individual; you subset them explicitly to those with observed data. + +--- + +# Part 1: Single-Family Model + +We start with one family from the built-in `hazard` dataset. Every step is written out explicitly — no helper functions, no loops — so you can see exactly what happens. + +```{r load-hazard, eval = run_models} +data(hazard) + +# Two families: famID 1 and famID 2 +table(hazard$famID) +``` + +## Step 1: Subset to one family and inspect + +```{r single-fam-subset, eval = run_models} +fam1 <- subset(hazard, famID == 1) + +# What does the pedigree look like? +head(fam1[, c("famID", "ID", "sex", "dadID", "momID", "DA2")]) +``` + +`DA2` is the phenotype we will model. It is binary in the actual dataset; for a real continuous-trait analysis you would substitute your own measure. + +## Step 2: Compute relatedness matrices + +```{r single-fam-matrices, eval = run_models} +# Additive genetic relatedness (proportion of genome shared IBD) +add_mat1 <- ped2add(fam1, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam1$ID +) # keep all individuals for now + + +# Common nuclear environment (1 if same biological parents, 0 otherwise) +cn_mat1 <- ped2cn(fam1, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam1$ID +) + +mt_mat1 <- ped2mit(fam1, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam1$ID +) + +# The matrices are square and labeled with individual IDs +dim(add_mat1) +rownames(add_mat1) +``` + +## Step 3: Prepare the phenotype data + +The model needs a **one-row matrix** with one column per individual, in the same order as the relatedness matrix rows. + +```{r single-fam-pheno, eval = run_models} +# Individual IDs in the order the matrices use +id_order1 <- rownames(add_mat1) + +# Pull phenotype values, reordered to match the matrix +pheno_vals1 <- fam1$DA2[match(id_order1, as.character(fam1$ID))] +names(pheno_vals1) <- id_order1 + +# Who actually has an observed phenotype? +observed1 <- !is.na(pheno_vals1) +cat("Individuals in pedigree:", length(id_order1), "\n") +cat("Individuals with observed phenotype:", sum(observed1), "\n") + +# obs_ids: the IDs of observed individuals — these will be column names +# They must be syntactically valid R names (OpenMx requirement) +obs_ids1 <- make.names(id_order1[observed1]) + +# One-row matrix: one column per observed individual. +# as.double() is required — OpenMx will error if the matrix storage mode +# is integer rather than double, even if the values look numeric. +pheno_row1 <- matrix( + as.double(pheno_vals1[observed1]), + nrow = 1, + dimnames = list(NULL, obs_ids1) +) +pheno_row1[1, 1:6] # first few values +``` + +## Step 4: Subset matrices to observed individuals + +Individuals without a phenotype must be removed from the matrices — their rows and columns are dropped. + +```{r single-fam-subset-mat, eval = run_models} +# Use the original (non-make.names) IDs to index the matrix +raw_obs_ids1 <- id_order1[observed1] + +add_mat1_obs <- add_mat1[raw_obs_ids1, raw_obs_ids1] +cn_mat1_obs <- cn_mat1[raw_obs_ids1, raw_obs_ids1] +mt_mat1_obs <- mt_mat1[raw_obs_ids1, raw_obs_ids1] + +# Rename rows/cols to match obs_ids1 (the make.names versions) +rownames(add_mat1_obs) <- colnames(add_mat1_obs) <- obs_ids1 +rownames(cn_mat1_obs) <- colnames(cn_mat1_obs) <- obs_ids1 +rownames(mt_mat1_obs) <- colnames(mt_mat1_obs) <- obs_ids1 + +dim(add_mat1_obs) +``` + +## Step 5: Check model identification + +Before fitting, verify that the variance components you want to estimate are statistically identifiable given this pedigree's structure. + +```{r single-fam-identify, eval = run_models} +id_check1 <- identifyComponentModel( + A = add_mat1_obs, + Cn = cn_mat1_obs, + Mt = mt_mat1_obs, + E = diag(1, nrow(add_mat1_obs)) +) +id_check1 +``` + +## Step 6: Build and fit the model + +```{r single-fam-fit, eval = run_models} +# Starting values for variance components +start_vars <- list( + ad2 = 0.3, # additive genetic + cn2 = 0.1, # common nuclear environment + ce2 = 0, # common extended (not estimated here) + mt2 = 0.1, # mitochondrial + dd2 = 0, # dominance (not estimated here) + am2 = 0, # A x Mt interaction (not estimated here) + ee2 = 0.5 # unique environment +) + +# Build the group model for family 1 +group1 <- buildOneFamilyGroup( + group_name = "family1", + Addmat = add_mat1_obs, + Nucmat = cn_mat1_obs, + Mtdmat = mt_mat1_obs, + full_df_row = pheno_row1, + obs_ids = obs_ids1 +) + +# Wrap in a full pedigree model and fit +model1 <- buildPedigreeMx( + model_name = "SingleFamilyModel", + vars = start_vars, + group_models = list(group1) +) + +fitted1 <- mxRun(model1) +summary(fitted1) +``` + +The estimated variance components are in `fitted1$ModelOne`: + +```{r single-fam-results, eval = run_models} +cat("Additive genetic (Vad):", fitted1$ModelOne$Vad$values, "\n") +cat("Common nuclear (Vcn):", fitted1$ModelOne$Vcn$values, "\n") +cat("Mitochondrial (Vmt):", fitted1$ModelOne$Vmt$values, "\n") +cat("Unique environ. (Ver):", fitted1$ModelOne$Ver$values, "\n") +``` + +With a single family, estimates have wide uncertainty. Part 2 adds a second family to improve precision. + +--- + +# Part 2: Two-Family Multi-Group Model + +Multi-group modeling means that **variance component parameters are shared** across families, but each family has its own relatedness matrix and phenotype data. We write out the steps for family 2 explicitly — the same steps as Part 1, just on different data — so you can see exactly what repeats and what changes. + +## Family 2: matrices + +```{r two-fam-matrices, eval = run_models} +fam2 <- subset(hazard, famID == 2) + +add_mat2 <- ped2add(fam2, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam2$ID +) + +cn_mat2 <- ped2cn(fam2, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam2$ID +) + +mt_mat2 <- ped2mit(fam2, + sparse = FALSE, + famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", sex = "sex", + keep_ids = fam2$ID +) + +dim(add_mat2) +``` + +## Family 2: phenotype data + +```{r two-fam-pheno, eval = run_models} +id_order2 <- rownames(add_mat2) + +pheno_vals2 <- fam2$DA2[match(id_order2, as.character(fam2$ID))] +names(pheno_vals2) <- id_order2 + +observed2 <- !is.na(pheno_vals2) +obs_ids2 <- make.names(id_order2[observed2]) +raw_obs_ids2 <- id_order2[observed2] + +pheno_row2 <- matrix( + as.double(pheno_vals2[observed2]), + nrow = 1, + dimnames = list(NULL, obs_ids2) +) + +cat("Family 2 observed:", sum(observed2), "of", length(id_order2), "\n") +``` + +## Family 2: subset matrices + +```{r two-fam-subset-mat, eval = run_models} +add_mat2_obs <- add_mat2[raw_obs_ids2, raw_obs_ids2] +cn_mat2_obs <- cn_mat2[raw_obs_ids2, raw_obs_ids2] +mt_mat2_obs <- mt_mat2[raw_obs_ids2, raw_obs_ids2] + +rownames(add_mat2_obs) <- colnames(add_mat2_obs) <- obs_ids2 +rownames(cn_mat2_obs) <- colnames(cn_mat2_obs) <- obs_ids2 +rownames(mt_mat2_obs) <- colnames(mt_mat2_obs) <- obs_ids2 +``` + +## Check identification across both families + +```{r two-fam-identify, eval = run_models} +n_total <- nrow(add_mat1_obs) + nrow(add_mat2_obs) + +id_check2 <- identifyComponentModel( + A = list(add_mat1_obs, add_mat2_obs), + Cn = list(cn_mat1_obs, cn_mat2_obs), + Mt = list(mt_mat1_obs, mt_mat2_obs), + E = diag(1, n_total) +) +id_check2 +``` + +## Build and fit the two-family model + +Each family gets its own group model. The key difference from Part 1: `buildPedigreeMx()` now receives both groups, and `mxFitFunctionMultigroup()` combines their likelihoods. The variance component parameters (`Vad`, `Vcn`, `Ver`) are estimated jointly. + +```{r two-fam-fit, eval = run_models} +# Family 2 group model (identical call structure to family 1) +group2 <- buildOneFamilyGroup( + group_name = "family2", + Addmat = add_mat2_obs, + Nucmat = cn_mat2_obs, + Mtdmat = mt_mat2_obs, + full_df_row = pheno_row2, + obs_ids = obs_ids2 +) + +# Both groups share the same variance component parameters +model2 <- buildPedigreeMx( + model_name = "TwoFamilyModel", + vars = start_vars, + group_models = list(group1, group2) +) + +fitted2 <- mxRun(model2) +summary(fitted2) +``` + +```{r two-fam-results, eval = run_models} +cat("Additive genetic (Vad):", fitted2$ModelOne$Vad$values, "\n") +cat("Common nuclear (Vcn):", fitted2$ModelOne$Vcn$values, "\n") +cat("Mitochondrial (Vmt):", fitted2$ModelOne$Vmt$values, "\n") +cat("Unique environ. (Ver):", fitted2$ModelOne$Ver$values, "\n") +``` + +The estimates from both families together are generally more stable than those from either family alone. + +--- + + +# Part 3: Scaling Up — Many Families and Parameter Recovery + +Once you understand the pattern from Parts 1 and 2, you can scale to many families using a loop. We switch to **simulated data** here so we have known true parameter values to check our estimates against. + +## Simulate pedigrees and data + +```{r simulate-many, eval = has_mvtnorm && run_models} +library(mvtnorm) +set.seed(2024) + +# True variance components +true_var <- list( + ad2 = 0.40, cn2 = 0.10, ee2 = 0.40, + ce2 = 0, mt2 = 0.10, dd2 = 0, am2 = 0 +) + +n_families <- 15 + +# Pre-allocate storage +add_list <- vector("list", n_families) +cn_list <- vector("list", n_families) +mt_list <- vector("list", n_families) +obs_ids_list <- vector("list", n_families) +pheno_list <- vector("list", n_families) + +for (i in seq_len(n_families)) { + ped_i <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6) + + A_i <- as.matrix(ped2add(ped_i, sparse = FALSE)) + Cn_i <- as.matrix(ped2cn(ped_i, sparse = FALSE)) + Mt_i <- as.matrix(ped2mit(ped_i, sparse = FALSE)) + n_i <- nrow(A_i) + + # Implied covariance from true parameters + V_i <- true_var$ad2 * A_i + + true_var$cn2 * Cn_i + + true_var$mt2 * Mt_i + + true_var$ee2 * diag(1, n_i) + + y_i <- rmvnorm(1, sigma = V_i) + + ids_i <- make.names(rownames(A_i)) + rownames(A_i) <- colnames(A_i) <- ids_i + rownames(Cn_i) <- colnames(Cn_i) <- ids_i + rownames(Mt_i) <- colnames(Mt_i) <- ids_i + + add_list[[i]] <- A_i + cn_list[[i]] <- Cn_i + mt_list[[i]] <- Mt_i + obs_ids_list[[i]] <- ids_i + pheno_list[[i]] <- matrix(y_i, nrow = 1, dimnames = list(NULL, ids_i)) +} + +cat("Family sizes:", vapply(add_list, nrow, integer(1)), "\n") +``` + +## Build and fit the multi-family model + +```{r fit-many, eval = has_mvtnorm && run_models} +group_models <- lapply(seq_len(n_families), function(i) { + buildOneFamilyGroup( + group_name = paste0("ped", i), + Addmat = add_list[[i]], + Nucmat = cn_list[[i]], + Mtdmat = mt_list[[i]], + full_df_row = pheno_list[[i]], + obs_ids = obs_ids_list[[i]] + ) +}) + +multi_model <- buildPedigreeMx( + model_name = "MultiPedigreeModel", + vars = start_vars, + group_models = group_models +) + +fitted_multi <- mxRun(multi_model) +``` + +## Compare estimates to true values + +```{r compare-many, eval = has_mvtnorm && run_models} +data.frame( + Component = c( + "Additive genetic (Vad)", + "Common nuclear (Vcn)", + "Mitochondrial (Vmt)", + "Unique environ. (Ver)" + ), + True = c( + true_var$ad2, true_var$cn2, + true_var$mt2, + true_var$ee2 + ), + Estimated = round(c( + fitted_multi$ModelOne$Vad$values, + fitted_multi$ModelOne$Vcn$values, + fitted_multi$ModelOne$Vmt$values, + fitted_multi$ModelOne$Ver$values + ), 4) +) +``` + +With ten families, estimates should be substantially closer to the true values than with a single family. + +## Using the high-level wrapper + +For convenience, `fitPedigreeModel()` wraps the build and fit steps together, and uses `mxTryHard()` for robust optimization: + +```{r fit-wrapper, eval = has_mvtnorm && run_models} +fitted_easy <- fitPedigreeModel( + model_name = "EasyFit", + vars = start_vars, + group_models = group_models, + tryhard = TRUE +) +summary(fitted_easy) +``` + +--- + +# Understanding the Covariance Structure + +The model estimated above is defined by: + +$$V = \sigma^2_a \mathbf{A} + \sigma^2_{cn} \mathbf{C}_n + \sigma^2_e \mathbf{I}$$ + +where $\mathbf{A}$ is the additive genetic relatedness matrix, $\mathbf{C}_n$ is the common nuclear environment matrix, $\mathbf{I}$ is the identity matrix (unique environment), and the $\sigma^2$ terms are the variance components estimated by the model. + +This is an extension of the classical twin model to arbitrary pedigree structures. Additional components can be added by passing `Extmat` (common extended environment), `Mtdmat` (mitochondrial), or `Dmgmat` (dominance) to `buildOneFamilyGroup()`. See `vignette("v1_modelingvariancecomponents", package = "BGmisc")` for background on identification. + +--- + +# Summary + +| Part | Data | Key point | +|------|------|-----------| +| 1 — Single family | `hazard`, family 1 | Full pipeline, one group model | +| 2 — Two families | `hazard`, families 1 & 2 | Same steps twice; `buildPedigreeMx()` combines them | +| 3 — Many families | Simulated | Loop over families; verify parameter recovery | + +The helper functions (`buildOneFamilyGroup()`, `buildPedigreeMx()`, `fitPedigreeModel()`) handle the translation from pedigree relatedness matrices into OpenMx model specifications. The pattern is always the same: one call to `buildOneFamilyGroup()` per family, then one call to `buildPedigreeMx()` to combine them. diff --git a/vignettes/v6_pedigree_model_fitting.Rmd b/vignettes/v6_pedigree_model_fitting.Rmd deleted file mode 100644 index 5cd82e01..00000000 --- a/vignettes/v6_pedigree_model_fitting.Rmd +++ /dev/null @@ -1,558 +0,0 @@ ---- -title: "Fitting Pedigree-Based Variance Component Models" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Fitting Pedigree-Based Variance Component Models} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -options(rmarkdown.html_vignette.check_title = FALSE) -``` - -# Introduction - -A core goal of behavior genetics is to decompose observed phenotypic variance into genetic and environmental components. Traditionally, this has been done using twin studies, which compare monozygotic (MZ) and dizygotic (DZ) twin pairs. However, extended pedigrees -- multi-generational families with known parentage -- provide richer information about relatedness and allow researchers to estimate a wider array of variance components. - -The `BGmisc` package provides a complete pipeline for pedigree-based variance component modeling: - -1. **Simulate** multi-generational pedigrees with `simulatePedigree()` -2. **Compute** relatedness matrices with `ped2add()`, `ped2cn()`, `ped2mit()`, and `ped2ce()` -3. **Check identification** with `identifyComponentModel()` and `comp2vech()` -4. **Build and fit** structural equation models with `buildOneFamilyGroup()`, `buildPedigreeMx()`, and `fitPedigreeModel()` - -This vignette walks through each step, from generating a pedigree to fitting a variance component model and interpreting the results. - -## Prerequisites - -This vignette requires the `OpenMx` package for structural equation modeling. If you don't have it installed, you can install it from CRAN or from the OpenMx website. - -```{r setup} -library(BGmisc) - -has_openmx <- requireNamespace("OpenMx", quietly = TRUE) -has_mvtnorm <- requireNamespace("mvtnorm", quietly = TRUE) - -if (has_openmx) { - library(OpenMx) -} else { - message( - "OpenMx is not installed. Code will be shown but not executed.\n", - "Install OpenMx with: install.packages('OpenMx')" - ) -} - -if (!has_mvtnorm) { - message( - "mvtnorm is not installed. Data simulation examples will not run.\n", - "Install mvtnorm with: install.packages('mvtnorm')" - ) -} else { - library(mvtnorm) -} - -run_models <- has_openmx && has_mvtnorm -``` - - -# Step 1: Simulate a Pedigree - -We begin by simulating a multi-generational pedigree using `simulatePedigree()`. This creates a balanced family structure with a specified number of generations and children per couple. - -```{r simulate-pedigree} -set.seed(421) - -ped <- simulatePedigree( - kpc = 3, # 3 children per couple - Ngen = 4, # 4 generations - sexR = 0.5, # equal sex ratio - marR = 0.6 # 60% mating rate -) - -head(ped) -``` - -The resulting data frame contains one row per individual with columns for family ID (`fam`), personal ID (`ID`), generation (`gen`), parent IDs (`dadID`, `momID`), spouse ID (`spID`), and biological sex (`sex`). - -- Number of individuals: ``r nrow(ped)`` -- Number of generations: ``r length(unique(ped$gen))`` - -```{r ped-summary} -summarizeFamilies(ped, famID = "fam")$family_summary -``` - - -# Step 2: Compute Relatedness Matrices - -With a pedigree in hand, we compute the various relatedness component matrices. Each matrix is square and symmetric, with rows and columns corresponding to individuals in the pedigree. The entry in row *i* and column *j* quantifies the relatedness between person *i* and person *j* along a specific pathway. - -## Additive Genetic Relatedness - -The additive genetic relatedness matrix captures the expected proportion of nuclear DNA shared identical by descent (IBD) between two individuals. For example, parent-offspring pairs share 0.5, full siblings share 0.5 on average, half-siblings share 0.25, and so on. - -```{r compute-additive} -add_matrix <- ped2add(ped, sparse = FALSE) -add_matrix[1:5, 1:5] -``` - -## Common Nuclear Environment - -The common nuclear environment matrix captures whether two individuals shared both biological parents (i.e., were raised in the same nuclear family). Full siblings who share the same mother and father have a value of 1; all others have 0. - -```{r compute-nuclear} -cn_matrix <- ped2cn(ped, sparse = FALSE) -cn_matrix[1:5, 1:5] -``` - -## Mitochondrial Relatedness - -The mitochondrial relatedness matrix captures shared maternal lineage. Individuals who share the same maternal line (mother, maternal grandmother, etc.) share mitochondrial DNA. - -```{r compute-mito} -mt_matrix <- ped2mit(ped, sparse = FALSE) -mt_matrix[1:5, 1:5] -``` - -## Common Extended Family Environment - -The common extended family environment matrix indicates whether individuals belong to the same extended family. For a single pedigree, this is simply a matrix of ones. - -```{r compute-extended} -ce_matrix <- ped2ce(ped) -ce_matrix[1:5, 1:5] -``` - - -# Step 3: Check Model Identification - -Before fitting a model, we need to verify that the variance components are *identified* -- that is, the data provide enough information to uniquely estimate each parameter. If components are not identified, their estimates can trade off against each other, leading to unstable or meaningless results. - -The `identifyComponentModel()` function checks identification by vectorizing each relatedness component matrix (via `comp2vech()`) and testing whether the resulting design matrix has full column rank. Each component matrix becomes a column in this design matrix. If the rank equals the number of components, the model is identified. - -For more background on identification in variance component models, see `vignette("v1_modelingvariancecomponents", package = "BGmisc")`. - -## Checking Our Full Model - -We plan to fit a 5-component model with additive genetic (A), common nuclear environment (Cn), common extended environment (Ce), mitochondrial (Mt), and unique environment (E) components. Let's check whether these five components are simultaneously identified using the relatedness matrices we just computed: - -```{r identify-full-model} -id_full <- identifyComponentModel( - A = add_matrix, - Cn = cn_matrix, - Ce = ce_matrix, - Mt = mt_matrix, - E = diag(1, nrow(add_matrix)) -) -id_full -``` -```{r, include=FALSE} -identified <- id_full$identified - -identified_text <- "The full model is identified. We can proceed to fit it. However, to illustrate the process of checking identification and refining the model, we will also show how to interpret the details of the identification check. Below, I have provided an unidentified model to demonstrate how to use the `nidp` element of the result to understand which components are confounded." - -not_identified_text <- "The full model is NOT identified. We will need to refine the model by dropping or fixing some components." -``` - -`r if (identified) paste(identified_text) else not_identified_text` - -```{r identify-full-model-details, include = identified} -# show if model is identified - -identifyComponentModel( - A = add_matrix, - A2 = add_matrix, - Cn = cn_matrix, - Ce = ce_matrix, - Mt = mt_matrix, - E = diag(1, nrow(add_matrix)) -) -``` - -With a single pedigree, the 5-component model *may* not be fully identified. The `nidp` element in the result tells us which components are confounded. This is because some relatedness matrices can be linearly dependent -- for example, `ce_matrix` is a matrix of all ones for a single family, and the identity matrix (E) plus other components may span a similar space. In this case, our model is identified, but if it were not, we would see a message like "Variance components are not all identified. (And even if they were, we might not have enough data to estimate them all.) - -## Narrowing Down to an Identified Model - -Based on the identification check above, we can drop or fix the non-identified components. A natural choice is to remove the components flagged by `identifyComponentModel()` and re-check: - -```{r identify-reduced} -# A simpler model: A + Cn + E -id_ace <- identifyComponentModel( - A = list(add_matrix), - Cn = list(cn_matrix), - E = diag(1, nrow(add_matrix)) -) -id_ace -``` - -A single extended pedigree typically provides enough variation in relatedness coefficients (parent-offspring = 0.5, siblings = 0.5, grandparent-grandchild = 0.25, cousins = 0.125, etc.) to identify the A + Cn + E model. - -## Recovering Identification with Multiple Families - -When a component is not identified with one family, adding families with different structures can help. Each family contributes additional rows to the design matrix. Let's check whether the full 5-component model becomes identified when we combine two pedigrees: - -```{r identify-multi} -set.seed(99) -ped2 <- simulatePedigree(kpc = 4, Ngen = 3, marR = 0.5) |> makeTwins() -add2 <- ped2add(ped2, sparse = FALSE) -cn2 <- ped2cn(ped2, sparse = FALSE) -ce2 <- ped2ce(ped2) -mt2 <- ped2mit(ped2, sparse = FALSE) - -# Check the full model across two families -n_combined <- nrow(add_matrix) + nrow(add2) -id_two_fam <- identifyComponentModel( - A = list(add_matrix, add2), - Cn = list(cn_matrix, cn2), - Ce = list(ce_matrix, ce2), - Mt = list(mt_matrix, mt2), - E = diag(1, n_combined) -) -id_two_fam -``` - -This result guides our modeling decisions in the steps that follow. When fitting the model below, we set the non-identified components' true values to zero so that we have a known ground truth to recover. - - -# Step 4: Simulate Phenotypic Data - -Before fitting a model, we need observed data. In practice, this would be measured phenotypes (e.g., height, cognitive ability, disease status). Here, we simulate phenotypic data from a known variance component structure so we can verify that our model recovers the true parameters. - -We define "true" variance components and use the relatedness matrices to construct the population covariance matrix, then sample from it. - -```{r simulate-phenotype, eval = has_mvtnorm} -# True variance components (proportions of total variance) -true_var <- list( - ad2 = 0.50, # additive genetic - cn2 = 0.10, # common nuclear environment - ce2 = 0.00, # common extended environment (set to 0 for identifiability) - mt2 = 0.00, # mitochondrial (set to 0 for simplicity) - ee2 = 0.40 # unique environment (residual) -) - -# Build the implied covariance matrix -# V = ad2*A + cn2*Cn + ce2*U + mt2*Mt + ee2*I -n <- nrow(add_matrix) -I_mat <- diag(1, n) -U_mat <- matrix(1, n, n) - -V_true <- true_var$ad2 * add_matrix + - true_var$cn2 * cn_matrix + - true_var$ce2 * U_mat + - true_var$mt2 * mt_matrix + - true_var$ee2 * I_mat - -# Simulate one realization of the phenotype vector -set.seed(123) -y <- mvtnorm::rmvnorm(1, sigma = V_true) - - -# Create named variable labels (required by OpenMx) -ytemp <- paste("S", rownames(add_matrix)) -``` - -```{r show-phenotype} -if (!exists("y")) { - y <- rep(NA, nrow(add_matrix)) -} -``` - - -We simulated phenotypic data for`r ncol(y)` individuals, with a mean of `r round(mean(y), 3)` and a standard deviation of `r round(sd(y), 3)`. The variance in this simulated phenotype arises from the specified genetic and environmental components according to the covariance structure we defined. - -In practice, you would have data from multiple independently ascertained families. Here we simulate data from a single pedigree for simplicity, but the model-fitting functions support multiple pedigrees (shown in a later section). - - -# Step 5: Build the OpenMx Model - -The `BGmisc` package provides helper functions that construct the OpenMx model in three layers: - -1. **`buildPedigreeModelCovariance()`** -- Creates the variance component parameters (free parameters to be estimated) -2. **`buildOneFamilyGroup()`** -- Creates the model for a single family, embedding the relatedness matrices and observed data -3. **`buildPedigreeMx()`** -- Combines the variance components with one or more family groups into a multi-group model - -## Building a Single-Family Model - -Let's walk through building the model step by step. - -### Variance Component Parameters - -The `buildPedigreeModelCovariance()` function creates the OpenMx sub-model that holds the free variance component parameters. You can control which components to include: - -```{r build-covariance, eval = run_models} -# Starting values for the optimizer -start_vars <- list( - ad2 = 0.3, dd2 = 0, cn2 = 0.1, - ce2 = 0.1, mt2 = 0.1, am2 = 0, - ee2 = 0.4 -) - -# Build variance component sub-model -vc_model <- buildPedigreeModelCovariance( - vars = start_vars, - Vad = TRUE, # estimate additive genetic variance - Vdd = FALSE, # do not estimate dominance - Vcn = TRUE, # estimate common nuclear environment - Vce = TRUE, # estimate common extended environment - Vmt = TRUE, # estimate mitochondrial - Vam = FALSE, # do not estimate A x Mt interaction - Ver = TRUE # estimate unique environment -) -vc_model - -summary(vc_model) -``` - -### Family Group Model - -The `buildOneFamilyGroup()` function constructs the model for one family. It takes the relatedness matrices and the observed data for that family: - -```{r build-group, eval = run_models} -# Format the observed data as a 1-row matrix with named columns -obs_data <- matrix(y, nrow = 1, dimnames = list(NULL, ytemp)) - -# Build the family group model -family_group <- buildOneFamilyGroup( - group_name = "family1", - Addmat = add_matrix, - Nucmat = cn_matrix, - Extmat = ce_matrix, - Mtdmat = mt_matrix, - full_df_row = obs_data, - ytemp = ytemp -) -``` - -The family group model contains: - -- **Identity matrix (I)** and **unit matrix (U)** for the unique and extended environment components -- **Relatedness matrices** (A, Cn, Mt, etc.) as fixed data matrices -- **An mxAlgebra** that computes the implied covariance: $V = \sigma^2_a A + \sigma^2_{cn} C_n + \sigma^2_{ce} U + \sigma^2_{mt} Mt + \sigma^2_e I$ -- **Normal expectation** with the covariance and a free mean - - -### Assembling the Full Model - -The `buildPedigreeMx()` function combines the variance component parameters (shared across all families) with the family group model(s): - -```{r build-full, eval = run_models} -full_model <- buildPedigreeMx( - model_name = "PedigreeVCModel", - vars = start_vars, - group_models = list(family_group) -) -full_model$submodels -``` - - -# Step 6: Fit the Model - -With the model assembled, we fit it using OpenMx's optimizer. The `mxRun()` function performs maximum likelihood estimation: - -```{r fit-model, eval = run_models} -fitted_model <- mxRun(full_model) -smr <- summary(fitted_model) -``` - -```{r show-results, eval = run_models} -# Extract variance component estimates -estimates <- c( - Vad = fitted_model$ModelOne$Vad$values[1, 1], - Vcn = fitted_model$ModelOne$Vcn$values[1, 1], - Vce = fitted_model$ModelOne$Vce$values[1, 1], - Vmt = fitted_model$ModelOne$Vmt$values[1, 1], - Ver = fitted_model$ModelOne$Ver$values[1, 1] -) - -# Compare estimates to true values -truth <- c( - Vad = true_var$ad2, - Vcn = true_var$cn2, - Vce = true_var$ce2, - Vmt = true_var$mt2, - Ver = true_var$ee2 -) - -comparison <- data.frame( - Component = names(truth), - True = truth, - Estimated = round(estimates, 4) -) -comparison -``` - -```{r show-fit-stats, eval = run_models} -cat("-2 Log Likelihood:", smr$Minus2LogLikelihood, "\n") -cat("Converged:", fitted_model$output$status$code == 0, "\n") -``` - -With a single pedigree realization, estimates will vary from the true values due to sampling variability. Estimation improves substantially with multiple families, as shown next. - - -# Step 7: Multi-Pedigree Model - -In practice, researchers have data from multiple families. The BGmisc helpers are designed for this multi-group scenario, where variance component parameters are shared across families but each family has its own relatedness structure and data. - -## Simulating Multiple Families - -```{r multi-ped-simulate, eval = run_models} -set.seed(2024) -n_families <- 5 - -# Storage -ped_list <- vector("list", n_families) -add_list <- vector("list", n_families) -cn_list <- vector("list", n_families) -mt_list <- vector("list", n_families) -ce_list <- vector("list", n_families) -y_list <- vector("list", n_families) -ytemp_list <- vector("list", n_families) - -for (i in seq_len(n_families)) { - # Simulate each family with slightly different structure - ped_i <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6) - ped_list[[i]] <- ped_i - - # Compute relatedness matrices - A_i <- as.matrix(ped2add(ped_i)) - Cn_i <- as.matrix(ped2cn(ped_i)) - Mt_i <- as.matrix(ped2mit(ped_i)) - Ce_i <- ped2ce(ped_i) - n_i <- nrow(A_i) - - add_list[[i]] <- A_i - cn_list[[i]] <- Cn_i - mt_list[[i]] <- Mt_i - ce_list[[i]] <- Ce_i - - # Build implied covariance and simulate data - I_i <- diag(1, n_i) - U_i <- matrix(1, n_i, n_i) - V_i <- true_var$ad2 * A_i + - true_var$cn2 * Cn_i + - true_var$ce2 * U_i + - true_var$mt2 * Mt_i + - true_var$ee2 * I_i - - y_list[[i]] <- mvtnorm::rmvnorm(1, sigma = V_i) - ytemp_list[[i]] <- paste("S", rownames(A_i)) -} - -cat("Simulated", n_families, "families\n") -cat("Family sizes:", vapply(ped_list, nrow, integer(1)), "\n") -``` - -## Building and Fitting the Multi-Group Model - -We build a group model for each family and then combine them: - -```{r multi-ped-fit, eval = run_models} -# Build group models for each family -group_models <- lapply(seq_len(n_families), function(i) { - obs_data_i <- matrix(y_list[[i]], nrow = 1, dimnames = list(NULL, ytemp_list[[i]])) - - buildOneFamilyGroup( - group_name = paste0("ped", i), - Addmat = add_list[[i]], - Nucmat = cn_list[[i]], - Extmat = ce_list[[i]], - Mtdmat = mt_list[[i]], - full_df_row = obs_data_i, - ytemp = ytemp_list[[i]] - ) -}) - -# Build the multi-group model -multi_model <- buildPedigreeMx( - model_name = "MultiPedigreeModel", - vars = start_vars, - group_models = group_models -) - -# Fit the model -fitted_multi <- mxRun(multi_model) -smr_multi <- summary(fitted_multi) -``` - -```{r multi-ped-results, eval = run_models} -# Extract and compare estimates -estimates_multi <- c( - Vad = fitted_multi$ModelOne$Vad$values[1, 1], - Vcn = fitted_multi$ModelOne$Vcn$values[1, 1], - Vce = fitted_multi$ModelOne$Vce$values[1, 1], - Vmt = fitted_multi$ModelOne$Vmt$values[1, 1], - Ver = fitted_multi$ModelOne$Ver$values[1, 1] -) - -comparison_multi <- data.frame( - Component = c( - "Additive genetic (Vad)", "Common nuclear (Vcn)", - "Common extended (Vce)", "Mitochondrial (Vmt)", - "Unique environment (Ver)" - ), - True = truth, - Estimated = round(estimates_multi, 4) -) -comparison_multi - -cat("\n-2 Log Likelihood:", smr_multi$Minus2LogLikelihood, "\n") -cat("Converged:", fitted_multi$output$status$code == 0, "\n") -``` - -With multiple families providing more information, the estimates should more closely approximate the true generating values. - - -# Using the High-Level `fitPedigreeModel()` Wrapper - -For convenience, `fitPedigreeModel()` wraps the build and fit steps together. It accepts pre-built group models and uses `mxTryHard()` for robust optimization with multiple starts: - -```{r fit-highlevel, eval = run_models} -fitted_easy <- fitPedigreeModel( - model_name = "EasyFit", - vars = start_vars, - data = NULL, - group_models = group_models, - tryhard = TRUE -) - -summary(fitted_easy) -``` - - -# Understanding the Covariance Structure - -The key equation underlying the model is: - -$$V = \sigma^2_a \mathbf{A} + \sigma^2_{cn} \mathbf{C}_n + \sigma^2_{ce} \mathbf{U} + \sigma^2_{mt} \mathbf{M} + \sigma^2_e \mathbf{I}$$ - -where: - -- $\mathbf{A}$ is the additive genetic relatedness matrix (from `ped2add()`) -- $\mathbf{C}_n$ is the common nuclear environment matrix (from `ped2cn()`) -- $\mathbf{U}$ is a matrix of ones representing shared extended family environment (from `ped2ce()`) -- $\mathbf{M}$ is the mitochondrial relatedness matrix (from `ped2mit()`) -- $\mathbf{I}$ is the identity matrix (unique environment, one per person) -- $\sigma^2_a, \sigma^2_{cn}, \sigma^2_{ce}, \sigma^2_{mt}, \sigma^2_e$ are the variance components to be estimated - -This is an extension of the classical twin model to arbitrary pedigree structures. The additive genetic relatedness matrix generalizes the concept of MZ twins sharing 100% of genes and DZ twins sharing 50% -- in a pedigree, every pair of relatives has a specific coefficient of relatedness determined by their genealogical connection. - - -# Summary - -This vignette demonstrated the full workflow for pedigree-based variance component modeling: - -| Step | Function | Purpose | -|------|----------|---------| -| 1 | `simulatePedigree()` | Generate a multi-generational pedigree | -| 2 | `ped2add()`, `ped2cn()`, `ped2mit()`, `ped2ce()` | Compute relatedness matrices | -| 3 | `identifyComponentModel()`| Check model identification | -| 4 | Simulate or prepare phenotypic data | Observed data for model fitting | -| 5 | `buildOneFamilyGroup()`, `buildPedigreeModelCovariance()` | Build OpenMx model components | -| 6 | `buildPedigreeMx()`, `mxRun()` or `fitPedigreeModel()` | Assemble and fit the model | -| 7 | Multiple families | Scale to multi-group pedigree models | - -The helper functions (`buildPedigreeModelCovariance()`, `buildOneFamilyGroup()`, `buildFamilyGroups()`, `buildPedigreeMx()`, `fitPedigreeModel()`) handle the mechanics of translating pedigree relatedness matrices into proper OpenMx model specifications, allowing researchers to focus on the substantive questions rather than the modeling boilerplate. diff --git a/vignettes/v7_trimPedigree.Xmd b/vignettes/v7_trimPedigree.Xmd new file mode 100644 index 00000000..f355ea12 --- /dev/null +++ b/vignettes/v7_trimPedigree.Xmd @@ -0,0 +1,341 @@ +--- +title: "Trimming Large Pedigrees with BGmisc" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Trimming Large Pedigrees} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +# Introduction + +Pedigree data collected in genetic epidemiology studies are rarely tidy. A +registry may include thousands of individuals, but the people who actually +carry phenotypic measurements --- disease diagnoses, biomarker values, survey +responses --- are often a small subset. The rest are relatives recruited to +establish family structure: parents of probands, spouses who appear in only +one mating, distant founders entered solely to connect two branches of a tree. + +These peripheral individuals matter for defining genetic relationships, but +once those relationships are established they may add little to an analysis. +Keeping them inflates relatedness matrices, slows model fitting, and increases +memory use, all without improving estimates. The practical question is: *which +individuals can be safely removed without disrupting the connectivity of the +core network?* + +The answer depends on graph structure. In the directed pedigree graph used by +`BGmisc` (edges run from parent to child), two kinds of node are structurally +dispensable: + +- **Terminal nodes** have no children (outdegree 0). They sit at the tips of + the tree. Removing them severs no path between any other pair of + individuals. +- **Founder singletons** have no recorded parents (indegree 0) and exactly + one child (outdegree 1). They are attached to the network by a single edge. + Removing them leaves every other connection intact. + +`BGmisc` provides two functions that operationalise this idea: + +- `findLeaves()` identifies which individuals currently qualify as leaves. +- `trimPedigree()` removes them iteratively, peeling the pedigree from the + outside in until a stopping condition is met. + +Crucially, topology alone should not decide who gets removed. An individual +who is a structural leaf but carries a disease diagnosis should stay. Both +functions therefore accept a `keep_var` argument that protects individuals +based on their phenotypic data, regardless of their position in the graph. + +## Data + +```{r setup, message = FALSE, warning = FALSE} +library(BGmisc) +library(ggpedigree) +library(ggplot2) +data(hazard) +``` + +The `hazard` dataset contains `r nrow(hazard)` individuals across two extended +families, simulated with an age-related disease hazard. It includes an +`affected` column (logical disease status) alongside standard pedigree +variables. The two families are shown below, with filled shapes indicating +affected individuals. + +```{r fig-full, fig.width = 10, fig.height = 5, warning = FALSE, message = FALSE, fig.cap = "Full hazard pedigree. Filled shapes indicate affected individuals."} +ggpedigree( + hazard, + personID = "ID", + famID = "FamID", + overlay_column = "affected", + config = list( + overlay_include = TRUE, + code_male = 0, + sex_color_include = FALSE + ) +) + + facet_wrap(~FamID, scales = "free") +``` + +# Finding Leaves + +`findLeaves()` returns the IDs of all current leaf nodes. Both types are +enabled by default. Use `verbose = TRUE` to see the breakdown by type: + +```{r, message = TRUE} +leaves <- findLeaves(hazard, verbose = TRUE) +leaves +``` + +To make the candidates concrete, we can mark them in the dataset and plot +them directly. In the figure below, filled shapes are leaf candidates --- +nodes that could be removed without severing any connection between the +remaining individuals. + +```{r fig-leaves, fig.width = 10, fig.height = 5, warning = FALSE, message = FALSE, fig.cap = "Leaf candidates (filled). These individuals can be removed without disconnecting the rest of the network."} +hazard_marked <- hazard +hazard_marked$is_leaf <- hazard_marked$ID %in% leaves + +ggpedigree( + hazard_marked, + personID = "ID", + famID = "FamID", + overlay_column = "is_leaf", + config = list( + overlay_include = TRUE, + code_male = 0, + sex_color_include = FALSE + ) +) + + facet_wrap(~FamID, scales = "free") +``` + +You can restrict to one leaf type if needed: + +```{r} +terminal_only <- findLeaves(hazard, include_founder_singletons = FALSE) +singletons_only <- findLeaves(hazard, include_terminal = FALSE) + +cat("Terminal only: ", length(terminal_only), "\n") +cat("Singletons only: ", length(singletons_only), "\n") +cat("Both combined: ", length(leaves), "\n") +``` + +# Iterative Trimming + +Identifying leaves once is rarely enough. After the outermost tips are removed, +what were previously internal nodes may themselves become leaves. `trimPedigree()` +handles this by calling `findLeaves()` in a loop, removing one layer at a time +until nothing more can be pruned or a stopping condition is reached. + +```{r, message = TRUE} +trimPedigree(hazard, verbose = TRUE) + +trimmed <- trimPedigree(hazard, verbose = TRUE,max_iter =1) + +cat("Before:", nrow(hazard), "individuals\n") +cat("After: ", nrow(trimmed), "individuals\n") +``` + +Parent ID columns are updated automatically: any `momID` or `dadID` that +pointed to a removed individual is set to `NA`. + +```{r} +removed_ids <- setdiff(as.character(hazard$ID), as.character(trimmed$ID)) +sum(as.character(trimmed$dadID) %in% removed_ids, na.rm = TRUE) +sum(as.character(trimmed$momID) %in% removed_ids, na.rm = TRUE) +``` + +The resulting tree retains only the structurally essential core --- the +individuals who connect others and could not be peeled away without breaking +the network. + +```{r fig-trimmed, fig.width = 10, fig.height = 5, warning = FALSE, message = FALSE, fig.cap = "Pedigree after full trimming. Filled shapes indicate affected individuals."} +ggpedigree( + trimmed, + personID = "ID", +# famID = "FamID", +# status_column = "affected", # not working with overlays ``shrug + overlays = list( + list(column = "affected", code_affected = TRUE, shape = "cross", color = "black"), + list(column = "affected", code_affected = TRUE, shape = "slash", color = "red", stroke = 2) + ), + config = list( + overlay_include = TRUE, + status_include = TRUE, + code_male = 0, + sex_color_include = TRUE, + overlay_mode = "shape" + ) +) +``` + +# Stopping Rules + +## `max_iter`: limit the number of passes + +`max_iter` trims only a fixed number of outer shells, leaving the broader +structure intact: + +```{r, message = TRUE} +trimmed_1 <- trimPedigree(hazard, max_iter = 1L, verbose = TRUE) +trimmed_2 <- trimPedigree(hazard, max_iter = 2L, verbose = TRUE) + +cat("After 1 pass: ", nrow(trimmed_1), "individuals\n") +cat("After 2 passes:", nrow(trimmed_2), "individuals\n") +cat("Full trim: ", nrow(trimmed), "individuals\n") +``` + +## `min_size`: retain a minimum number of individuals + +`min_size` halts trimming before any pass that would drop below the threshold: + +```{r, message = TRUE} +trimmed_min <- trimPedigree(hazard, min_size = 20L, verbose = TRUE) +cat("Retained:", nrow(trimmed_min), "individuals\n") +``` + +The two rules can be combined; trimming stops as soon as either is met: + +```{r, message = TRUE} +trimmed_both <- trimPedigree(hazard, max_iter = 2L, min_size = 20L, verbose = TRUE) +cat("Retained:", nrow(trimmed_both), "individuals\n") +``` + +# Phenotype-Guided Trimming + +Consider a proband-ascertained study: families were recruited because at least +one member was diagnosed with a condition of interest. Many relatives were then +enrolled simply to fill out the pedigree. Those relatives may be structural +leaves --- but if they themselves carry a diagnosis, removing them would discard +the very observations that motivated data collection. + +The `keep_var` and `keep_vals` arguments solve this by protecting individuals +based on their phenotypic values. The protection is re-applied at every +iteration, so an individual who becomes a leaf only after several rounds of +trimming is still safe if their phenotype qualifies. + +In `hazard`, disease status is recorded in `affected`: + +```{r} +table(hazard$affected, useNA = "ifany") +``` + +## Protect anyone with a recorded phenotype + +Omitting `keep_vals` protects every individual whose `keep_var` value is +non-missing: + +```{r, message = TRUE} +trimmed_pheno <- trimPedigree(hazard, keep_var = "affected", verbose = TRUE) + +cat("Without protection:", nrow(trimmed), "individuals\n") +cat("With protection: ", nrow(trimmed_pheno), "individuals\n") + +# Confirm no phenotyped individual was removed +all(hazard$ID[!is.na(hazard$affected)] %in% trimmed_pheno$ID) +``` + +The resulting tree is larger than the basic trim because affected individuals +are held in place even when they sit at the tips of the graph. + +```{r fig-pheno, fig.width = 10, fig.height = 5, warning = FALSE, message = FALSE, fig.cap = "Pedigree after phenotype-protected trimming. Filled shapes are affected individuals, all retained regardless of their structural position."} +ggpedigree( + trimmed_pheno, + personID = "ID", + famID = "FamID", + overlay_column = "affected", + config = list( + overlay_include = TRUE, + code_male = 0, + sex_color_include = FALSE + ) +) + + facet_wrap(~FamID, scales = "free") +``` + +## Protect only individuals with a specific value + +Pass `keep_vals` to narrow the protection to particular values. Here we keep +only affected individuals and allow removal of unaffected leaves: + +```{r, message = TRUE} +trimmed_affected <- trimPedigree( + hazard, + keep_var = "affected", + keep_vals = TRUE, + verbose = TRUE +) +cat("Retained:", nrow(trimmed_affected), "individuals\n") +``` + +## Protect individuals with missing phenotype data + +`keep_vals = NA` reverses the default: it protects individuals whose status +has *not* yet been recorded, which is useful when follow-up data collection +is planned: + +```{r, message = TRUE} +trimmed_keep_na <- trimPedigree( + hazard, + keep_var = "affected", + keep_vals = NA, + verbose = TRUE +) +cat("Retained:", nrow(trimmed_keep_na), "individuals\n") +``` + +`keep_vals` is a vector, so multiple criteria can be combined --- for example, +protect both affected individuals and those with missing status: + +```{r, message = TRUE} +trimmed_combined <- trimPedigree( + hazard, + keep_var = "affected", + keep_vals = c(TRUE, NA), + verbose = TRUE +) +cat("Retained:", nrow(trimmed_combined), "individuals\n") +``` + +# Forced Removal + +`remove_ids` removes specific individuals unconditionally before any +leaf-based trimming begins --- useful for excluding individuals due to data +quality issues or consent withdrawal: + +```{r, message = TRUE} +force_out <- head(hazard$ID, 2) + +trimmed_forced <- trimPedigree(hazard, remove_ids = force_out, verbose = TRUE) +cat("Force-removed:", paste(force_out, collapse = ", "), "\n") +cat("Retained:", nrow(trimmed_forced), "individuals\n") +``` + +# Quick Reference + +| Parameter | Controls | +|---|---| +| `include_terminal` | Flag outdegree-0 nodes as leaves | +| `include_founder_singletons` | Flag indegree-0 & outdegree-1 nodes as leaves | +| `max_iter` | Maximum number of trimming passes | +| `min_size` | Minimum retained pedigree size | +| `keep_var` | Phenotype column used for protection | +| `keep_vals` | Values (or `NA`) that shield individuals from removal | +| `remove_ids` | IDs to remove unconditionally before leaf trimming | + +Use `findLeaves()` to preview candidates before committing to a trim: + +```{r} +candidates <- findLeaves(hazard, keep_var = "affected") +cat(length(candidates), "candidates with phenotype protection\n") + +result <- trimPedigree(hazard, keep_var = "affected") +cat(nrow(hazard) - nrow(result), "removed,", nrow(result), "retained\n") +```