diff --git a/R/loo-glossary.R b/R/loo-glossary.R index 06ba5ce6..cce5ac68 100644 --- a/R/loo-glossary.R +++ b/R/loo-glossary.R @@ -153,4 +153,98 @@ #' individual models due to correlation (i.e., if some observations are easier #' and some more difficult to predict for all models). #' +#' @section `p_worse` (probability of worse predictive performance): +#' +#' `p_worse` is the estimated probability that a model has worse predictive +#' performance than the best-ranked model in the comparison, based on the normal +#' approximation to the uncertainty in `elpd_diff`. It is computed as +#' +#' p_worse = pnorm(0, elpd_diff, se_diff). +#' +#' The best-ranked model (the first row in the `loo_compare()` output, where +#' `elpd_diff = 0`) always receives `NA`, since the comparison is defined +#' relative to that model. +#' +#' Because models are ordered by `elpd_loo` before computing `p_worse`, all +#' reported values are at least 0.5 by construction. A value close to 0.5 +#' indicates that the models are nearly indistinguishable in predictive +#' performance and that the ranking could easily be reversed with different +#' data. A value close to 1 indicates that the lower-ranked model is almost +#' certainly worse. `p_worse` inherits all the limitations of `se_diff` and the +#' normal approximation on which it is based. In particular, when `se_diff` is +#' underestimated, `p_worse` will be estimated too close to 1, making a model +#' appear more clearly worse than the data actually support. Conversely, when +#' `elpd_diff` is biased due to an unreliable LOO approximation, `p_worse` can +#' point in the wrong direction entirely. When any of these conditions are +#' present, `diag_diff` or `diag_elpd` will be flagged in the `loo_compare()` +#' output. See those sections below for further guidance. +#' +#' @section `diag_diff` (pairwise comparison diagnostics): +#' +#' `diag_diff` is a diagnostic column in the `loo_compare()` output for each +#' model comparison against the current reference model. It flags conditions +#' under which the normal approximation behind `se_diff` and `p_worse` is likely +#' to be poorly calibrated. The column contains a short label when a condition +#' is detected, and is empty otherwise. +#' +#' The column `diag_diff` currently flags two problems: +#' +#' ### `N < 100` +#' +#' When the number of observations is small, we may assume `se_diff` to be +#' underestimated. As a rough heuristic one can multiply `se_diff` by 2 to +#' make a more conservative estimate. +#' +#' ### `|elpd_diff| < 4` +#' +#' When `|elpd_diff|` is below 4, the models have very similar predictive +#' performance. In this setting, Sivula et al. (2025) show that skewness in +#' the error distribution can make the normal approximation for `se_diff` +#' and `p_worse` miscalibrated, even for large N. In practice, this usually +#' supports treating the models as predictively similar. +#' +#' ### Relation between `N < 100` and `|elpd_diff| < 4` +#' +#' The conditions flagged by `diag_diff` are not independent: they tend to +#' co-occur, and when they do, some flags carry more information than others. +#' `loo_compare()` therefore follows a priority hierarchy and shows only the +#' most critical flag in the table output. +#' +#' The hierarchy is as follows: +#' +#' * **`N < 100` takes highest priority.** A small sample size undermines the +#' reliability of `se_diff` by underestimating uncertainty. Because of this, +#' even if `|elpd_diff| < 4` is also true for a comparison, the table will only +#' show `N < 100`. The small sample size renders the `|elpd_diff| < 4` +#' diagnostic less meaningful. +#' +#' * **`|elpd_diff| < 4` takes second priority.** When N >= 100 and the +#' difference is small, the normal approximation is miscalibrated due to the +#' skewness of the error distribution (Sivula et al., 2025). In this +#' situation, `se_diff` exists and is not heavily biased in scale, but the +#' shape of the approximation is wrong, making `p_worse` unreliable. +#' +#' @section `diag_elpd`: +#' +#' `diag_elpd` is a diagnostic column in the `loo_compare()` output that flags +#' when the PSIS-LOO approximation for an individual model is unreliable. Unlike +#' `diag_diff`, which concerns the *comparison* between models, `diag_elpd` +#' concerns the quality of the `elpd_loo` estimate for each model individually. +#' It contains a short text label when a problem is detected, and is empty +#' otherwise. +#' +#' ### `K k_psis > t` (K observations with Pareto-k values > t) +#' +#' This label indicates that K observations for this model have Pareto-k values +#' above the PSIS reliability threshold `t` used by `loo` for that fit. The +#' threshold is sample-size dependent, and in many practical cases close to +#' 0.7. When this flag appears, the PSIS approximation can be unreliable for +#' those observations, and the resulting `elpd_loo` may be biased. Because +#' `elpd_diff` is a direct difference of two models' `elpd_loo` values, bias in +#' either model's estimate propagates directly into `elpd_diff` and `p_worse`. +#' This is qualitatively different from the calibration issues flagged by +#' `diag_diff`: here the estimate itself may be wrong, not just uncertain. +#' +#' See for further information on Pareto-k values the "Pareto k estimates" +#' section. NULL diff --git a/R/loo_compare.R b/R/loo_compare.R index fdf0e368..8cf7c472 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -2,9 +2,6 @@ #' #' @description Compare fitted models based on [ELPD][loo-glossary]. #' -#' By default the print method shows only the most important information. Use -#' `print(..., simplify=FALSE)` to print a more detailed summary. -#' #' @export #' @param x An object of class `"loo"` or a list of such objects. If a list is #' used then the list names will be used as the model names in the output. See @@ -12,8 +9,8 @@ #' @param ... Additional objects of class `"loo"`, if not passed in as a single #' list. #' -#' @return A matrix with class `"compare.loo"` that has its own -#' print method. See the **Details** section. +#' @return A data frame with class `"compare.loo"` that has its own +#' print method. See the **Details** and **Examples** sections. #' #' @details #' When comparing two fitted models, we can estimate the difference in their @@ -21,14 +18,14 @@ #' [`elpd_loo`][loo-glossary] or `elpd_waic` (or multiplied by \eqn{-2}, if #' desired, to be on the deviance scale). #' -#' When using `loo_compare()`, the returned matrix will have one row per model -#' and several columns of estimates. The values in the -#' [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] columns of the -#' returned matrix are computed by making pairwise comparisons between each -#' model and the model with the largest ELPD (the model in the first row). For -#' this reason the `elpd_diff` column will always have the value `0` in the -#' first row (i.e., the difference between the preferred model and itself) and -#' negative values in subsequent rows for the remaining models. +#' ## `elpd_diff` and `se_diff` +#' When using `loo_compare()`, the returned data frame will have one row per +#' model and several columns of estimates. The values of +#' [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] are computed by +#' making pairwise comparisons between each model and the model with the +#' largest ELPD (the model listed first). Therefore, the first `elpd_diff` +#' value will always be `0` (i.e., the difference between the preferred model +#' and itself) and the rest of the values will be negative. #' #' To compute the standard error of the difference in [ELPD][loo-glossary] --- #' which should not be expected to equal the difference of the standard errors @@ -41,9 +38,36 @@ #' standard approach of comparing differences of deviances to a Chi-squared #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. -#' Sivula et al. (2022) discuss the conditions when the normal -#' approximation used for SE and `se_diff` is good. #' +#' ## `p_worse`, `diag_diff`, and `diag_elpd` +#' The values in the `p_worse` column show the probability of each model +#' having worse ELPD than the best model. These probabilities are computed +#' with a normal approximation using the values from `elpd_diff` and +#' `se_diff`. Sivula et al. (2025) present the conditions when the normal +#' approximation used for SE and `se_diff` is good, and the column +#' `diag_diff` contains possible diagnostic messages: +#' +#' * `N < 100` (small data) +#' * `|elpd_diff| < 4` (models make similar predictions) +#' +#' If either of these diagnostic messages is shown, the error distribution is +#' skewed or thick tailed and the normal approximation based on `elpd_diff` +#' and `se_diff` is not well calibrated. In that case, the probabilities +#' `p_worse` are likely to be too large. However, `elpd_diff` and `se_diff` +#' will still be indicative of the differences and uncertainties (for example, +#' if `|elpd_diff|` is many times larger than `se_diff` the difference is quite +#' certain). In addition, if the model is not well specificed and there are +#' outliers, the error distribution can also be skewed or thick tailed and the +#' normal approximation is not well calibrated. Possible model misspecification +#' and outliers can be diagnosed with usual predictive checking methods. +#' +#' The column `diag_elpd` shows the PSIS-LOO Pareto k diagnostic for the +#' pointwise ELPD computations for each model. If `K k_psis > 0.7` is shown, +#' where `K` is the number of high Pareto k values in the PSIS +#' computation, then there may be significant bias in `elpd_diff` favoring +#' models with a large number of high Pareto k values. +#' +#' ## Warnings for many model comparisons #' If more than \eqn{11} models are compared, we internally recompute the model #' differences using the median model by ELPD as the baseline model. We then #' estimate whether the differences in predictive performance are potentially @@ -52,7 +76,7 @@ #' selection process. In that case users are recommended to avoid model #' selection based on LOO-CV, and instead to favor model averaging/stacking or #' projection predictive inference. -#' +#' #' @seealso #' * The [FAQ page](https://mc-stan.org/loo/articles/online-only/faq.html) on #' the __loo__ website for answers to frequently asked questions. @@ -68,12 +92,8 @@ #' comp <- loo_compare(loo1, loo2, loo3) #' print(comp, digits = 2) #' -#' # show more details with simplify=FALSE -#' # (will be the same for all models in this artificial example) -#' print(comp, simplify = FALSE, digits = 3) -#' #' # can use a list of objects with custom names -#' # will use apple, banana, and cherry, as the names in the output +#' # the names will be used in the output #' loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) #' #' \dontrun{ @@ -101,54 +121,94 @@ loo_compare.default <- function(x, ...) { loos <- x } - # If subsampling is used + # if subsampling is used if (any(sapply(loos, inherits, "psis_loo_ss"))) { return(loo_compare.psis_loo_ss_list(loos)) } + # run pre-comparison checks loo_compare_checks(loos) + # compute elpd_diff and se_elpd_diff relative to best model comp <- loo_compare_matrix(loos) ord <- loo_compare_order(loos) - - # compute elpd_diff and se_elpd_diff relative to best model rnms <- rownames(comp) diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) + colnames(diffs) <- rnms elpd_diff <- apply(diffs, 2, sum) se_diff <- apply(diffs, 2, se_elpd_diff) - comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp) - rownames(comp) <- rnms - # run order statistics-based checks on models + # compute probabilities that a model has worse elpd than the best model + # using a normal approximation (Sivula et al., 2025) + p_worse <- stats::pnorm(0, elpd_diff, se_diff) + p_worse[elpd_diff == 0] <- NA + + comp <- cbind( + data.frame( + model = rnms, + elpd_diff = elpd_diff, + se_diff = se_diff, + p_worse = p_worse, + diag_diff = diag_diff(nrow(diffs), elpd_diff), + diag_elpd = diag_elpd(loos[ord]) + ), + as.data.frame(comp) + ) + rownames(comp) <- NULL + + # run order statistics-based checks for many model comparisons loo_order_stat_check(loos, ord) class(comp) <- c("compare.loo", class(comp)) - return(comp) + comp } #' @rdname loo_compare #' @export #' @param digits For the print method only, the number of digits to use when #' printing. -#' @param simplify For the print method only, should only the essential columns -#' of the summary matrix be printed? The entire matrix is always returned, but -#' by default only the most important columns are printed. -print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) { - xcopy <- x - if (inherits(xcopy, "old_compare.loo")) { - if (NCOL(xcopy) >= 2 && simplify) { - patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$" - xcopy <- xcopy[, grepl(patts, colnames(xcopy))] - } - } else if (NCOL(xcopy) >= 2 && simplify) { - xcopy <- xcopy[, c("elpd_diff", "se_diff")] +#' @param p_worse For the print method only, should we include the normal +#' approximation based probability of each model having worse performance than +#' the best model? The default is `TRUE`. +print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { + if (inherits(x, "old_compare.loo")) { + return(unclass(x)) + } + if (!inherits(x, "data.frame")) { + class(x) <- c(class(x), "data.frame") + } + if (!all(c("model", "elpd_diff", "se_diff") %in% colnames(x))) { + print(as.data.frame(x)) + return(x) + } + x2 <- cbind( + model = x$model, + .fr(x[, c("elpd_diff", "se_diff")], digits) + ) + if (p_worse && "p_worse" %in% colnames(x)) { + x2 <- cbind( + x2, + p_worse = .fr(x[, "p_worse"], digits = 2), + diag_diff = x[, "diag_diff"], + diag_elpd = x[, "diag_elpd"] + ) + } + print(x2, quote = FALSE, row.names = FALSE) + + # show glossary for diagnostic flags + has_diag <- any(nzchar(x[["diag_diff"]], keepNA = FALSE), na.rm = TRUE) || + any(nzchar(x[["diag_elpd"]], keepNA = FALSE), na.rm = TRUE) + if (has_diag && p_worse) { + message( + "\nDiagnostic flags present. ", + "See ?`loo-glossary` (sections `diag_diff` and `diag_elpd`) ", + "or https://mc-stan.org/loo/reference/loo-glossary.html." + ) } - print(.fr(xcopy, digits), quote = FALSE) invisible(x) } - # internal ---------------------------------------------------------------- #' Compute pointwise elpd differences @@ -172,7 +232,6 @@ se_elpd_diff <- function(diffs) { sqrt(N) * sd(diffs) } - #' Perform checks on `"loo"` objects before comparison #' @noRd #' @param loos List of `"loo"` objects. @@ -227,7 +286,6 @@ loo_compare_checks <- function(loos) { #' Find the model names associated with `"loo"` objects #' #' @export -#' @keywords internal #' @param x List of `"loo"` objects. #' @return Character vector of model names the same length as `x.` #' @@ -256,7 +314,6 @@ find_model_names <- function(x) { #' Compute the loo_compare matrix -#' @keywords internal #' @noRd #' @param loos List of `"loo"` objects. loo_compare_matrix <- function(loos){ @@ -278,7 +335,6 @@ loo_compare_matrix <- function(loos){ #' Computes the order of loos for comparison #' @noRd -#' @keywords internal #' @param loos List of `"loo"` objects. loo_compare_order <- function(loos){ tmp <- sapply(loos, function(x) { @@ -293,7 +349,6 @@ loo_compare_order <- function(loos){ #' Perform checks on `"loo"` objects __after__ comparison #' @noRd -#' @keywords internal #' @param loos List of `"loo"` objects. #' @param ord List of `"loo"` object orderings. #' @return Nothing, just possibly throws errors/warnings. @@ -335,14 +390,12 @@ loo_order_stat_check <- function(loos, ord) { #' Returns the middle index of a vector #' @noRd -#' @keywords internal #' @param vec A vector. #' @return Integer index value. middle_idx <- function(vec) floor(length(vec) / 2) #' Computes maximum order statistic from K Gaussians #' @noRd -#' @keywords internal #' @param K Number of Gaussians. #' @param c Scaling of the order statistic. #' @return Numeric expected maximum from K samples from a Gaussian with mean @@ -350,3 +403,38 @@ middle_idx <- function(vec) floor(length(vec) / 2) order_stat_heuristic <- function(K, c) { qnorm(p = 1 - 1 / (K * 2), mean = 0, sd = c) } + +#' Count number of high Pareto k values in PSIS-LOO and create diagnostic message +#' @noRd +#' @param loos Ordered list of loo objects. +#' @return Character vector of diagnostic messages. +diag_elpd <- function(loos) { + sapply(loos, function(loo) { + k <- loo$diagnostics[["pareto_k"]] + if (is.null(k)) { + out <- "" + } else { + S <- dim(loo)[1] + khat_threshold <- ps_khat_threshold(S) + K <- sum(k > khat_threshold) + out <- ifelse(K == 0, "", paste0(K, " k_psis > ", round(khat_threshold, 2))) + } + out + }) +} + +#' Create diagnostic for elpd differences +#' @noRd +#' @param N Number of data points. +#' @param elpd_diff Vector of elpd differences. +#' @return Character vector of diagnostic messages. +diag_diff <- function(N, elpd_diff) { + if (N < 100) { + diag_diff <- rep("N < 100", length(elpd_diff)) + diag_diff[elpd_diff == 0] <- "" + } else { + diag_diff <- rep("", length(elpd_diff)) + diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" + } + diag_diff +} diff --git a/R/loo_compare.psis_loo_ss_list.R b/R/loo_compare.psis_loo_ss_list.R index acd0690b..9c0db564 100644 --- a/R/loo_compare.psis_loo_ss_list.R +++ b/R/loo_compare.psis_loo_ss_list.R @@ -173,14 +173,9 @@ loo_compare_checks.psis_loo_ss_list <- function(loos) { #' @rdname loo_compare #' @export -print.compare.loo_ss <- function(x, ..., digits = 1, simplify = TRUE) { +print.compare.loo_ss <- function(x, ..., digits = 1) { xcopy <- x - if (inherits(xcopy, "old_compare.loo")) { - if (NCOL(xcopy) >= 2 && simplify) { - patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$" - xcopy <- xcopy[, grepl(patts, colnames(xcopy))] - } - } else if (NCOL(xcopy) >= 2 && simplify) { + if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff", "subsampling_se_diff")] } print(.fr(xcopy, digits), quote = FALSE) diff --git a/man/find_model_names.Rd b/man/find_model_names.Rd index cdb76b80..70a79d58 100644 --- a/man/find_model_names.Rd +++ b/man/find_model_names.Rd @@ -15,4 +15,3 @@ Character vector of model names the same length as \code{x.} \description{ Find the model names associated with \code{"loo"} objects } -\keyword{internal} diff --git a/man/loo-glossary.Rd b/man/loo-glossary.Rd index 7aa19639..a7ff026c 100644 --- a/man/loo-glossary.Rd +++ b/man/loo-glossary.Rd @@ -161,6 +161,110 @@ individual models due to correlation (i.e., if some observations are easier and some more difficult to predict for all models). } +\section{\code{p_worse} (probability of worse predictive performance)}{ + + +\code{p_worse} is the estimated probability that a model has worse predictive +performance than the best-ranked model in the comparison, based on the normal +approximation to the uncertainty in \code{elpd_diff}. It is computed as + +\if{html}{\out{
}}\preformatted{p_worse = pnorm(0, elpd_diff, se_diff). +}\if{html}{\out{
}} + +The best-ranked model (the first row in the \code{loo_compare()} output, where +\code{elpd_diff = 0}) always receives \code{NA}, since the comparison is defined +relative to that model. + +Because models are ordered by \code{elpd_loo} before computing \code{p_worse}, all +reported values are at least 0.5 by construction. A value close to 0.5 +indicates that the models are nearly indistinguishable in predictive +performance and that the ranking could easily be reversed with different +data. A value close to 1 indicates that the lower-ranked model is almost +certainly worse. \code{p_worse} inherits all the limitations of \code{se_diff} and the +normal approximation on which it is based. In particular, when \code{se_diff} is +underestimated, \code{p_worse} will be estimated too close to 1, making a model +appear more clearly worse than the data actually support. Conversely, when +\code{elpd_diff} is biased due to an unreliable LOO approximation, \code{p_worse} can +point in the wrong direction entirely. When any of these conditions are +present, \code{diag_diff} or \code{diag_elpd} will be flagged in the \code{loo_compare()} +output. See those sections below for further guidance. +} + +\section{\code{diag_diff} (pairwise comparison diagnostics)}{ + + +\code{diag_diff} is a diagnostic column in the \code{loo_compare()} output for each +model comparison against the current reference model. It flags conditions +under which the normal approximation behind \code{se_diff} and \code{p_worse} is likely +to be poorly calibrated. The column contains a short label when a condition +is detected, and is empty otherwise. + +The column \code{diag_diff} currently flags two problems: +\subsection{\code{N < 100}}{ + +When the number of observations is small, we may assume \code{se_diff} to be +underestimated. As a rough heuristic one can multiply \code{se_diff} by 2 to +make a more conservative estimate. +} + +\subsection{\verb{|elpd_diff| < 4}}{ + +When \verb{|elpd_diff|} is below 4, the models have very similar predictive +performance. In this setting, Sivula et al. (2025) show that skewness in +the error distribution can make the normal approximation for \code{se_diff} +and \code{p_worse} miscalibrated, even for large N. In practice, this usually +supports treating the models as predictively similar. +} + +\subsection{Relation between \code{N < 100} and \verb{|elpd_diff| < 4}}{ + +The conditions flagged by \code{diag_diff} are not independent: they tend to +co-occur, and when they do, some flags carry more information than others. +\code{loo_compare()} therefore follows a priority hierarchy and shows only the +most critical flag in the table output. + +The hierarchy is as follows: +\itemize{ +\item \strong{\code{N < 100} takes highest priority.} A small sample size undermines the +reliability of \code{se_diff} by underestimating uncertainty. Because of this, +even if \verb{|elpd_diff| < 4} is also true for a comparison, the table will only +show \code{N < 100}. The small sample size renders the \verb{|elpd_diff| < 4} +diagnostic less meaningful. +\item \strong{\verb{|elpd_diff| < 4} takes second priority.} When N >= 100 and the +difference is small, the normal approximation is miscalibrated due to the +skewness of the error distribution (Sivula et al., 2025). In this +situation, \code{se_diff} exists and is not heavily biased in scale, but the +shape of the approximation is wrong, making \code{p_worse} unreliable. +} +} +} + +\section{\code{diag_elpd}}{ + + +\code{diag_elpd} is a diagnostic column in the \code{loo_compare()} output that flags +when the PSIS-LOO approximation for an individual model is unreliable. Unlike +\code{diag_diff}, which concerns the \emph{comparison} between models, \code{diag_elpd} +concerns the quality of the \code{elpd_loo} estimate for each model individually. +It contains a short text label when a problem is detected, and is empty +otherwise. +\subsection{\verb{K k_psis > t} (K observations with Pareto-k values > t)}{ + +This label indicates that K observations for this model have Pareto-k values +above the PSIS reliability threshold \code{t} used by \code{loo} for that fit. The +threshold is sample-size dependent, and in many practical cases close to +0.7. When this flag appears, the PSIS approximation can be unreliable for +those observations, and the resulting \code{elpd_loo} may be biased. Because +\code{elpd_diff} is a direct difference of two models' \code{elpd_loo} values, bias in +either model's estimate propagates directly into \code{elpd_diff} and \code{p_worse}. +This is qualitatively different from the calibration issues flagged by +\code{diag_diff}: here the estimate itself may be wrong, not just uncertain. + +See for further information on Pareto-k values the "Pareto k estimates" +section. +} +} + \references{ Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC. diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index 7b3514a6..02a4d2c2 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -11,9 +11,9 @@ loo_compare(x, ...) \method{loo_compare}{default}(x, ...) -\method{print}{compare.loo}(x, ..., digits = 1, simplify = TRUE) +\method{print}{compare.loo}(x, ..., digits = 1, p_worse = TRUE) -\method{print}{compare.loo_ss}(x, ..., digits = 1, simplify = TRUE) +\method{print}{compare.loo_ss}(x, ..., digits = 1) } \arguments{ \item{x}{An object of class \code{"loo"} or a list of such objects. If a list is @@ -26,34 +26,31 @@ list.} \item{digits}{For the print method only, the number of digits to use when printing.} -\item{simplify}{For the print method only, should only the essential columns -of the summary matrix be printed? The entire matrix is always returned, but -by default only the most important columns are printed.} +\item{p_worse}{For the print method only, should we include the normal +approximation based probability of each model having worse performance than +the best model? The default is \code{TRUE}.} } \value{ -A matrix with class \code{"compare.loo"} that has its own -print method. See the \strong{Details} section. +A data frame with class \code{"compare.loo"} that has its own +print method. See the \strong{Details} and \strong{Examples} sections. } \description{ Compare fitted models based on \link[=loo-glossary]{ELPD}. - -By default the print method shows only the most important information. Use -\code{print(..., simplify=FALSE)} to print a more detailed summary. } \details{ When comparing two fitted models, we can estimate the difference in their expected predictive accuracy by the difference in \code{\link[=loo-glossary]{elpd_loo}} or \code{elpd_waic} (or multiplied by \eqn{-2}, if desired, to be on the deviance scale). +\subsection{\code{elpd_diff} and \code{se_diff}}{ -When using \code{loo_compare()}, the returned matrix will have one row per model -and several columns of estimates. The values in the -\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} columns of the -returned matrix are computed by making pairwise comparisons between each -model and the model with the largest ELPD (the model in the first row). For -this reason the \code{elpd_diff} column will always have the value \code{0} in the -first row (i.e., the difference between the preferred model and itself) and -negative values in subsequent rows for the remaining models. +When using \code{loo_compare()}, the returned data frame will have one row per +model and several columns of estimates. The values of +\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} are computed by +making pairwise comparisons between each model and the model with the +largest ELPD (the model listed first). Therefore, the first \code{elpd_diff} +value will always be \code{0} (i.e., the difference between the preferred model +and itself) and the rest of the values will be negative. To compute the standard error of the difference in \link[=loo-glossary]{ELPD} --- which should not be expected to equal the difference of the standard errors @@ -66,8 +63,40 @@ better sense of uncertainty than what is obtained using the current standard approach of comparing differences of deviances to a Chi-squared distribution, a practice derived for Gaussian linear models or asymptotically, and which only applies to nested models in any case. -Sivula et al. (2022) discuss the conditions when the normal -approximation used for SE and \code{se_diff} is good. +} + +\subsection{\code{p_worse}, \code{diag_diff}, and \code{diag_elpd}}{ + +The values in the \code{p_worse} column show the probability of each model +having worse ELPD than the best model. These probabilities are computed +with a normal approximation using the values from \code{elpd_diff} and +\code{se_diff}. Sivula et al. (2025) present the conditions when the normal +approximation used for SE and \code{se_diff} is good, and the column +\code{diag_diff} contains possible diagnostic messages: +\itemize{ +\item \code{N < 100} (small data) +\item \verb{|elpd_diff| < 4} (models make similar predictions) +} + +If either of these diagnostic messages is shown, the error distribution is +skewed or thick tailed and the normal approximation based on \code{elpd_diff} +and \code{se_diff} is not well calibrated. In that case, the probabilities +\code{p_worse} are likely to be too large. However, \code{elpd_diff} and \code{se_diff} +will still be indicative of the differences and uncertainties (for example, +if \verb{|elpd_diff|} is many times larger than \code{se_diff} the difference is quite +certain). In addition, if the model is not well specificed and there are +outliers, the error distribution can also be skewed or thick tailed and the +normal approximation is not well calibrated. Possible model misspecification +and outliers can be diagnosed with usual predictive checking methods. + +The column \code{diag_elpd} shows the PSIS-LOO Pareto k diagnostic for the +pointwise ELPD computations for each model. If \verb{K k_psis > 0.7} is shown, +where \code{K} is the number of high Pareto k values in the PSIS +computation, then there may be significant bias in \code{elpd_diff} favoring +models with a large number of high Pareto k values. +} + +\subsection{Warnings for many model comparisons}{ If more than \eqn{11} models are compared, we internally recompute the model differences using the median model by ELPD as the baseline model. We then @@ -78,6 +107,7 @@ selection process. In that case users are recommended to avoid model selection based on LOO-CV, and instead to favor model averaging/stacking or projection predictive inference. } +} \examples{ # very artificial example, just for demonstration! LL <- example_loglik_array() @@ -88,12 +118,8 @@ loo3 <- loo(LL + 2) # should be best model when compared comp <- loo_compare(loo1, loo2, loo3) print(comp, digits = 2) -# show more details with simplify=FALSE -# (will be the same for all models in this artificial example) -print(comp, simplify = FALSE, digits = 3) - # can use a list of objects with custom names -# will use apple, banana, and cherry, as the names in the output +# the names will be used in the output loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) \dontrun{ diff --git a/man/tis.Rd b/man/tis.Rd index 46cf30a8..df03482c 100644 --- a/man/tis.Rd +++ b/man/tis.Rd @@ -50,44 +50,42 @@ setting \code{mc.cores} interactively or in a script is fine). }} } \value{ -The \code{tis()} methods return an object of class \code{"tis"}, -which is a named list with the following components: +The `tis()` methods return an object of class `"tis"`, + which is a named list with the following components: \describe{ -\item{\code{log_weights}}{ -Vector or matrix of smoothed (and truncated) but \emph{unnormalized} log -weights. To get normalized weights use the -\code{\link[=weights.importance_sampling]{weights()}} method provided for objects of -class \code{tis}. -} -\item{\code{diagnostics}}{ -A named list containing one vector: -\itemize{ -\item \code{pareto_k}: Not used in \code{tis}, all set to 0. -\item \code{n_eff}: Effective sample size estimates. -} -} + \item{`log_weights`}{ + Vector or matrix of smoothed (and truncated) but *unnormalized* log + weights. To get normalized weights use the + [`weights()`][weights.importance_sampling] method provided for objects of + class `tis`. + } + \item{`diagnostics`}{ + A named list containing one vector: + * `pareto_k`: Not used in `tis`, all set to 0. + * `n_eff`: Effective sample size estimates. + } } -Objects of class \code{"tis"} also have the following \link[=attributes]{attributes}: +Objects of class `"tis"` also have the following [attributes][attributes()]: \describe{ -\item{\code{norm_const_log}}{ -Vector of precomputed values of \code{colLogSumExps(log_weights)} that are -used internally by the \code{\link[=weights]{weights()}}method to normalize the log weights. -} -\item{\code{r_eff}}{ -If specified, the user's \code{r_eff} argument. -} -\item{\code{tail_len}}{ -Not used for \code{tis}. -} -\item{\code{dims}}{ -Integer vector of length 2 containing \code{S} (posterior sample size) -and \code{N} (number of observations). -} -\item{\code{method}}{ -Method used for importance sampling, here \code{tis}. -} + \item{`norm_const_log`}{ + Vector of precomputed values of `colLogSumExps(log_weights)` that are + used internally by the [weights()]method to normalize the log weights. + } + \item{`r_eff`}{ + If specified, the user's `r_eff` argument. + } + \item{`tail_len`}{ + Not used for `tis`. + } + \item{`dims`}{ + Integer vector of length 2 containing `S` (posterior sample size) + and `N` (number of observations). + } + \item{`method`}{ + Method used for importance sampling, here `tis`. + } } } \description{ diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index cf27900e..27bad419 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -1,44 +1,95 @@ # loo_compare returns expected results (2 models) - WAoAAAACAAQFAAACAwAAAAMOAAAAEAAAAAAAAAAAwBA6U1+cRe4AAAAAAAAAAD+2ake0LxMB - wFTh8N3JQljAVeWWE8MGuUARCD2zEXBfQBEalRIN2T9ACijAYdW5U0AmZ5XrANCKP/H9Zexy - 814/8ZtgnG1nx0Bk4fDdyUJYQGXllhPDBrlAIQg9sxFwX0AhGpUSDdk/AAAEAgAAAAEABAAJ - AAAAA2RpbQAAAA0AAAACAAAAAgAAAAgAAAQCAAAAAQAEAAkAAAAIZGltbmFtZXMAAAATAAAA - AgAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1vZGVsMgAAABAAAAAIAAQACQAAAAll - bHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxw - ZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkA - AAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAADAAQACQAAAAtjb21wYXJl - LmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJheQAAAP4= + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAA + AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAABAAAAAC + AAQACQAAAAAABAAJAAAAAAAAAA4AAAACwFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMR + cF9AEQg9sxFwXwAAAA4AAAACQAoowGHVuVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l + 7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAA + BAIAAAABAAQACQAAAAVuYW1lcwAAABAAAAAMAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9k + aWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJ + AAAACWRpYWdfZWxwZAAEAAkAAAAJZWxwZF93YWljAAQACQAAAAxzZV9lbHBkX3dhaWMABAAJ + AAAABnBfd2FpYwAEAAkAAAAJc2VfcF93YWljAAQACQAAAAR3YWljAAQACQAAAAdzZV93YWlj + AAAEAgAAAAEABAAJAAAABWNsYXNzAAAAEAAAAAIABAAJAAAAC2NvbXBhcmUubG9vAAQACQAA + AApkYXRhLmZyYW1lAAAEAgAAAAEABAAJAAAACXJvdy5uYW1lcwAAAA0AAAACgAAAAP////4A + AAD+ -# loo_compare returns expected result (3 models) +--- - WAoAAAACAAQFAAACAwAAAAMOAAAAGAAAAAAAAAAAwBA6U1+cRe7AMA3KkbYEGAAAAAAAAAAA - P7ZqR7QvEwE/y6/t4TTtXsBU4fDdyUJYwFXllhPDBrnAWOVjgjbDYkARCD2zEXBfQBEalRIN - 2T9AEPIF3GigE0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCj/x/WXscvNeP/GbYJxtZ8c/8YDQ - kmfJX0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYkAhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAA - BAIAAAABAAQACQAAAANkaW0AAAANAAAAAgAAAAMAAAAIAAAEAgAAAAEABAAJAAAACGRpbW5h - bWVzAAAAEwAAAAIAAAAQAAAAAwAEAAkAAAAGbW9kZWwxAAQACQAAAAZtb2RlbDIABAAJAAAA - Bm1vZGVsMwAAABAAAAAIAAQACQAAAAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAA - CWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNl - X3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFz - cwAAABAAAAADAAQACQAAAAtjb21wYXJlLmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJh - eQAAAP4= + Code + print(comp1) + Output + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 0.0 0.0 NA + +--- -# compare returns expected result (2 models) + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAACAAAAAAAAAAA/tmpHtC8TAQAA + AA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAAAAAABAAJAAAAB04gPCAxMDAA + AAAQAAAAAgAEAAkAAAAAAAQACQAAAAAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAA + AkARCD2zEXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXs + cvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEa + lRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAADAAEAAkAAAAFbW9kZWwABAAJAAAA + CWVscGRfZGlmZgAEAAkAAAAHc2VfZGlmZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAJZGlhZ19k + aWZmAAQACQAAAAlkaWFnX2VscGQABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93 + YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAH + c2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21wYXJlLmxv + bwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAANAAAAAoAA + AAD////+AAAA/g== + +--- + + Code + print(comp2) + Output + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 + Message + + Diagnostic flags present. See ?`loo-glossary` (sections `diag_diff` and `diag_elpd`) or https://mc-stan.org/loo/reference/loo-glossary.html. + +--- Code - comp1 + print(comp2, p_worse = FALSE) Output - elpd_diff se - 0.0 0.0 + model elpd_diff se_diff + model1 0.0 0.0 + model2 -4.1 0.1 + +# loo_compare returns expected result (3 models) + + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAEAAkAAAAGbW9kZWwzAAAADgAAAAMAAAAAAAAAAMAQOlNfnEXuwDANypG2BBgAAAAO + AAAAAwAAAAAAAAAAP7ZqR7QvEwE/y6/t4TTtXgAAAA4AAAADf/AAAAAAB6I/8AAAAAAAAD/w + AAAAAAAAAAAAEAAAAAMABAAJAAAAAAAEAAkAAAAHTiA8IDEwMAAEAAkAAAAHTiA8IDEwMAAA + ABAAAAADAAQACQAAAAAABAAJAAAAAAAEAAkAAAAAAAAADgAAAAPAVOHw3clCWMBV5ZYTwwa5 + wFjlY4I2w2IAAAAOAAAAA0ARCD2zEXBfQBEalRIN2T9AEPIF3GigEwAAAA4AAAADQAoowGHV + uVNAJmeV6wDQikBByNhSGt0KAAAADgAAAAM/8f1l7HLzXj/xm2CcbWfHP/GA0JJnyV8AAAAO + AAAAA0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYgAAAA4AAAADQCEIPbMRcF9AIRqVEg3ZP0Ag + 8gXcaKATAAAEAgAAAAEABAAJAAAABW5hbWVzAAAAEAAAAAwABAAJAAAABW1vZGVsAAQACQAA + AAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAAB3Bfd29yc2UABAAJAAAACWRpYWdf + ZGlmZgAEAAkAAAAJZGlhZ19lbHBkAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRf + d2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAA + B3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5s + b28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAADQAAAAKA + AAAA/////QAAAP4= --- Code - comp2 + print(comp1) Output - elpd_diff se - -4.1 0.1 + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 + model3 -16.1 0.2 1.00 N < 100 + Message + + Diagnostic flags present. See ?`loo-glossary` (sections `diag_diff` and `diag_elpd`) or https://mc-stan.org/loo/reference/loo-glossary.html. # compare returns expected result (3 models) diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 7f720b22..aba34209 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -73,8 +73,12 @@ test_that("loo_compare throws appropriate warnings", { comp_colnames <- c( + "model", "elpd_diff", "se_diff", + "p_worse", + "diag_diff", + "diag_elpd", "elpd_waic", "se_elpd_waic", "p_waic", @@ -86,20 +90,31 @@ comp_colnames <- c( test_that("loo_compare returns expected results (2 models)", { comp1 <- loo_compare(w1, w1) expect_s3_class(comp1, "compare.loo") + expect_s3_class(comp1, "data.frame") expect_equal(colnames(comp1), comp_colnames) - expect_equal(rownames(comp1), c("model1", "model2")) - expect_output(print(comp1), "elpd_diff") - expect_equal(comp1[1:2, 1], c(0, 0), ignore_attr = TRUE) - expect_equal(comp1[1:2, 2], c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$model, c("model1", "model2")) + expect_equal(comp1$elpd_diff, c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$se_diff, c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$p_worse, c(NA_real_, NA_real_), ignore_attr = TRUE) + expect_snapshot_value(comp1, style = "serialize") + expect_snapshot(print(comp1)) comp2 <- loo_compare(w1, w2) expect_s3_class(comp2, "compare.loo") expect_equal(colnames(comp2), comp_colnames) - + expect_equal(comp2$p_worse, c(NA, 1)) + expect_equal(comp2$diag_diff, c("", "N < 100")) + expect_equal(comp2$diag_elpd, c("", "")) expect_snapshot_value(comp2, style = "serialize") + expect_snapshot(print(comp2)) + expect_snapshot(print(comp2, p_worse = FALSE)) # specifying objects via ... and via arg x gives equal results expect_equal(comp2, loo_compare(x = list(w1, w2))) + + # custom naming works + comp3 <- loo_compare(x = list("A" = w2, "B" = w1)) + expect_equal(comp3$model, c("B", "A")) }) @@ -108,12 +123,13 @@ test_that("loo_compare returns expected result (3 models)", { comp1 <- loo_compare(w1, w2, w3) expect_equal(colnames(comp1), comp_colnames) - expect_equal(rownames(comp1), c("model1", "model2", "model3")) - expect_equal(comp1[1, 1], 0) + expect_equal(comp1$model, c("model1", "model2", "model3")) + expect_equal(comp1$p_worse, c(NA, 1, 1)) + expect_equal(comp1$diag_diff, c("", "N < 100", "N < 100")) expect_s3_class(comp1, "compare.loo") - expect_s3_class(comp1, "matrix") - + expect_s3_class(comp1, "data.frame") expect_snapshot_value(comp1, style = "serialize") + expect_snapshot(print(comp1)) # specifying objects via '...' gives equivalent results (equal # except rownames) to using 'x' argument @@ -129,13 +145,11 @@ test_that("compare throws deprecation warnings", { test_that("compare returns expected result (2 models)", { expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated") - expect_snapshot(comp1) expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0)) expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated") - expect_snapshot(comp2) - expect_named(comp2, c("elpd_diff", "se")) - expect_s3_class(comp2, "compare.loo") + expect_equal(round(comp2[1:2], 3), c(elpd_diff = -4.057, se = 0.088)) + expect_s3_class(comp2, "old_compare.loo") # specifying objects via ... and via arg x gives equal results expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated")