% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/svem_significance_test_parallel.R
\name{svem_significance_test_parallel}
\alias{svem_significance_test_parallel}
\title{SVEM whole-model significance test with mixture support (parallel)}
\usage{
svem_significance_test_parallel(
  formula,
  data,
  mixture_groups = NULL,
  nPoint = 2000,
  nSVEM = 10,
  nPerm = 150,
  percent = 90,
  nBoot = 100,
  glmnet_alpha = c(1),
  weight_scheme = c("SVEM"),
  objective = c("wAIC", "wBIC", "wSSE", "auto"),
  relaxed = FALSE,
  verbose = TRUE,
  nCore = parallel::detectCores() - 2,
  seed = NULL,
  spec = NULL,
  response = NULL,
  use_spec_contrasts = TRUE,
  ...
)
}
\arguments{
\item{formula}{A model formula. If \code{spec} is provided, the right-hand
side is ignored and replaced by the locked expansion in \code{spec}.}

\item{data}{A data frame containing the variables in the model.}

\item{mixture_groups}{Optional list describing one or more mixture-factor
groups. Each element should be a list with components:
\itemize{
\item \code{vars}: character vector of column names;
\item \code{lower}: numeric vector of lower bounds (same length as \code{vars});
\item \code{upper}: numeric vector of upper bounds (same length as \code{vars});
\item \code{total}: scalar specifying the sum of the mixture variables.
}
All mixture variables must appear in exactly one group. Defaults to \code{NULL}.}

\item{nPoint}{Number of random evaluation points in the factor space
(default \code{2000}).}

\item{nSVEM}{Number of SVEM fits on the original (unpermuted) data used to
summarize the observed surface (default \code{10}).}

\item{nPerm}{Number of SVEM fits on permuted responses used to build the null
reference distribution (default \code{150}).}

\item{percent}{Percentage of variance to capture in the SVD of the permutation
surfaces (default \code{90}).}

\item{nBoot}{Number of bootstrap iterations within each inner SVEM fit
(default \code{100}).}

\item{glmnet_alpha}{Numeric vector of \code{glmnet} alpha values
(default \code{c(1)}).}

\item{weight_scheme}{Weighting scheme for SVEM (default \code{"SVEM"}).
Passed to \code{SVEMnet()}.}

\item{objective}{Objective used inside \code{SVEMnet()} to pick the bootstrap
path solution. One of \code{"wAIC"}, \code{"wBIC"}, or
\code{"wSSE"} (default \code{"wAIC"}).}

\item{relaxed}{Logical; default \code{FALSE}. When \code{TRUE}, inner
\code{SVEMnet()} fits use \code{glmnet}'s relaxed elastic-net path and
select both \code{lambda} and relaxed \code{gamma} on each bootstrap.
When \code{FALSE}, the standard \code{glmnet} path is used. If
\code{relaxed = TRUE} and \code{glmnet_alpha} includes \code{0}, ridge
(\code{alpha = 0}) is dropped by \code{SVEMnet()} for relaxed fits.}

\item{verbose}{Logical; if \code{TRUE}, display progress messages
(default \code{TRUE}).}

\item{nCore}{Number of CPU cores for parallel processing. Default is
\code{parallel::detectCores() - 2}, with a floor of \code{1}.}

\item{seed}{Optional integer seed for reproducible RNG (default \code{NULL}).
When supplied, the master RNG kind is set to \code{"L'Ecuyer-CMRG"} (with
\code{sample.kind = "Rounding"} when supported), and deterministic
per-iteration seeds are generated on the master and applied inside each
parallel \code{\%dopar\%} iteration via \code{set.seed()}.
This yields reproducibility regardless of parallel scheduling and core
count.}

\item{spec}{Optional \code{bigexp_spec} created by \code{bigexp_terms()}.
If provided, the test reuses its locked expansion. The working formula
becomes \code{bigexp_formula(spec, response_name)}, where
\code{response_name} is taken from \code{response} if supplied, otherwise
from the left-hand side of \code{formula}. Categorical sampling uses
\code{spec$levels}, and numeric sampling prefers \code{spec$num_range}
when available.
Discrete numeric predictors recorded by \code{bigexp_terms()}
(\code{spec$settings$discrete_numeric} + \code{spec$settings$discrete_levels})
are sampled only from their recorded allowed levels when building the
evaluation grid.}

\item{response}{Optional character name for the response variable to use when
\code{spec} is supplied. If omitted, the response is taken from the
left-hand side of \code{formula}.}

\item{use_spec_contrasts}{Logical; default \code{TRUE}. When \code{spec} is
supplied and \code{use_spec_contrasts = TRUE}, the function replays
\code{spec$settings$contrasts_options} on the parallel workers for
deterministic factor coding.}

\item{...}{Additional arguments passed to \code{SVEMnet()} and then to
\code{glmnet()} (for example: \code{penalty.factor}, \code{offset},
\code{lower.limits}, \code{upper.limits}, \code{standardize.response}, etc.).
The \code{relaxed} setting is controlled by the \code{relaxed} argument of
this function and any \code{relaxed} value passed via \code{...} is ignored
with a warning.}
}
\value{
An object of class \code{"svem_significance_test"}, a list with
components:
\itemize{
\item \code{p_value}: median whole-model \eqn{p}-value across the
\code{nSVEM} original SVEM fits.
\item \code{p_values}: numeric vector of length \code{nSVEM} with the
per-fit \eqn{p}-values.
\item \code{d_Y}: numeric vector of distances for the original SVEM fits.
\item \code{d_pi_Y}: numeric vector of distances for the permutation fits.
\item \code{distribution_fit}: fitted SHASHo distribution object.
\item \code{data_d}: data frame of distances and source labels
(original vs permutation), suitable for plotting.
}
}
\description{
Perform a permutation-based whole-model significance test for a continuous
(Gaussian) SVEM fit, with optional mixture-factor groups and parallel SVEM
refits.
}
\details{
The procedure follows Karl (2024): it generates a space-filling evaluation
grid in the factor space, fits multiple SVEM models on the original data and
on permuted responses, standardizes grid predictions, reduces them via an
SVD-based low-rank representation, and summarizes each fit by a
Mahalanobis-type distance in the reduced space. A flexible SHASHo
distribution is then fit to the permutation distances and used to obtain a
whole-model \eqn{p}-value for the observed surface.

Because the test is based on a finite number of permutations and a fitted
null distribution, the reported \eqn{p}-values are approximate and are
intended as a diagnostic measure of global factor signal, not as exact
hypothesis tests.

All SVEM refits (for the original and permuted responses) are run in
parallel using \code{foreach} + \code{doParallel}.

Reproducible parallel RNG (Windows/macOS/Linux): when \code{seed} is supplied,
the function sets the master RNG to \code{RNGkind("L'Ecuyer-CMRG",
sample.kind = "Rounding")} (falling back to \code{"L'Ecuyer-CMRG"} on older R),
and generates a deterministic, per-iteration seed schedule on the master.
Each parallel \code{foreach} iteration then calls \code{set.seed()} with its
assigned seed before performing any random draws (including permutations and
the bootstrap randomness inside \code{SVEMnet()}).
This makes results reproducible regardless of worker scheduling (including
\code{preschedule = FALSE}) and independent of the number of cores.

The function can optionally reuse a deterministic, locked expansion built
with \code{bigexp_terms()}. Supply \code{spec} (and optionally
\code{response}) to ensure that categorical levels, contrasts, and the
polynomial/interaction structure are identical across repeated calls and
across multiple responses sharing the same factor space.

Although the implementation calls \code{SVEMnet()} internally and will
technically run for any supported \code{family}, the significance test is
\emph{designed} for continuous (Gaussian) responses and should be interpreted
in that setting.
}
\section{Acknowledgments}{

OpenAI's GPT models (o1-preview through GPT-5 Pro) were
used to assist with coding and roxygen documentation; all
content was reviewed and finalized by the author.
}

\examples{
\donttest{
  set.seed(1)

  # Small toy data with a 3-component mixture A, B, C
  n <- 40
  sample_trunc_dirichlet <- function(n, lower, upper, total) {
    k <- length(lower)
    stopifnot(length(upper) == k, total >= sum(lower), total <= sum(upper))
    avail <- total - sum(lower)
    if (avail <= 0) return(matrix(rep(lower, each = n), nrow = n))
    out <- matrix(NA_real_, n, k)
    i <- 1L
    while (i <= n) {
      g <- rgamma(k, 1, 1)
      w <- g / sum(g)
      x <- lower + avail * w
      if (all(x <= upper + 1e-12)) { out[i, ] <- x; i <- i + 1L }
    }
    out
  }

  lower <- c(0.10, 0.20, 0.05)
  upper <- c(0.60, 0.70, 0.50)
  total <- 1.0
  ABC   <- sample_trunc_dirichlet(n, lower, upper, total)
  A <- ABC[, 1]; B <- ABC[, 2]; C <- ABC[, 3]
  X <- runif(n)
  F <- factor(sample(c("red", "blue"), n, replace = TRUE))
  y <- 2 + 3*A + 1.5*B + 1.2*C + 0.5*X + 1*(F == "red") + rnorm(n, sd = 0.3)
  dat <- data.frame(y = y, A = A, B = B, C = C, X = X, F = F)

  mix_spec <- list(list(
    vars  = c("A", "B", "C"),
    lower = lower,
    upper = upper,
    total = total
  ))

  ## Example 1: direct formula interface (no locked expansion spec)
  res1 <- svem_significance_test_parallel(
    y ~ A + B + C + X + F,
    data           = dat,
    mixture_groups = mix_spec,
    glmnet_alpha   = 1,
    weight_scheme  = "SVEM",
    objective      = "auto",
    relaxed        = FALSE,   # default, shown for clarity
    nCore          = 2,
    seed           = 123,
    verbose        = FALSE
  )
  res1$p_value

  ## Example 2: using a deterministic bigexp expansion spec
  ## Build a wide expansion once and reuse it via `spec`
  spec <- bigexp_terms(
    y ~ A + B + C + X + F,
    data             = dat,
    factorial_order  = 2,  # up to 2-way interactions
    polynomial_order = 2   # up to quadratic terms in continuous vars
  )

  ## Run the same significance test, but with the locked expansion:
  ## - `formula` is still required, but its RHS is ignored when `spec` is given
  ## - `response` tells the helper which LHS to use with `spec`
  res2 <- svem_significance_test_parallel(
    y ~ A + B + C + X + F,
    data               = dat,
    mixture_groups     = mix_spec,
    glmnet_alpha       = 1,
    weight_scheme      = "SVEM",
    objective          = "auto",
    relaxed            = FALSE,
    nCore              = 2,
    seed               = 123,
    spec               = spec,
    response           = "y",
    use_spec_contrasts = TRUE,
    verbose            = FALSE
  )
  res2$p_value
}
}
\references{
Gotwalt, C., & Ramsey, P. (2018). Model Validation Strategies for Designed Experiments Using Bootstrapping Techniques With Applications to Biopharmaceuticals. \emph{JMP Discovery Conference}. \url{https://community.jmp.com/t5/Abstracts/Model-Validation-Strategies-for-Designed-Experiments-Using/ev-p/849873/redirect_from_archived_page/true}

Karl, A. T. (2024). A randomized permutation whole-model test heuristic for Self-Validated Ensemble Models (SVEM). \emph{Chemometrics and Intelligent Laboratory Systems}, \emph{249}, 105122. \doi{10.1016/j.chemolab.2024.105122}

Karl, A., Wisnowski, J., & Rushing, H. (2022). JMP Pro 17 Remedies for Practical Struggles with Mixture Experiments. JMP Discovery Conference. \doi{10.13140/RG.2.2.34598.40003/1}

Lemkus, T., Gotwalt, C., Ramsey, P., & Weese, M. L. (2021). Self-Validated Ensemble Models for Design of Experiments. \emph{Chemometrics and Intelligent Laboratory Systems}, 219, 104439. \doi{10.1016/j.chemolab.2021.104439}

Xu, L., Gotwalt, C., Hong, Y., King, C. B., & Meeker, W. Q. (2020). Applications of the Fractional-Random-Weight Bootstrap. \emph{The American Statistician}, 74(4), 345–358. \doi{10.1080/00031305.2020.1731599}

Ramsey, P., Gaudard, M., & Levin, W. (2021). Accelerating Innovation with Space Filling Mixture Designs, Neural Networks and SVEM. \emph{JMP Discovery Conference}. \url{https://community.jmp.com/t5/Abstracts/Accelerating-Innovation-with-Space-Filling-Mixture-Designs/ev-p/756841}

Ramsey, P., & Gotwalt, C. (2018). Model Validation Strategies for Designed Experiments Using Bootstrapping Techniques With Applications to Biopharmaceuticals. \emph{JMP Discovery Conference - Europe}. \url{https://community.jmp.com/t5/Abstracts/Model-Validation-Strategies-for-Designed-Experiments-Using/ev-p/849647/redirect_from_archived_page/true}

Ramsey, P., Levin, W., Lemkus, T., & Gotwalt, C. (2021). SVEM: A Paradigm Shift in Design and Analysis of Experiments. \emph{JMP Discovery Conference - Europe}. \url{https://community.jmp.com/t5/Abstracts/SVEM-A-Paradigm-Shift-in-Design-and-Analysis-of-Experiments-2021/ev-p/756634}

Ramsey, P., & McNeill, P. (2023). CMC, SVEM, Neural Networks, DOE, and Complexity: It's All About Prediction. \emph{JMP Discovery Conference}.

Friedman, J. H., Hastie, T., and Tibshirani, R. (2010).
Regularization Paths for Generalized Linear Models via Coordinate Descent.
Journal of Statistical Software, 33(1), 1-22.

Meinshausen, N. (2007).
Relaxed Lasso. Computational Statistics & Data Analysis, 52(1), 374-393.

Kish, L. (1965). \emph{Survey Sampling}. Wiley.

Lumley, T. (2004). Analysis of complex survey samples.
\emph{Journal of Statistical Software}, 9(1), 1–19.

Lumley, T. and Scott, A. (2015). AIC and BIC for modelling with complex survey data.
\emph{Journal of Survey Statistics and Methodology}, 3(1), 1–18.
}
\seealso{
\code{\link{SVEMnet}}, \code{\link{bigexp_terms}},
\code{\link{bigexp_formula}}
}
