% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ale_core.R
\name{ale}
\alias{ale}
\title{Create and return ALE data, statistics, and plots}
\usage{
ale(
  data,
  model,
  x_cols = NULL,
  y_col = NULL,
  ...,
  parallel = parallel::detectCores(logical = FALSE) - 1,
  model_packages = as.character(NA),
  output = c("plots", "data", "stats", "conf_regions"),
  pred_fun = function(object, newdata, type = pred_type) {
     stats::predict(object =
    object, newdata = newdata, type = type)
 },
  pred_type = "response",
  p_values = NULL,
  p_alpha = c(0.01, 0.05),
  x_intervals = 100,
  boot_it = 0,
  seed = 0,
  boot_alpha = 0.05,
  boot_centre = "mean",
  relative_y = "median",
  y_type = NULL,
  median_band_pct = c(0.05, 0.5),
  rug_sample_size = 500,
  min_rug_per_interval = 1,
  ale_xs = NULL,
  ale_ns = NULL,
  compact_plots = FALSE,
  silent = FALSE
)
}
\arguments{
\item{data}{dataframe. Dataset from which to create predictions for the ALE.}

\item{model}{model object. Model for which ALE should be calculated.
May be any kind of R object that can make predictions from data.}

\item{x_cols}{character. Vector of column names from \code{data} for which
one-way ALE data is to be calculated (that is, simple ALE without interactions).
If not provided, ALE will be created for all columns in \code{data} except \code{y_col}.}

\item{y_col}{character length 1. Name of the outcome target label (y) variable.
If not provided, \code{ale()} will try to detect it automatically. For non-standard
models, \code{y_col} should be provided. For survival models, set \code{y_col} to the
name of the binary event column; in that case, \code{pred_type} should also be specified.}

\item{...}{not used. Inserted to require explicit naming of subsequent arguments.}

\item{parallel}{non-negative integer length 1. Number of parallel threads
(workers or tasks) for parallel execution of the function. See details.}

\item{model_packages}{character. Character vector of names of
packages that \code{model} depends on that might not be obvious.
The \code{{ale}} package should be able to automatically recognize and load most
packages that are needed, but with parallel processing enabled (which is the
default), some packages might not be properly loaded. If you get a strange error
message that mentions something somewhere about 'future', try adding the
package for your model to this vector, especially if you see such errors after
the progress bars begin displaying (assuming you did not disable progress bars
with \code{silent = TRUE}).}

\item{output}{character in c('plots', 'data', 'stats', 'conf_regions'). Vector of types of results to return.
'plots' will return an ALE plot; 'data' will return the source ALE data;
'stats' will return ALE statistics. Each option must be listed to return the
specified component. By default, all are returned.}

\item{pred_fun, pred_type}{function,character length 1. \code{pred_fun} is a function that
returns a vector of predicted values of type \code{pred_type} from \code{model} on \code{data}.
See details.}

\item{p_values}{instructions for calculating p-values and to determine the
median band. If \code{NULL} (default), no p-values are calculated and
\code{median_band_pct} is used to determine the median band.
To calculate p-values, an object generated by the
\code{\link[=create_p_funs]{create_p_funs()}} function must be provided here. If \code{p_values} is set to 'auto',
this \code{ale()} function will try to automatically create the p-values function;
this only works with standard R model types. Any error message will be given
if p-values cannot be generated. Any other input provided to this argument
will result in an error. For more details about creating p-values,
see documentation for \code{\link[=create_p_funs]{create_p_funs()}}. Note that p-values will not be
generated if 'stats' are not included as an option in the \code{output} argument.}

\item{p_alpha}{numeric length 2 from 0 to 1. Alpha for "confidence interval" ranges
for printing bands around the median for single-variable plots. These are the
default values used if \code{p_values} are provided. If \code{p_values} are not provided,
then \code{median_band_pct} is used instead.
The inner band range will be the median value of y ± \code{p_alpha[2]} of the relevant
ALE statistic (usually ALE range or normalized ALE range).
For plots with a second outer band, its range will be the median ± \code{p_alpha[1]}.
For example, in the ALE plots, for the default \code{p_alpha = c(0.01, 0.05)},
the inner band will be the median ± ALE minimum or maximum at p = 0.05 and
the outer band will be the median ± ALE minimum or maximum at p = 0.01.}

\item{x_intervals}{positive integer length 1. Maximum number of intervals on the x-axis
for the ALE data for each column in \code{x_cols}. The number of intervals that the algorithm generates
might eventually be fewer than what the user specifies if the data values for
a given x value do not support that many intervals.}

\item{boot_it}{non-negative integer length 1. Number of bootstrap iterations for the
ALE values. If \code{boot_it = 0} (default), then ALE will be calculated on the entire dataset
with no bootstrapping.}

\item{seed}{integer length 1. Random seed. Supply this between runs to assure that
identical random ALE data is generated each time}

\item{boot_alpha}{numeric length 1 from 0 to 1. Alpha for percentile-based confidence
interval range for the bootstrap intervals; the bootstrap confidence intervals
will be the lowest and highest \code{(1 - 0.05) / 2} percentiles. For example,
if \code{boot_alpha = 0.05} (default), the intervals will be from the 2.5 and 97.5
percentiles.}

\item{boot_centre}{character length 1 in c('mean', 'median'). When bootstrapping, the
main estimate for \code{ale_y} is considered to be \code{boot_centre}. Regardless of the
value specified here, both the mean and median will be available.}

\item{relative_y}{character length 1 in c('median', 'mean', 'zero'). The ale_y values will
be adjusted relative to this value. 'median' is the default. 'zero' will maintain the
default of \code{\link[ALEPlot:ALEPlot]{ALEPlot::ALEPlot()}}, which is not shifted.}

\item{y_type}{character length 1. Datatype of the y (outcome) variable.
Must be one of c('binary', 'numeric', 'multinomial', 'ordinal'). Normally
determined automatically; only provide for complex non-standard models that
require it.}

\item{median_band_pct}{numeric length 2 from 0 to 1. Alpha for "confidence interval" ranges
for printing bands around the median for single-variable plots. These are the
default values used if \code{p_values} are not provided. If \code{p_values} are provided,
then \code{median_band_pct} is ignored.
The inner band range will be the median value of y ± \code{median_band_pct[1]/2}.
For plots with a second outer band, its range will be the median ± \code{median_band_pct[2]/2}.
For example, for the default \code{median_band_pct = c(0.05, 0.5)}, the inner band
will be the median ± 2.5\% and the outer band will be the median ± 25\%.}

\item{rug_sample_size, min_rug_per_interval}{single non-negative integer length 1.
Rug plots are normally
down-sampled otherwise they are too slow. \code{rug_sample_size} specifies the size
of this sample. To prevent down-sampling, set to \code{Inf}. To suppress rug plots,
set to 0. When down-sampling, the rug plots maintain representativeness of the
data by guaranteeing that each of the \code{x_intervals} intervals will retain at least
\code{min_rug_per_interval} elements; usually set to just 1 or 2.}

\item{ale_xs, ale_ns}{list of ale_x and ale_n vectors. If provided, these vectors will be used to
set the intervals of the ALE x axis for each variable. By default (NULL), the
function automatically calculates the ale_x intervals. \code{ale_xs} is normally used
in advanced analyses where the ale_x intervals from a previous analysis are
reused for subsequent analyses (for example, for full model bootstrapping;
see the \code{\link[=model_bootstrap]{model_bootstrap()}} function).}

\item{compact_plots}{logical length 1, default \code{FALSE}. When \code{output} includes
'plots', the returned \code{ggplot} objects each include the environments of the plots.
This lets the user modify the plots with all the flexibility of \code{ggplot}, but it
can result in very large return objects (sometimes even hundreds of megabytes
large). To compact the plots to their bare minimum, set \code{compact_plots = TRUE}.
However, returned plots will not be easily modifiable, so this should only be
used if you do not want to subsequently modify the plots.}

\item{silent}{logical length 1, default \code{FALSE.} If \code{TRUE}, do not display any
non-essential messages during execution (such as progress bars).
Regardless, any warnings and errors will always display. See details for how
to enable progress bars.}
}
\value{
list with the following elements:
\itemize{
\item \code{data}: a list whose elements, named by each requested x variable, are each
a tibble with the following columns:
\itemize{
\item \code{ale_x}: the values of each of the ALE x intervals or categories.
\item \code{ale_n}: the number of rows of data in each \code{ale_x} interval or category.
\item \code{ale_y}: the ALE function value calculated for that interval or category.
For bootstrapped ALE, this is the same as \code{ale_y_mean} by default
or \code{ale_y_median} if the \code{boot_centre = 'median'} argument is specified.
Regardless, both \code{ale_y_mean} and \code{ale_y_median} are returned as columns here.
\item \code{ale_y_lo}, \code{ale_y_hi}: the lower and upper confidence intervals, respectively,
for the bootstrapped \code{ale_y} value.
Note: regardless what options are requested in the \code{output} argument, this
\code{data} element is always returned.
}
\item \code{stats}: if \code{stats} are requested in the \code{output} argument (as is the default),
returns a list. If not requested, returns \code{NULL}. The returned list provides
ALE statistics of the \code{data} element duplicated and presented from various
perspectives in the following elements:
\itemize{
\item \code{by_term}: a list named by each requested x variable, each of whose elements
is a tibble with the following columns:
\itemize{
\item \code{statistic}: the ALE statistic specified in the row (see
the \code{by_statistic} element below).
\item \code{estimate}: the bootstrapped \code{mean} or \code{median} of the \code{statistic},
depending on the \code{boot_centre} argument to the \code{\link[=ale]{ale()}} function.
Regardless, both \code{mean} and \code{median} are returned as columns here.
\item \code{conf.low}, \code{conf.high}: the lower and upper confidence intervals,
respectively, for the bootstrapped \code{estimate}.
}
\item \code{by_statistic}: list named by each of the following ALE statistics:
\code{aled}, \code{aler_min}, \code{aler_max}, \code{naled}, \code{naler_min}, \code{naler_max}. See
\code{vignette('ale-statistics')} for details.
\item \code{estimate}: a tibble whose data consists of the \code{estimate} values from the
\code{by_term} element above. The columns are \code{term} (the variable name) and the
statistic for which the estimate is given:
\code{aled}, \code{aler_min}, \code{aler_max}, \code{naled}, \code{naler_min}, \code{naler_max}.
\item \code{effects_plot}: a \code{ggplot} object which is the ALE effects plot for all the
x variables.
}
\item \code{plots}: if \code{plots} are requested in the \code{output} argument (as is the default),
returns a list whose elements, named by each requested x variable, are each
a \code{ggplot} object of the ALE y values plotted against the x variable intervals.
If \code{plots} is not included in \code{output}, this element is \code{NULL}.
\item \code{conf_regions}: if \code{conf_regions} are requested in the \code{output} argument (as is the default),
returns a list. If not requested, returns \code{NULL}. The returned list provides
summaries of the confidence regions of the relevant ALE statistics of the \code{data}
element.
The list has the following elements:
\itemize{
\item \code{by_term}: a list named by each requested x variable, each of whose elements
is a tibble with the relevant data for the confidence regions.
(See \code{vignette('ale-statistics')} for details about confidence regions.)
\item \code{significant}: a tibble that summarizes the \code{by_term} to only show confidence
regions that are statistically significant. Its columns are those from
\code{by_term} plus a \code{term} column to specify which x variable is indicated
by the respective row.
\item \code{sig_criterion}: a length-one character vector that reports which values
were used to determine statistical significance: if \code{p_values} was
provided to the \code{\link[=ale]{ale()}} function, it will be used; otherwise,
\code{median_band_pct} will be used.
}
\item Various values echoed from the original call to the \code{\link[=ale]{ale()}} function, provided
to document the key elements used to calculate the ALE data, statistics, and plots:
\code{y_col}, \code{x_cols}, \code{boot_it}, \code{seed}, \code{boot_alpha}, \code{boot_centre}, \code{relative_y},
\code{y_type}, \code{median_band_pct}, \code{rug_sample_size}. These are either the values
provided by the user or used by default if the user did not change them.
\item \code{y_summary}: summary statistics of y values used for the ALE calculation.
These statistics are based on the actual values of \code{y_col} unless if \code{y_type} is a
probability or other value that is constrained in the \verb{[0, 1]} range. In that
case, \code{y_summary} is based on the predicted values of \code{y_col} by applying
\code{model} to the \code{data}. \code{y_summary} is a named numeric vector. Most of the
elements are the percentile of the y values. E.g., the '5\%' element is the
5th percentile of y values. The following elements have special meanings:
\itemize{
\item The first element is named either \code{p} or \code{q} and its value is always 0.
The value is not used; only the name of the element is meaningful.
\code{p} means that the following special \code{y_summary} elements are based on
the provided \code{p_values} object. \code{q} means that quantiles were calculated
based on \code{median_band_pct} because \code{p_values} was not provided.
\item \code{min}, \code{mean}, \code{max}: the minimum, mean, and maximum y values, respectively.
Note that the median is \verb{50\%}, the 50th percentile.
\item \code{med_lo_2}, \code{med_lo}, \code{med_hi}, \code{med_hi_2}: \code{med_lo} and \code{med_hi} are the
inner lower and upper confidence intervals of y values with respect to
the median (\verb{50\%}); \code{med_lo_2} and \code{med_hi_2} are the outer confidence
intervals. See the documentation for the \code{p_alpha} and \code{median_band_pct}
arguments to understand how these are determined.
}
}
}
\description{
\code{ale()} is the central function that manages the creation of ALE data and plots
for one-way ALE. For two-way interactions, see \code{\link[=ale_ixn]{ale_ixn()}}. This function calls
\code{ale_core} (a non-exported function) that manages the ALE data and plot creation in detail. For details, see
the introductory vignette for this package or the details and examples below.
}
\details{
ale_core.R

Core functions for the ale package: ale, ale_ixn, and ale_core
}
\section{Custom predict function}{

The calculation of ALE requires modifying several values of the original
\code{data}. Thus, \code{ale()} needs direct access to a \code{predict} function that work on
\code{model}. By default, \code{ale()} uses a generic default \code{predict} function of the form
\code{predict(object, newdata, type)} with the default prediction type of 'response'.
If, however, the desired prediction values are not generated with that format,
the user must specify what they want. Most of the time, the only modification needed is
to change the prediction type to some other value by setting the \code{pred_type} argument
(e.g., to 'prob' to generated classification probabilities). But if the desired
predictions need a different function signature, then the user must create a
custom prediction function and pass it to \code{pred_fun}. The requirements for this
custom function are:
\itemize{
\item It must take three required arguments and nothing else:
\itemize{
\item \code{object}: a model
\item \code{newdata}: a dataframe or compatible table type
\item \code{type}: a string; it should usually be specified as \code{type = pred_type}
These argument names are according to the R convention for the
generic stats::predict function.
}
\item It must return a vector of numeric values as the prediction.
}

You can see an example below of a custom prediction function.

\strong{Note:} \code{survival} models probably do not need a custom prediction function
but \code{y_col} must be set to the name of the binary event column and
\code{pred_type} must be set to the desired prediction type.
}

\section{ALE statistics}{

For details about the ALE-based statistics (ALED, ALER, NALED, and NALER), see
\code{vignette('ale-statistics')}.
}

\section{Parallel processing}{

Parallel processing using the \code{{furrr}} library is enabled by default. By default,
it will use all the available physical
CPU cores (minus the core being used for the current R session) with the setting
\code{parallel = parallel::detectCores(logical = FALSE) - 1}. Note that only
physical cores are used (not logical cores or "hyperthreading") because
machine learning can only take advantage of the floating point processors on
physical cores, which are absent from logical cores. Trying to use logical
cores will not speed up processing and might actually slow it down with useless
data transfer. If you will dedicate
the entire computer to running this function (and you don't mind everything
else becoming very slow while it runs), you may use all cores by setting
\code{parallel = parallel::detectCores(logical = FALSE)}. To disable parallel
processing, set \code{parallel = 0}.
}

\section{Progress bars}{

Progress bars are implemented with the \code{{progressr}} package, which lets
the user fully control progress bars. \strong{To disable progress bars, set \code{silent = TRUE}.}
The first time a function is called in
the \code{{ale}} package that requires progress bars, it checks if the user has
activated the necessary \code{{progressr}} settings. If not, the \code{{ale}} package
automatically enables \code{{progressr}} progress bars with the \code{cli} handler and
prints a message notifying the user.

If you like the default progress bars and you want to make them permanent, then you
can \href{https://support.posit.co/hc/en-us/articles/360047157094-Managing-R-with-Rprofile-Renviron-Rprofile-site-Renviron-site-rsession-conf-and-repos-conf}{add the following lines of code to your .Rprofile configuration file}
and they will become your defaults for every R session; you will not see the
message again:

\if{html}{\out{<div class="sourceCode R">}}\preformatted{progressr::handlers(global = TRUE)
progressr::handlers('cli')
}\if{html}{\out{</div>}}

For more details on formatting progress bars to your liking, see the introduction
to the \href{https://progressr.futureverse.org/articles/progressr-intro.html}{\code{{progressr}} package}.
}

\examples{
set.seed(0)
diamonds_sample <- ggplot2::diamonds[sample(nrow(ggplot2::diamonds), 1000), ]

# Create a GAM model with flexible curves to predict diamond price
# Smooth all numeric variables and include all other variables
gam_diamonds <- mgcv::gam(
  price ~ s(carat) + s(depth) + s(table) + s(x) + s(y) + s(z) +
    cut + color + clarity,
  data = diamonds_sample
)
summary(gam_diamonds)


\donttest{

# Simple ALE without bootstrapping
ale_gam_diamonds <- ale(
  diamonds_sample, gam_diamonds,
  parallel = 2  # CRAN limit (delete this line on your own computer)
)

# Plot the ALE data
ale_gam_diamonds$plots |>
  patchwork::wrap_plots()

# Bootstrapped ALE
# This can be slow, since bootstrapping runs the algorithm boot_it times

# Create ALE with 100 bootstrap samples
ale_gam_diamonds_boot <- ale(
  diamonds_sample, gam_diamonds, boot_it = 100,
  parallel = 2  # CRAN limit (delete this line on your own computer)
)

# Bootstrapped ALEs print with confidence intervals
ale_gam_diamonds_boot$plots |>
  patchwork::wrap_plots()


# If the predict function you want is non-standard, you may define a
# custom predict function. It must return a single numeric vector.
custom_predict <- function(object, newdata, type = pred_type) {
  predict(object, newdata, type = type, se.fit = TRUE)$fit
}

ale_gam_diamonds_custom <- ale(
  diamonds_sample, gam_diamonds,
  pred_fun = custom_predict, pred_type = 'link',
  parallel = 2  # CRAN limit (delete this line on your own computer)
)

# Plot the ALE data
ale_gam_diamonds_custom$plots |>
  patchwork::wrap_plots()

}


}
\references{
Okoli, Chitu. 2023.
“Statistical Inference Using Machine Learning and Classical Techniques Based
on Accumulated Local Effects (ALE).” arXiv. \url{https://arxiv.org/abs/2310.09877}.
}
