% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rf.R
\name{rf}
\alias{rf}
\title{Random forest models with Moran's I test of the residuals}
\usage{
rf(
  data = NULL,
  dependent.variable.name = NULL,
  predictor.variable.names = NULL,
  distance.matrix = NULL,
  distance.thresholds = NULL,
  xy = NULL,
  ranger.arguments = NULL,
  scaled.importance = FALSE,
  seed = 1,
  verbose = TRUE,
  n.cores = parallel::detectCores() - 1,
  cluster = NULL
)
}
\arguments{
\item{data}{Data frame with a response variable and a set of predictors. Default: \code{NULL}}

\item{dependent.variable.name}{Character string with the name of the response variable. Must be a column name in \code{data}. For binary response variables (0/1), case weights are automatically computed using \code{\link[=case_weights]{case_weights()}} to balance classes. Default: \code{NULL}}

\item{predictor.variable.names}{Character vector with predictor variable names. All names must be columns in \code{data}. Alternatively, accepts the output of \code{\link[=auto_cor]{auto_cor()}} or \code{\link[=auto_vif]{auto_vif()}} for automated variable selection. Default: \code{NULL}}

\item{distance.matrix}{Square matrix with pairwise distances between observations in \code{data}. Must have the same number of rows as \code{data}. If \code{NULL}, spatial autocorrelation of residuals is not computed. Default: \code{NULL}}

\item{distance.thresholds}{Numeric vector of distance thresholds for spatial autocorrelation analysis. For each threshold, distances below that value are set to zero when computing Moran's I. If \code{NULL}, defaults to \code{seq(0, max(distance.matrix), length.out = 4)}. Default: \code{NULL}}

\item{xy}{Data frame or matrix with two columns containing coordinates, named "x" and "y". Not used by this function but stored in the model for use by \code{\link[=rf_evaluate]{rf_evaluate()}} and \code{\link[=rf_tuning]{rf_tuning()}}. Default: \code{NULL}}

\item{ranger.arguments}{Named list with \link[ranger]{ranger} arguments. Arguments for this function can also be passed here. The default importance method is 'permutation' instead of ranger's default 'none'. The \code{x}, \code{y}, and \code{formula} arguments are not supported. See \link[ranger]{ranger} help for available arguments. Default: \code{NULL}}

\item{scaled.importance}{If \code{TRUE}, variable importance is computed on scaled data using \link[base]{scale}, making importance scores comparable across models with different predictor units. Default: \code{FALSE}}

\item{seed}{Random seed for reproducibility. Default: \code{1}}

\item{verbose}{If \code{TRUE}, display messages and plots during execution. Default: \code{TRUE}}

\item{n.cores}{Number of cores for parallel execution. Default: \code{parallel::detectCores() - 1}}

\item{cluster}{Cluster object from \code{parallel::makeCluster()}. Not used by this function but stored in the model for use in downstream functions. Default: \code{NULL}}
}
\value{
A ranger model object with additional slots:
\itemize{
\item \code{ranger.arguments}: Arguments used to fit the model.
\item \code{importance}: List with global importance data frame (predictors ranked by importance), importance plot, and local importance scores (per-observation difference in accuracy between permuted and non-permuted predictors, based on out-of-bag data).
\item \code{performance}: Model performance metrics including R-squared (out-of-bag and standard), pseudo R-squared, RMSE, and NRMSE.
\item \code{residuals}: Model residuals with normality diagnostics (\code{\link[=residuals_diagnostics]{residuals_diagnostics()}}) and spatial autocorrelation (\code{\link[=moran_multithreshold]{moran_multithreshold()}}).
}
}
\description{
Fits a random forest model using \link[ranger]{ranger} and extends it with spatial diagnostics: residual autocorrelation (Moran's I) at multiple distance thresholds, performance metrics (RMSE, NRMSE via \code{\link[=root_mean_squared_error]{root_mean_squared_error()}}), and variable importance scores computed on scaled data (via \link[base]{scale}).
}
\details{
See \link[ranger]{ranger} documentation for additional details. The \code{formula} interface is supported via \code{ranger.arguments}, but variable interactions are not permitted. For feature engineering including interactions, see \code{\link[=the_feature_engineer]{the_feature_engineer()}}.
}
\examples{

data(
  plants_df,
  plants_response,
  plants_predictors,
  plants_distance
)

m <- rf(
  data = plants_df,
  dependent.variable.name = plants_response,
  predictor.variable.names = plants_predictors,
  distance.matrix = plants_distance,
  distance.thresholds = c(100, 1000, 2000),
  ranger.arguments = list(
    num.trees = 50,
    min.node.size = 20
  ),
  verbose = FALSE,
  n.cores = 1
)

class(m)
#variable importance
m$importance$per.variable
m$importance$per.variable.plot

#model performance
m$performance

#autocorrelation of residuals
m$residuals$autocorrelation$per.distance
m$residuals$autocorrelation$plot

#model predictions
m$predictions$values

#predictions for new data (using stats::predict)
y <- stats::predict(
  object = m,
  data = plants_df[1:5, ],
  type = "response"
)$predictions

#alternative: pass arguments via ranger.arguments list
args <- list(
  data = plants_df,
  dependent.variable.name = plants_response,
  predictor.variable.names = plants_predictors,
  distance.matrix = plants_distance,
  distance.thresholds = c(100, 1000, 2000),
  num.trees = 50,
  min.node.size = 20,
  num.threads = 1
)

m <- rf(
  ranger.arguments = args,
  verbose = FALSE
)

}
\seealso{
Other main_models: 
\code{\link{rf_spatial}()}
}
\concept{main_models}
