% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.best.models.R
\name{hmda.best.models}
\alias{hmda.best.models}
\title{Select Best Models Across All Models in HMDA Grid}
\usage{
hmda.best.models(
  df,
  n_models = NULL,
  distance_percentage = NULL,
  metrics = c("logloss", "mae", "mse", "rmse", "rmsle", "mean_per_class_error", "auc",
    "aucpr", "r2", "accuracy", "f1", "mcc", "f2"),
  hyperparam = FALSE
)
}
\arguments{
\item{df}{A data frame of class \code{"hmda.grid.analysis"} containing
model performance results. It must include a column named
\code{model_ids}.}

\item{n_models}{Integer. The number of top models to select per metric.
If both \code{n_models} and \code{distance_percentage} are \code{NULL},
defaults to 1.}

\item{distance_percentage}{Numeric in (0, 1). Alternative to \code{n_models}. Selects all models within
a given percentage distance of the best value for each metric (direction-aware). You must specify either
\code{n_models} or \code{distance_percentage}, not both. distance_percentage is direction-aware.
For example, when metric is AUC, if the distance_percentage is set to 1%, it selects models that
have AUC equal or lower than 99% of the model with the highest AUC. If a metric is
specified that lower values mean better performance, such as logloss, then
a distance_percentage of 1% selects models that have a logloss within 1% higher than
the model with the lowest logloss.}

\item{metrics}{Character vector of performance metric column names to consider. Supported metrics
are "logloss", "mae", "mse", "rmse", "rmsle", "mean_per_class_error", "auc", "aucpr",
"r2", "accuracy", "f1", "mcc", "f2".}

\item{hyperparam}{Logical. If \code{TRUE}, returns all columns for the selected models (including
hyperparameters). If \code{FALSE}, returns only \code{model_ids} plus the selected metric columns.}
}
\value{
A data frame containing the union of selected models across all considered metrics.
  If \code{hyperparam = FALSE}, the output includes \code{model_ids} and the metric columns found in \code{df}.
  If \code{hyperparam = TRUE}, the output includes all columns from \code{df} for the selected models.
}
\description{
Scans an HMDA grid analysis data frame for performance metric columns and, for each metric,
selects the best-performing models according to the correct optimization direction
(lower is better for some metrics; higher is better for others). The function returns a
subset of the input data frame containing the union of selected model IDs.
}
\details{
The function uses a predefined set of H2O performance metrics along with
  their desired optimization directions:
  \describe{
    \item{logloss, mae, mse, rmse, rmsle, mean_per_class_error}{Lower values
          are better.}
    \item{auc, aucpr, r2, accuracy, f1, mcc, f2}{Higher values are better.}
  }
}
\examples{
\dontrun{
  library(HMDA)
  library(h2o)
  hmda.init()

  # Import a sample binary outcome dataset into H2O
  train <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
  test <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

  # Identify predictors and response
  y <- "response"
  x <- setdiff(names(train), y)

  # For binary classification, response should be a factor
  train[, y] <- as.factor(train[, y])
  test[, y] <- as.factor(test[, y])

  params <- list(learn_rate = c(0.01, 0.1),
                 max_depth = c(3, 5, 9),
                 sample_rate = c(0.8, 1.0)
  )

  # Train and validate a cartesian grid of GBMs
  hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
                          grid_id = "hmda_grid1",
                          training_frame = train,
                          nfolds = 10,
                          ntrees = 100,
                          seed = 1,
                          hyper_params = params)

  # Assess the performances of the models
  grid_performance <- hmda.grid.analysis(hmda_grid1)

  # Return the best 2 models according to each metric
  hmda.best.models(grid_performance, n_models = 2)

  # return all models with performance metric as high as 98\% of the best model, for each metric
  # i.e., the distance of the selected models should be up to 2\% from the
  # best model in each metric
  hmda.best.models(grid_performance, distance_percentage = 0.02)
}

}
\author{
E. F. Haghish
}
