library(tidylearn)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.5.2Automated Machine Learning (AutoML) streamlines the model development
process by automatically trying multiple approaches and selecting the
best one. tidylearn’s tl_auto_ml() function explores
various modeling strategies including dimensionality reduction,
clustering, and different supervised methods.
Note: AutoML orchestrates the wrapped packages
(glmnet, randomForest, xgboost, etc.) rather than implementing new
algorithms. Each model in the leaderboard wraps an established package,
and you can access the raw model objects via model$fit.
The tl_auto_ml() function follows a systematic
approach:
# AutoML with all features enabled
result_full <- tl_auto_ml(
data = iris,
formula = Species ~ .,
task = "auto", # Automatically detect task type
use_reduction = TRUE, # Try PCA preprocessing
use_clustering = TRUE, # Add cluster features
time_budget = 120, # 2 minutes
cv_folds = 5, # Cross-validation folds
metric = NULL # Auto-select metric
)AutoML automatically detects the task type:
# Disable dimensionality reduction
no_reduction <- tl_auto_ml(iris, Species ~ .,
use_reduction = FALSE,
time_budget = 60)
# Disable cluster features
no_clustering <- tl_auto_ml(iris, Species ~ .,
use_clustering = FALSE,
time_budget = 60)
# Baseline models only
baseline_only <- tl_auto_ml(iris, Species ~ .,
use_reduction = FALSE,
use_clustering = FALSE,
time_budget = 30)# Split data for evaluation
split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)
# Run AutoML on training data
automl_iris <- tl_auto_ml(split$train, Species ~ .,
time_budget = 90,
cv_folds = 5)
# Evaluate on test set
test_preds <- predict(automl_iris$best_model, new_data = split$test)
test_accuracy <- mean(test_preds$.pred == split$test$Species)
cat("AutoML Test Accuracy:", round(test_accuracy * 100, 1), "%\n")# Split mtcars data
split_mtcars <- tl_split(mtcars, prop = 0.7, seed = 42)
# Run AutoML
automl_mpg <- tl_auto_ml(split_mtcars$train, mpg ~ .,
task = "regression",
time_budget = 90)
# Evaluate
test_preds_mpg <- predict(automl_mpg$best_model, new_data = split_mtcars$test)
rmse <- sqrt(mean((test_preds_mpg$.pred - split_mtcars$test$mpg)^2))
cat("AutoML Test RMSE:", round(rmse, 2), "\n")# Preprocess data first
processed <- tl_prepare_data(
split$train,
Species ~ .,
scale_method = "standardize",
remove_correlated = TRUE
)
# Run AutoML on preprocessed data
automl_processed <- tl_auto_ml(processed$data, Species ~ .,
time_budget = 60)
# Note: Need to apply same preprocessing to test data
test_processed <- tl_prepare_data(
split$test,
Species ~ .,
scale_method = "standardize"
)
test_preds_proc <- predict(automl_processed$best_model,
new_data = test_processed$data)# Manual approach: choose one model
manual_model <- tl_model(split$train, Species ~ ., method = "forest")
manual_preds <- predict(manual_model, new_data = split$test)
manual_acc <- mean(manual_preds$.pred == split$test$Species)
# AutoML approach
automl_model <- tl_auto_ml(split$train, Species ~ ., time_budget = 60)
automl_preds <- predict(automl_model$best_model, new_data = split$test)
automl_acc <- mean(automl_preds$.pred == split$test$Species)
cat("Manual Selection:", round(manual_acc * 100, 1), "%\n")
cat("AutoML:", round(automl_acc * 100, 1), "%\n")# First pass: quick exploration
quick_automl <- tl_auto_ml(split$train, Species ~ .,
time_budget = 30,
use_reduction = TRUE,
use_clustering = FALSE)
# Analyze what worked
best_approach <- quick_automl$best_model$spec$method
# Second pass: focus on promising approaches
if (grepl("pca", names(quick_automl$best_model)[1])) {
# If PCA worked well, focus on dimensionality reduction
refined_automl <- tl_auto_ml(split$train, Species ~ .,
time_budget = 60,
use_reduction = TRUE,
use_clustering = TRUE)
}# Get top 3 models
top_models <- automl_iris$leaderboard %>%
arrange(desc(performance)) %>%
head(3)
# Make predictions with each
ensemble_preds <- list()
for (i in 1:nrow(top_models)) {
model_name <- top_models$model[i]
model <- automl_iris$models[[model_name]]
ensemble_preds[[i]] <- predict(model, new_data = split$test)$.pred
}
# Majority vote for classification
final_pred <- apply(do.call(cbind, ensemble_preds), 1, function(x) {
names(which.max(table(x)))
})
ensemble_acc <- mean(final_pred == split$test$Species)
cat("Ensemble Accuracy:", round(ensemble_acc * 100, 1), "%\n")Good use cases:
Consider manual selection when:
tidylearn’s AutoML provides:
# Complete AutoML workflow
workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)
automl_result <- tl_auto_ml(
data = workflow_split$train,
formula = Species ~ .,
task = "auto",
use_reduction = TRUE,
use_clustering = TRUE,
time_budget = 120,
cv_folds = 5
)
# Evaluate best model
final_preds <- predict(automl_result$best_model, new_data = workflow_split$test)
final_accuracy <- mean(final_preds$.pred == workflow_split$test$Species)
cat("Final AutoML Accuracy:", round(final_accuracy * 100, 1), "%\n")
cat("Best approach:", automl_result$best_model$spec$method, "\n")AutoML makes machine learning accessible and efficient, allowing you to quickly find good solutions while learning which approaches work best for your data.