Automated Machine Learning with tidylearn

library(tidylearn)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.5.2

Introduction

Automated Machine Learning (AutoML) streamlines the model development process by automatically trying multiple approaches and selecting the best one. tidylearn’s tl_auto_ml() function explores various modeling strategies including dimensionality reduction, clustering, and different supervised methods.

Note: AutoML orchestrates the wrapped packages (glmnet, randomForest, xgboost, etc.) rather than implementing new algorithms. Each model in the leaderboard wraps an established package, and you can access the raw model objects via model$fit.

Basic Usage

Classification Task

# Run AutoML on iris dataset
result <- tl_auto_ml(iris, Species ~ .,
                    task = "classification",
                    time_budget = 60)

# View best model
print(result$best_model)

# View all models tried
names(result$models)

# View leaderboard
result$leaderboard

Regression Task

# Run AutoML on regression problem
result_reg <- tl_auto_ml(mtcars, mpg ~ .,
                        task = "regression",
                        time_budget = 60)

# Best model
print(result_reg$best_model)

How AutoML Works

The tl_auto_ml() function follows a systematic approach:

Baseline Models: Trains standard models (logistic, tree, forest for classification; linear, tree, forest for regression)
Dimensionality Reduction: Applies PCA and trains models on reduced features
Cluster Features: Adds cluster assignments as features
Advanced Models: If time allows, tries additional methods

# AutoML with all features enabled
result_full <- tl_auto_ml(
  data = iris,
  formula = Species ~ .,
  task = "auto",                    # Automatically detect task type
  use_reduction = TRUE,             # Try PCA preprocessing
  use_clustering = TRUE,            # Add cluster features
  time_budget = 120,                # 2 minutes
  cv_folds = 5,                     # Cross-validation folds
  metric = NULL                     # Auto-select metric
)

Task Type Detection

AutoML automatically detects the task type:

# Task type is automatically detected
result_auto <- tl_auto_ml(iris, Species ~ ., task = "auto")
# Detects: Classification (factor response)

result_auto_reg <- tl_auto_ml(mtcars, mpg ~ ., task = "auto")
# Detects: Regression (numeric response)

Controlling the Search

Time Budget

# Quick search (30 seconds)
quick_result <- tl_auto_ml(iris, Species ~ ., time_budget = 30)

# Thorough search (10 minutes)
thorough_result <- tl_auto_ml(iris, Species ~ ., time_budget = 600)

Feature Engineering Options

# Disable dimensionality reduction
no_reduction <- tl_auto_ml(iris, Species ~ .,
                          use_reduction = FALSE,
                          time_budget = 60)

# Disable cluster features
no_clustering <- tl_auto_ml(iris, Species ~ .,
                           use_clustering = FALSE,
                           time_budget = 60)

# Baseline models only
baseline_only <- tl_auto_ml(iris, Species ~ .,
                           use_reduction = FALSE,
                           use_clustering = FALSE,
                           time_budget = 30)

Cross-Validation Settings

# Adjust cross-validation folds
result_cv <- tl_auto_ml(iris, Species ~ .,
                       cv_folds = 10,    # More folds = better estimate, slower
                       time_budget = 120)

# Fewer folds for faster evaluation
result_fast <- tl_auto_ml(iris, Species ~ .,
                         cv_folds = 3,
                         time_budget = 60)

Understanding Results

Accessing Models

result <- tl_auto_ml(iris, Species ~ ., time_budget = 60)

# Best performing model
best_model <- result$best_model

# All models trained
all_models <- result$models

# Specific model
baseline_logistic <- result$models$baseline_logistic
pca_forest <- result$models$pca_forest

Leaderboard

# View performance comparison
leaderboard <- result$leaderboard

# Sort by performance
leaderboard <- leaderboard %>%
  arrange(desc(performance))

print(leaderboard)

Making Predictions

# Use best model for predictions
predictions <- predict(result$best_model, new_data = new_data)

# Or use a specific model
predictions_pca <- predict(result$models$pca_forest, new_data = new_data)

Practical Examples

Example 1: Iris Classification

# Split data for evaluation
split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

# Run AutoML on training data
automl_iris <- tl_auto_ml(split$train, Species ~ .,
                         time_budget = 90,
                         cv_folds = 5)

# Evaluate on test set
test_preds <- predict(automl_iris$best_model, new_data = split$test)
test_accuracy <- mean(test_preds$.pred == split$test$Species)

cat("AutoML Test Accuracy:", round(test_accuracy * 100, 1), "%\n")

# Compare models
for (model_name in names(automl_iris$models)) {
  model <- automl_iris$models[[model_name]]
  preds <- predict(model, new_data = split$test)
  acc <- mean(preds$.pred == split$test$Species)
  cat(model_name, ":", round(acc * 100, 1), "%\n")
}

Example 2: MPG Prediction

# Split mtcars data
split_mtcars <- tl_split(mtcars, prop = 0.7, seed = 42)

# Run AutoML
automl_mpg <- tl_auto_ml(split_mtcars$train, mpg ~ .,
                        task = "regression",
                        time_budget = 90)

# Evaluate
test_preds_mpg <- predict(automl_mpg$best_model, new_data = split_mtcars$test)
rmse <- sqrt(mean((test_preds_mpg$.pred - split_mtcars$test$mpg)^2))

cat("AutoML Test RMSE:", round(rmse, 2), "\n")

Example 3: Custom Preprocessing + AutoML

# Preprocess data first
processed <- tl_prepare_data(
  split$train,
  Species ~ .,
  scale_method = "standardize",
  remove_correlated = TRUE
)

# Run AutoML on preprocessed data
automl_processed <- tl_auto_ml(processed$data, Species ~ .,
                              time_budget = 60)

# Note: Need to apply same preprocessing to test data
test_processed <- tl_prepare_data(
  split$test,
  Species ~ .,
  scale_method = "standardize"
)

test_preds_proc <- predict(automl_processed$best_model,
                           new_data = test_processed$data)

Comparing AutoML with Manual Selection

# Manual approach: choose one model
manual_model <- tl_model(split$train, Species ~ ., method = "forest")
manual_preds <- predict(manual_model, new_data = split$test)
manual_acc <- mean(manual_preds$.pred == split$test$Species)

# AutoML approach
automl_model <- tl_auto_ml(split$train, Species ~ ., time_budget = 60)
automl_preds <- predict(automl_model$best_model, new_data = split$test)
automl_acc <- mean(automl_preds$.pred == split$test$Species)

cat("Manual Selection:", round(manual_acc * 100, 1), "%\n")
cat("AutoML:", round(automl_acc * 100, 1), "%\n")

Advanced AutoML Strategies

Strategy 1: Iterative AutoML

# First pass: quick exploration
quick_automl <- tl_auto_ml(split$train, Species ~ .,
                          time_budget = 30,
                          use_reduction = TRUE,
                          use_clustering = FALSE)

# Analyze what worked
best_approach <- quick_automl$best_model$spec$method

# Second pass: focus on promising approaches
if (grepl("pca", names(quick_automl$best_model)[1])) {
  # If PCA worked well, focus on dimensionality reduction
  refined_automl <- tl_auto_ml(split$train, Species ~ .,
                              time_budget = 60,
                              use_reduction = TRUE,
                              use_clustering = TRUE)
}

Strategy 2: Ensemble of AutoML Models

# Get top 3 models
top_models <- automl_iris$leaderboard %>%
  arrange(desc(performance)) %>%
  head(3)

# Make predictions with each
ensemble_preds <- list()
for (i in 1:nrow(top_models)) {
  model_name <- top_models$model[i]
  model <- automl_iris$models[[model_name]]
  ensemble_preds[[i]] <- predict(model, new_data = split$test)$.pred
}

# Majority vote for classification
final_pred <- apply(do.call(cbind, ensemble_preds), 1, function(x) {
  names(which.max(table(x)))
})

ensemble_acc <- mean(final_pred == split$test$Species)
cat("Ensemble Accuracy:", round(ensemble_acc * 100, 1), "%\n")

Performance Metrics

Classification Metrics

# AutoML automatically uses accuracy for classification
result_class <- tl_auto_ml(iris, Species ~ .,
                          metric = "accuracy",
                          time_budget = 60)

Regression Metrics

# AutoML automatically uses RMSE for regression
result_reg <- tl_auto_ml(mtcars, mpg ~ .,
                        metric = "rmse",
                        time_budget = 60)

Best Practices

Set appropriate time budget: Start with 60-120 seconds for initial exploration
Use cross-validation: More folds give better estimates but take longer
Preprocess when needed: Handle missing values before AutoML
Split your data: Always evaluate on held-out test data
Examine multiple models: The “best” model may not always be robust
Consider ensemble approaches: Combine top models for better performance
Monitor time vs performance: Longer searches don’t always yield better results

When to Use AutoML

Good use cases:

Quick prototyping and baseline establishment
When you’re unsure which algorithm to use
Feature engineering exploration
Benchmark for manual approaches
Limited ML expertise

Consider manual selection when:

You have domain knowledge about the best approach
Interpretability is critical
You need fine-grained control over hyperparameters
Computational resources are very limited

Troubleshooting

AutoML runs too slowly

# Reduce time budget
quick_result <- tl_auto_ml(data, formula, time_budget = 30)

# Reduce CV folds
fast_result <- tl_auto_ml(data, formula, cv_folds = 3)

# Disable feature engineering
baseline_result <- tl_auto_ml(data, formula,
                             use_reduction = FALSE,
                             use_clustering = FALSE)

Not enough models tried

# Increase time budget
thorough_result <- tl_auto_ml(data, formula, time_budget = 300)

# Ensure feature engineering is enabled
full_result <- tl_auto_ml(data, formula,
                         use_reduction = TRUE,
                         use_clustering = TRUE)

Summary

tidylearn’s AutoML provides:

Automated model selection across multiple algorithms
Feature engineering with PCA and clustering
Cross-validation for robust performance estimates
Easy comparison through leaderboard
Flexible configuration for different scenarios
Integration workflows combining supervised and unsupervised learning

# Complete AutoML workflow
workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

automl_result <- tl_auto_ml(
  data = workflow_split$train,
  formula = Species ~ .,
  task = "auto",
  use_reduction = TRUE,
  use_clustering = TRUE,
  time_budget = 120,
  cv_folds = 5
)

# Evaluate best model
final_preds <- predict(automl_result$best_model, new_data = workflow_split$test)
final_accuracy <- mean(final_preds$.pred == workflow_split$test$Species)

cat("Final AutoML Accuracy:", round(final_accuracy * 100, 1), "%\n")
cat("Best approach:", automl_result$best_model$spec$method, "\n")

AutoML makes machine learning accessible and efficient, allowing you to quickly find good solutions while learning which approaches work best for your data.