I am trying to stack multiple models for a binary classification case. Some models require factors to be turned into dummies while other do not, hence me using different recipes during preprocessing.
Despite the models fitting just fine, I can not seem to stack them due to their differing recipes, as stacks raises an error when trying to use add_candidates()
whenever candidates are to be added with different recipes:
Error:
! It seems like the new candidate member 'tune_list$lightgbm$tuning_results' doesn't make use of the same resampling object as the existing candidates.
Is this expected behaviour? I would expect such a stack to be possible. I tried replicating my code as closely as possible with this minimal example below:
library(tidymodels)
library(stacks)
library(bonsai)
library(lightgbm)
#> Loading required package: R6
#>
#> Attaching package: 'lightgbm'
#> The following object is masked from 'package:dplyr':
#>
#> slice
df <- credit_data |>
as_tibble() |>
tibble::rowid_to_column("id")
tune_wflow <- function(model_data,
model_type) {
# train/test split
train_test_split <- rsample::initial_split(model_data, strata = "Marital")
train_set <- rsample::training(train_test_split)
# set up recipe
base_rec <- recipes::recipe(Status ~ ., data = train_set) |>
recipes::update_role(id, new_role = "id") |>
recipes::update_role(Marital, new_role = "strata") |>
# remove NA
recipes::step_naomit(recipes::all_predictors())
rec <- if (model_type == "xgboost") {
base_rec |>
recipes::step_dummy(recipes::all_nominal_predictors()) |>
# convert remaining non-numeric predictors
recipes::step_mutate(
dplyr::across(tidyselect:::where(is.logical), as.double)
)
} else if (model_type == "lightgbm") {
base_rec
}
# set up model parameters
mod <- if (model_type == "xgboost") {
parsnip::boost_tree(
min_n = tune::tune(),
mtry = tune::tune(),
tree_depth = tune::tune(),
learn_rate = tune::tune(),
sample_size = tune::tune()
) |>
parsnip::set_engine("xgboost") |>
parsnip::set_mode("classification")
} else if (model_type == "lightgbm") {
parsnip::boost_tree(
min_n = tune::tune(),
mtry = tune::tune(),
tree_depth = tune::tune(),
learn_rate = tune::tune(),
sample_size = tune::tune()
) |>
parsnip::set_engine("lightgbm") |>
parsnip::set_mode("classification")
}
# bind recipe and model specs to workflow
wflow <- workflows::workflow() |>
workflows::add_recipe(rec) |>
workflows::add_model(mod)
# set up classification metrics
metrics <- yardstick::metric_set(
yardstick::accuracy,
yardstick::roc_auc)
# set up parallelization
cl <- parallel::makePSOCKcluster(parallel::detectCores())
doParallel::registerDoParallel(cl)
tune <- tune::tune_grid(
wflow,
resamples = rsample::vfold_cv(
train_set,
v = 5,
strata = "Marital"),
grid = 5,
metrics = metrics,
control = stacks::control_stack_grid()
)
closeAllConnections()
# return tuned results
return(
list(tuning_results = tune)
)
}
# fit ----
models <- c("xgboost", "lightgbm")
tune_list <- list()
# fitting loop
for (model in models) {
tuning_results <- tune_wflow(df, model_type = model)
tune_list[[model]] <- tuning_results
}
# stacking ----
mystack <-
stacks() |>
add_candidates(tune_list$xgboost$tuning_results) |>
add_candidates(tune_list$lightgbm$tuning_results)
#> Error:
#> ! It seems like the new candidate member 'tune_list$lightgbm$tuning_results' doesn't make use of the same resampling object as the existing candidates.
#> Run `rlang::last_error()` to see where the error occurred.
Created on 2022-11-21 with reprex v2.0.2