Can someone please explain how to conduct Bayesian optimization using nested cross-validation with the tidymodels packages?
library(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmplibrary(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)
## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat,
outside = vfold_cv(v = 10, repeats = 10),
inside = bootstraps(times = 25))
## inner loop
xgboost_inner <- boost_tree(mtry = tune(),
trees = tune(),
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune(),
min_n = tune(),
}
set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)
## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat,
outside = vfold_cv(v = 10, repeats = 100),
inside = bootstraps(times = 25))
## inner loop
xgb_inner <-
boost_tree(
trees = tune(),
learn_rate = tune(),
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
) %>%
set_mode("classification") %>%
set_engine("xgboost")
## recipe
model_recipe <- recipe(y ~ ., data = train_dat)
xgb_FUN <- function(params, analysis_set) {
trees <- params$trees[[1]]
learn_rate <- params$learn_rate[[1]]
tree_depth <- params$tree_depth[[1]]
min_n <- params$min_n[[1]]
loss_reduction <- params$loss_reduction[[1]]
sample_size <- params$sample_size[[1]]
mtry <- params$mtry[[1]]
boost_tree(mode = "classification", trees = trees, learn_rate = learn_rate, tree_depth = tree_depth,
min_n = min_n, loss_reduction = loss_reduction, sample_size = sample_size, mtry = mtry) %>%
set_engine("xgboost") %>%
fit(y ~ ., data = analysis_set)
}
xgb_grid <- parameters(
trees(range = c(500,1500)),
learn_rate(),
tree_depth(),
min_n(),loss_reduction(),
sample_size=sample_prop(),
finalize(mtry(), nested_cv.controls),
size = 100)
params_list <- list(xgb = xgb_grid)
mod_FUN_list <- list(xgb = xgb_FUN)
What do I need to do next? How do I create a function to run Bayesian optimisation?
Session info:
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
locale:
[1] LC_CTYPE=en_IE.UTF-8 LC_NUMERIC=C LC_TIME=en_IE.UTF-8 LC_COLLATE=en_IE.UTF-8 LC_MONETARY=en_IE.UTF-8
[6] LC_MESSAGES=en_IE.UTF-8 LC_PAPER=en_IE.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_IE.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] themis_1.0.2 yardstick_1.3.1 workflowsets_1.1.0 workflows_1.1.4 tune_1.2.1 rsample_1.2.1 recipes_1.0.10 parsnip_1.2.1
[9] modeldata_1.3.0 infer_1.0.7 dials_1.2.1 scales_1.3.0 broom_1.0.6 tidymodels_1.2.0 lubridate_1.9.3 forcats_1.0.0
[17] stringr_1.5.1 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 magrittr_2.0.3
[25] dplyr_1.1.4 mlbench_2.1-5
loaded via a namespace (and not attached):
[1] minqa_1.2.7 colorspace_2.1-0 class_7.3-22 htmlTable_2.4.2 base64enc_0.1-3 rstudioapi_0.16.0 mice_3.16.0
[8] listenv_0.9.1 furrr_0.3.1 prodlim_2023.08.28 fansi_1.0.6 codetools_0.2-18 splines_4.1.2 mnormt_2.1.1
[15] knitr_1.47 Formula_1.2-5 jsonlite_1.8.8 nloptr_2.0.3 cluster_2.1.2 finalfit_1.0.7 compiler_4.1.2
[22] backports_1.5.0 Matrix_1.6-5 fastmap_1.2.0 cli_3.6.2 htmltools_0.5.8.1 tools_4.1.2 gtable_0.3.5
[29] glue_1.7.0 Rcpp_1.0.12 DiceDesign_1.10 vctrs_0.6.5 nlme_3.1-155 iterators_1.0.14 psych_2.4.3
[36] timeDate_4032.109 gower_1.0.1 xfun_0.44 globals_0.16.3 lme4_1.1-35.3 timechange_0.3.0 lifecycle_1.0.4
[43] shapviz_0.9.3 future_1.33.2 pan_1.9 MASS_7.3-55 ipred_0.9-14 hms_1.1.3 parallel_4.1.2
[50] yaml_2.3.8 gridExtra_2.3 rpart_4.1.23 stringi_1.8.4 foreach_1.5.2 checkmate_2.3.1 lhs_1.1.6
[57] hardhat_1.4.0 boot_1.3-28 lava_1.8.0 shape_1.4.6.1 rlang_1.1.4 pkgconfig_2.0.3 evaluate_0.23
[64] lattice_0.22-6 htmlwidgets_1.6.4 tidyselect_1.2.1 parallelly_1.37.1 R6_2.5.1 generics_0.1.3 Hmisc_5.1-3
[71] mitml_0.4-5 pillar_1.9.0 foreign_0.8-86 withr_3.0.0 survival_3.2-13 nnet_7.3-17 ROSE_0.0-4
[78] future.apply_1.11.2 jomo_2.7-6 xgboost_1.7.7.1 utf8_1.2.4 tzdb_0.4.0 rmarkdown_2.27 grid_4.1.2
[85] data.table_1.15.4 digest_0.6.35 GPfit_1.0-8 munsell_0.5.1 glmnet_4.1-8
I followed the example provided on the official website:
and the scripts provided here:
but I am completely lost!!!!!