XGBoost: Bayesian optimisation using nested cross-validation (tidymodels)

Can someone please explain how to conduct Bayesian optimization using nested cross-validation with the tidymodels packages?

``````library(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)

sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp\$x, tmp\$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmplibrary(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)

sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp\$x, tmp\$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}

set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)

## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat,
outside = vfold_cv(v = 10, repeats = 10),
inside = bootstraps(times = 25))

## inner loop
xgboost_inner <- boost_tree(mtry = tune(),
trees = tune(),
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune(),
min_n = tune(),

}

set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)

## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat,
outside = vfold_cv(v = 10, repeats = 100),
inside = bootstraps(times = 25))

## inner loop
xgb_inner <-
boost_tree(
trees = tune(),
learn_rate = tune(),
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
) %>%
set_mode("classification") %>%
set_engine("xgboost")

## recipe
model_recipe <- recipe(y ~ ., data = train_dat)

xgb_FUN <- function(params, analysis_set) {
trees <- params\$trees[[1]]
learn_rate <- params\$learn_rate[[1]]
tree_depth <- params\$tree_depth[[1]]
min_n <- params\$min_n[[1]]
loss_reduction <- params\$loss_reduction[[1]]
sample_size <- params\$sample_size[[1]]
mtry <- params\$mtry[[1]]
boost_tree(mode = "classification", trees = trees, learn_rate = learn_rate, tree_depth = tree_depth,
min_n = min_n, loss_reduction = loss_reduction, sample_size = sample_size, mtry = mtry) %>%
set_engine("xgboost") %>%
fit(y ~ ., data = analysis_set)
}

xgb_grid <- parameters(
trees(range = c(500,1500)),
learn_rate(),
tree_depth(),
min_n(),loss_reduction(),
sample_size=sample_prop(),
finalize(mtry(), nested_cv.controls),
size = 100)

params_list <- list(xgb = xgb_grid)
mod_FUN_list <- list(xgb = xgb_FUN)
``````

What do I need to do next? How do I create a function to run Bayesian optimisation?

Session info:

``````R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3

locale:
[1] LC_CTYPE=en_IE.UTF-8       LC_NUMERIC=C               LC_TIME=en_IE.UTF-8        LC_COLLATE=en_IE.UTF-8     LC_MONETARY=en_IE.UTF-8
[6] LC_MESSAGES=en_IE.UTF-8    LC_PAPER=en_IE.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_IE.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] themis_1.0.2       yardstick_1.3.1    workflowsets_1.1.0 workflows_1.1.4    tune_1.2.1         rsample_1.2.1      recipes_1.0.10     parsnip_1.2.1
[9] modeldata_1.3.0    infer_1.0.7        dials_1.2.1        scales_1.3.0       broom_1.0.6        tidymodels_1.2.0   lubridate_1.9.3    forcats_1.0.0
[17] stringr_1.5.1      purrr_1.0.2        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0    magrittr_2.0.3
[25] dplyr_1.1.4        mlbench_2.1-5

loaded via a namespace (and not attached):
[1] minqa_1.2.7         colorspace_2.1-0    class_7.3-22        htmlTable_2.4.2     base64enc_0.1-3     rstudioapi_0.16.0   mice_3.16.0
[8] listenv_0.9.1       furrr_0.3.1         prodlim_2023.08.28  fansi_1.0.6         codetools_0.2-18    splines_4.1.2       mnormt_2.1.1
[15] knitr_1.47          Formula_1.2-5       jsonlite_1.8.8      nloptr_2.0.3        cluster_2.1.2       finalfit_1.0.7      compiler_4.1.2
[22] backports_1.5.0     Matrix_1.6-5        fastmap_1.2.0       cli_3.6.2           htmltools_0.5.8.1   tools_4.1.2         gtable_0.3.5
[29] glue_1.7.0          Rcpp_1.0.12         DiceDesign_1.10     vctrs_0.6.5         nlme_3.1-155        iterators_1.0.14    psych_2.4.3
[36] timeDate_4032.109   gower_1.0.1         xfun_0.44           globals_0.16.3      lme4_1.1-35.3       timechange_0.3.0    lifecycle_1.0.4
[43] shapviz_0.9.3       future_1.33.2       pan_1.9             MASS_7.3-55         ipred_0.9-14        hms_1.1.3           parallel_4.1.2
[50] yaml_2.3.8          gridExtra_2.3       rpart_4.1.23        stringi_1.8.4       foreach_1.5.2       checkmate_2.3.1     lhs_1.1.6
[57] hardhat_1.4.0       boot_1.3-28         lava_1.8.0          shape_1.4.6.1       rlang_1.1.4         pkgconfig_2.0.3     evaluate_0.23
[64] lattice_0.22-6      htmlwidgets_1.6.4   tidyselect_1.2.1    parallelly_1.37.1   R6_2.5.1            generics_0.1.3      Hmisc_5.1-3
[71] mitml_0.4-5         pillar_1.9.0        foreign_0.8-86      withr_3.0.0         survival_3.2-13     nnet_7.3-17         ROSE_0.0-4
[78] future.apply_1.11.2 jomo_2.7-6          xgboost_1.7.7.1     utf8_1.2.4          tzdb_0.4.0          rmarkdown_2.27      grid_4.1.2
[85] data.table_1.15.4   digest_0.6.35       GPfit_1.0-8         munsell_0.5.1       glmnet_4.1-8

``````

I followed the example provided on the official website:

and the scripts provided here:

but I am completely lost!!!!!