XGBoost: Bayesian optimisation using nested cross-validation (tidymodels)

DobraVila · July 24, 2024, 8:02pm

Can someone please explain how to conduct Bayesian optimization using nested cross-validation with the tidymodels packages?

library(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)

sim_data <- function(n) {
  tmp <- mlbench.friedman1(n, sd = 1)
  tmp <- cbind(tmp$x, tmp$y)
  tmp <- as.data.frame(tmp)
  names(tmp)[ncol(tmp)] <- "y"
  tmplibrary(mlbench)
library(dplyr)
library(magrittr)
library(tidyverse)
library(tidymodels)

sim_data <- function(n) {
  tmp <- mlbench.friedman1(n, sd = 1)
  tmp <- cbind(tmp$x, tmp$y)
  tmp <- as.data.frame(tmp)
  names(tmp)[ncol(tmp)] <- "y"
  tmp
}

set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)

## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat, 
                       outside = vfold_cv(v = 10, repeats = 10),
                       inside = bootstraps(times = 25)) 

## inner loop
xgboost_inner <- boost_tree(mtry = tune(), 
                            trees = tune(),
                            learn_rate = tune(),
                            tree_depth = tune(),
                            sample_size = tune(),
                            min_n = tune(),
                
}

set.seed(9815)
train_dat <- sim_data(100)
large_dat <- sim_data(10^5)

## NESTED CV
set.seed(14)
nested_cv <- nested_cv(train_dat, 
                       outside = vfold_cv(v = 10, repeats = 100),
                       inside = bootstraps(times = 25)) 

## inner loop
xgb_inner <-
  boost_tree(
     trees = tune(),
     learn_rate = tune(),
     tree_depth = tune(), 
     min_n = tune(),
     loss_reduction = tune(),
     sample_size = tune(),
     mtry = tune(),
   ) %>%
   set_mode("classification") %>%
   set_engine("xgboost")

## recipe
model_recipe <- recipe(y ~ ., data = train_dat)

xgb_FUN <- function(params, analysis_set) {
      trees <- params$trees[[1]]
      learn_rate <- params$learn_rate[[1]]
      tree_depth <- params$tree_depth[[1]]
      min_n <- params$min_n[[1]]
      loss_reduction <- params$loss_reduction[[1]]
      sample_size <- params$sample_size[[1]]
      mtry <- params$mtry[[1]]
      boost_tree(mode = "classification", trees = trees, learn_rate = learn_rate, tree_depth = tree_depth,
                 min_n = min_n, loss_reduction = loss_reduction, sample_size = sample_size, mtry = mtry) %>%
            set_engine("xgboost") %>%
            fit(y ~ ., data = analysis_set)
}

xgb_grid <- parameters(
  trees(range = c(500,1500)),
  learn_rate(),
  tree_depth(),
  min_n(),loss_reduction(),
  sample_size=sample_prop(),
  finalize(mtry(), nested_cv.controls),
size = 100)

params_list <- list(xgb = xgb_grid)
mod_FUN_list <- list(xgb = xgb_FUN)

What do I need to do next? How do I create a function to run Bayesian optimisation?

Session info:

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3

locale:
 [1] LC_CTYPE=en_IE.UTF-8       LC_NUMERIC=C               LC_TIME=en_IE.UTF-8        LC_COLLATE=en_IE.UTF-8     LC_MONETARY=en_IE.UTF-8   
 [6] LC_MESSAGES=en_IE.UTF-8    LC_PAPER=en_IE.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_IE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] themis_1.0.2       yardstick_1.3.1    workflowsets_1.1.0 workflows_1.1.4    tune_1.2.1         rsample_1.2.1      recipes_1.0.10     parsnip_1.2.1     
 [9] modeldata_1.3.0    infer_1.0.7        dials_1.2.1        scales_1.3.0       broom_1.0.6        tidymodels_1.2.0   lubridate_1.9.3    forcats_1.0.0     
[17] stringr_1.5.1      purrr_1.0.2        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0    magrittr_2.0.3    
[25] dplyr_1.1.4        mlbench_2.1-5     

loaded via a namespace (and not attached):
 [1] minqa_1.2.7         colorspace_2.1-0    class_7.3-22        htmlTable_2.4.2     base64enc_0.1-3     rstudioapi_0.16.0   mice_3.16.0        
 [8] listenv_0.9.1       furrr_0.3.1         prodlim_2023.08.28  fansi_1.0.6         codetools_0.2-18    splines_4.1.2       mnormt_2.1.1       
[15] knitr_1.47          Formula_1.2-5       jsonlite_1.8.8      nloptr_2.0.3        cluster_2.1.2       finalfit_1.0.7      compiler_4.1.2     
[22] backports_1.5.0     Matrix_1.6-5        fastmap_1.2.0       cli_3.6.2           htmltools_0.5.8.1   tools_4.1.2         gtable_0.3.5       
[29] glue_1.7.0          Rcpp_1.0.12         DiceDesign_1.10     vctrs_0.6.5         nlme_3.1-155        iterators_1.0.14    psych_2.4.3        
[36] timeDate_4032.109   gower_1.0.1         xfun_0.44           globals_0.16.3      lme4_1.1-35.3       timechange_0.3.0    lifecycle_1.0.4    
[43] shapviz_0.9.3       future_1.33.2       pan_1.9             MASS_7.3-55         ipred_0.9-14        hms_1.1.3           parallel_4.1.2     
[50] yaml_2.3.8          gridExtra_2.3       rpart_4.1.23        stringi_1.8.4       foreach_1.5.2       checkmate_2.3.1     lhs_1.1.6          
[57] hardhat_1.4.0       boot_1.3-28         lava_1.8.0          shape_1.4.6.1       rlang_1.1.4         pkgconfig_2.0.3     evaluate_0.23      
[64] lattice_0.22-6      htmlwidgets_1.6.4   tidyselect_1.2.1    parallelly_1.37.1   R6_2.5.1            generics_0.1.3      Hmisc_5.1-3        
[71] mitml_0.4-5         pillar_1.9.0        foreign_0.8-86      withr_3.0.0         survival_3.2-13     nnet_7.3-17         ROSE_0.0-4         
[78] future.apply_1.11.2 jomo_2.7-6          xgboost_1.7.7.1     utf8_1.2.4          tzdb_0.4.0          rmarkdown_2.27      grid_4.1.2         
[85] data.table_1.15.4   digest_0.6.35       GPfit_1.0-8         munsell_0.5.1       glmnet_4.1-8

I followed the example provided on the official website:

and the scripts provided here:

github.com

ercbk/nested-cross-validation-comparison/blob/master/duration-experiment/kuhn-johnson/nested-cv-tune-kj.R

# Nested cross-validation using tune package

# Kuhn-Johnson method
# tune



# Notes
# 1.  *** Make sure the target column is last in dataframe ***


# Sections
# 1. Set-up
# 2. Error functions
# 3. Model functions
# 4, Hyperparameter Grids
# 5. Functions used in the loops
# 6. Compare Algorithms

This file has been truncated. show original

gist.github.com

https://gist.github.com/topepo/5b57761f98d6ef1afa0b262fc870a7d7

nested_example.R

# pak::pak(c("tidymodels/finetune@nested"), ask = FALSE)
library(tidymodels)
library(finetune)
library(rlang)
library(sfd)
library(doMC)

# ------------------------------------------------------------------------------

tidymodels_prefer()

This file has been truncated. show original

but I am completely lost!!!!!

system · October 22, 2024, 8:02pm

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.