Tidymodels Case Weights

michaelgloven · March 10, 2024, 3:50am

I'm trying to use case weights (importance) and get an error during resampling (res) which I cannot resolve:

→ A | error: Can't subset columns with case_wts.
case_wts must be numeric or character, not a <hardhat_importance_weights/hardhat_case_weights/vctrs_vctr> object.
There were issues with some computations A: x9
Warning: All models failed. Run show_notes(.Last.tune.result) for more information.

Any guidance is appreciated

library(tidyverse)
library(tidymodels)
library(yardstick)

y1  <- c("T", "T", "T", "T", "T", "T", "F", "F", "F", "T")
y2  <- c("A", "B", "A", "B", "A", "B", "A", "B", "C", "D")
x3  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x4  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x5  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
wts <- c(1,1,1,1,1,1,.1,.1,.1,1)
d   <- data.frame(y1, y2, x3, x4, x5, wts)

case_wts <- importance_weights(as.numeric(d$wts))

rec <- recipe(y1~., data = d) %>%                                      
      step_dummy(all_nominal_predictors(), one_hot = T, trained = F, skip = F) %>%
      step_center(all_numeric_predictors(), skip = F) %>%
      step_scale(all_numeric_predictors(), skip = F)

spec <- boost_tree() %>% set_mode("classification") %>% set_engine("xgboost")

param_mod   <- tunable(spec) %>% filter(component_id == "main") %>% pull(name)
param_mod1  <- param_mod[! param_mod %in% c("learn_rate", "mtry" , "min_n" , "loss_reduction" ,"sample_size" , "stop_iter" )]  
param_tune <- purrr::map(param_mod, ~ set_names(list(tune()), .x)) %>% reduce(c)

update_spec  <- spec %>% update(param_tune)                
p            <- extract_parameter_set_dials(update_spec)  
grid         <- grid_regular(finalize(p, d), levels = 2, original = T) 

wf      <- workflow() %>% add_model(update_spec) %>% add_recipe(rec) %>% add_case_weights(case_wts)
cv      <- vfold_cv(d, v = 3, repeats = 3, strata = NULL, breaks = 1)
metrics <- metric_set(yardstick::roc_auc, yardstick::accuracy, yardstick::sensitivity)
ctr     <- control_grid(verbose = FALSE, save_pred = TRUE, event_level = "second")
res     <- wf %>% tune_grid(resamples = cv, grid = grid, metrics = metrics, control = ctr)

b   <- select_best(res)                           
m   <- finalize_workflow(wf, b) %>% fit(d)

nirgrahamuk · March 11, 2024, 12:32pm

some changes I made :

added libraries that I seemed to need when working through this

library(lightgbm)
library(bonsai)

before making the recipe from d, convert the weights column to case weights.

d$wts <- importance_weights(as.numeric(d$wts))

rec <- recipe(y1~., data = d) %>%      #  .... etc.

the wf creation becomes

wf      <- workflow() %>% add_model(update_spec) %>% add_recipe(rec) %>% add_case_weights(wts)

however, we then reach a point where we can't overcome; which is that currently lightgbm does no support case weights.
Case weights are not enabled by the underlying model implementation.

michaelgloven · March 11, 2024, 2:09pm

thanks, I moved the case weights argument and changed the engine to "xgboost" and the updated code above makes it to "res" but I get the same error attempting to complete the resamples:

→ A | error: Can't subset columns with case_wts.
case_wts must be numeric or character, not a <hardhat_importance_weights/hardhat_case_weights/vctrs_vctr> object.
There were issues with some computations A: x9
Warning: All models failed. Run show_notes(.Last.tune.result) for more information.

nirgrahamuk · March 11, 2024, 3:23pm

Try this ?

library(tidyverse)
library(tidymodels)
library(yardstick)
library(xgboost)

y1  <- c("T", "T", "T", "T", "T", "T", "F", "F", "F", "T")
y2  <- c("A", "B", "A", "B", "A", "B", "A", "B", "C", "D")
x3  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x4  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x5  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
wts <- c(1,1,1,1,1,1,.1,.1,.1,1)
d   <- data.frame(y1, y2, x3, x4, x5, wts)

d$wts <- importance_weights(as.numeric(d$wts))

rec <- recipe(y1~., data = d) %>%                                      
  step_dummy(all_nominal_predictors(), one_hot = T, trained = F, skip = F) %>%
  step_center(all_numeric_predictors(), skip = F) %>%
  step_scale(all_numeric_predictors(), skip = F)  

spec <- boost_tree() %>% set_mode("classification") %>% set_engine("xgboost")

param_mod   <- tunable(spec) %>% filter(component_id == "main") %>% pull(name)
param_mod1  <- param_mod[! param_mod %in% c("learn_rate", "mtry" , "min_n" , "loss_reduction" ,"sample_size" , "stop_iter" )]  
param_tune <- purrr::map(param_mod, ~ set_names(list(tune()), .x)) %>% reduce(c)

update_spec  <- spec %>% update(param_tune)                
p            <- extract_parameter_set_dials(update_spec)  
grid         <- grid_regular(finalize(p, d), levels = 2, original = T) 



wf      <- workflow() %>% add_model(update_spec) %>% add_recipe(rec) %>% add_case_weights(wts)
cv      <- vfold_cv(d, v = 3, repeats = 3, strata = NULL, breaks = 1)
metrics <- metric_set(yardstick::roc_auc, yardstick::accuracy, yardstick::sensitivity)
ctr     <- control_grid(verbose = FALSE, save_pred = TRUE, event_level = "second")

res     <- wf %>% tune_grid(resamples = cv, grid = grid, metrics = metrics, control = ctr)

b   <- select_best(res)                           
m   <- finalize_workflow(wf, b) %>% fit(d)

michaelgloven · March 11, 2024, 3:58pm

with your help I think I have a solution that works, I need to do more testing with my use case (not reprex) but it looks like if I simply prepare my data.frame with wts <- importance_weights(c(10,10,10,10,10,10,.1,.1,.1,10)) and then pass this into recipe as usual then use add_case_weights(wts) argument with the workflow I get a working model considering case weights. Here is the code that works for me:

y1  <- c("T", "T", "T", "T", "T", "T", "F", "F", "F", "T")
y2  <- c("A", "B", "A", "B", "A", "B", "A", "B", "C", "D")
x3  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x4  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
x5  <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
wts <- importance_weights(c(10,10,10,10,10,10,.1,.1,.1,10))
d   <- data.frame(y1, y2, x3, x4, x5, wts)

rec <- recipe(y1~., data = d) %>%                                      
      step_dummy(all_nominal_predictors(), one_hot = T, trained = F, skip = F) %>%
      step_center(all_numeric_predictors(), skip = F) %>%
      step_scale(all_numeric_predictors(), skip = F)

spec <- boost_tree() %>% set_mode("classification") %>% set_engine("xgboost")

param_mod   <- tunable(spec) %>% filter(component_id == "main") %>% pull(name)
param_mod1  <- param_mod[! param_mod %in% c("learn_rate", "mtry" , "min_n" , "loss_reduction" ,"sample_size" , "stop_iter" )]  
param_tune  <- purrr::map(param_mod, ~ set_names(list(tune()), .x)) %>% reduce(c)

update_spec  <- spec %>% update(param_tune)                
p            <- extract_parameter_set_dials(update_spec)  
grid         <- grid_regular(finalize(p, d), levels = 2, original = T) 

wf      <- workflow() %>% add_model(update_spec) %>% add_recipe(rec) %>% add_case_weights(wts)
cv      <- vfold_cv(d, v = 3, repeats = 1, strata = NULL, breaks = 1)
metrics <- metric_set(yardstick::roc_auc, yardstick::accuracy, yardstick::sensitivity)
ctr     <- control_grid(verbose = T, save_pred = TRUE)
res     <- wf %>% tune_grid(resamples = cv, grid = grid, metrics = metrics, control = ctr)

b   <- select_best(res)                           
m   <- finalize_workflow(wf, b) %>% fit(d)
p   <- predict(m, d, type = "prob")

system · March 18, 2024, 3:58pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.