Local explanation of ML models using DALEX for XGB

jgarrigan · February 7, 2024, 11:34pm

Hi,

I'm trying to get information about a prediction for a single observation for an XGB model developed using Tidymodels using the DALEX package.

I've developed a convoluted workflow but in summary I'm having issues using predict_parts with my model explainer and am unsure what the error refers to. I've tried converting the single new_observation to a matrix but this doesn't remove the error:

Error in xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]],  : 
  REAL() can only be applied to a 'numeric', not a 'list'

Any suggestions on a fix would be greatly appreciated, below is the code I'm using for the Ames Housing data:

# INSTALL AND LOAD PACKAGES -----------------------------------------------
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidymodels,
  tidyverse,
  doParallel,
  janitor,
  AmesHousing,
  vip,
  randomForest,
  baguette,
  bonsai,
  finetune,
  lightgbm,
  DALEXtra,
  SHAPforxgboost
)

# load the housing data and clean names
ames_data <- make_ames() %>%
  janitor::clean_names() %>% 
  mutate(sale_price_log = log10(sale_price))

# SPLIT INTO TRAINING AND TESTING DATA. STRATIFY BY SALE PRICE
ames_split <- rsample::initial_split(
  ames_data %>% select(-sale_price),
  prop = 0.8,
  strata = sale_price_log
)

# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price_log, v = 10)

base_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
  # APPLYING LOG TRANSFORMATION TO SALE_PRICE AND GR_LIV_AREA TO ADDRESS SKEWNESS
  step_log(gr_liv_area, base = 10) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

normalise_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_nzv(all_nominal_predictors()) %>%
  # HANDLING RARE FACTOR LEVELS IN NEIGHBORHOOD TO IMPROVE MODEL ROBUSTNESS
  step_other(all_nominal_predictors(), threshold = 0.05, other = "OTHER") %>%
  # STABILIZING VARIANCE AND NORMALIZING DISTRIBUTIONS FOR LOT_AREA AND GR_LIV_AREA
  step_YeoJohnson(all_numeric_predictors()) %>% 
  # NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
  step_normalize(all_numeric_predictors()) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_zv(all_predictors())# %>%
# APPLYING SPLINES TO LATITUDE AND LONGITUDE TO CAPTURE NON-LINEAR RELATIONSHIPS, TUNING DEGREES OF FREEDOM
#step_ns(latitude, longitude, deg_free = tune())

# PCA RECIPE
pca_rec <- recipe(sale_price_log ~ ., data = ames_train) %>% 
  # FOR UNSEEN FACTROR LEVELS, CREATE A NEW LEVEL CALLED "NEW"
  step_novel(all_nominal_predictors()) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors()) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_zv(all_predictors()) %>%
  # NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
  step_normalize(all_numeric_predictors()) %>%
  # CONVERT NUMERIC COLUMNS TO PRINCIPAL COMPONENTS
  step_pca(all_predictors(), threshold = 0.95)

# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
  tree_depth = tune(),
  min_n = tune(),
  cost_complexity = tune()
) %>%
  set_mode("regression") %>%
  set_engine("rpart", times = 25L)

# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
  rand_forest(
    mtry = tune(),
    min_n = tune(),
    trees = 500
  ) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
  trees = 500,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_engine("xgboost", importance = TRUE) %>%
  set_mode("regression")

# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
  boost_tree(
    learn_rate = tune(),
    stop_iter = tune(),
    trees = 500
  ) %>%
  set_engine("lightgbm", num_leaves = tune()) %>%
  set_mode("regression")

wflw_set <-
  workflow_set(
    preproc = list(base = base_rec, normalise = normalise_rec, pca = pca_rec),
    models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec),
    cross = TRUE
  )

# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "base_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 308)))

base_rf_param <- wflw_set %>% 
  extract_workflow(
    id = "base_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 308)))

# UPDATE MTRY PARAMETER FOR THE NORMALISED XGB MODEL
normalise_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "normalise_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 284)))

# UPDATE MTRY PARAMETER FOR THE NORMALISED RF MODEL
normalise_rf_param <- wflw_set %>%
  extract_workflow(
    id = "normalise_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 284)))

# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "pca_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 5)))

# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_rf_param <- wflw_set %>%
  extract_workflow(
    id = "pca_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 5)))

# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
  option_add(param_info = base_xgb_param, id = "base_xgb") %>%
  option_add(param_info = base_rf_param, id = "base_rf") %>%
  option_add(param_info = normalise_xgb_param, id = "normalise_xgb") %>%
  option_add(param_info = normalise_rf_param, id = "normalise_rf") %>%
  option_add(param_info = pca_xgb_param, id = "pca_xgb") %>%
  option_add(param_info = pca_rf_param, id = "pca_rf")

# SPECIFY THE TUNE GRID
race_ctrl <-
  control_race(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
  workflow_map(
    "tune_race_anova",
    seed = 123,
    resamples = ames_folds,
    grid = 30,
    control = race_ctrl,
    verbose = TRUE
  )
tictoc::toc()

# EXTRACT THE BEST RESULTS
best_results <- 
  race_results %>% 
  extract_workflow_set_result("base_xgb") %>% 
  select_best(metric = "rmse")

final_fit <- 
  race_results %>% 
  extract_workflow("base_xgb") %>% 
  finalize_workflow(best_results) %>% 
  last_fit(split = ames_split, metrics = metric_set(rmse, rsq))

vip_features <- c('gr_liv_area','total_bsmt_sf','year_built','fireplaces','bldg_type','year_remod_add')

vip_train <- ames_train %>% 
  select(all_of(vip_features))

explainer_xgb <- explain_tidymodels(
  model = final_fit %>% extract_fit_engine(),
  data = vip_train,
  y = ames_train$sale_price_log,
  label = "xgb",
  verbose = FALSE
)

one_fam <- vip_train[200, ]

xgb_breakdown <- predict_parts(explainer = explainer_xgb,
                               new_observation = one_fam_matrix,
                               type = "shap",
                               B = 5)

system · February 28, 2024, 11:35pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.