Hi,
I'm trying to get information about a prediction for a single observation for an XGB model developed using Tidymodels using the DALEX package.
I've developed a convoluted workflow but in summary I'm having issues using predict_parts
with my model explainer and am unsure what the error refers to. I've tried converting the single new_observation
to a matrix but this doesn't remove the error:
Error in xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], :
REAL() can only be applied to a 'numeric', not a 'list'
Any suggestions on a fix would be greatly appreciated, below is the code I'm using for the Ames Housing data:
# INSTALL AND LOAD PACKAGES -----------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidymodels,
tidyverse,
doParallel,
janitor,
AmesHousing,
vip,
randomForest,
baguette,
bonsai,
finetune,
lightgbm,
DALEXtra,
SHAPforxgboost
)
# load the housing data and clean names
ames_data <- make_ames() %>%
janitor::clean_names() %>%
mutate(sale_price_log = log10(sale_price))
# SPLIT INTO TRAINING AND TESTING DATA. STRATIFY BY SALE PRICE
ames_split <- rsample::initial_split(
ames_data %>% select(-sale_price),
prop = 0.8,
strata = sale_price_log
)
# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price_log, v = 10)
base_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# APPLYING LOG TRANSFORMATION TO SALE_PRICE AND GR_LIV_AREA TO ADDRESS SKEWNESS
step_log(gr_liv_area, base = 10) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE)
normalise_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_nzv(all_nominal_predictors()) %>%
# HANDLING RARE FACTOR LEVELS IN NEIGHBORHOOD TO IMPROVE MODEL ROBUSTNESS
step_other(all_nominal_predictors(), threshold = 0.05, other = "OTHER") %>%
# STABILIZING VARIANCE AND NORMALIZING DISTRIBUTIONS FOR LOT_AREA AND GR_LIV_AREA
step_YeoJohnson(all_numeric_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors())# %>%
# APPLYING SPLINES TO LATITUDE AND LONGITUDE TO CAPTURE NON-LINEAR RELATIONSHIPS, TUNING DEGREES OF FREEDOM
#step_ns(latitude, longitude, deg_free = tune())
# PCA RECIPE
pca_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# FOR UNSEEN FACTROR LEVELS, CREATE A NEW LEVEL CALLED "NEW"
step_novel(all_nominal_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors()) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CONVERT NUMERIC COLUMNS TO PRINCIPAL COMPONENTS
step_pca(all_predictors(), threshold = 0.95)
# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
) %>%
set_mode("regression") %>%
set_engine("rpart", times = 25L)
# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
trees = 500,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost", importance = TRUE) %>%
set_mode("regression")
# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
boost_tree(
learn_rate = tune(),
stop_iter = tune(),
trees = 500
) %>%
set_engine("lightgbm", num_leaves = tune()) %>%
set_mode("regression")
wflw_set <-
workflow_set(
preproc = list(base = base_rec, normalise = normalise_rec, pca = pca_rec),
models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec),
cross = TRUE
)
# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
extract_workflow(
id = "base_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
base_rf_param <- wflw_set %>%
extract_workflow(
id = "base_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED XGB MODEL
normalise_xgb_param <- wflw_set %>%
extract_workflow(
id = "normalise_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED RF MODEL
normalise_rf_param <- wflw_set %>%
extract_workflow(
id = "normalise_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_xgb_param <- wflw_set %>%
extract_workflow(
id = "pca_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_rf_param <- wflw_set %>%
extract_workflow(
id = "pca_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
option_add(param_info = base_xgb_param, id = "base_xgb") %>%
option_add(param_info = base_rf_param, id = "base_rf") %>%
option_add(param_info = normalise_xgb_param, id = "normalise_xgb") %>%
option_add(param_info = normalise_rf_param, id = "normalise_rf") %>%
option_add(param_info = pca_xgb_param, id = "pca_xgb") %>%
option_add(param_info = pca_rf_param, id = "pca_rf")
# SPECIFY THE TUNE GRID
race_ctrl <-
control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
workflow_map(
"tune_race_anova",
seed = 123,
resamples = ames_folds,
grid = 30,
control = race_ctrl,
verbose = TRUE
)
tictoc::toc()
# EXTRACT THE BEST RESULTS
best_results <-
race_results %>%
extract_workflow_set_result("base_xgb") %>%
select_best(metric = "rmse")
final_fit <-
race_results %>%
extract_workflow("base_xgb") %>%
finalize_workflow(best_results) %>%
last_fit(split = ames_split, metrics = metric_set(rmse, rsq))
vip_features <- c('gr_liv_area','total_bsmt_sf','year_built','fireplaces','bldg_type','year_remod_add')
vip_train <- ames_train %>%
select(all_of(vip_features))
explainer_xgb <- explain_tidymodels(
model = final_fit %>% extract_fit_engine(),
data = vip_train,
y = ames_train$sale_price_log,
label = "xgb",
verbose = FALSE
)
one_fam <- vip_train[200, ]
xgb_breakdown <- predict_parts(explainer = explainer_xgb,
new_observation = one_fam_matrix,
type = "shap",
B = 5)