tidymodels: error when predicting on new data with xgboost model

julia · July 25, 2020, 5:05pm

If you have used one of the tune_* functions to find the best parameters for your model and then finalized your workflow with those parameters, the next step is to train or fit that workflow one more time on the whole training set. Let's walk through an example.

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom     0.7.0      ✓ recipes   0.1.13
#> ✓ dials     0.0.8      ✓ rsample   0.0.7 
#> ✓ dplyr     1.0.0      ✓ tibble    3.0.3 
#> ✓ ggplot2   3.3.2      ✓ tidyr     1.1.0 
#> ✓ infer     0.5.3      ✓ tune      0.1.1 
#> ✓ modeldata 0.0.2      ✓ workflows 0.1.2 
#> ✓ parsnip   0.1.2      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()

## pretend this is your training data
data("hpc_data")

xgb_spec <- boost_tree(
  trees = 1000, 
  tree_depth = tune(), 
  min_n = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

hpc_folds <- vfold_cv(hpc_data, strata = class)

xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  size = 5
)

xgb_wf <- workflow() %>%
  add_formula(class ~ .) %>%
  add_model(xgb_spec)

xgb_wf
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: boost_tree()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> class ~ .
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Boosted Tree Model Specification (classification)
#> 
#> Main Arguments:
#>   trees = 1000
#>   min_n = tune()
#>   tree_depth = tune()
#> 
#> Computational engine: xgboost

doParallel::registerDoParallel()
set.seed(123)
xgb_res <- tune_grid(
  xgb_wf,
  resamples = hpc_folds,
  grid = xgb_grid
)

xgb_res
#> # Tuning results
#> # 10-fold cross-validation using stratification 
#> # A tibble: 10 x 4
#>    splits             id     .metrics          .notes          
#>    <list>             <chr>  <list>            <list>          
#>  1 <split [3.9K/434]> Fold01 <tibble [10 × 6]> <tibble [0 × 1]>
#>  2 <split [3.9K/434]> Fold02 <tibble [10 × 6]> <tibble [0 × 1]>
#>  3 <split [3.9K/434]> Fold03 <tibble [10 × 6]> <tibble [0 × 1]>
#>  4 <split [3.9K/434]> Fold04 <tibble [10 × 6]> <tibble [0 × 1]>
#>  5 <split [3.9K/434]> Fold05 <tibble [10 × 6]> <tibble [0 × 1]>
#>  6 <split [3.9K/434]> Fold06 <tibble [10 × 6]> <tibble [0 × 1]>
#>  7 <split [3.9K/433]> Fold07 <tibble [10 × 6]> <tibble [0 × 1]>
#>  8 <split [3.9K/432]> Fold08 <tibble [10 × 6]> <tibble [0 × 1]>
#>  9 <split [3.9K/431]> Fold09 <tibble [10 × 6]> <tibble [0 × 1]>
#> 10 <split [3.9K/431]> Fold10 <tibble [10 × 6]> <tibble [0 × 1]>

Next, let's finalize this workflow and then fit() it to the training data. (The tuning process used the training data, but that was to find the best model parameters, not to train the model itself.)

trained_wf <- xgb_wf %>%
  finalize_workflow(
  select_best(xgb_res, "roc_auc")
) %>%
  fit(hpc_data)

trained_wf
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: boost_tree()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> class ~ .
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> ##### xgb.Booster
#> raw: 2 Mb 
#> call:
#>   xgboost::xgb.train(params = list(eta = 0.3, max_depth = 3L, gamma = 0, 
#>     colsample_bytree = 1, min_child_weight = 9L, subsample = 1), 
#>     data = x, nrounds = 1000, watchlist = wlist, verbose = 0, 
#>     objective = "multi:softprob", num_class = 4L, nthread = 1)
#> params (as set within xgb.train):
#>   eta = "0.3", max_depth = "3", gamma = "0", colsample_bytree = "1", min_child_weight = "9", subsample = "1", objective = "multi:softprob", num_class = "4", nthread = "1", validate_parameters = "TRUE"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.evaluation.log()
#> # of features: 26 
#> niter: 1000
#> nfeatures : 26 
#> evaluation_log:
#>     iter training_merror
#>        1        0.320942
#>        2        0.301778
#> ---                     
#>      999        0.010390
#>     1000        0.010390

Now let's say we have some brand new data. You can predict() with new data on the trained workflow.

brand_new_data <- hpc_data[5, -8]
brand_new_data
#> # A tibble: 1 x 7
#>   protocol compounds input_fields iterations num_pending  hour day  
#>   <fct>        <dbl>        <dbl>      <dbl>       <dbl> <dbl> <fct>
#> 1 E              100           82         20           0  10.4 Fri

predict(trained_wf, new_data = brand_new_data)
#> # A tibble: 1 x 1
#>   .pred_class
#>   <fct>      
#> 1 VF

^{Created on 2020-07-17 by the reprex package (v0.3.0)}

The thing to remember is that if you fit the workflow then you need to predict on the workflow. If you want to predict on a model (vs. a workflow) you can tune a model specification.