rfe error - logistic classification - "undefined columns selected" and "'match' requires vector arguments"

Max · February 28, 2019, 9:29pm

I had to tweak the recipe a bit:

library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(recipes)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#> 
#>     step

set.seed(2624)
percent <- 0.80

mtcars <- 
  mtcars %>% 
  mutate(am = as.factor(am))
in_train <- createDataPartition(mtcars$am, p = percent, list = FALSE)
train_data <- mtcars[in_train,]
test_data <- mtcars[-in_train,]

log_recipe <- 
  recipe(formula = am ~ ., data = train_data) %>% 
  # Nothing selected here and fails
  # step_other(all_nominal(), -all_outcomes(), threshold = 0.02, other = "other_assigned ") %>% 
  step_center(all_numeric()) %>% 
  step_scale(all_numeric()) %>% 
  step_pca(all_numeric(), -all_outcomes(), num_comp = nrow(train_data)) %>% 
  # No variables to make dummies
  # step_dummy(all_nominal(), -all_outcomes()) %>% 
  # step_nzv(all_predictors()) %>% 
  # step_pca() ensures that step_corr() won't select anything
  # step_corr(all_numeric()) %>% 
  # step_lincomb(all_numeric()) %>% 
  step_naomit(all_predictors()) %>% 
  # Do the conversion here (skip means it won't fail when predicting)
  step_mutate(am = as.factor(am), skip = TRUE)

train_prepped <- 
  log_recipe %>% 
  prep(train_data) %>% 
  juice()

set.seed(2624)
log.glmRFE <- lrFuncs
log.glmRFE$summary <- twoClassSummary

log_ctrl <- rfeControl(functions = log.glmRFE,
                       method = "repeatedcv", 
                       number = 10,
                       repeats = 5,
                       saveDetails = TRUE,
                       verbose= FALSE)


log_model <- 
  rfe(x = train_prepped %>% dplyr::select(-am),
      y = train_prepped$am,
      sizes = 1:10,
      rfeControl = log_ctrl)
#> Warning in rfe.default(x = train_prepped %>% dplyr::select(-am), y =
#> train_prepped$am, : Metric 'Accuracy' is not created by the summary
#> function; 'ROC' will be used instead
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# <snip>

log_model
#> 
#> Recursive feature selection
#> 
#> Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
#> 
#> Resampling performance over subset size:
#> 
#>  Variables    ROC Sens Spec   ROCSD SensSD  SpecSD Selected
#>          1 0.8150 0.74 0.50 0.33047 0.3676 0.49487         
#>          2 0.9650 0.97 0.94 0.17504 0.1568 0.23990         
#>          3 0.9650 0.97 0.96 0.17504 0.1568 0.19795         
#>          4 0.9650 0.92 0.96 0.17504 0.2548 0.19795         
#>          5 0.9650 0.90 0.97 0.15980 0.2673 0.15682         
#>          6 0.9625 0.87 0.97 0.15205 0.2998 0.15682         
#>          7 0.9900 0.90 0.99 0.07071 0.2673 0.07071        *
#>          8 0.9800 0.84 0.98 0.11112 0.3264 0.14142         
#>          9 0.9700 0.79 0.97 0.14846 0.3655 0.15682         
#>         10 0.9700 0.79 0.97 0.14846 0.3655 0.15682         
#> 
#> The top 5 variables (out of 7):
#>    PC02, PC01, PC03, PC10, PC07

Created on 2019-02-28 by the reprex package (v0.2.1)

If you want to be adventurous though... there is a branch of caret that has recipe integration with the feature selection routines. I'll probably release it to CRAN at the end of March.

It's a little tricky with steps that select/filter variables. The recipe would be remade within each resample (as it should) but you might not have the same set of variables as other resamples.

The new code would be:

library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(recipes)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#> 
#>     step

set.seed(2624)
percent <- 0.80

mtcars <- 
  mtcars %>% 
  mutate(am = as.factor(am))
in_train <- createDataPartition(mtcars$am, p = percent, list = FALSE)
train_data <- mtcars[in_train,]
test_data <- mtcars[-in_train,]

log_recipe <- 
  recipe(formula = am ~ ., data = train_data) %>% 
  # Nothing selected here and fails
  # step_other(all_nominal(), -all_outcomes(), threshold = 0.02, other = "other_assigned ") %>% 
  step_center(all_numeric()) %>% 
  step_scale(all_numeric()) %>% 
  step_pca(all_numeric(), -all_outcomes(), num_comp = nrow(train_data)) %>% 
  # No variables to make dummies
  # step_dummy(all_nominal(), -all_outcomes()) %>% 
  # step_nzv(all_predictors()) %>% 
  # step_pca() ensures that step_corr() won't select anything
  # step_corr(all_numeric()) %>% 
  # step_lincomb(all_numeric()) %>% 
  step_naomit(all_predictors()) %>% 
  # Do the conversion here (skip means it won't fail when predicting)
  step_mutate(am = as.factor(am), skip = TRUE)

set.seed(2624)
log.glmRFE <- lrFuncs
log.glmRFE$summary <- twoClassSummary

log_ctrl <- rfeControl(functions = log.glmRFE,
                       method = "repeatedcv", 
                       number = 10,
                       repeats = 5,
                       saveDetails = TRUE,
                       verbose= FALSE)

log_model <- 
  rfe(log_recipe, 
      data = train_data,
      sizes = 1:10,
      rfeControl = log_ctrl)
#> Warning in rfe.recipe(log_recipe, data = train_data, sizes = 1:10,
#> rfeControl = log_ctrl): Metric 'Accuracy' is not created by the summary
#> function; 'ROC' will be used instead
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# <snip>

log_model
#> 
#> Recursive feature selection
#> 
#> Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
#> 
#> Resampling performance over subset size:
#> 
#>  Variables    ROC Sens Spec  ROCSD  SensSD SpecSD Num_Resamples Selected
#>          1 0.8250 0.74 0.57 0.3283 0.36756 0.4950            50         
#>          2 0.9800 0.99 0.92 0.1414 0.07071 0.2740            50         
#>          3 1.0000 0.97 1.00 0.0000 0.15682 0.0000            50        *
#>          4 0.9650 0.89 1.00 0.1598 0.27274 0.0000            50         
#>          5 0.9575 0.93 0.95 0.1779 0.22610 0.2082            50         
#>          6 0.9750 0.86 0.97 0.1263 0.30372 0.1568            50         
#>          7 0.9650 0.88 0.97 0.1750 0.25873 0.1568            50         
#>          8 0.9650 0.83 0.97 0.1750 0.32904 0.1568            50         
#>          9 0.9500 0.79 0.97 0.2020 0.36547 0.1568            50         
#>         10 0.9700 0.79 0.97 0.1485 0.36547 0.1568            50         
#> 
#> The top 3 variables (out of 3):
#>    PC02, PC01, PC03

Created on 2019-02-28 by the reprex package (v0.2.1)