I am following the tutorial here and applying that on my own data. I am building a recipe
which has only one step, converting string to factor. The data I am working with dflog
has three variables, Height
, Weight
and Gender
and Gender
is what I am trying to predict. Here's a sample of the data:
dflog <- structure(list(Gender = c("Male", "Male", "Female", "Male", "Male",
"Male", "Male", "Female", "Female", "Female", "Female", "Male",
"Male", "Female", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Male", "Male", "Female", "Male", "Male", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Female", "Male", "Male", "Female", "Female",
"Female", "Female", "Female", "Male", "Female", "Female", "Female",
"Female"), Height = c(69.6916891296127, 65.1884775622368, 58.2253970159585,
70.8668694250787, 73.0719769487447, 67.3421407592253, 67.0405897586907,
64.9250143571263, 67.8118152454112, 65.5325102349844, 63.1140543077552,
67.1104474203962, 71.9384959492937, 64.7432660120597, 64.6784085432949,
67.4242401728615, 65.1259560448078, 60.8782407546953, 65.6783060394966,
70.0742964754103, 71.4713872338541, 73.298338885144, 70.144874073255,
68.2595974675301, 70.8874866086991, 73.5692978094574, 61.3511138475123,
60.5856377146781, 63.4931430001388, 67.1820800126776, 68.5613711736471,
68.3141883278045, 69.9297057558467, 71.8725840567383, 63.0000474918155,
66.7034796444077, 64.5987969022366, 61.2575726538818, 65.0054061792742,
76.0270818630512, 61.7890437911651, 61.6830285758629, 62.5498457985719,
62.1876806542036, 63.3198176682499, 74.5178760319385, 67.0449228375836,
59.0455310013641, 64.5902856291067, 66.2874422639476), Weight = c(190.145864881414,
151.886942249695, 114.961707275311, 199.657175172799, 206.79014203395,
197.532280979268, 188.605099476104, 142.118166114949, 146.259558415089,
120.847428543239, 145.26879625192, 160.050285251246, 217.55978401917,
141.434997471028, 123.010507026547, 176.485090451344, 144.532731183071,
128.031249745851, 146.51337666912, 205.387909895344, 192.929107075105,
194.948577135866, 190.585376753341, 174.142421017912, 182.887947720361,
216.61777777797, 121.966343592203, 116.430074671494, 160.554603918248,
164.779375641609, 175.041945711247, 174.243488335751, 199.721623719246,
193.13807676563, 165.555965906858, 189.430028578934, 173.992094268605,
131.46928157665, 152.801310681396, 232.313470969478, 123.140730916332,
116.423510279721, 118.90320396155, 124.606873604888, 119.315988261806,
201.957971666922, 140.815517479934, 105.380905400503, 140.176081684439,
168.905571923085)), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -50L), spec = structure(list(
cols = list(Gender = structure(list(), class = c("collector_character",
"collector")), Height = structure(list(), class = c("collector_double",
"collector")), Weight = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Here's the code for what I am trying to do:
library(tidyverse)
library(tidymodels)
#Split the data
set.seed(666)
dflog_split <- initial_split(dflog, prop = 3/4)
dflog_train <- training(dflog_split)
dflog_test <- testing(dflog_split)
#Model spec
dflog_mod <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
#Build a recipe
dflog_rec <- recipe(Gender ~ ., data = dflog_train) %>%
step_string2factor(all_nominal())
#Build a workflow
dflog_workflow <- workflow() %>%
add_model(dflog_mod) %>%
add_recipe(dflog_rec)
#Fit
dflog_fit <- dflog_workflow %>%
fit(data = dflog_train)
dflog_fit %>% pull_workflow_fit() %>% tidy()
#Predict on test data
predict(dflog_fit, dflog_test)
Running the predict
function gives me this error:
Error: Can't subset columns that don't exist.
x Column `Gender` doesn't exist.
Run `rlang::last_error()` to see where the error occurred.
This is the last_error()
:
Can't subset columns that don't exist.
x Column `Gender` doesn't exist.
Backtrace:
1. stats::predict(dflog_fit, dflog_test)
24. vctrs:::stop_subscript_oob(...)
25. vctrs:::stop_subscript(...)
Run `rlang::last_trace()` to see the full context.
And this is the last_trace()
:
<error/vctrs_error_subscript_oob>
Can't subset columns that don't exist.
x Column `Gender` doesn't exist.
Backtrace:
█
1. ├─stats::predict(dflog_fit, dflog_test)
2. ├─workflows:::predict.workflow(dflog_fit, dflog_test)
3. │ ├─hardhat::forge(new_data, blueprint)
4. │ └─hardhat:::forge.data.frame(new_data, blueprint)
5. │ └─blueprint$forge$process(...)
6. │ ├─recipes::bake(object = rec, new_data = new_data)
7. │ └─recipes:::bake.recipe(object = rec, new_data = new_data)
8. │ ├─recipes::bake(object$steps[[i]], new_data = new_data)
9. │ └─recipes:::bake.step_string2factor(object$steps[[i]], new_data = new_data)
10. │ ├─purrr::map2_df(...)
11. │ │ └─purrr::map2(.x, .y, .f, ...)
12. │ ├─new_data[, col_names]
13. │ └─tibble:::`[.tbl_df`(new_data, , col_names)
14. │ └─tibble:::tbl_subset_col(x, j = j, j_arg)
15. │ └─tibble:::vectbl_as_col_index(j, x, j_arg = j_arg)
16. │ └─tibble:::vectbl_as_col_location(...)
17. │ ├─tibble:::subclass_col_index_errors(...)
18. │ │ ├─base::tryCatch(...)
19. │ │ │ └─base:::tryCatchList(expr, classes, parentenv, handlers)
20. │ │ │ └─base:::tryCatchOne(expr, names, parentenv, handlers[[1L]])
21. │ │ │ └─base:::doTryCatch(return(expr), name, parentenv, handler)
22. │ │ └─base::force(expr)
23. │ └─vctrs::vec_as_location(j, n, names, arg = as_label(j_arg))
24. └─vctrs:::stop_subscript_oob(...)
25. └─vctrs:::stop_subscript(...)
Not sure why this doesn't work.