Thanks for the response! Sorry if my question wasn't entirely clear, let me try again, adding a reprex. After preprocessing the flight_data and converting to factors, the tutorial adds a recipe:
library(tidymodels) # for the recipes package, along with the rest of tidy-models
# Helper packages
library(nycflights13) # for flight data
library(skimr) # for variable summaries
set.seed(123)
flight_data <-
flights %>%
mutate(
# Convert the arrival delay to a factor
arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
arr_delay = factor(arr_delay),
# We will use the date (not date-time) in the recipe below
date = as.Date(time_hour)
) %>%
# Include the weather data
inner_join(weather, by = c("origin", "time_hour")) %>%
# Only retain the specific columns we will use
select(dep_time, flight, origin, dest, air_time, distance,
carrier, date, arr_delay, time_hour) %>%
# Exclude missing data
na.omit() %>%
# For creating models, it is better to have qualitative columns
# encoded as factors (instead of character strings)
mutate_if(is.character, as.factor)
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible when random numbers are used
set.seed(555)
# Put 3/4 of the data into the training set
data_split <- initial_split(flight_data, prop = 3/4)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
flights_rec <-
recipe(arr_delay ~ ., data = train_data) %>%
update_role(flight, time_hour, new_role = "ID") %>%
step_date(date, features = c("dow", "month")) %>%
step_holiday(date, holidays = timeDate::listHolidays("US")) %>%
step_rm(date) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors())
trained_flights_rec <- flights_rec %>% prep(training = train_data)
trained_flights_rec
#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> ID 2
#> outcome 1
#> predictor 7
#>
#> Training data contained 244365 data points and no missing data.
#>
#> Operations:
#>
#> Date features from date [trained]
#> Holiday features from date [trained]
#> Variables removed date [trained]
#> Dummy variables from origin, dest, carrier, date_dow, date_month [trained]
#> Zero variance filter removed dest_LEX [trained]
test_bake <- trained_flights_rec %>% bake(new_data = test_data)
test_bake %>% select(starts_with("dest_L"))
#> # A tibble: 81,454 x 3
#> dest_LAS dest_LAX dest_LGB
#> <dbl> <dbl> <dbl>
#> 1 0 0 0
#> 2 0 0 0
#> 3 0 0 0
#> 4 0 0 0
#> 5 1 0 0
#> 6 0 0 0
#> 7 0 0 0
#> 8 0 0 0
#> 9 0 0 0
#> 10 0 0 0
#> # … with 81,444 more rows
As you can see, step_zv
removes the column dest_LEX
from both training data and test data. This is crucial for training the model (an unregularized logistic regression) because LEX doesn't appear as a destination in the training data so the estimated \beta for dest_LEX
is unconstrained. But given that, it is unclear from the tutorial text what recipes does with dest == LEX
in the test data.
LEX_test_data <- test_data %>%
filter(dest == "LEX")
test_LEX_bake <- trained_flights_rec %>% bake(new_data = LEX_test_data)
test_LEX_bake %>% select_if(~ !is.numeric(.) || sum(.) != 0)
#> # A tibble: 1 x 9
#> dep_time flight air_time distance time_hour arr_delay origin_LGA
#> <int> <int> <dbl> <dbl> <dttm> <fct> <dbl>
#> 1 2026 3669 90 604 2013-11-24 20:00:00 on_time 1
#> # … with 2 more variables: date_dow_Mon <dbl>, date_month_Nov <dbl>
We can see here that all of the dest_
columns are 0 when we bake the recipe on test data where dest == LEX
.
But because we used step_dummy
without one-hot encoding, all of the dest
columns will also be 0 when dest == ABQ
.
new_ABQ_test_data = LEX_test_data %>% mutate(dest = factor("ABQ", levels = levels(LEX_test_data$dest)))
test_ABQ_bake <- trained_flights_rec %>% bake(new_data = new_ABQ_test_data)
test_ABQ_bake %>% select_if(~ !is.numeric(.) || sum(.) != 0)
#> # A tibble: 1 x 9
#> dep_time flight air_time distance time_hour arr_delay origin_LGA
#> <int> <int> <dbl> <dbl> <dttm> <fct> <dbl>
#> 1 2026 3669 90 604 2013-11-24 20:00:00 on_time 1
#> # … with 2 more variables: date_dow_Mon <dbl>, date_month_Nov <dbl>
Just to show that this is unique to ABQ (the chosen baseline contrast), we can try the same thing with dest == PHL
new_PHL_test_data = LEX_test_data %>% mutate(dest = factor("PHL", levels = levels(LEX_test_data$dest)))
test_PHL_bake <- trained_flights_rec %>% bake(new_data = new_PHL_test_data)
test_PHL_bake %>% select_if(~ !is.numeric(.) || sum(.) != 0)
#> # A tibble: 1 x 10
#> dep_time flight air_time distance time_hour arr_delay origin_LGA
#> <int> <int> <dbl> <dbl> <dttm> <fct> <dbl>
#> 1 2026 3669 90 604 2013-11-24 20:00:00 on_time 1
#> # … with 3 more variables: dest_PHL <dbl>, date_dow_Mon <dbl>,
#> # date_month_Nov <dbl>
Here there is a nonzero dest_PHL
column. So recipes is treating LEX
and ABQ
the same, even though the choice of ABQ
as a baseline was essentially arbitrary (because it came first in the factor). It's possible that when we predict on test data, the model does something smart with the dropped zero-variance columns so we can try that out.
lr_mod <-
logistic_reg() %>%
set_engine("glm")
flights_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(flights_rec)
flights_fit <-
flights_wflow %>%
fit(data = train_data)
predict(flights_fit, LEX_test_data, type = "prob")
#> # A tibble: 1 x 2
#> .pred_late .pred_on_time
#> <dbl> <dbl>
#> 1 0.376 0.624
predict(flights_fit, new_ABQ_test_data, type = "prob")
#> # A tibble: 1 x 2
#> .pred_late .pred_on_time
#> <dbl> <dbl>
#> 1 0.376 0.624
predict(flights_fit, new_PHL_test_data, type = "prob")
#> # A tibble: 1 x 2
#> .pred_late .pred_on_time
#> <dbl> <dbl>
#> 1 0.132 0.868
The model makes the same prediction for LEX and ABQ but not PHL as you might have predicted based on how recipes "baked" the test data above. This seems like potentially bad model behavior to me unless there is something I'm missing?