Here is an outline of my code. I left this overnight and the tune_grid()
part was hanging. It never finished. What am I doing wrong?
library(glmnet)
library(tidymodels)
library(tidyverse)
train_df <- read_csv("train.csv") |>
clean_names()
test_df <- read_csv("test.csv") |>
clean_names()
# recipe
rm_cols_missing <- skim(train_df) |>
as_tibble() |>
arrange(complete_rate) |>
select(skim_variable, complete_rate) |>
filter(complete_rate < 0.5) |>
pull(skim_variable)
rm_n_unique <- skim(train_df) |>
as_tibble() |>
arrange(character.n_unique) |>
select(skim_variable, character.n_unique) |>
filter(character.n_unique < 3) |>
pull(skim_variable)
rm_n_unique <- purrr::discard(rm_n_unique, .p = ~ stringr::str_detect(.x, "alley|central_air"))
train_df <- train_df |>
mutate(sale_price = log(sale_price + 1))
target_recipe <- recipe(train_df, sale_price ~ .) |>
step_rm(id) |>
step_rm(all_of(rm_cols_missing)) |>
step_rm(all_of(rm_n_unique)) |>
step_log(all_numeric(), -all_outcomes(), offset = 1) |> # log + 1
step_normalize(all_numeric(), -all_outcomes()) |>
step_other(all_nominal(), -all_outcomes(), threshold = 0.03) |> # rare levels to other
step_novel(all_predictors(), -all_numeric()) |> # assign a previously unseen factor level to a new value
step_impute_knn(all_predictors()) |> # use knn to impute missing values
step_dummy(all_nominal(), -all_outcomes()) # make dummies for categorical variables
# bootstrap
train_folds <- bootstraps(train_df, times = 5)
spec_lasso <- linear_reg(penalty = tune(), mixture = 1) %>%
set_mode("regression") %>%
set_engine("glmnet")
wf_lasso <- workflow() %>%
add_recipe(target_recipe) %>%
add_model(spec_lasso)
set.seed(42)
grid_values <- grid_regular(penalty(range = c(-4, -2)), levels = 10)
race_lasso <- tune_grid(
wf_lasso,
resamples = train_folds,
grid = grid_values,
metrics = metric_set(rmse)
)