My tuning takes so long that it does not finish. I have tried tune_bayes()
and tune_grid()
here is my script
library(tidyverse)
library(scales)
library(skimr)
library(tidymodels)
library(bonsai)
library(lightgbm)
library(janitor)
library(doParallel)
options(width = 120)
# read csv data with readr
train_df <- read_csv("raw_data/train.csv") |>
clean_names()
test_df <- read_csv("raw_data/test.csv") |>
clean_names()
# skimr
skim(train_df)
skim(train_df) |>
as_tibble() |>
arrange(complete_rate) |>
select(skim_variable, complete_rate)
rm_cols_missing <- skim(train_df) |>
as_tibble() |>
arrange(complete_rate) |>
select(skim_variable, complete_rate) |>
filter(complete_rate < 0.5) |>
pull(skim_variable)
rm_n_unique <- skim(train_df) |>
as_tibble() |>
arrange(character.n_unique) |>
select(skim_variable, character.n_unique) |>
filter(character.n_unique < 3) |>
pull(skim_variable)
# ecdf of the target variable vs the normal cdf
# trans form target to log
rm_n_unique <- purrr::discard(rm_n_unique, .p = ~stringr::str_detect(.x,"alley"))
target_recipe <- recipe(train_df, sale_price ~ .) %>%
step_rm(id) %>%
step_rm(all_of(rm_cols_missing)) %>%
step_rm(all_of(rm_n_unique)) %>% # maybe don't remove central_air
step_log(all_numeric(), offset = 1) %>% # log + 1
step_normalize(all_numeric(),-all_outcomes()) %>%
step_other(all_nominal(), -all_outcomes(), threshold = 0.03) %>% # rare levels to other
step_novel(all_predictors(), -all_numeric()) %>% # assign a previously unseen factor level to a new value
step_impute_knn(all_predictors()) %>% # use knn to impute missing values
step_dummy(all_nominal(), -all_outcomes()) # make dummies for categorical variables
prep(target_recipe, training = train_df) %>%
juice() %>%
glimpse()
# lgbm params
model_lgbm <- boost_tree(
trees = tune(), learn_rate = tune(),
tree_depth = tune(), min_n = tune(),
loss_reduction = tune(),
sample_size = tune(), mtry = tune(),
) %>%
set_mode("regression") %>%
set_engine("lightgbm", nthread = 10)
SalePrice_workflow <- workflow() %>% add_recipe(target_recipe)
SalePrice_xgb_workflow <-SalePrice_workflow %>% add_model(model_lgbm)
# hyperparameters
hyperparams_lgbm <- parameters(
trees(), learn_rate(),
tree_depth(), min_n(),
loss_reduction(),
sample_size = sample_prop(), finalize(mtry(), train_df)
)
xgboost_params <- hyperparams_lgbm %>% update(trees = trees(c(100, 500)))
set.seed(321)
folds_sale_price <- vfold_cv(train_df, v = 5, strata = sale_price)
# increment workflow
workflow_SalePrice_xgb_model <-
workflow() %>%
add_model(model_lgbm) %>%
add_recipe(target_recipe)
set.seed(42)
doParallel::registerDoParallel(10)
lgbm_tune <-
workflow_SalePrice_xgb_model %>%
tune_bayes(
resamples = folds_sale_price,
param_info = hyperparams_lgbm,
initial = 10,
iter = 30,
metrics = metric_set(rmse, mape),
control = control_bayes(no_improve = 5,
save_pred = T, verbose = T)
)
doParallel::stopImplicitCluster()
show_notes(lgbm_tune)
SalePrice_best_model <- select_best(xgboost_tune, "rmse", maximize = F)
print(SalePrice_best_model)
#
SalePrice_final_model <- finalize_model(SalePrice_xgb_model, SalePrice_best_model)
SalePrice_workflow <- workflow_SalePrice_xgb_model %>% update_model(SalePrice_final_model)
SalePrice_xgb_fit <- fit(SalePrice_workflow, data = train_data)
#
pred <-
predict(SalePrice_xgb_fit, test_data)
readr::write_csv(pred, "pred.csv")
My data source comes from Kaggle-House Price Advanced Regression Techniques