Correctly Specifying Hyperparameters in `workflow_map`

prdm0 · August 6, 2024, 4:44pm

Dear all,

In this example, I need to tune one hyperparameter in the recipe and others in the model. Apparently, the code runs without problems; however, the rank_results() function does not work on the object returned by workflow_map().

Is there a more consistent way to specify which hyperparameters belong to the model and which belong to the recipe that I am not considering in the code?

Apparently, the tuning was successful:

> tunagem
# A workflow set/tibble: 2 × 4
  wflow_id  info             option    result   
  <chr>     <list>           <list>    <list>   
1 modelo_lm <tibble [1 × 4]> <opts[4]> <tune[+]>
2 modelo_rf <tibble [1 × 4]> <opts[4]> <tune[+]>

I believe the problem is that the hyperparameters are not well specified and not clearly defined for each of the models.

library(tidymodels)
library(parallelly)

tidymodels::tidymodels_prefer()

cores <- parallelly::availableCores(logical = FALSE)
cl <- parallel::makeForkCluster(cores)
doParallel::registerDoParallel(cl)

random_dados <- function(n = 1L, sigma2 = 2){
  x <- runif(n = n, 8, 18)
  y <- 45 * tanh(x/1.7 - 7) + 57  + rnorm(n = n, mean = 0, sd = sigma2^2) 
  tibble(x = x, y = y)
}

# Gerando dados -----------------------------------------------------------
set.seed(123)
dados <- random_dados(n = 2e3, sigma2 = 2)

# Realizando hold-out -----------------------------------------------------
dados_split <- initial_split(dados, prop = 0.8, strata = y)
treino <- training(dados_split)
teste <- testing(dados_split)

# 5-fold cross-validation -------------------------------------------------
cv <- vfold_cv(treino, v = 5, strata = y, repeats = 5)

# Definindo alguns modelos para comparar ----------------------------------
modelo_linear <- linear_reg() %>% 
  set_engine("lm")

modelo_rf <- rand_forest(min_n = tune(), trees = tune(), mtry = tune()) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

# Definindo receita -------------------------------------------------------
receita <- recipe(y ~ ., data = treino) |>
  step_scale(all_predictors()) |> 
  step_center(all_predictors()) |> 
  step_poly(x, degree = tune("p")) 
  
# Parâmetros da receita ---------------------------------------------------
parametros_receita <-
  receita |>
  hardhat::extract_parameter_set_dials() |>
  update(
    p = degree_int(c(1L, 15L))
  )

# Parâmetros do modelo ----------------------------------------------------
parametros_modelo <- 
  modelo_rf |> 
  hardhat::extract_parameter_set_dials() |>
  update(
    trees = trees(c(1L, 1000L)),
    min_n = min_n(c(2L, 10L)),
    mtry = finalize(mtry(), treino)
  )

# parametros <- dplyr::bind_rows(parametros_receita, parametros_modelo)

parametros <- dials::parameters(
  list(
    p = degree_int(c(1L, 25L)),
    trees = trees(c(1L, 1000L)),
    min_n = min_n(c(2L, 10L)),
    mtry = finalize(mtry(), treino)
  )
)

# Definindo workflow ------------------------------------------------------
wf <- 
  workflow_set(
    preproc = list("modelo" = receita),
    models = list(
      lm = modelo_linear,
      rf = modelo_rf
    )
  ) |>
  option_add(param_info = parametros_receita, id = "modelo_lm") |>
  option_add(param_info = parametros_modelo, id = "modelo_rf")

# Tunando -----------------------------------------------------------------
ctrl <- 
  control_grid(
    save_pred = TRUE,
    save_workflow = TRUE,
    parallel_over = "everything"
  )

tunagem <- 
  wf |>
  workflow_map(
    fn = "tune_grid",
    grid =  5,
    resamples = cv,
    control = ctrl,
    param_info = parametros,
    seed = 123
  )

foreach::registerDoSEQ()
parallel::stopCluster(cl)

prdm0 · August 6, 2024, 5:03pm

I believe I corrected it since I used the same recipe for both models, forcing the random forest to have an additional hyperparameter, which is the degree of the polynomial. Therefore, I needed to do:

parametros <- dials::parameters(
  list(
    p = degree_int(c(1L, 25L)),
    trees = trees(c(1L, 1000L)),
    min_n = min_n(c(2L, 10L)),
    mtry = finalize(mtry(), treino)
  )
)

And then inform this in the workflow_set(), doing:

wf <- 
  workflow_set(
    preproc = list("modelo" = receita),
    models = list(
      lm = modelo_linear,
      rf = modelo_rf
    )
  ) |>
  option_add(param_info = parametros_receita, id = "modelo_lm") |>
  option_add(param_info = parametros, id = "modelo_rf")

Note that for id = "modelo_rf", the set of random forest hyperparameters plus the parameter p was passed, that is, option_add(param_info = parametros, id = "modelo_rf"). The complete code is below:

library(tidymodels)
library(parallelly)

tidymodels::tidymodels_prefer()

cores <- parallelly::availableCores(logical = FALSE)
cl <- parallel::makeForkCluster(cores)
doParallel::registerDoParallel(cl)

random_dados <- function(n = 1L, sigma2 = 2){
  x <- runif(n = n, 8, 18)
  y <- 45 * tanh(x/1.7 - 7) + 57  + rnorm(n = n, mean = 0, sd = sigma2^2) 
  tibble(x = x, y = y)
}

# Gerando dados -----------------------------------------------------------
set.seed(123)
dados <- random_dados(n = 2e3, sigma2 = 2)

# Realizando hold-out -----------------------------------------------------
dados_split <- initial_split(dados, prop = 0.8, strata = y)
treino <- training(dados_split)
teste <- testing(dados_split)

# 5-fold cross-validation -------------------------------------------------
cv <- vfold_cv(treino, v = 5, strata = y, repeats = 5)

# Definindo alguns modelos para comparar ----------------------------------
modelo_linear <- linear_reg() %>% 
  set_engine("lm")

modelo_rf <- rand_forest(min_n = tune(), trees = tune(), mtry = tune()) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

# Definindo receita -------------------------------------------------------
receita <- recipe(y ~ ., data = treino) |>
  step_scale(all_predictors()) |> 
  step_center(all_predictors()) |> 
  step_poly(x, degree = tune("p")) 
  
# Parâmetros da receita ---------------------------------------------------
parametros_receita <-
  receita |>
  hardhat::extract_parameter_set_dials() |>
  update(
    p = degree_int(c(1L, 15L))
  )

# Parâmetros do modelo ----------------------------------------------------
parametros_modelo <- 
  modelo_rf |> 
  hardhat::extract_parameter_set_dials() |>
  update(
    trees = trees(c(1L, 1000L)),
    min_n = min_n(c(2L, 10L)),
    mtry = finalize(mtry(), treino)
  )

# parametros <- dplyr::bind_rows(parametros_receita, parametros_modelo)

parametros <- dials::parameters(
  list(
    p = degree_int(c(1L, 25L)),
    trees = trees(c(1L, 1000L)),
    min_n = min_n(c(2L, 10L)),
    mtry = finalize(mtry(), treino)
  )
)

# Definindo workflow ------------------------------------------------------
wf <- 
  workflow_set(
    preproc = list("modelo" = receita),
    models = list(
      lm = modelo_linear,
      rf = modelo_rf
    )
  ) |>
  option_add(param_info = parametros_receita, id = "modelo_lm") |>
  option_add(param_info = parametros, id = "modelo_rf")

# Tunando -----------------------------------------------------------------
ctrl <- 
  control_grid(
    save_pred = TRUE,
    save_workflow = TRUE,
    parallel_over = "everything"
  )

tunagem <- 
  wf |>
  workflow_map(
    fn = "tune_grid",
    grid =  5,
    resamples = cv,
    control = ctrl,
    seed = 123
  )

foreach::registerDoSEQ()
parallel::stopCluster(cl)

rank_results(tunagem)

system · August 13, 2024, 5:04pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.