Using recipes with multilevelmod

Wondering if anyone knows how I might use recipes with multilevelmod

Has some suggestions namely

First, instead of using add_formula() , we suggest using add_variables() . This passes the columns as-is to the model fitting function. To add the random effects formula, use the formula argument of add_model() . For example:

library(multilevelmod)
set.seed(1234)
data(sleepstudy, package = "lme4")

lmer_wflow <- 
  workflow() %>% 
  add_variables(outcomes = Reaction, predictors = c(Days, Subject)) %>% 
  add_model(lmer_spec, formula = Reaction ~ Days + (1|Subject))

lmer_wflow %>% fit(data = sleepstudy)

If using a recipe, make sure that functions like step_dummy() do not convert the column for the independent experimental unit (i.e. subject) to dummy variables. The underlying model fit functions require a single column for these data.

Using a recipe also offers the opportunity to set a different role for the independent experiment unit, which can come in handy when more complex preprocessing is needed.

rec <- 
  recipe(Reaction ~ Days + Subject, data = sleepstudy) %>%
  add_role(Subject, new_role = "exp_unit") %>%
  step_zv(all_predictors(), -has_role("exp_unit"))

lmer_wflow %>%
  remove_variables() %>%
  add_recipe(rec) %>%
  fit(data = sleepstudy)

The example here works fine however, I'm having issues when I want to do any kind of variable transformation, such as step_dummy or step_pca?

1 Like

When using recipes with multilevelmod, the main issue with transformations like step_dummy() or step_pca() is that they might interfere with the random effects structure of the model. Here’s how to handle it properly:

Key Considerations:

  1. Preserve the grouping variable (e.g., Subject) as a single column.
  • Avoid applying step_dummy() or step_pca() to the grouping variable.
  1. Apply transformations selectively.
  • Use -has_role("exp_unit") to ensure transformations don’t affect the experimental unit. Use the freecine extension for more guide and solution.

Yes, do you have an example you can share the problem is with things like

add_variables

This passes the columns as-is to the model fitting function.

Secondly the formula

add_formula

Whereas both step_pca and step_dummy will change the column names.

library('lme4')
#> Loading required package: Matrix
library('multilevelmod')
#> Loading required package: parsnip
library('workflowsets')
library('tidymodels')
tidymodels_prefer()

riesby
#> # A tibble: 250 Γ— 7
#>    subject depr_score  week  male endogenous imipramine desipramine
#>    <fct>        <dbl> <dbl> <dbl>      <dbl>      <dbl>       <dbl>
#>  1 101             -8     0     0          0       4.04        4.20
#>  2 101            -19     1     0          0       3.93        4.81
#>  3 101            -22     2     0          0       4.33        4.96
#>  4 101            -23     3     0          0       4.37        4.96
#>  5 103            -18     0     1          0       2.77        5.24
#>  6 103             -9     1     1          0       3.47        5.21
#>  7 103            -18     2     1          0       3.53        5.34
#>  8 103            -20     3     1          0       3.58        5.36
#>  9 104            -11     0     1          1       5.34        4.75
#> 10 104            -16     1     1          1       5.75        5.06
#> # β„Ή 240 more rows

splits <- group_initial_split(riesby, group = subject)

folds <- group_vfold_cv(training(splits), v = 5, group = subject)

rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = tune())

lmer_spec <- 
  linear_reg() %>% 
  set_engine("lmer")

lmer_wflow <- 
  workflow() %>% 
  add_variables(outcomes = depr_score, predictors = c(male, endogenous, imipramine, desipramine, subject)) %>% 
  add_model(lmer_spec, formula = depr_score ~ male + endogenous + imipramine + desipramine + (1|subject))



grid_results <- tune_grid(
  lmer_wflow,
  resamples = folds,
  grid = 10,
  metrics = metric_set(rmse),
  control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> Warning: No tuning parameters have been detected, performance will be evaluated
#> using the resamples with no tuning. Did you want to [tune()] parameters?

collect_predictions(grid_results)
#> # A tibble: 187 Γ— 5
#>     .pred id         .row depr_score .config             
#>     <dbl> <chr>     <int>      <dbl> <chr>               
#>  1  -6.14 Resample1     5         -6 Preprocessor1_Model1
#>  2  -6.69 Resample1     6         -6 Preprocessor1_Model1
#>  3  -5.40 Resample1     7         -9 Preprocessor1_Model1
#>  4  -7.04 Resample1     8        -13 Preprocessor1_Model1
#>  5  -7.36 Resample1    23         -6 Preprocessor1_Model1
#>  6  -8.39 Resample1    24         -7 Preprocessor1_Model1
#>  7  -9.80 Resample1    25        -12 Preprocessor1_Model1
#>  8 -10.3  Resample1    26        -13 Preprocessor1_Model1
#>  9  -6.23 Resample1    27         -8 Preprocessor1_Model1
#> 10  -7.15 Resample1    28         -8 Preprocessor1_Model1
#> # β„Ή 177 more rows

collect_metrics(grid_results)
#> # A tibble: 1 Γ— 6
#>   .metric .estimator  mean     n std_err .config             
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard    7.35     5   0.687 Preprocessor1_Model1

final_model <- fit_best(grid_results)

final_model
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: depr_score
#> Predictors: c(male, endogenous, imipramine, desipramine, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: depr_score ~ male + endogenous + imipramine + desipramine + (1 |  
#>     subject)
#>    Data: data
#> REML criterion at convergence: 1177.133
#> Random effects:
#>  Groups   Name        Std.Dev.
#>  subject  (Intercept) 5.577   
#>  Residual             4.585   
#> Number of obs: 187, groups:  subject, 50
#> Fixed Effects:
#> (Intercept)         male   endogenous   imipramine  desipramine  
#>     13.7276       0.6072       1.6161      -0.8234      -4.2021

Now try with recipe

lmer_wflow_rec <- lmer_wflow %>%
  remove_variables() %>%
  add_recipe(rec)

grid_results_rec <- tune_grid(
  lmer_wflow_rec,
  resamples = folds,
  grid = 10,
  metrics = metric_set(rmse),
  control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> β†’ A | error:   object 'male' not found
#> There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x5There were issues with some computations   A: x9There were issues with some computations   A: x11There were issues with some computations   A: x14There were issues with some computations   A: x17There were issues with some computations   A: x20There were issues with some computations   A: x20
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
library('lme4')
Loading required package: Matrix
library('multilevelmod')
Loading required package: parsnip
library('workflowsets')
library('tidymodels')
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
βœ” broom     1.0.7     βœ” recipes   1.1.1
βœ” dials     1.3.0     βœ” rsample   1.2.1
βœ” dplyr     1.1.4     βœ” tibble    3.2.1
βœ” ggplot2   3.5.1     βœ” tidyr     1.3.1
βœ” infer     1.0.7     βœ” tune      1.2.1
βœ” modeldata 1.4.0     βœ” workflows 1.1.4
βœ” purrr     1.0.4     βœ” yardstick 1.3.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
βœ– purrr::discard()  masks scales::discard()
βœ– tidyr::expand()   masks Matrix::expand()
βœ– dplyr::filter()   masks stats::filter()
βœ– dplyr::lag()      masks stats::lag()
βœ– tidyr::pack()     masks Matrix::pack()
βœ– recipes::step()   masks stats::step()
βœ– tidyr::unpack()   masks Matrix::unpack()
βœ– recipes::update() masks Matrix::update(), stats::update()
β€’ Learn how to get started at https://www.tidymodels.org/start/
tidymodels_prefer()
riesby |> head()
# A tibble: 6 Γ— 7
  subject depr_score  week  male endogenous imipramine desipramine
  <fct>        <dbl> <dbl> <dbl>      <dbl>      <dbl>       <dbl>
1 101             -8     0     0          0       4.04        4.20
2 101            -19     1     0          0       3.93        4.81
3 101            -22     2     0          0       4.33        4.96
4 101            -23     3     0          0       4.37        4.96
5 103            -18     0     1          0       2.77        5.24
6 103             -9     1     1          0       3.47        5.21
splits <- group_initial_split(riesby, group = subject)

folds <- group_vfold_cv(training(splits), v = 5, group = subject)
rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), threshold = 1)

prep_rec <- prep(rec, training = training(splits) %>% filter(week == 0))

n_comp <- tidy(prep_rec, 2, type ='variance') %>%
  select(component) %>%
  filter(component == max(component)) %>%
  pull()

tidy(prep_rec, 2, type ='variance') %>%
  filter(terms == 'cumulative percent variance' | terms == 'percent variance') %>%
  mutate(variance = factor(terms, levels = c('cumulative percent variance', 'percent variance'), labels = c("cumulative","percent"))) %>%
  mutate(PC = component) %>%
  mutate(percent = value/100) %>%
  ggplot(aes(x=PC, y=percent, fill=variance, alpha = variance)) +
  geom_bar(stat = "identity",
           position = "identity") +
  scale_fill_manual(values = 
                      c("cumulative" = "#56B4E9", 
                        "percent" = "black")) +
  scale_x_continuous(breaks = 1:n_comp) +
  scale_y_continuous(
    breaks = seq(0, 1, 0.1),
    labels = scales::percent_format()
  ) +
  scale_alpha_manual(values = c(.75,1)) +
  labs(
    x = "Principal Component",
    y = "Percent of Variance Explained",
    title = "Variance Explained by Principal Component"
  ) +
  geom_text(aes(label = scales::percent(percent)), 
            #position = position_stack(), 
            size = 3)
Warning in 1:n_comp: numerical expression has 4 elements: only the first used

rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = 3)
lmer_spec <- linear_reg() %>% 
  set_engine("lmer")

lmer_wflow <- workflow() %>% 
  add_variables(outcomes = depr_score, predictors = c(starts_with("PC"), c(subject))) %>% 
  add_model(lmer_spec, formula = depr_score ~ PC1 + PC2 + PC3 + (1|subject))

lmer_wflow2 <- lmer_wflow %>%
  remove_variables() %>%
  add_recipe(rec) 
lmer_fits <- tune_grid(
  lmer_wflow2,
  resamples = folds,
  grid = 10,
  control = control_grid(save_pred = TRUE)
)
Warning: No tuning parameters have been detected, performance will be evaluated
using the resamples with no tuning. Did you want to [tune()] parameters?
collect_predictions(lmer_fits)
# A tibble: 188 Γ— 5
    .pred id         .row depr_score .config             
    <dbl> <chr>     <int>      <dbl> <chr>               
 1  -8.06 Resample1    23         -6 Preprocessor1_Model1
 2  -8.85 Resample1    24         -7 Preprocessor1_Model1
 3  -9.94 Resample1    25        -12 Preprocessor1_Model1
 4 -10.3  Resample1    26        -13 Preprocessor1_Model1
 5 -10.0  Resample1    35         -1 Preprocessor1_Model1
 6  -8.44 Resample1    36         -1 Preprocessor1_Model1
 7 -10.1  Resample1    37         -8 Preprocessor1_Model1
 8 -11.2  Resample1    38         -6 Preprocessor1_Model1
 9 -12.3  Resample1    46         -8 Preprocessor1_Model1
10 -13.0  Resample1    47        -10 Preprocessor1_Model1
# β„Ή 178 more rows