Using weights in tidymodels when predicting on new data.

Oscar1 · May 26, 2025, 10:02am

Hi,
I'm trying to implement weights in regression using tidy models. I can get it to work using only glmnet. But when doing it within the tidymodels framework i keep getting different errors when wantint to use the models on heldout data, such as.

Error in hardhat::forge():
! The required column "weight" is missing.

That is, in heldout i would not like to use weights anymore. Please see the example code below where i only use glmnet with and without weights, followed by a tidymodel pipeline with and without weights.

library(glmnet)
library(tibble)
library(tidymodels)

set.seed(123)

# Simulate training and test data
n_train <- 100
n_test <- 10

X_train <- matrix(rnorm(n_train * 4), ncol = 4)
X_test  <- matrix(rnorm(n_test * 4), ncol = 4)
colnames(X_train) <- colnames(X_test) <- paste0("x", 1:4)

# True model
y_train <- X_train[,1] * 3 - X_train[,2] * 2 + rnorm(n_train)

# Make weights extreme for first 50 cases
weights <- rep(1, n_train)
weights[1:50] <- 10

# ----------------------------
# BASELINE glmnet (no tidymodels)
# ----------------------------

# Step 1: Use cross-validation with weights to choose lambda
cv_fit_with_weights <- cv.glmnet(X_train, y_train, alpha = 0.5, weights = weights)
cv_fit_no_weights   <- cv.glmnet(X_train, y_train, alpha = 0.5)

# Step 2: Extract best lambda
best_lambda_wt  <- cv_fit_with_weights$lambda.min
best_lambda_nowt <- cv_fit_no_weights$lambda.min

# Step 3: Final model fit using best lambda
final_fit_wt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_wt, weights = weights)
final_fit_nowt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_nowt)

# Step 4: Predict on new test data
pred_wt <- predict(final_fit_wt, newx = X_test)
pred_nowt <- predict(final_fit_nowt, newx = X_test)

# Step 5: Compare predictions
comparison <- tibble(
  pred_with_weights = as.numeric(pred_wt),
  pred_no_weights = as.numeric(pred_nowt),
  diff = round(abs(pred_wt - pred_nowt), 4)
)

print("===== glmnet (manual) comparison =====")
print(comparison)

# ----------------------------
# TIDYMODELS PIPELINE
# ----------------------------

# Prepare training data as a tibble
train_df <- as_tibble(X_train)
train_df$y <- y_train
train_df$weight <- weights

test_df <- as_tibble(X_test)

# Cross-validation folds
folds <- vfold_cv(train_df, v = 5)

# --------- Model spec (shared) ---------
glm_spec <- linear_reg(penalty = tune(), mixture = 0.5) %>%
  set_engine("glmnet")

# ==========Steps WITHOUT weights ==========
rec_nowt <- recipe(y ~ ., data = train_df %>% select(-weight))

wf_nowt <- workflow() %>%
  add_recipe(rec_nowt) %>%
  add_model(glm_spec)

# Tune
tuned_nowt <- tune_grid(
  wf_nowt,
  resamples = folds,
  grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
  metrics = metric_set(rmse)
)

best_lambda_nowt <- select_best(tuned_nowt, metric = "rmse")$penalty

# Final fit
final_fit_nowt <- finalize_workflow(wf_nowt, tibble(penalty = best_lambda_nowt)) %>%
  fit(data = train_df %>% select(-weight))

# Predict
preds_nowt <- predict(final_fit_nowt, new_data = test_df)

# ========== Same steps WITH weights ==========
rec <- recipe(y ~ ., data = train_df) %>%
  update_role(weight, new_role = "case_weight")

wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(glm_spec)

# Tune penalty with weights
tuned_wt <- tune_grid(
  wf,
  resamples = folds,
  grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
  metrics = metric_set(rmse)
)

best_lambda_wt <- select_best(tuned_wt, metric = "rmse")$penalty

# Final model fit
final_fit_wt <- finalize_workflow(wf, tibble(penalty = best_lambda_wt)) %>%
  fit(data = train_df)

# Predict
preds_wt <- predict(final_fit_wt, new_data = test_df)

# Compare predictions
comparison_tidy <- tibble(
  pred_with_weights = preds_wt$.pred,
  pred_no_weights = preds_nowt$.pred,
  diff = round(abs(preds_wt$.pred - preds_nowt$.pred), 4)
)

print("===== tidymodels comparison =====")
print(comparison_tidy)


```


For discussions related to modeling, machine learning and deep learning. Related packages include `caret`, `modelr`, `yardstick`, `rsample`, `parsnip`, `tensorflow`, `keras`, `cloudml`, and `tfestimators`.

Max · May 28, 2025, 8:40pm

tidymodels handles case weights more extensively than just adding them to the model since they affect preprocessing, postprocessing, and performance estimates differently (depending on what you want the weights to do).

For parsnip, see ?case_weights (or the web page). Basically, you have to first use a function that specifies what kinds of weight you want. I'll use importance weights; those will not affect model predictions (unlike frequency weights). You can change that up if you like.

I’ve modified your code below (look for # changed comments)

With a recipe, it knows that it is a case weight and acts accordingly. It won’t treat it as a predictor.

Finally, the workflows package includes an add_case_weights() function to inform it about them.

Besides the man pages, there is a good worked example on tidymodels.org:

library(glmnet)
#> Loading required package: Matrix
#> Loaded glmnet 4.1-8
library(tibble)
library(tidymodels)

set.seed(123)

# Simulate training and test data
n_train <- 100
n_test <- 10

X_train <- matrix(rnorm(n_train * 4), ncol = 4)
X_test  <- matrix(rnorm(n_test * 4), ncol = 4)
colnames(X_train) <- colnames(X_test) <- paste0("x", 1:4)

# True model
y_train <- X_train[,1] * 3 - X_train[,2] * 2 + rnorm(n_train)

# Make weights extreme for first 50 cases
weights <- rep(1, n_train)
weights[1:50] <- 10

# BASELINE glmnet (no tidymodels)


# Step 1: Use cross-validation with weights to choose lambda
cv_fit_with_weights <- cv.glmnet(X_train, y_train, alpha = 0.5, weights = weights)
cv_fit_no_weights   <- cv.glmnet(X_train, y_train, alpha = 0.5)

# Step 2: Extract best lambda
best_lambda_wt  <- cv_fit_with_weights$lambda.min
best_lambda_nowt <- cv_fit_no_weights$lambda.min

# Step 3: Final model fit using best lambda
final_fit_wt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_wt, weights = weights)
final_fit_nowt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_nowt)

# Step 4: Predict on new test data
pred_wt <- predict(final_fit_wt, newx = X_test)
pred_nowt <- predict(final_fit_nowt, newx = X_test)

# Step 5: Compare predictions
comparison <- tibble(
  pred_with_weights = as.numeric(pred_wt),
  pred_no_weights = as.numeric(pred_nowt),
  diff = round(abs(pred_wt - pred_nowt), 4)
)

print("===== glmnet (manual) comparison =====")
#> [1] "===== glmnet (manual) comparison ====="
print(comparison)
#> # A tibble: 10 × 3
#>    pred_with_weights pred_no_weights diff[,"s0"]
#>                <dbl>           <dbl>       <dbl>
#>  1            -1.17           -1.12       0.0517
#>  2            -3.75           -3.86       0.114 
#>  3            -2.92           -2.99       0.0738
#>  4             0.222           0.208      0.0142
#>  5             3.04            2.93       0.109 
#>  6             0.602           0.254      0.347 
#>  7            -0.933          -0.884      0.0497
#>  8             1.55            1.54       0.0078
#>  9            -2.55           -2.55       0.0061
#> 10             2.09            1.91       0.180

# TIDYMODELS PIPELINE


# Prepare training data as a tibble
train_df <- as_tibble(X_train)
train_df$y <- y_train
train_df$weight <- weights

test_df <- as_tibble(X_test)

# Cross-validation folds
set.seed(1) # changed
folds <- vfold_cv(train_df, v = 5)

glm_spec <- linear_reg(penalty = tune(), mixture = 0.5) %>%
  set_engine("glmnet")

# ==========Steps WITHOUT weights ==========
rec_nowt <- recipe(y ~ ., data = train_df %>% select(-weight))

wf_nowt <- workflow() %>%
  add_recipe(rec_nowt) %>%
  add_model(glm_spec)

# Tune
tuned_nowt <- tune_grid(
  wf_nowt,
  resamples = folds,
  grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
  metrics = metric_set(rmse)
)

best_lambda_nowt <- select_best(tuned_nowt, metric = "rmse")$penalty

# Final fit
final_fit_nowt <- finalize_workflow(wf_nowt, tibble(penalty = best_lambda_nowt)) %>%
  fit(data = train_df %>% select(-weight))

# Predict
preds_nowt <- predict(final_fit_nowt, new_data = test_df)

# ========== Same steps WITH weights ==========

# changed
train_wts_df <- train_df
train_wts_df$weight <- importance_weights(train_wts_df$weight)

# changed
set.seed(1) 
folds_wts <- vfold_cv(train_wts_df, v = 5)

rec_wts <- recipe(y ~ ., data = train_wts_df) # changed

wf_wts <- workflow() %>%
  add_recipe(rec_wts) %>%
  add_model(glm_spec) %>%
  add_case_weights(weight) # changed

# Tune penalty with weights
tuned_wt <- tune_grid(
  wf_wts,
  resamples = folds_wts,
  grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
  metrics = metric_set(rmse)
)

best_lambda_wt <- select_best(tuned_wt, metric = "rmse")$penalty

# Final model fit
final_fit_wt <- 
  finalize_workflow(wf_wts, tibble(penalty = best_lambda_wt)) %>%  # changed
  fit(data = train_wts_df) # changed

# Predict
# Note: for importance weights, you don't need them at prediction-time
# If you do, use frequency_weights()
preds_wt <- predict(final_fit_wt, new_data = test_df)

# Compare predictions
comparison_tidy <- tibble(
  pred_with_weights = preds_wt$.pred,
  pred_no_weights = preds_nowt$.pred,
  diff = round(abs(preds_wt$.pred - preds_nowt$.pred), 4)
)

print("===== tidymodels comparison =====")
#> [1] "===== tidymodels comparison ====="
print(comparison_tidy)
#> # A tibble: 10 × 3
#>    pred_with_weights pred_no_weights   diff
#>                <dbl>           <dbl>  <dbl>
#>  1            -1.24           -1.14  0.0929
#>  2            -3.83           -3.89  0.0567
#>  3            -3.00           -3.02  0.0238
#>  4             0.214           0.205 0.0088
#>  5             3.09            2.95  0.137 
#>  6             0.660           0.274 0.386 
#>  7            -1.02           -0.916 0.103 
#>  8             1.53            1.54  0.0081
#>  9            -2.59           -2.56  0.0293
#> 10             2.11            1.92  0.190

^{Created on 2025-05-28 with reprex v2.1.1}

Oscar1 · June 3, 2025, 12:00pm

Thanks a lot for the clarification – it helped a lot!

system · June 10, 2025, 12:01pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.