Hi,
I'm trying to implement weights in regression using tidy models. I can get it to work using only glmnet. But when doing it within the tidymodels framework i keep getting different errors when wantint to use the models on heldout data, such as.
Error in hardhat::forge()
:
! The required column "weight" is missing.
That is, in heldout i would not like to use weights anymore. Please see the example code below where i only use glmnet with and without weights, followed by a tidymodel pipeline with and without weights.
library(glmnet)
library(tibble)
library(tidymodels)
set.seed(123)
# Simulate training and test data
n_train <- 100
n_test <- 10
X_train <- matrix(rnorm(n_train * 4), ncol = 4)
X_test <- matrix(rnorm(n_test * 4), ncol = 4)
colnames(X_train) <- colnames(X_test) <- paste0("x", 1:4)
# True model
y_train <- X_train[,1] * 3 - X_train[,2] * 2 + rnorm(n_train)
# Make weights extreme for first 50 cases
weights <- rep(1, n_train)
weights[1:50] <- 10
# ----------------------------
# BASELINE glmnet (no tidymodels)
# ----------------------------
# Step 1: Use cross-validation with weights to choose lambda
cv_fit_with_weights <- cv.glmnet(X_train, y_train, alpha = 0.5, weights = weights)
cv_fit_no_weights <- cv.glmnet(X_train, y_train, alpha = 0.5)
# Step 2: Extract best lambda
best_lambda_wt <- cv_fit_with_weights$lambda.min
best_lambda_nowt <- cv_fit_no_weights$lambda.min
# Step 3: Final model fit using best lambda
final_fit_wt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_wt, weights = weights)
final_fit_nowt <- glmnet(X_train, y_train, alpha = 0.5, lambda = best_lambda_nowt)
# Step 4: Predict on new test data
pred_wt <- predict(final_fit_wt, newx = X_test)
pred_nowt <- predict(final_fit_nowt, newx = X_test)
# Step 5: Compare predictions
comparison <- tibble(
pred_with_weights = as.numeric(pred_wt),
pred_no_weights = as.numeric(pred_nowt),
diff = round(abs(pred_wt - pred_nowt), 4)
)
print("===== glmnet (manual) comparison =====")
print(comparison)
# ----------------------------
# TIDYMODELS PIPELINE
# ----------------------------
# Prepare training data as a tibble
train_df <- as_tibble(X_train)
train_df$y <- y_train
train_df$weight <- weights
test_df <- as_tibble(X_test)
# Cross-validation folds
folds <- vfold_cv(train_df, v = 5)
# --------- Model spec (shared) ---------
glm_spec <- linear_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
# ==========Steps WITHOUT weights ==========
rec_nowt <- recipe(y ~ ., data = train_df %>% select(-weight))
wf_nowt <- workflow() %>%
add_recipe(rec_nowt) %>%
add_model(glm_spec)
# Tune
tuned_nowt <- tune_grid(
wf_nowt,
resamples = folds,
grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
metrics = metric_set(rmse)
)
best_lambda_nowt <- select_best(tuned_nowt, metric = "rmse")$penalty
# Final fit
final_fit_nowt <- finalize_workflow(wf_nowt, tibble(penalty = best_lambda_nowt)) %>%
fit(data = train_df %>% select(-weight))
# Predict
preds_nowt <- predict(final_fit_nowt, new_data = test_df)
# ========== Same steps WITH weights ==========
rec <- recipe(y ~ ., data = train_df) %>%
update_role(weight, new_role = "case_weight")
wf <- workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Tune penalty with weights
tuned_wt <- tune_grid(
wf,
resamples = folds,
grid = tibble(penalty = 10^seq(-3, 1, length.out = 10)),
metrics = metric_set(rmse)
)
best_lambda_wt <- select_best(tuned_wt, metric = "rmse")$penalty
# Final model fit
final_fit_wt <- finalize_workflow(wf, tibble(penalty = best_lambda_wt)) %>%
fit(data = train_df)
# Predict
preds_wt <- predict(final_fit_wt, new_data = test_df)
# Compare predictions
comparison_tidy <- tibble(
pred_with_weights = preds_wt$.pred,
pred_no_weights = preds_nowt$.pred,
diff = round(abs(preds_wt$.pred - preds_nowt$.pred), 4)
)
print("===== tidymodels comparison =====")
print(comparison_tidy)
```
For discussions related to modeling, machine learning and deep learning. Related packages include `caret`, `modelr`, `yardstick`, `rsample`, `parsnip`, `tensorflow`, `keras`, `cloudml`, and `tfestimators`.