I am using the College data set from ISLR to implement a Lasso regression. However, when I compare it with the error in the test data when using a simple linear model the error of the Lasso is greater, but Lasso is supposed to outperform a linear regression model. Could you help me see what is that I am doing wrong?
library(ggplot2)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyverse)
library(tidymodels)
#> -- Attaching packages -------------------------------------------------------------- tidymodels 0.1.0 --
#> v broom 0.5.5 v rsample 0.0.5
#> v dials 0.0.4 v tune 0.0.1
#> v infer 0.5.1 v workflows 0.1.0
#> v parsnip 0.0.5 v yardstick 0.0.5
#> v recipes 0.1.9
#> -- Conflicts ----------------------------------------------------------------- tidymodels_conflicts() --
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter() masks stats::filter()
#> x recipes::fixed() masks stringr::fixed()
#> x dplyr::lag() masks stats::lag()
#> x dials::margin() masks ggplot2::margin()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step() masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
library(glmnet)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#> Loaded glmnet 3.0-2
library(reprex)
library(ISLR)
data("College")
College <- as_tibble(College) %>%
mutate(cost = Outstate + Room.Board) %>%
select(-c(Outstate,Room.Board))
College <- na.omit(College)
College<-College %>%
mutate(Private=if_else(Private == "yes", 1L, 0L))
set.seed(123)
cost_split <- College %>% initial_split(prop = 0.5)
cost_train <- training(cost_split)
cost_test <- testing(cost_split)
x = as.matrix(cost_train[,1:14]) #predictors
y = as.matrix(cost_train[,17]) #goal
lambdas_to_try <- seq(10^-3, 10^5, length.out = 10000)
# Setting alpha = 1 implements lasso regression
lasso_cv <- cv.glmnet(x, y, alpha = 1, lambda = lambdas_to_try,
standardize = TRUE, nfolds = 10)
# Best cross-validated lambda
lambda_cv <- lasso_cv$lambda.min
x = as.matrix(cost_test[,1:14]) #predictors
y = as.matrix(cost_test[,17]) #goal
# Fit final model, get its sum of squared residuals and multiple R-squared
model_cv <- glmnet(x, y, alpha = 1, lambda = lambda_cv, standardize = TRUE)
y_hat_cv <- predict(model_cv, x)
ssr_cv <- t(y - y_hat_cv) %*% (y - y_hat_cv)
rsq_lasso_cv <- cor(y, y_hat_cv)^2
error_lasso<-mean((y - y_hat_cv)^2)
M1<-lm(cost ~ Apps+Accept+Enroll+Top10perc+Top25perc+F.Undergrad+P.Undergrad+Books+Personal+PhD+Terminal+S.F.Ratio+perc.alumni+Expend+Grad.Rate+Private, data = cost_train)
cost_test%>%
mutate(pred_linear = predict(M1, newdata = cost_test))%>%
mutate(resid_m1=cost-pred_linear) %>%
summarize(MSFE1=mean(resid_m1^2))
#> Warning in predict.lm(M1, newdata = cost_test): prediction from a rank-deficient
#> fit may be misleading
#> # A tibble: 1 x 1
#> MSFE1
#> <dbl>
#> 1 7087178.
error_lasso
#> [1] 8354684
Created on 2020-04-25 by the reprex package (v0.3.0)
Thanks in advance