I would say that tidymodels is ready for general use, although certain bits of it are still maturing. Here is an example of how you would go about choose models based on resampling, get predictions intervals, and so forth, using some similar models to what you have in your project:
library(tidymodels)
#> ββ Attaching packages βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ tidymodels 0.1.0 ββ
#> β broom 0.5.5 β recipes 0.1.9
#> β dials 0.0.4 β rsample 0.0.5
#> β dplyr 0.8.5 β tibble 2.99.99.9014
#> β ggplot2 3.3.0 β tune 0.0.1.9000
#> β infer 0.5.1 β workflows 0.1.0.9000
#> β parsnip 0.0.5.9000 β yardstick 0.0.5.9000
#> β purrr 0.3.3
#> ββ Conflicts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ tidymodels_conflicts() ββ
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x ggplot2::margin() masks dials::margin()
#> x recipes::step() masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
library(modeldata)
data(biomass)
biomass_tr <- biomass[biomass$dataset == "Training",]
biomass_te <- biomass[biomass$dataset == "Testing",]
## create cross validation folds
bio_folds <- vfold_cv(biomass_tr)
## make a base recipe, to build on later with the polynomials
base_rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
data = biomass_tr)
lm_spec <- linear_reg() %>%
set_engine("lm")
## you don't have to use workflows, but it might be nice in this situation
wf <- workflow() %>%
add_model(lm_spec)
The function fit_resamples()
is what you want to use to fit models to resampled data, for computing performance metrics. After you have them you can collect_metrics()
.
poly_2 <- fit_resamples(
wf %>%
add_recipe(base_rec %>%
step_poly(carbon, hydrogen, degree = 2)),
resamples = bio_folds,
control = control_resamples(save_pred = TRUE)
)
poly_3 <- fit_resamples(
wf %>%
add_recipe(base_rec %>%
step_poly(carbon, hydrogen, degree = 3)),
resamples = bio_folds,
control = control_resamples(save_pred = TRUE)
)
poly_2 %>%
collect_metrics()
#> # A tibble: 2 x 5
#> .metric .estimator mean n std_err
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 rmse standard 1.50 10 0.247
#> 2 rsq standard 0.813 10 0.0737
poly_3 %>%
collect_metrics()
#> # A tibble: 2 x 5
#> .metric .estimator mean n std_err
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 rmse standard 1.40 10 0.215
#> 2 rsq standard 0.837 10 0.0716
Looks like the third degree polynomial is better. How do we get mean predictions and confidence intervals?
third_degree_fit <- wf %>%
add_recipe(base_rec %>%
step_poly(carbon, hydrogen, degree = 3)) %>%
fit(biomass_tr)
mean_pred <- predict(third_degree_fit, new_data = biomass_te)
conf_int_pred <- predict(third_degree_fit, new_data = biomass_te, type = "conf_int")
biomass_te %>%
bind_cols(mean_pred) %>%
bind_cols(conf_int_pred) %>%
as_tibble()
#> # A tibble: 80 x 11
#> sample dataset carbon hydrogen oxygen nitrogen sulfur HHV .pred .pred_lower
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Almon⦠Testing 46.4 5.67 47.2 0.3 0.22 18.3 18.5 18.3
#> 2 Almon⦠Testing 43.2 5.5 48.1 2.85 0.34 17.6 17.2 16.9
#> 3 Anima⦠Testing 42.7 5.5 49.1 2.4 0.3 17.2 17.0 16.7
#> 4 Aspar⦠Testing 46.4 6.1 37.3 1.8 0.5 18.9 18.5 18.3
#> 5 Bambo⦠Testing 48.8 6.32 42.8 0.2 0 20.5 19.6 19.4
#> 6 Barle⦠Testing 44.3 5.5 41.7 0.7 0.2 18.5 17.4 17.3
#> 7 Beet β¦ Testing 38.9 5.23 54.1 1.19 0.51 15.1 15.8 15.4
#> 8 Bio-D⦠Testing 42.1 4.66 33.8 0.95 0.2 16.2 16.1 15.8
#> 9 Black⦠Testing 29.2 4.4 31.1 0.14 4.9 11.1 14.3 12.9
#> 10 Brown⦠Testing 27.8 3.77 23.7 4.63 1.05 10.8 12.6 12.1
#> # β¦ with 70 more rows, and 1 more variable: .pred_upper <dbl>
Created on 2020-03-17 by the reprex package (v0.3.0)
Hope that helps! 