I am using tidymodels and I am stacking 5 models with tidymodels : random forest, cubist, xgb, support vector machine and neural network for a final prediction that will result in a raster map. My data cannot be shared but here is an example below where I am predicting the "target". My plan is to be able to produce 100 maps of the stacked models and compute a 90% confidence interval map. Here below is the code for one map.
My way forward is to bootstrap the data 100 times so that for each data split: (1) models got fine tuned, (2) then stacked, (3) predict the final map, (4) save the maps on my computer (e.g. map001.tif, map002.tif, …map100.tif), and finally (5) save locally each of the fit ensemble (e.g. fit001.rds, fit002.rds…fit100.rds) that produce the maps. It should be possible to run a loop based on the bootstrap data for that purpose and that is where I am having issues. I will appreciate any help.
# Helper packages
library(dplyr)
library(CAST)
library(sf)
library(raster)
library(terra)
library(rules)
# For modelling
library(tidymodels)
library(tidyverse)
library(finetune)
library(stacks)
library(tidypredict)
# Run in parallel
library(doParallel)
# Let´s create rasters for the surface grid
r1 <- raster(ncol=36, nrow=18)
r1[] <- 1:ncell(r1)
r2=sqrt(r1)
r3<-r1/1000
r4<-r1*r1
r5<-r1^2
rast_grid<-stack(r1,r2,r3,r4,r5)
names(rast_grid)<-c("feature1","feature2","feature3","feature4","feature5")
# Create data frame
df<-as.data.frame(r1,xy=T) %>%
mutate(target =2*layer/mean(layer)) %>%
select(x,y,target)
# Extract data
ext_df<-raster::extract(rast_grid, SpatialPoints(df),sp=F)
red_data <- bind_cols(ext_df, df[,3]) %>% dplyr::rename(target=6)
# Set number of folds and repetition
set.seed(123)
es_folds <-
vfold_cv(red_data, strata = target, V=10,repeats = 3)
# Creating two recipes
# centered and scaled predictors for SVM
normalized_rec <-
recipe(target ~ ., data = red_data) %>%
step_normalize(all_predictors())
poly_recipe <-
normalized_rec %>%
step_poly(all_predictors()) %>%
step_interact(~ all_predictors():all_predictors())
# Create a set of model specifications
nnet_spec <-
mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>%
set_engine("nnet", MaxNWts = 2600) %>%
set_mode("regression")
svm_r_spec <-
svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
rf_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger",importance = "permutation") %>%
set_mode("regression")
xgb_spec <-
boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(),
min_n = tune(), sample_size = tune(), trees = tune()) %>%
set_engine("xgboost") %>%
set_mode("regression")
cubist_spec <-
cubist_rules(committees = tune(), neighbors = tune()) %>%
set_engine("Cubist")
nnet_param <-
nnet_spec %>%
extract_parameter_set_dials() %>%
update(hidden_units = hidden_units(c(1, 27)))
#---------------------------------------¨--------------------------------------#
#------------------- creating workflow set ------------------------------------#
#------------------------------------------------------------------------------#
normalized <-
workflow_set(
preproc = list(normalized = normalized_rec),
models = list(SVM_radial = svm_r_spec,
neural_network = nnet_spec)
)
# add the neural network parameter object:
normalized <-
normalized %>%
option_add(param_info = nnet_param,
id = "normalized_neural_network")
# create another workflow for non linear models
model_vars <-
workflow_variables(outcomes = target,
predictors = everything())
no_pre_proc <-
workflow_set(
preproc = list(simple = model_vars),
models = list(RF = rf_spec, boosting = xgb_spec,
Cubist = cubist_spec)
)
# Finally, we assemble the set that uses nonlinear terms and interactions
# with the appropriate models
all_workflows <-
bind_rows(no_pre_proc, normalized) %>%
# Make the workflow ID's a little more simple:
mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows
#---------------------------------------¨--------------------------------------#
#------------------- fine tuning models ---------------------------------------#
#------------------------------------------------------------------------------#
# Register a parallel backend
all_cores <- parallel::detectCores(logical = FALSE)
ncore <- all_cores-2
cl <- makePSOCKcluster(ncore)
registerDoParallel(cl)
# Use here race control
race_ctrl <-
control_race(
allow_par=TRUE,
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
race_results <-
all_workflows %>%
workflow_map(
fn="tune_race_anova",
seed = 123,
resamples = es_folds,
grid = 100,
control = race_ctrl
)
#------------------------------------------------------------------------------#
#------------------- Stack the models -----------------------------------------#
#------------------------------------------------------------------------------#
# # Stack the models
es_stack <-
stacks() %>%
add_candidates(race_results)
# # Blend the models
set.seed(123)
ens <- blend_predictions(es_stack, penalty = 10^seq(-2, -0.5, length = 20))
# Fit the ensemble models
fit_ens <- fit_members(ens)
#------------------------------------------------------------------------------#
#------------------- Making the maps ------------------------------------------#
#------------------------------------------------------------------------------#
# Predict the map
t_rast_grid<-rast(rast_grid)
r_target_map<-predict(t_rast_grid,fit_ens)
plot(r_target_map)
stopCluster(cl)