Generating 100 maps with bootstrapping using stacked ensemble fit with tidymodels

Mercy · March 21, 2024, 12:30pm

I am using tidymodels and I am stacking 5 models with tidymodels : random forest, cubist, xgb, support vector machine and neural network for a final prediction that will result in a raster map. My data cannot be shared but here is an example below where I am predicting the "target". My plan is to be able to produce 100 maps of the stacked models and compute a 90% confidence interval map. Here below is the code for one map.

My way forward is to bootstrap the data 100 times so that for each data split: (1) models got fine tuned, (2) then stacked, (3) predict the final map, (4) save the maps on my computer (e.g. map001.tif, map002.tif, …map100.tif), and finally (5) save locally each of the fit ensemble (e.g. fit001.rds, fit002.rds…fit100.rds) that produce the maps. It should be possible to run a loop based on the bootstrap data for that purpose and that is where I am having issues. I will appreciate any help.

# Helper packages
library(dplyr)
library(CAST)
library(sf)
library(raster)
library(terra)
library(rules)

# For modelling
library(tidymodels)
library(tidyverse)
library(finetune)
library(stacks)
library(tidypredict)

# Run in parallel
library(doParallel)


# Let´s create rasters for the surface grid
r1 <- raster(ncol=36, nrow=18)
r1[] <- 1:ncell(r1)
r2=sqrt(r1)
r3<-r1/1000
r4<-r1*r1
r5<-r1^2

rast_grid<-stack(r1,r2,r3,r4,r5)
names(rast_grid)<-c("feature1","feature2","feature3","feature4","feature5")

# Create data frame
df<-as.data.frame(r1,xy=T) %>%
    mutate(target =2*layer/mean(layer)) %>%
    select(x,y,target) 

# Extract data
ext_df<-raster::extract(rast_grid, SpatialPoints(df),sp=F)
red_data <- bind_cols(ext_df, df[,3]) %>% dplyr::rename(target=6)

# Set number of folds and repetition
set.seed(123)
es_folds <- 
  vfold_cv(red_data, strata = target, V=10,repeats = 3)

# Creating two recipes 
# centered and scaled predictors for SVM

normalized_rec <- 
  recipe(target ~ ., data = red_data) %>% 
  step_normalize(all_predictors()) 

poly_recipe <- 
  normalized_rec %>% 
  step_poly(all_predictors()) %>% 
  step_interact(~ all_predictors():all_predictors())

# Create a set of model specifications

nnet_spec <- 
  mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>% 
  set_engine("nnet", MaxNWts = 2600) %>% 
  set_mode("regression")

svm_r_spec <- 
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")

rf_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger",importance = "permutation") %>% 
  set_mode("regression")

xgb_spec <- 
  boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
             min_n = tune(), sample_size = tune(), trees = tune()) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

cubist_spec <- 
  cubist_rules(committees = tune(), neighbors = tune()) %>% 
  set_engine("Cubist") 

nnet_param <- 
  nnet_spec %>% 
  extract_parameter_set_dials() %>% 
  update(hidden_units = hidden_units(c(1, 27)))

#---------------------------------------¨--------------------------------------#
#------------------- creating workflow set ------------------------------------#
#------------------------------------------------------------------------------#

normalized <- 
  workflow_set(
    preproc = list(normalized = normalized_rec), 
    models = list(SVM_radial = svm_r_spec,  
                  neural_network = nnet_spec)
    
  )

# add the neural network parameter object:
normalized <- 
  normalized %>% 
  option_add(param_info = nnet_param, 
             id = "normalized_neural_network")


# create another workflow for non linear models
model_vars <- 
  workflow_variables(outcomes = target, 
                     predictors = everything())

no_pre_proc <- 
  workflow_set(
    preproc = list(simple = model_vars), 
    models = list(RF = rf_spec, boosting = xgb_spec, 
                  Cubist = cubist_spec)
  )

# Finally, we assemble the set that uses nonlinear terms and interactions 
# with the appropriate models

all_workflows <- 
  bind_rows(no_pre_proc, normalized) %>% 
  # Make the workflow ID's a little more simple: 
  mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows

#---------------------------------------¨--------------------------------------#
#------------------- fine tuning models ---------------------------------------#
#------------------------------------------------------------------------------#

# Register a parallel backend
all_cores <- parallel::detectCores(logical = FALSE)
ncore <- all_cores-2
cl <- makePSOCKcluster(ncore)
registerDoParallel(cl)

# Use here race control
race_ctrl <-
  control_race(
    allow_par=TRUE,
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

race_results <-
  all_workflows %>%
  workflow_map(
    fn="tune_race_anova",
    seed = 123,
    resamples = es_folds,
    grid = 100,
    control = race_ctrl
  )

#------------------------------------------------------------------------------#
#------------------- Stack the models -----------------------------------------#
#------------------------------------------------------------------------------#

# # Stack the models
es_stack <-
  stacks() %>%
  add_candidates(race_results)

# # Blend the models
set.seed(123)
ens <- blend_predictions(es_stack, penalty = 10^seq(-2, -0.5, length = 20))

# Fit the ensemble models
fit_ens <- fit_members(ens)


#------------------------------------------------------------------------------#
#------------------- Making the maps ------------------------------------------#
#------------------------------------------------------------------------------#

# Predict the map
t_rast_grid<-rast(rast_grid)
r_target_map<-predict(t_rast_grid,fit_ens)
plot(r_target_map)

stopCluster(cl)

Max · March 22, 2024, 11:01am

I'm having trouble reproducing these lines:

library(CAST)
data <- get(load(system.file("extdata","Cookfarm.RData",package="CAST")))
#> Warning in readChar(con, 5L, useBytes = TRUE): cannot open compressed file '',
#> probable reason 'No such file or directory'
#> Error in readChar(con, 5L, useBytes = TRUE): cannot open the connection

^{Created on 2024-03-22 with reprex v2.0.2}

Mercy · March 22, 2024, 11:24am

The cookfarm data is available from CAST package. See this link

Max · March 22, 2024, 11:39am

Can you please update the code above so that the example is reproducible?

Mercy · March 22, 2024, 1:30pm

Thanks Max for your interest in this question. I have created raster that can be used directly.