I have a `data.frame`

with 41 columns. The first column is my dependent variable (called *ntl*) and the rest of the columns are me independent variables. The independent variables are named *pop*, *tirs*, *agbh*, *lc* and they have an associated number to their names which starts from 020 to 200 with step 20 (for example, *pop020*, *lc080*, *agbh140* etc).

I am running multiple random forest (RF) regression models and each time I am using the dependent variable and a set of 4 independent variables. The set of the independent variables should have the same 3 digit number, for instance the model will be *ntl ~ pop080 + tirs080 + agbh080 + lc080* or *ntl ~ pop200 + tirs200 + agbh200 + lc200* etc, After finishing the RF I am saving the prediction and the residuals of the RF as *rf_pred020.tif* and *rf_resids020.tif*, depending on the 3digit I used in the analysis.

So far, I am chaning manually the 3 digit number where necessary. I want to create a `for`

loop that changes automatically the 3 digit numbers where necessary after every RF model.

Here is the code:

```
library(caret)
library(terra)
library(randomForest)
library(doParallel)
wd = "path/"
s <- vect(paste0(wd, "lc.shp"))
block.data = read.csv(paste0(wd, "block.data.csv"))
block.data = subset(block.data, select = c(ntl, tirs020, pop020, agbh020, lc020))
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
train <- block.data[samp, ]
test <- block.data[-samp, ]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# define the control
trControl = trainControl(method = "repeatedcv",
number = 10,
search = "grid",
repeats = 5,
savePredictions = FALSE)
rf_default = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
# tuneLenght = 50,
trControl = trControl)
print(rf_default)
# plot(rf_default)
# Search best mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(1:4))
rf_mtry <- train(ntl ~ pop + tirs + agbh,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
ntree = 1000)
# print(rf_mtry)
best_mtry <- rf_mtry$bestTune$mtry
# best_mtry
# search best maxnodes
# store_maxnode = list()
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:50)){
set.seed(3455556)
rf_maxnode = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
maxnodes = maxnodes,
# tuneLenght = 50,
ntree = 1000)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# print(paste0("Best R-squared: ", best.rsq, " with maxnodes: ", best.maxnodes))
# best.maxnodes
# search best ntree
# store_maxtrees = list()
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 8000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
maxnodes = best.maxnodes,
# tuneLenght = 50,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
# print(paste0("Best R-squared: ", best.rsq, " with best tree number: ", best.ntree))
# best.ntree
# train the model with the optimum rf
fit_rf = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
# tuneLenght = 50,
nodesize = 20,
ntree = best.ntree,
maxnodes = best.maxnodes)
fit_rf
# check Rsquared in the test data
fit_rf2 = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
test,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
# tuneLenght = 50,
nodesize = 20,
ntree = best.ntree,
maxnodes = best.maxnodes)
fit_rf2
stopCluster(cl)
pop = rast(paste0(wd, "pop.tif"))
pop_res = rast(paste0(wd, "pop020.tif"))
tirs = rast(paste0(wd, "tirs.tif"))
tirs_res = rast(paste0(wd, "tirs020.tif"))
agbh = rast(paste0(wd, "agbh.tif"))
agbh_res = rast(paste0(wd, "agbh020.tif"))
lc = rast(paste0(wd, "lc.tif"))
lc_res = rast(paste0(wd, "lc020.tif"))
sm = c(pop, tirs, agbh, lc)
names(sm) = c("pop", "tirs", "agbh", "lc")
res(sm)
ntl = rast(paste0(wd, "ntl.tif"))
ntl = resample(ntl, pop_res)
sb = c(ntl, pop_res, tirs_res, agbh_res, lc_res)
names(sb) = c("ntl", "pop", "tirs", "agbh", "lc")
# apply best model (fit_rf) using the whole data.set
m = randomForest(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = sb,
mtry = best_mtry,
importance = TRUE,
na.action = na.omit,
nodesize = 20,
maxnodes = best.maxnodes,
ntree = best.ntree,
corr.bias = TRUE,
replace = TRUE)
m
# predict rf prediction
ps = predict(sm, m, na.rm = TRUE)
ps = crop(ps, ext(s))
ps = mask(ps, s)
writeRaster(ps, paste0(wd, "rf_pred020.tif"), overwrite = TRUE)
# export rf residuals
pb = predict(sb, m, na.rm = TRUE)
rsds = sb$ntl - pb
rsds = crop(rsds, ext(s))
rsds = mask(rsds, s)
writeRaster(rsds, paste0(wd, "rf_resids020.tif"), overwrite = TRUE)
```

I have tried to create a `list`

of the column names with a 3-digit number using the `from`

, `to`

and `seq`

commands and then use a `for`

loop to loop over the column names and modify the column names in the model formula, read the corresponding raster layer, and perform model fitting and prediction but without success. Can someone provide some guidance? I can share the `data.frame`

via GDrive.