I have a data.frame
with 41 columns. The first column is my dependent variable (called ntl) and the rest of the columns are me independent variables. The independent variables are named pop, tirs, agbh, lc and they have an associated number to their names which starts from 020 to 200 with step 20 (for example, pop020, lc080, agbh140 etc).
I am running multiple random forest (RF) regression models and each time I am using the dependent variable and a set of 4 independent variables. The set of the independent variables should have the same 3 digit number, for instance the model will be ntl ~ pop080 + tirs080 + agbh080 + lc080 or ntl ~ pop200 + tirs200 + agbh200 + lc200 etc, After finishing the RF I am saving the prediction and the residuals of the RF as rf_pred020.tif and rf_resids020.tif, depending on the 3digit I used in the analysis.
So far, I am chaning manually the 3 digit number where necessary. I want to create a for
loop that changes automatically the 3 digit numbers where necessary after every RF model.
Here is the code:
library(caret)
library(terra)
library(randomForest)
library(doParallel)
wd = "path/"
s <- vect(paste0(wd, "lc.shp"))
block.data = read.csv(paste0(wd, "block.data.csv"))
block.data = subset(block.data, select = c(ntl, tirs020, pop020, agbh020, lc020))
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
train <- block.data[samp, ]
test <- block.data[-samp, ]
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# define the control
trControl = trainControl(method = "repeatedcv",
number = 10,
search = "grid",
repeats = 5,
savePredictions = FALSE)
rf_default = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
# tuneLenght = 50,
trControl = trControl)
print(rf_default)
# plot(rf_default)
# Search best mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(1:4))
rf_mtry <- train(ntl ~ pop + tirs + agbh,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
ntree = 1000)
# print(rf_mtry)
best_mtry <- rf_mtry$bestTune$mtry
# best_mtry
# search best maxnodes
# store_maxnode = list()
tuneGrid = expand.grid(.mtry = best_mtry)
best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:50)){
set.seed(3455556)
rf_maxnode = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
maxnodes = maxnodes,
# tuneLenght = 50,
ntree = 1000)
rsq <- rf_maxnode$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.maxnodes <- maxnodes
}
}
# print(paste0("Best R-squared: ", best.rsq, " with maxnodes: ", best.maxnodes))
# best.maxnodes
# search best ntree
# store_maxtrees = list()
best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 8000, by = 500)) {
set.seed(67777789)
rf_maxtrees = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 20,
maxnodes = best.maxnodes,
# tuneLenght = 50,
ntree = ntree)
rsq <- rf_maxtrees$finalModel$rsq
if (rsq > best.rsq) {
best.rsq <- rsq
best.ntree <- ntree
}
}
# print(paste0("Best R-squared: ", best.rsq, " with best tree number: ", best.ntree))
# best.ntree
# train the model with the optimum rf
fit_rf = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
train,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
# tuneLenght = 50,
nodesize = 20,
ntree = best.ntree,
maxnodes = best.maxnodes)
fit_rf
# check Rsquared in the test data
fit_rf2 = train(ntl ~ pop020 + tirs020 + agbh020 + lc020,
test,
method = "rf",
metric = "Rsquared",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
# tuneLenght = 50,
nodesize = 20,
ntree = best.ntree,
maxnodes = best.maxnodes)
fit_rf2
stopCluster(cl)
pop = rast(paste0(wd, "pop.tif"))
pop_res = rast(paste0(wd, "pop020.tif"))
tirs = rast(paste0(wd, "tirs.tif"))
tirs_res = rast(paste0(wd, "tirs020.tif"))
agbh = rast(paste0(wd, "agbh.tif"))
agbh_res = rast(paste0(wd, "agbh020.tif"))
lc = rast(paste0(wd, "lc.tif"))
lc_res = rast(paste0(wd, "lc020.tif"))
sm = c(pop, tirs, agbh, lc)
names(sm) = c("pop", "tirs", "agbh", "lc")
res(sm)
ntl = rast(paste0(wd, "ntl.tif"))
ntl = resample(ntl, pop_res)
sb = c(ntl, pop_res, tirs_res, agbh_res, lc_res)
names(sb) = c("ntl", "pop", "tirs", "agbh", "lc")
# apply best model (fit_rf) using the whole data.set
m = randomForest(ntl ~ pop020 + tirs020 + agbh020 + lc020,
data = sb,
mtry = best_mtry,
importance = TRUE,
na.action = na.omit,
nodesize = 20,
maxnodes = best.maxnodes,
ntree = best.ntree,
corr.bias = TRUE,
replace = TRUE)
m
# predict rf prediction
ps = predict(sm, m, na.rm = TRUE)
ps = crop(ps, ext(s))
ps = mask(ps, s)
writeRaster(ps, paste0(wd, "rf_pred020.tif"), overwrite = TRUE)
# export rf residuals
pb = predict(sb, m, na.rm = TRUE)
rsds = sb$ntl - pb
rsds = crop(rsds, ext(s))
rsds = mask(rsds, s)
writeRaster(rsds, paste0(wd, "rf_resids020.tif"), overwrite = TRUE)
I have tried to create a list
of the column names with a 3-digit number using the from
, to
and seq
commands and then use a for
loop to loop over the column names and modify the column names in the model formula, read the corresponding raster layer, and perform model fitting and prediction but without success. Can someone provide some guidance? I can share the data.frame
via GDrive.