How can I export in a csv, multiple r2 of multiple Random Forest regressions?

nikos_geo · August 23, 2023, 12:31pm

I have a csv which contains the response variable (called ntl) and the predictors. I am running a Random Forest (RF) regression to predict the ntl values. Because I execute multiple times the RF algorithm, I added a number (030, 040) after the predictors name in order to make my life easier when I execute the for loop (see code below). The first six rows of the csv look like this:

head(block.data)
                 ntl  agbh030  agbh040   blue030   blue040    nir030    nir040   pop030   pop040  road030  road040  tirs030
1 52.94 1.773147 1.778323 0.7179366 0.7178904 0.9911458 0.9909238 33.98462 33.99507 152.4204 153.0130 25.39851
2 49.45 2.895677 2.893922 0.9956004 0.9937335 1.3766788 1.3741488 62.45861 62.34375 435.8899 434.7412 35.85143
3 49.12 3.996930 3.992193 1.0742660 1.0725741 1.4975094 1.4952157 78.47204 78.32093 412.6563 411.8447 37.86271
4 44.37 4.146727 4.142556 1.1628517 1.1606724 1.5424725 1.5395863 64.81855 64.78107 295.8405 295.5381 38.16120
5 39.97 4.324530 4.319309 1.1125427 1.1113752 1.4355043 1.4340464 69.18198 68.98098 289.2147 288.5684 36.89496
6 38.44 3.946339 3.943006 1.0540453 1.0531118 1.4150538 1.4135966 47.66867 47.65901 194.1575 193.8339 34.88610
   tirs040
1 25.39134
2 35.77946
3 37.79913
4 38.09968
5 36.84203
6 34.84804

As you can see, when I execute the first loop I am using the predictors which have the 030 in their names, and in the second loop the ones that have the 040 in their names etc.

library("randomForest")
library("doFuture") 

wd <- "test/"

# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .

# for reproduciblity
set.seed(123)

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)

  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # number of trees with highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]
  
  m1 <- randomForest(
    formula = ntl ~ .,
    data    = subBlockData,
    ntree = btree,
    mtry = ncol(features)/3,
    importance = FALSE)
  
  m1 # "% Var explained" is the number I want in the csv
}

What I want is to create a csv (called r2.csv) which will have two columns (called std and r2). The first column will contain the number (030, 040 etc) after the predictors name and the second the r-squared of the RF (called m1 in my code). How can I do that?

Here is my csv:

block.data = structure(list(ntl = c(52.93999863, 49.45000076, 49.11999893, 
44.36999893, 39.97000122, 38.43999863, 38.99000168, 40.88000107, 
37.47999954, 40.65000153, 44.79999924, 50.38000107, 49.38000107, 
68.80000305, 59.88000107), agbh030 = c(1.773146749, 2.89567709, 
3.996930361, 4.146727085, 4.324529648, 3.946338654, 3.430744886, 
2.554599762, 2.052541256, 1.614771366, 1.991117001, 2.35665369, 
0.801872492, 3.310310841, 5.005721092), agbh040 = c(1.778323054, 
2.893922091, 3.992192984, 4.14255619, 4.319309235, 3.943005562, 
3.425927639, 2.553015947, 2.049137354, 1.619293809, 1.997152209, 
2.3637712, 0.81288743, 3.306827068, 4.997349739), blue030 = c(0.717936635, 
0.995600402, 1.074265957, 1.162851691, 1.112542748, 1.05404532, 
0.972470403, 0.864918828, 0.797388136, 0.780449748, 0.717492938, 
0.659686863, 0.544718802, 0.954298615, 1.098036289), blue040 = c(0.717890441, 
0.993733525, 1.072574139, 1.160672426, 1.111375213, 1.053111792, 
0.971249402, 0.863858879, 0.796197593, 0.780405819, 0.717574954, 
0.661465645, 0.546935678, 0.95236063, 1.097277403), nir030 = c(0.99114579, 
1.376678824, 1.49750936, 1.542472482, 1.435504317, 1.415053844, 
1.312831402, 1.290184379, 1.151046991, 1.062780142, 0.961566031, 
0.851697564, 0.746279657, 1.258480549, 1.483649731), nir040 = c(0.990923822, 
1.374148846, 1.495215654, 1.539586306, 1.434046388, 1.41359663, 
1.31127429, 1.288556933, 1.149601102, 1.062715054, 0.961549222, 
0.854078174, 0.749200106, 1.256223083, 1.48290813), pop030 = c(33.98461914, 
62.45861053, 78.47203827, 64.81855011, 69.18198395, 47.66866684, 
85.6471405, 63.21319962, 17.80360603, 57.13486862, 88.91275024, 
58.64352036, 34.46327209, 54.0263176, 61.17818451), pop040 = c(33.99507141, 
62.34374619, 78.32093048, 64.78107452, 68.98097992, 47.65901184, 
85.38433838, 63.14036942, 17.84538078, 57.06893158, 88.97719574, 
58.98400879, 34.64493942, 53.96364212, 61.17742538), road030 = c(152.4203644, 
435.8898926, 412.6563416, 295.8404541, 289.2147217, 194.1574554, 
136.9489441, 213.915741, 195.634903, 275.4249573, 129.0567017, 
110.0097885, 35.32381439, 138.662796, 365.485321), road040 = c(153.013031, 
434.7412109, 411.8447266, 295.538147, 288.5683899, 193.8339233, 
137.1492004, 213.4812012, 196.2715302, 276.1116638, 129.8981934, 
110.5063858, 35.86360931, 139.2144165, 364.8994446), tirs030 = c(25.39851189, 
35.85142899, 37.86270905, 38.16120148, 36.8949585, 34.88610458, 
33.1179924, 29.98550034, 26.51055908, 26.58303452, 25.10464287, 
22.69356346, 18.32171249, 30.8216877, 36.87643814), tirs040 = c(25.39133835, 
35.77946091, 37.79912567, 38.09967804, 36.84203339, 34.84803772, 
33.07061386, 29.94636917, 26.48450851, 26.57940674, 25.10551834, 
22.75337219, 18.39633369, 30.77775002, 36.85960007)), class = "data.frame", row.names = c(NA, 
-15L))

nikos_geo · August 24, 2023, 11:27am

nikos_geo:

library("randomForest")
library("doFuture") 

wd <- "test/"

# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .

# for reproduciblity
set.seed(123)

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)

  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # number of trees with highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]
  
  m1 <- randomForest(
    formula = ntl ~ .,
    data    = subBlockData,
    ntree = btree,
    mtry = ncol(features)/3,
    importance = FALSE)
  
  m1 # "% Var explained" is the number I want in the csv
}

The solution:

library("randomForest")
library("doFuture") 

wd <- "test/"

# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .

r2.df <- NULL

# for reproduciblity
set.seed(123)

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)

  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # number of trees with highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]
  
  m1 <- randomForest(
    formula = ntl ~ .,
    data    = subBlockData,
    ntree = btree,
    mtry = ncol(features)/3,
    importance = FALSE)
  
  m1 # "% Var explained" is the number I want in the csv

  r2.df <- rbind(r2.df, data.frame(column=i, r2=m1$rsq[btree]))
}

write.csv(r2.df, paste0(wd, 'result.csv'), row.names = FALSE)

system · August 31, 2023, 11:27am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.