Hello,
I have written code to tune the parameters of a regression random forest model. I defined a search grid by h2o grid. It finds the best model but when I run the best model to predict the target for the new dataset(test data) all values of the target are the same so when I calculate the correlation between the actual target and the predicted target, the result is NAN.
I need to mention, when this function finds all models I sort them from best to worse based on error and then I get the first one as the best model then it gives the NAN result. but I tried some of the other models from the middle of the sorted models and they gave so better results. here is my code:
Can you help me with what the problem is?
library(h2o)
h2o.init()
h2o.clusterInfo()
library(tidyverse)
df = original_dataset # a dataset which has 173(rows) samples and 1850 features(col)
normalize <- function(x) {
if(max(x) == min(x)){
return(0)
}
return ((x - min(x)) / (max(x) - min(x)))
}
df = df[,2: ncol(df)]
maxmindf <- as.data.frame(lapply(df, normalize))
attach(maxmindf)
df_norm<-as.matrix(maxmindf)
h_df <- as.h2o(df_norm)
#split the data to train and test
df.split <- h2o.splitFrame(data = h_df, ratios = 0.8, seed = 200)
h_train <- df.split[[1]]
h_test <- df.split[[2]]
target <- "Expression"
features <- setdiff(colnames(df), target)
# different values for the mtries
a1 = floor((ncol(h_train)/3))
a2 = floor(sqrt(ncol(h_train)))
#search grid
hyper_grid.h2o <- list(ntrees = seq(501, 801, by = 100),
mtries = c(a1,a2)
)
hyper_grid.h2o
#number of model
sapply(hyper_grid.h2o, length) %>% prod()
#finding the best model
system.time(grid_cartesian <- h2o.grid(algorithm = "randomForest",
grid_id = "rf_grid1",
x = features,
y = target,
seed = 200,
# nfolds = 5,
training_frame = h_train,
hyper_params = hyper_grid.h2o,
search_criteria = list(strategy = "Cartesian"),
parallelism = 64
)
)
grid_cartesian
grid_perf <- h2o.getGrid(grid_id = "rf_grid1",
sort_by = "residual_deviance",
decreasing = FALSE)
grid_perf@summary_table
best_model1 <- h2o.getModel(grid_perf@model_ids[[1]]) #select the best model
best_model1
#predict the test data
pred <- h2o.predict (object = best_model1, newdata = h_test)
sqrt(mean((as.vector(h_test$Expression) - as.vector(pred)) ^2))
Corelation1 = cor(h_test$Expression , pred) # this one returns NAN
print(Corelation1)