Here's a reprex
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
# create representative data to illustrate problem
data(imports85)
str(imports85)
#> 'data.frame': 205 obs. of 26 variables:
#> $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
#> $ normalizedLosses: int NA NA NA 164 164 NA 158 NA 158 NA ...
#> $ make : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...
#> $ fuelType : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
#> $ aspiration : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
#> $ numOfDoors : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
#> $ bodyStyle : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
#> $ driveWheels : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
#> $ engineLocation : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
#> $ wheelBase : num 88.6 88.6 94.5 99.8 99.4 ...
#> $ length : num 169 169 171 177 177 ...
#> $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
#> $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
#> $ curbWeight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
#> $ engineType : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
#> $ numOfCylinders : Ord.factor w/ 7 levels "two"<"three"<..: 3 3 5 3 4 4 4 4 4 4 ...
#> $ engineSize : int 130 130 152 109 136 136 136 136 131 131 ...
#> $ fuelSystem : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
#> $ bore : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
#> $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
#> $ compressionRatio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
#> $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
#> $ peakRpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
#> $ cityMpg : int 21 21 19 24 18 19 19 19 17 16 ...
#> $ highwayMpg : int 27 27 26 30 22 25 25 25 20 22 ...
#> $ price : int 13495 16500 16500 13950 17450 15250 17710 18920 23875 NA ...
mydata = imports85[c(4:5, 8:9, 10:13)]
fuelType.matrix= model.matrix(~fuelType - 1, mydata)
mydata=cbind(mydata, fuelType.matrix)
aspiration.matrix= model.matrix(~aspiration - 1, mydata)
mydata=cbind(mydata, aspiration.matrix)
driveWheels.matrix= model.matrix(~driveWheels - 1, mydata)
mydata=cbind(mydata, driveWheels.matrix)
engineLocation.matrix= model.matrix(~engineLocation - 1, mydata)
mydata=cbind(mydata, engineLocation.matrix)
mydata = mydata[, -c(1:4)]
# pick a continuous variable as the response and create model
rndforest= randomForest(length ~., data = mydata, ntree=501, mtry = 13, nodesize = 1, importance = TRUE)
#> Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
#> range
# fails with mtry error
rndforest
#>
#> Call:
#> randomForest(formula = length ~ ., data = mydata, ntree = 501, mtry = 13, nodesize = 1, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 501
#> No. of variables tried at each split: 12
#>
#> Mean of squared residuals: 9.741354
#> % Var explained: 93.57
# choose minimal mtry
rndforest= randomForest(length ~., data = mydata, ntree=501, mtry = 3, nodesize = 1, importance = TRUE)
# returns value
rndforest
#>
#> Call:
#> randomForest(formula = length ~ ., data = mydata, ntree = 501, mtry = 3, nodesize = 1, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 501
#> No. of variables tried at each split: 3
#>
#> Mean of squared residuals: 14.59681
#> % Var explained: 90.36
# choose intermediate
rndforest= randomForest(length ~., data = mydata, ntree=501, mtry = 8, nodesize = 1, importance = TRUE)
# returns value
rndforest
#>
#> Call:
#> randomForest(formula = length ~ ., data = mydata, ntree = 501, mtry = 8, nodesize = 1, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 501
#> No. of variables tried at each split: 8
#>
#> Mean of squared residuals: 9.448357
#> % Var explained: 93.76
# Quarter in on again
rndforest= randomForest(length ~., data = mydata, ntree=501, mtry = 10, nodesize = 1, importance = TRUE)
# still good
rndforest
#>
#> Call:
#> randomForest(formula = length ~ ., data = mydata, ntree = 501, mtry = 10, nodesize = 1, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 501
#> No. of variables tried at each split: 10
#>
#> Mean of squared residuals: 10.03625
#> % Var explained: 93.37
# push to one less than failure point mtry
rndforest= randomForest(length ~., data = mydata, ntree=501, mtry = 12, nodesize = 1, importance = TRUE)
# conclusion: for this data set mtry cannot exceed 12
rndforest
#>
#> Call:
#> randomForest(formula = length ~ ., data = mydata, ntree = 501, mtry = 12, nodesize = 1, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 501
#> No. of variables tried at each split: 12
#>
#> Mean of squared residuals: 9.964372
#> % Var explained: 93.42
# also variable lengths differ error does not arise
Created on 2020-02-27 by the reprex package (v0.3.0)