KNN method with different accuracy

Inuraghe · November 9, 2021, 1:54pm

I wrote the knn model with two different methods and I get different results, with a very different accuracy index between the two.. even if they are both knn.

This is my actual code:

library(class)
library(ggplot2)
library(gmodels)
library(scales)
library(caret)
library(tidyverse)
library(caret)

db_data <- iris
row_train <- sample(nrow(iris), nrow(iris)*0.8)
db_train <- iris[row_train,]
db_test <- iris[-row_train,]

unique(db_train$Species)
table(db_train$Species)
#--------

#FIST METHOD 

#KNN
#-------
model_knn<-train(Species ~ ., data = db_train, method = "knn",tuneLength = 10)
summary(model_knn)
#-------

#PREDICTION NEW RECORD
#-------
test_data <- db_test
db_test$predict <- predict(model_knn, newdata=test_data, interval='confidence')
confusionMatrix(data=factor(db_test$predict),reference=factor(db_test$Species))
#-------

#SECOND METHOD
#-------
db_class_train_x <- db_data[row_train, -ncol(db_data)]
db_class_train_y <- db_data[row_train, ncol(db_data)]

db_class_test_x <- db_data[-row_train, -ncol(db_data)]
db_class_test_y <- db_data[-row_train, ncol(db_data)]

#knn
model_knn <- knn(db_class_train_x,
                 db_class_test_x,
                 db_class_train_y, 
                 12)
confusionMatrix(data=factor(model_knn),reference=factor(db_class_test_y))
#-------

My question is: why is the second method more performing?

Max · November 9, 2021, 4:21pm

You are comparing different models. The caret model does not test k = 12 so it would not have selected it. When I test that value, I get the same results (see below).

One minor thing: you should set the random number seed. These fits use random numbers and you will not get reproducible results if the seed is not fixed.

library(class)
library(ggplot2)
library(gmodels)
library(scales)
library(caret)
#> Loading required package: lattice
library(tidyverse)
library(caret)

db_data <- iris
row_train <- sample(nrow(iris), nrow(iris)*0.8)
db_train <- iris[row_train,]
db_test <- iris[-row_train,]

unique(db_train$Species)
#> [1] setosa     virginica  versicolor
#> Levels: setosa versicolor virginica
table(db_train$Species)
#> 
#>     setosa versicolor  virginica 
#>         43         40         37

#FIST METHOD 

#KNN

set.seed(1)
model_knn<-train(Species ~ ., data = db_train, method = "knn",tuneGrid = data.frame(k = 12))
summary(model_knn)
#>             Length Class      Mode     
#> learn       2      -none-     list     
#> k           1      -none-     numeric  
#> theDots     0      -none-     list     
#> xNames      4      -none-     character
#> problemType 1      -none-     character
#> tuneValue   1      data.frame list     
#> obsLevels   3      -none-     character
#> param       0      -none-     list

#PREDICTION NEW RECORD

test_data <- db_test
db_test$predict <- predict(model_knn, newdata=test_data, interval='confidence')
confusionMatrix(data=factor(db_test$predict),reference=factor(db_test$Species))
#> Confusion Matrix and Statistics
#> 
#>             Reference
#> Prediction   setosa versicolor virginica
#>   setosa          7          0         0
#>   versicolor      0          9         0
#>   virginica       0          1        13
#> 
#> Overall Statistics
#>                                           
#>                Accuracy : 0.9667          
#>                  95% CI : (0.8278, 0.9992)
#>     No Information Rate : 0.4333          
#>     P-Value [Acc > NIR] : 5.119e-10       
#>                                           
#>                   Kappa : 0.9482          
#>                                           
#>  Mcnemar's Test P-Value : NA              
#> 
#> Statistics by Class:
#> 
#>                      Class: setosa Class: versicolor Class: virginica
#> Sensitivity                 1.0000            0.9000           1.0000
#> Specificity                 1.0000            1.0000           0.9412
#> Pos Pred Value              1.0000            1.0000           0.9286
#> Neg Pred Value              1.0000            0.9524           1.0000
#> Prevalence                  0.2333            0.3333           0.4333
#> Detection Rate              0.2333            0.3000           0.4333
#> Detection Prevalence        0.2333            0.3000           0.4667
#> Balanced Accuracy           1.0000            0.9500           0.9706

#SECOND METHOD

db_class_train_x <- db_data[row_train, -ncol(db_data)]
db_class_train_y <- db_data[row_train, ncol(db_data)]

db_class_test_x <- db_data[-row_train, -ncol(db_data)]
db_class_test_y <- db_data[-row_train, ncol(db_data)]

#knn
model_knn <- knn(db_class_train_x,
                 db_class_test_x,
                 db_class_train_y, 
                 12)
confusionMatrix(data=factor(model_knn),reference=factor(db_class_test_y))
#> Confusion Matrix and Statistics
#> 
#>             Reference
#> Prediction   setosa versicolor virginica
#>   setosa          7          0         0
#>   versicolor      0          9         0
#>   virginica       0          1        13
#> 
#> Overall Statistics
#>                                           
#>                Accuracy : 0.9667          
#>                  95% CI : (0.8278, 0.9992)
#>     No Information Rate : 0.4333          
#>     P-Value [Acc > NIR] : 5.119e-10       
#>                                           
#>                   Kappa : 0.9482          
#>                                           
#>  Mcnemar's Test P-Value : NA              
#> 
#> Statistics by Class:
#> 
#>                      Class: setosa Class: versicolor Class: virginica
#> Sensitivity                 1.0000            0.9000           1.0000
#> Specificity                 1.0000            1.0000           0.9412
#> Pos Pred Value              1.0000            1.0000           0.9286
#> Neg Pred Value              1.0000            0.9524           1.0000
#> Prevalence                  0.2333            0.3333           0.4333
#> Detection Rate              0.2333            0.3000           0.4333
#> Detection Prevalence        0.2333            0.3000           0.4667
#> Balanced Accuracy           1.0000            0.9500           0.9706

^{Created on 2021-11-09 by the reprex package (v2.0.0)}

system · November 16, 2021, 4:22pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.