Hello,
I do get an error when trying to run a confusion matrix.
> Error: `data` and `reference` should be factors with the same levels.
This what I am doing:
I am creating my own function to get my best cutoff points for the ROC curve for training and validating dataset.
AccuracyCutoffInfo <- function(train, test, predict, actual)
{
#change the cutoff value's range as you please
cutoff <- seq(.1 , .9, by = .05)
accuracy <- lapply(cutoff, function(c)
{
# use the confusionMatrix from the caret package
cm_train <- confusionMatrix(train[[predict]] > c, train[[actual]])
cm_test <- confusionMatrix(test[[predict]] > c, test[[actual]] )
return(dt)
}) %>% rbindlist()
accuracy_long <- gather(accuracy, "data", "accuracy", -1)
plot <- ggplot(accuracy_long, aes(cutoff, accuracy, group = data, color = data)) +
geom_line(size = 1) + geom_poin(size = 3) +
scale_y_continuous(label = percent) +
ggtitle("Train/Test Accuracy for Different Cutoff")
return(list(data = accuracy, plot = plot))
}
theme_set(theme_minimal())
I am using then the same function:
> accuracy_info <- AccuracyCutoffInfo(train = train, test = validate, predict = "pred", actual = "real")
Now, my training and testing datasets have the class as factors.
Check this out:
For training data set my dput is:
structure(list(shortness_breath = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = c("negative", "negative", "negative", "negative",
"negative", "negative"), Age = c(42, 53, 42, 50, 27, 26),
Gender = c("Female", "Female", "Female", "Male", "Female",
"Male"), pred = c(`1` = 0.194445511752173, `2` = 0.157691990854952,
`3` = 0.158715363855891, `4` = 0.157970559536371, `5` = 0.160119548044875,
`6` = 0.160213516891202), real = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"), problems = structure(list(
row = c(2910L, 35958L), col = c("how_unwell", "how_unwell"
), expected = c("a double", "a double"), actual = c("How Unwell",
"How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
And then for validation dataset my dput is:
structure(list(shortness_breath = structure(c(1L, 2L, 2L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = c("negative", "negative", "negative", "negative",
"negative", "negative"), Age = c(63, 19, 31, 26, 30, 45),
Gender = c("Male", "Female", "Male", "Male", "Female", "Female"
), pred = c(`1` = 0.26594006201297, `2` = 0.160872548705087,
`3` = 0.159744118695227, `4` = 0.160213516891202, `5` = 0.159837909145038,
`6` = 0.15843572889978), real = structure(c(1L, 2L, 2L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"), problems = structure(list(
row = c(2910L, 35958L), col = c("how_unwell", "how_unwell"
), expected = c("a double", "a double"), actual = c("How Unwell",
"How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))