#Installing libraries
install.packages('rpart')
install.packages('caret')
install.packages('rpart.plot')
install.packages('rattle')
install.packages('party')
#Loading libraries
library(rpart)
library(caret)
library(rpart.plot)
library(rattle)
library(party)
library(e1071)
#Reading the data set as a data frame
data<-read.csv(file.choose(),header = T)
# structure of the data
str(data)
#new data
data$PF <- factor(data$pass)
# number of rows with missing values
nrow(data) - sum(complete.cases(data))
# deleting redundant variable `veil.type`
#data$veil.type <- NULL
anyNA(data)
# analyzing the variable
xtabs(~pass+sex, data = data)
xtabs(~pass+studytime, data = data)
xtabs(~pass+internet, data = data)
xtabs(~pass+freetime, data = data)
xtabs(~pass+goout, data = data)
xtabs(~pass+absences, data = data)
xtabs(~pass+G1, data = data)
xtabs(~pass+G2, data = data)
xtabs(~pass+G3, data = data)
#Partition data into Training and Validation datasets
set.seed(1234)
pd <- sample(2,nrow(data),replace = TRUE, prob = c(0.8,0.2))
train <- data[pd==1,]
validate <- data[pd==2,]
#Decision Tree with rpart
#tree<- rpart(pass~sex +studytime +internet +freetime+goout +absences +G1 +G2 +G3,train)
tree<-rpart(pass~sex +studytime +internet +freetime+goout +absences +G1 +G2 +G3,data)
a <-data.frame(sex=c(0),studytime=c(2),internet=c(0),freetime=c(3),goout=c(4),absences=c(6),G1=c(5),G2=c(6),G3=c(6))
result<-predict(tree,a)
print(result)
tree<-rpart(absences~pass+sex +studytime +internet +freetime+goout+G1 +G2 +G3,data)
a<-data.frame(pass=c(0),sex=c(0),studytime=c(2),internet=c(1),freetime=c(3),goout=c(3),G1=c(5),G2=c(5),G3=c(6))
result<-predict(tree,a)
print(result)
#Visualize the decision tree with rpart plot
rpart.plot(tree,extra=1)
#Prediction
predict(tree,validate)
#Calculating accuracy
#table(ifelse(data$sex,"0","1"))
#table(data$pass)
#confusionMatrix(as.factor(ifelse(data$sex,"0","1")),data$pass)
#predict<-(as.factor(predict))
#cm
#confusionMatrix(predict,validate$pass,positive = "1")
`
Hi please share the dataset
Hello, I find your code fairly confusing not least because you create tree one way, then replace it with another way. As such I will refer to the second one.
A confusion matrix will be the result of performing a table on the known value of the dependent variable against the predicted value of that variable. I take it from the second rpart formula that absences is the dependent variable. You can make a confusion matrix for that over the validation dataset like this I would guess:
table(predict(tree,validate),validate$absences)
oh, i see and now i understand. I'm new to rstudio, so I depend on youtube and google to find coding. thanks for that, is there any part of my coding that is wrong?
idk how to send dataset here but this dataset I found on the internet and only use 10 variables. https://www.kaggle.com/datasets/dinhanhx/studentgradepassorfailprediction?resource=download
Hi
In your dataset target variable (pass) is a binary variable, hence we need to follow Logistic Regression. Using glm function we can build multiple models, make predictions, and confusion matrix. The dataset I used is from the link that you have provided above. Just copy these codes in your rstudio and run and see. hope useful.
For more detailed analysis, you may also refer to my channel "Happy Learning-GP" for the video "Shiny App Logistic Regression multiple model".
Here is the code :
library(dplyr)
library(caret) # logistic regression related package,
#Reading the data set as a data frame
df<-read.csv(file.choose(),header = T)
data <- df[,c('pass','sex','studytime','internet','freetime','goout','absences','G1','G2','G3')]
#function to get confusion matrix and statistics
fnsummarystat <- function(model,testdata){
pred1 <- predict(model, newdata = testdata, type = "response")
y_pred1 <- as.numeric(ifelse( pred1 > 0.5, 1, 0))
#Creates vectors having data points
y_pred1 <- factor( y_pred1, levels = c(0, 1))
y_act1 <- testdata[,'pass']
results <- caret::confusionMatrix(reference = y_act1, data = y_pred1)
print(results$table)
}
delete NA rows
data <- na.omit(data)
#Declare Dependent Variable and convert as factor
mdependvar <- 'pass'
data <-data %>% dplyr::select(-mdependvar, everything())
data$pass <- factor(ifelse(data$pass == 1, 1, 0), levels = c(0, 1))
#Partition data into Training and testdata datasets
set.seed(1234)
pd <- sample(2,nrow(data),replace = TRUE, prob = c(0.8,0.2))
traindata <- data[pd==1,]
testdata <- data[pd==2,]
#Building Multiple models
fullmodel <- glm(pass~., family="binomial", data=traindata)
model1 <- glm(pass~ sex, data = traindata, family = "binomial")
model2 <- glm(pass~ studytime, data = traindata, family = "binomial")
model3 <- glm(pass~ internet, data = traindata, family = "binomial")
model4 <- glm(pass~ freetime, data = traindata, family = "binomial")
model5 <- glm(pass~ goout, data = traindata, family = "binomial")
model6 <- glm(pass~ absences, data = traindata, family = "binomial")
model7 <- glm(pass~ G1, data = traindata, family = "binomial")
model8 <- glm(pass~ G2, data = traindata, family = "binomial")
model9 <- glm(pass~ G3, data = traindata, family = "binomial")
#confusion matrix and statistics
fnsummarystat(fullmodel,testdata)
fnsummarystat(model1,testdata)
fnsummarystat(model2,testdata)
fnsummarystat(model3,testdata)
fnsummarystat(model4,testdata)
fnsummarystat(model5,testdata)
fnsummarystat(model6,testdata)
fnsummarystat(model7,testdata)
fnsummarystat(model8,testdata)
fnsummarystat(model9,testdata)
oh, I see, that's why I was having a problem doing the decision tree model. thanks for the suggestion!!!
you are welcome. That dataset is very interesting. it has lots of key variables. Have a look at my video on my channel. apartment confusion matrix, you can evaluate all models and see which variable is important and influencing the target variable, likelihood ratio, KS plot, AUC, and a lot more.
source code is provided as a link in the description. the beauty of the script is that it is like an application and you just have to download and run the application, no further coding is required. the video has a full demo
please mark it as a solution, if it meets your requirements
will do but I'm sorry it shows this error, can know what is the problem
Warning messages:
1: glm.fit: algorithm did not converge
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
yes, i also noticed, that some of the models give such warning. however, you get the confusion matrix
following link may clarify this scenario How to Handle R Warning: glm.fit: algorithm did not converge - Statology
sure, thank you a lot!!!
Hi
please mark it as a solution, if it meets your requirements
This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.
If you have a query related to it or one of the replies, start a new topic and refer back with a link.