Hypothesis -
The Loans Division of Bank want to know the accounts who are likely to default in repaying the loans when the contract ends
execution problem -
getting the following error while running confusion matrix (Error: data
and reference
should be factors with the same levels.) . please check and help me on this
loan <-
read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/loan.txt", sep=';')
trans <- read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/trans.txt", sep=';')
trans <- subset(trans, select = c(account_id,balance,k_symbol))
loanaccount <- merge(trans, loan, by="account_id")
loanaccount <- subset(loanaccount,select = -c(loan_id))
##checking missing value
is.na(loanaccount)
which(is.na(loanaccount))
##duplicated values
unique(loanaccount)
distinct(loanaccount)
## create training and test data
install.packages("DMwR")
library(DMwR)
str(loanaccount)
##data split
datasplit <- sample(nrow(loanaccount), round(nrow(loanaccount)*0.8))
trainigdata <- loanaccount[datasplit,]
testdata <- loanaccount[-datasplit,]
unique(trainigdata)
## loan amount distribution and box plot
library(ggplot2)
give_count <- stat_summary(fun.data = function(x) return(c(y = median(x)*1.06, label = length(x))),
geom = "text")
give_mean <-
stat_summary(fun.y = mean, colour = "darkgreen", geom = "point",
shape = 18, size = 3, show.legend = FALSE)
ggplot(trainigdata, aes(x=k_symbol, y=amount))+ +
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=FALSE) +
give_count +
give_mean +
scale_y_continuous(labels = comma) +
labs(title="Loan Amount by status", x = "loan purpose", y = "Loan Amount \n")
## summary on training dataset
summary(trainigdata)
summary(trainigdata$status)
summary(trainigdata$k_symbol)
## t-test result
install.packages("graphics")
library(graphics)
install.packages("pwr")
library(pwr)
install.packages("nparcomp")
library(nparcomp)
t.test(trainigdata$amount, testdata$amount)
t.test(trainigdata$amount, loanaccount$amount)
## making tree model from train data
install.packages("tree")
library(tree)
train.loan <- tree(status~.-duration-date-payments-account_id, testdata)
plot(train.loan)
text(train.loan, pretty=0)
summary(train.loan)
## tree data prediction
treeloanprediction <- predict(train.loan,trainigdata, type = "class")
##logistic regression
lmloan <- glm(cbind(account_id,status)~.-payments,family="binomial", trainigdata)
summary(lmloan)$coeff
plot(lmloan)
##predict
predictlm <- predict(lmloan,newdata = testdata, type="response")
predictlm
## confufusion matrix sensitivy, secifity
library(heuristica)
library(caret)
library(ROCR)
library(stringi)
model_glm <- predict.glm(lmloan, testdata, type = "response", na.action = na.pass)
model_predict <- function(pred, t) ifelse (pred>t, TRUE, FALSE)
testdata <- testdata[complete.cases(testdata),]
caret::confusionMatrix(model_predict(model_glm, 0.5), reference = testdata, positive="TRUE")
## test set area under the curve
library(ROCR)
rocrpred <- prediction(model_glm, trainigdata$status)
pred <- prediction(predicttestdata,testdata$status)
as.numeric(performance(pred, "auc")@y.values)