So I am trying to look at decision trees for random foresting modeling to compare against my base of logistic regression. The other day the predict function was working fine but now every time i get to line 141, R encounters fatal error and aborts session. I have redownloaded R and R studio, restarted my computer, everything I can think of.
#-----Section 01-------------------------------------------
## Load Packages
library(caret)
library(C50)
library(plyr)
library(gmodels)
library(ROCR)
##xG
#Read in events Data
# set working directory
setwd(dirname(file.choose()))
getwd()
events <- read.csv("desc.csv", stringsAsFactors = FALSE)
head(events) # Inspect top rows of the data
str(events)
# select variables
myvars <- names(events[c(17:22)])
events <- events[myvars]
rm(myvars)
str(events)
#-----Section 02-------------------------------------------
## structure data
# BodyPart
events$bodypart <- factor(events$bodypart, levels = c("1", "2", "3"), exclude = NA,
labels = c("right_foot","left_foot", "header"))
summary(events$bodypart)
# Situation
events$situation <- factor(events$situation, levels = c("1", "2", "3", "4"), exclude = NA,
labels = c("open_play","set_piece", "corner", "free_kick"))
summary(events$situation)
# Assist Method
events$assist_method <- factor(events$assist_method, levels = c("0", "1", "2", "3", "4"), exclude = NA,
labels = c("no_assist","assist_pass", "assist_cross", "assist_header", "assist_throughball"))
summary(events$assist_method)
# Fast_break
events$fast_break <- factor(events$fast_break, levels = c("0", "1"), exclude = NA,
labels = c("no","yes"))
summary(events$fast_break)
# Is_goal
events$is_goal <- factor(events$is_goal, levels = c("0", "1"), exclude = NA,
labels = c("0","1"))
summary(events$is_goal)
# Location
events$location <- factor(events$location, levels = c("3","7","9","10","11","12","13","14","16"), exclude = NA,
labels = c("centre_box","diff_angle","left_side_box", "left_side_6ybox",
"right_side_box","right_side_6ybox","close_range","penalty","long_range"))
summary(events$location)
# create dummy variables
library(dummies)
shots <- dummy.data.frame(events, names = c("location", "assist_method") , sep = ".")
colnames(shots) <- c("centre_box","diff_angle", "left_side_box", "left_side_6ybox", "right_side_box","right_side_6ybox", "close_range",
"penalty", "long_range","bodypart","no_assist", "assist_pass", "assist_cross", "assist_header","assist_through_ball","situation",
"fast_break","is_goal")
# check for missing data
apply(shots, MARGIN = 2, FUN = function(x) sum(is.na(x)))
library(Amelia)
missmap(shots, col = c("black", "grey"), legend = FALSE)
shots <- na.omit(shots) # remove any missing data
str(events)
#-----Section 03-------------------------------------------
# train and test subsets
set.seed(12345)
shots.rand <- shots[order(runif("3812")), ]
# split into training (75%) and test (25%) data sets
shots_tr <- shots.rand[1:2859, ]
shots_te <- shots.rand[2860:3812, ]
round(prop.table(table(shots_tr$is_goal))*100,1)
round(prop.table(table(shots_te$is_goal))*100,1)
#-----Section 02-------------------------------------------
# explore the data
# look at two characteristics of the applicant
table(shots$left_side_box)
table(shots$diff_angle_left)
# look at two characteristics of the loan
summary(shots$close_range)
summary(shots$diff_angle_right)
# look at the classification variable
table(shots$is_goal)
prop.table(table(shots$is_goal))
# Turn outcome into Factor
shots_tr$is_goal <- c('is_goal')
# Convert `day_vector` to a factor with ordered level
shots_tr$is_goal <- factor(shots_tr$is_goal, order = TRUE, levels =c('0','1'))
##Turn Factor into Character and then into Numeric
shots_tr$is_goal <- as.numeric(as.character(shots_tr$`is_goal`))
shots_te$is_goal <- as.numeric(as.character(shots_te$`is_goal`))
# training a model on the data
# build the simplest decision tree
library(caret)
library(C50)
library(AppliedPredictiveModeling)
set.seed(12345)
shots_model <- C5.0(shots_tr[18], shots_tr$is_goal)
# [-29] means exclude variable 17 'is_goal'
# display simple facts about the tree
shots_model
# display detailed information about the tree
summary(shots_model)
#-----Section 06-------------------------------------------
##Turn Factor into Character and then into Numeric
shots_tr$is_goal1 <- as.numeric(as.character(shots_tr$`is_goal`))
shots_te$is_goal1 <- as.numeric(as.character(shots_te$`is_goal`))
# evaluating model performance
# create a factor vector of predictions on test data
shots_pred1 <- predict(shots_model, shots_te)
# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(shots_te$is_goal, shots_pred1,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
# more diagnostics
library(caret)
confusionMatrix(shots_pred1, shots_te$is_goal, positive = "1")
#-----Section 07-------------------------------------------
# improving model performance
# pruning the tree to simplify and/or avoid over-fitting
?C5.0Control
set.seed(12345)
shots_prune <- C5.0(shots_te[-18], shots_tr$is_goal,
control = C5.0Control(minCases = 9)) # 1% training obs.
shots_prune
summary(shots_prune)
credit_prune_pred <- predict(shots_prune, shots_te)
CrossTable(shots_te$is_goal, shots_prune_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual goal', 'predicted goal'))
confusionMatrix(shots_prune_pred, shots_te$is_goal, positive = "yes")
# boosting the accuracy of decision trees
# boosted decision tree with 10 trials
set.seed(12345)
credit_boost10 <- C5.0(shots_tr[-18], shots_tr$is_goal, control = C5.0Control(minCases = 9), trials = 10)
credit_boost10
summary(credit_boost10)
shots_boost_pred10 <- predict(credit_boost10, shots_te)
CrossTable(shots_te$is_goal, shots_boost_pred10,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual is_goal', 'predicted is_goal'))
confusionMatrix(shots_boost_pred10, shots_te$is_goal, positive = "yes")
# boosted decision tree with 100 trials
set.seed(12345)
shots_boost100 <- C5.0(shots_tr[18], shots$is_goal, control = C5.0Control(minCases = 9), trials = 100)
shotst_boost100
shots_boost_pred100 <- predict(shots_boost100, shots_te)
CrossTable(shots_te$is_goal, shots_boost_pred100,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual is_goal', 'predicted is_goal'))
confusionMatrix(shots_boost_pred100, shots_te$is_goal, positive = "yes")
#-----Section 08-------------------------------------------
# evaluating using ROC curve and value of AUC
# note: only works on two-value outcomes and NOT when using a cost matrix
library(ROCR)
# prepare probability data for outcomes
shots_prob <- predict(shots_boost100, credit_test, type = "prob")
# bind with test and earlier predicted data
shots_res <- cbind(shots_te, shots_boost_pred100, shots_prob)
head(shots_res)
# create a prediction object
shots_pred <- prediction(predictions = shots_res$is_goal, labels = shots$is_goal)
# plot ROC curve
?performance()
shots_perf1 <- performance(shots_pred, measure = "tpr", x.measure = "fpr")
plot(cshots_perf1, lwd = 2)
abline(a = 0, b = 1, lty = 2)
# calculate the area under the curve (AUC)
shots_perf2 <- performance(shots_pred, measure ="auc")
shots_perf2@y.values
# remove all variables from the environment
rm(list=ls())