hi guys, im new here so sorry if this is the wrong category!
Netflix Movie and TV Shows (June 2021) | Kaggle
I have a netflix dataset and i'd like to predict the success of a tvseries with its age certificate is that possible ? because i couldnt do it ! can anyone help me please?
library(tidyverse)
library(forecast)
library(dbplyr)
library(stats)
library(caret)
library(pROC)
library(e1071)
library(rpart)
library(rpart.plot)
library(caret)
library(factoextra)
library(tidyr)
library(ggplot2)
library(cluster)
netflix <- read_csv("C:/Users/User/Desktop/netflix_list.csv",
col_types = cols(imdb_id = col_number(),
popular_rank = col_number(), startYear = col_number(),
endYear = col_number(), episodes = col_number(),
runtime = col_number(), rating = col_number(),
numVotes = col_number(), cast = col_skip(),
image_url = col_skip()))
serie <- netflix %>%
select(popular_rank, type, certificate) %>%
filter(type == "tvSeries")
serie_true <- serie %>%
mutate( certificate = case_when(certificate == "13" ~ "jugendlich", certificate == "13+" ~ "jugendlich" , certificate == "12+" ~ "jugendlich" , certificate == "15" ~ "jugendlich", certificate == "15+" ~ "jugendlich", certificate == "7" ~ "jugendlich", certificate == "All" ~ "jugendlich", certificate == "UA" ~ "jugendlich", certificate == "U" ~ "jugendlich" , certificate == "7" ~ "jugendliche", TRUE ~ "volljährig"))
serie_true <- serie_true %>%
mutate(popular_rank = if_else(popular_rank <150, 1, 0))
serie_true <- serie_true %>%
select(-type)
dummy_serie <- dummyVars("~certificate", data = serie_true, fullRank = TRUE)
dummy_certificate <- data.frame(predict(dummy_serie, newdata = serie_true))
serie_true <- serie_true %>%
mutate(certificate = as.factor(certificate))%>%
mutate(popular_rank = as.factor(popular_rank))
serie_subset <- cbind(serie_true, dummy_certificate)
# set seed
set.seed(456)
# train and test data
train_index <- createDataPartition(serie_subset$popular_rank, p = 0.8)$Resample1
train_serie <- serie_subset[train_index, ]
test_serie <- serie_subset[-train_index, ]
# check ratio of variables
serie_true %>%
group_by(popular_rank, certificate) %>%
count()
train_serie %>%
group_by(popular_rank, certificatevolljährig) %>%
count()
# load packages
library(rpart)
library(rpart.plot)
# train our model
tree_serie <- rpart(popular_rank ~ ., data = train_serie, method = "class")
# plot our first decision tree
rpart.plot(tree_serie)
# apply model to test data
dt_results_serie <- predict(tree_serie, test_serie[, -1], type = "prob")
dt_results_serie
# bind model data to titanic test data
model_dt_results_serie <- cbind(test_serie, dt_results_serie)
head(model_dt_results_serie)