I am a PhD student who is learning to use machine learning algorithms for my dissertation in Information Assurance. I have been using XGBoost to do text classification for sentiment analysis. I have been using the Pang and Lee movie review dataset, which is 2000 movie reviews, both positive and negative. Using XGBoost, I managed to get 98.33% accuracy. I was looking at using PCA with my bag of words feature set for dimension reduction. I used the "prcomp" function in R on my training set and it worked fine. The accuracy for the training set was a 99.8%. When I tried to use the prcomp function on the "testing" partition, I got the following error: "cannot rescale a constant/zero column to unit variance." Here is the line of code where the program fails: "prin_comp <- prcomp(dtm_test, scale. = TRUE)"
I am unable to run the test set due to this issue. I have included my full R code for this project. Thanks for any help that you can provide.
setwd('C:/rscripts/movies')
imdb = read.csv('movies.csv', stringsAsFactors = FALSE)
library(text2vec)
library(caret)
library(magrittr)
library(xgboost)
library(glmnet)
library(stringr)
colnames(imdb)<-c("class","text")
imdb$text<-as.character(imdb$text)
head(imdb)
set.seed(100)
inTrain1<-createDataPartition(imdb$class,p=0.70,list=F)
train<-imdb[inTrain1,]
test<-imdb[-inTrain1,]
train<-cbind(train,id=rownames(train))
test<-cbind(test,id=rownames(test))
rownames(train)<-c(1:nrow(train))
rownames(test)<-c(1:nrow(test))
nrow(train)
nrow(test)
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
class(it_train)
vocab = create_vocabulary(it_train)
iconv(vocab, "latin1", "ASCII", sub="")
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
stop_words = c("the", "is", "and", "have","off","why")
t1 = Sys.time()
vocab = create_vocabulary(it_train, stopwords = stop_words)
gsub('[[:punct:] ]+',' ',vocab)
gsub("^\\s+|\\s+$", "", vocab)
gsub('[0-9]+', '', vocab)
gsub("@\\w+ *", "", vocab)
gsub("http", "", vocab)
gsub("1", "", vocab)
gsub("2", "", vocab)
gsub("http", "", vocab)
gsub("my", "", vocab)
gsub("too", "", vocab)
gsub("for", "", vocab)
print(difftime(Sys.time(), t1, units = 'sec'))
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
head(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
it_test = test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$id,
progressbar = FALSE)
dtm_test = create_dtm(it_test, vectorizer)
prin_comp <- prcomp(dtm_test, scale. = TRUE)
prin_comp2 <- prcomp(dtm_train, scale. = TRUE)
test_matrix <- xgb.DMatrix(prin_comp$x, label = test$class)
train_matrix <- xgb.DMatrix(prin_comp2$x, label = train$class)
xgb_params = list(
objective = "binary:logistic",
eta = 0.01,
max.depth = 7,
gammma = 1,
colsample_bytree = 0.5,
min_child_weight = 1,
eval_metric = "error")
set.seed(1234)
# Pass in our hyperparameteres and train the model
system.time(xgb <- xgboost(params = xgb_params,
data = train_matrix,
label = train$polarity,
nrounds = 500,
print_every_n = 100,
verbose = 1))
xgb_fit <- xgboost(data = train_matrix, params = xgb_params, nrounds = 100)
pred <- predict(xgb_fit,train_matrix)
pred.resp <- ifelse(pred >= 0.50, 1, 0)
table(pred.resp,train$class)
confusionMatrix(table(pred.resp, train$class))