Sys.setlocale(category = "LC_ALL", locale = "")
library(tidyverse)
dat<-readxl::read_xlsx("examplexlsx.xlsx")
dat<-dat%>%
select(v1,v2)
dat$v1<-as.factor(dat$v1)
library(tm)
library(tidytext)
library(textdata)
sms_corpus<-VCorpus(VectorSource(dat$v2))
inspect(sms_corpus[1:2])
as.character(sms_corpus[[1]])
lapply(sms_corpus[1:4],as.character)
library(stringi)
sms_corpus<-tm::tm_map(sms_corpus,tm::content_transformer(function(x) stri_trans_tolower(x)))
sms_corpus<-tm::tm_map(sms_corpus,tm::removeNumbers)
english_stopwords <- tm::stopwords("english")
english_stopwords <- iconv(english_stopwords, from = "UTF-8", to = "UTF-8")
sms_corpus<-tm_map(sms_corpus,removeWords,english_stopwords)
sms_corpus<-tm_map(sms_corpus,removePunctuation)
sms_corpus<-tm_map(sms_corpus,tm::stemDocument)
sms_corpus<-tm_map(sms_corpus,stripWhitespace)
sms_dtm<-DocumentTermMatrix(sms_corpus)
sms_dtm_train<-sms_dtm[1:350,]
sms_dtm_test<-sms_dtm[351:500,]
sms_train_labels<-dat[1:350,]$v1
sms_test_labels<-dat[351:500,]$v1
library(wordcloud)
wordcloud::wordcloud(sms_corpus,random.order = FALSE)
sms_freq_words<-tm::findFreqTerms(sms_dtm_train,5)
#sms_dtm_freq_train<-sms_dtm_train[,sms_freq_words]
#sms_dtm_freq_test<-sms_dtm_test[,sms_freq_words]
convert_counts<-function(x){
x<-ifelse(x > 0, "Yes", "No")
}
sms_train<-apply(sms_dtm_train, MARGIN = 2,convert_counts)
sms_test<-apply(sms_dtm_test, MARGIN = 2,convert_counts)
library(naivebayes)
sms_classifier <- naive_bayes(sms_train,sms_train_labels)
library(gmodels)
sms_test_pred<-predict(sms_classifier,sms_test)
最后预测报错信息如下:
Error in `[.default`(tab, V, ) : 下标出界
I'm look forward your advice as soon as possible,thank you!