Hi all.
I cant classify text using simple data and example.
Here data:
CATEGORY WORDS
животное животное лиса волк заяц
растение растение выращивание пшеница агрохолдинг
люди люди ресторан еда кинотеатр айфон портфель бизнес
археология археология раскопки черепок ваза кости динозавр
математика математика плюс делить произведение частное формула
отдых отдых море ресторан еда пляж арктика эверест лыжи
here code for training:
library(tm)
library(qdap)
Sys.setlocale("LC_ALL", 'ru_RU.CP1251')
mydata = read.delim("data.txt",header=TRUE,"\t",encoding="CP1251")
mydata$WORDS[1]
# Make a vector source
mydata_vector <- VectorSource(mydata$WORDS[1:3])
# Make a volatile corpus
mydata_corpus <- VCorpus(mydata_vector)
# Print out mydata_corpus
mydata_corpus
# Print data on the 15th row in mydata_corpus
mydata_corpus[[1]]
# Print the content of the 15th row in mydata_corpus
mydata_corpus[[1]]$content
# Alter the function code to match the instructions
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(bracketX))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "ё", replacement = "е")))
#через стоп-слова почему-то не удаляется этот символ
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "№", replacement = " ")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = '(сч.|счет|дог.|договор|документ|счф)(а|у)? [^о]+от [^ ]+ ', replacement = "")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "сумм(е|а)", replacement = "")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "[./-]", replacement = " ")))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("ru"), "№", "т ч", "том числе", "включая", "ндс", "rub","руб","коп","по счетам","по счету","счет", "счф", "дог", "договору", "оплата","этапу","года"))
#все слова длиной два и менее
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "\\b\\S{1,2}\\b", replacement = "")))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
clean_corp <- clean_corpus(mydata_corpus)
# Print out a cleaned up text
clean_corp[[1]][1]
# Print out the same text in original form
mydata_corpus[[1]][1]
# The RWeka package is already loaded
library(RWeka)
# Define bigram tokenizer
tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
# Create tdm and matrix with tf-idf weighting
# control = list(weighting = weightTfIdf, tokenize = tokenizer)
tf_idf_dtm <- DocumentTermMatrix(
clean_corp,
#control = list(tokenize = tokenizer)
list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE)
)
tf_idf_dtm_m <- as.matrix(tf_idf_dtm)
#tf_idf_dtm_m[order(tf_idf_dtm_m$1),]
#tf_idf_dtm_m.sorted=tf_idf_dtm_m[order(tf_idf_dtm_m[,2], decreasing = TRUE),]
# Print the dimensions of the matrix
dim(tf_idf_dtm_m)
tf_idf_dtm_m[1:3, 1:15]
#tf_idf_dtm_m.sorted[1:20, 1:6]
CATEGORY.factor = as.factor(mydata$CATEGORY)
tf_idf_dtm_m2 <- cbind(tf_idf_dtm_m, c(0, 1, 2) )
colnames(tf_idf_dtm_m2)[ncol(tf_idf_dtm_m2)] <- 'CATEGORY'
tf_idf_dtm_m3 <- as.data.frame(tf_idf_dtm_m2)
tf_idf_dtm_m3$CATEGORY <- as.factor(tf_idf_dtm_m3$CATEGORY)
library(caret)
# Train.
# list of all available methods: names(getModelInfo())
# http://topepo.github.io/caret/train-models-by-tag.html
# Bayesian Generalized Linear Model
fit <- train(CATEGORY ~ ., data = tf_idf_dtm_m3, method = 'bayesglm')
this code raise warnings:
Warning messages:
1: predictions failed for Resample03: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector2: predictions failed for Resample10: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector3: predictions failed for Resample15: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector4: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
and cant classify words correctly:
# Test data.
test_data <- c('заяц')
corpus <- VCorpus(VectorSource(test_data))
test_dtm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tf_idf_dtm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test_matrix <- as.matrix(test_dtm)
# Check accuracy on test.
result = predict(fit, newdata = test_matrix)
#result
#mydata$CATEGORY[result]
paste("type of '",test_data,"' word is: '",mydata$CATEGORY[result],"'", sep="")
Questions:
- Why warnings appears, whats wrong with code?
- Why prediction returns incorrect result (because "заяц" word is included only in one category = "животные", but code returns "растения" category )?