How to change language of termDocumentmatrix?

oktayozden · December 2, 2019, 7:14pm

I need to change language to Turkish in the function of termDocumentmatrix. Could you possibly help me?

this code works. I reach result what i want for stemming, stopwords etc for Turkish.


dat<-"BirGün, Türkiye'de günlük olarak yayımlanan ulusal bir gazete.
Gazetenin yazı işleri müdürü Berkant Gültekin, yayın danışmanı Barış İnce, 
sorumlu müdürü Cansever Uğur ve haber koordinatörü İbrahim Varlı'dır. Yayın 
hayatına 14 Nisan 2004'te başlayan gazetenin sahibi Birgün Yayıncılık ve İletişim Ticaret AŞ'd"

dat%>% 
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)%>%
  tokens_remove(stopwords("tr", source = "stopwords-iso")) %>%
  tokens_wordstem(language = "turkish") %>%
  tokens_tolower() 

Result:
[1] "birg"        "türkiye'"    "günlük"      "yayımlana"   "ulusal"      "gaze"        "gazete"      "yaz"         "iş"         
[10] "müdür"       "berkant"     "gültek"      "yay"         "danışma"     "barış"       "ince"        "sorumlu"     "müdür"      
[19] "cansever"    "uğur"        "haber"       "koordinatör" "ibrah"       "varlı'"      "yay"         "hayat"       "nisa"       
[28] "te"          "başlaya"     "gazete"      "sahip"       "birgi"       "yayıncılık"  "iletiş"      "ticaret"     "aş'd"       


#> Error in dat %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% : "%>%" fonksiyonu bulunamadı

^{Created on 2019-12-02 by the reprex package (v0.3.0)}

But, i dont integrate these process into term document matrix which is below that i try to mine pdf file


library(pdftools)
library(tm)
library(SnowballC)
library(dplyr)
library(stringr)
library(tidytext)
library(quanteda)



files <- list.files(pattern = "pdf$")
file<-as.character(files)
opinions <- lapply(files, pdf_text)
length(opinions)


lapply(opinions, length) 


corp <- Corpus(URISource(files),
               readerControl = list(reader = readPDF))


opinions.tdm <- TermDocumentMatrix(corp, 
                                   control = 
                                     list(language="turkish",
                                          stopwords = TRUE,
                                          removePunctuation = TRUE,
                                          tolower = TRUE,
                                          stemming = TRUE,
                                          removeNumbers = TRUE,
                                          bounds = list(global = c(1, Inf)))) 



inspect(opinions.tdm[1:10,]) 


opinions.tdm <- TermDocumentMatrix(corp, 
                                   control = 
                                     list(language="turkish",
                                          stopwords = TRUE,
                                          tolower = TRUE,
                                          stemming = TRUE,
                                          removePunctuation=TRUE,
                                          removeNumbers = TRUE,
                                          bounds = list(global = c(1, Inf))))    



findFreqTerms(opinions.tdm, lowfreq = 100, highfreq = Inf)

ft <- findFreqTerms(opinions.tdm, lowfreq = 100, highfreq = Inf)
as.matrix(opinions.tdm[ft,]) 

ft.tdm <- as.matrix(opinions.tdm[ft,])
sort(apply(ft.tdm, 1, sum), decreasing = TRUE)

a<-sort(apply(ft.tdm, 1, sum), decreasing = TRUE)

a<-as.data.frame(a)

a


a$word<-rownames(a)


aa<-a %>%  filter(a > 200) %>%
  mutate(word = reorder(word, a))

^{Created on 2019-12-02 by the reprex package (v0.3.0)}

system · December 24, 2019, 12:32pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.