For some strange reason, the special characters that I deleted appear again in my analysis in the bi-grams and trigrams. Here is my code in R Markdown, which knitted just fine.
Load the required packages.
library(tm)
library(ggplot2)
library(quanteda)
library(stringr)
Load the text files from the Working Directory.
tfilePath <- "C:/Users/mlrob/Documents/en_US/en_US.twitter.txt"
twitter <- readLines(tfilePath, skipNul = TRUE)
The Twitter file size of 2360148 elements (read from the environment window),
was too large for r to analyze. I analyzed a random sample of .01 percent of the file.
twittersample <- twitter[rbinom(length(twitter)*.01,length(twitter),.01)]
special characters in Twitter & â €¦™ ð Ÿ ¥ were removed.
twitter_edited <- str_replace_all(string=twittersample, pattern= "[&…™ðŸ¥]" , replacement= "")
Convert to a corpus, and use the tm package to clean the texts
tdocs<- Corpus(VectorSource(twitter_edited))
#Convert to lower case
tdocs <-tm_map(tdocs, content_transformer(tolower))
#Remove numbers
tdocs <-tm_map(tdocs,removeNumbers)
#Remove common stopwords
tdocs <- tm_map(tdocs,removeWords, stopwords("english"))
# Remove punctuations
tdocs <- tm_map(tdocs, removePunctuation)
#Eliminate extra white spaces
tdocs <- tm_map(tdocs, stripWhitespace)
a list of the 25 most commonly used words in Twitter
dtm <- TermDocumentMatrix(tdocs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word=names(v), freq=v)
head(d, 25)
d25<- d[1:25,]
print(d25)
#Frequency word plots are here as follows
```{r}
barplot(d25$freq, las=2, names.arg=d25$word, col="lightblue",main="most frequent words
in Twitter",
ylab="Word frequencies")
# transform to quanteda corpus
qtdocs <- corpus(tdocs)
summary(qtdocs, 5)
##Twitter bi-grams
toks <- tokens(qtdocs)
toks_bigram <- tokens_ngrams(toks, n=2)
#Get document feature matrix of bigrams
dfm_bigrams <- dfm(toks_bigram)
bi_dat <- textstat_frequency(dfm_bigrams)
print(bi_dat[1:20])
#Plot the bigrams for Twitter
# plot 20 most frequent bigrams
library("ggplot2")
# plot the data
ggplot(bi_dat[1:20], aes(x= reorder(feature, frequency), y= frequency)) +
geom_bar(stat="identity") +
coord_flip() +
labs(x = NULL, y = "Frequency") +
labs(title="Twitter bi-grams")
#head(toks_bigram[[1]], 50)
head(toks_bigram, 25)
##Twitter tri-grams
#toks <- tokens(qtdocs)
toks_trigram <- tokens_ngrams(toks, n=3)
#Get document feature matrix of bigrams
dfm_trigrams <- dfm(toks_trigram)
tri_dat <- textstat_frequency(dfm_trigrams)
print(tri_dat[1:20])
#Plot the trigrams for Twitter
# plot the data
ggplot(tri_dat[1:20], aes(x= reorder(feature, frequency), y= frequency)) +
geom_bar(stat="identity") +
coord_flip() +
labs(x = NULL, y = "Frequency") +
labs(title="Twitter tri-grams")