Thank you to anyone who can lend some guidance-
I'm looking to calculate word frequencies for a database of transcribed interviews that I access online. I would like to do the word frequencies for the words in a dictionary that I have.
I have pasted the dictionary and the code I have so far for analyzing it below (I would reprex but I don't know how). Let me know if you have done something similar before or know a way I could do this.
rm(list=ls())
library (stringr)
library (dplyr)
library(tidytext)
library(tidyverse)
library(rvest)
main.page <- read_html(x = "http://www.asapsports.com/show_player.php?category=11&letter=a")
urls <- main.page %>% # feed `main.page` to the next step
html_nodes("tr+ tr td+ td") %>%
str_sub ( 30, 79) %>%
str_subset ( "show_player") %>%
as.tibble()
colnames (urls) <- "urls"
names (urls)
links <- main.page %>% # feed `main.page` to the next step
html_nodes("tr+ tr td+ td") %>%
str_sub ( 30, 100) %>%
str_subset ( "show_player") %>%
str_sub (53, 66) %>%
str_replace_all ( "\\<", "")%>%
str_replace_all ( "\\\\", "")%>%
str_replace_all ( "\\/", "")%>%
str_replace_all ( "\\.", "")%>%
str_replace_all ( "\\>", "")%>%
str_replace_all ( ",", "_")%>%
str_replace_all ( " ", "")%>%
str_replace_all ( "\\'", "")%>%
as.tibble()
colnames (links) <- "links"
names (links)
sotu <- data.frame(links = links, urls = urls, stringsAsFactors = FALSE)
head(sotu)
View(sotu)
outfilea <- ""
for(i in seq(nrow(sotu))) {
text <- html(sotu$urls[i]) %>% # load the page
html_nodes("td td tr:nth-child(1) b a") %>%
html_attr("href") %>% # extract the URLs html_text()
as.tibble()
outfilea <-rbind (outfilea, text)
}
outfilea
colnames (outfilea) <- "url"
text2 <- outfilea %>% filter(str_detect(url, "http")) %>%
mutate (id =str_sub(url, -5,-1))
View(outfilea)
outfileb <- ""
for(i in seq(nrow(text2))) {
text <- read_html(text2$url[i]) %>% # load the page
html_nodes("tr+ tr tr td~ td+ td") %>% # isloate the text
html_text() %>%
as.tibble () %>%
mutate (id =paste(text2$id[i],"basket", sep = "")) %>%
filter (!is.na(value)) %>%
filter (grepl ( "[a-z]", value))%>%
filter (!grepl ( "var cx", value))%>%
filter (!grepl ( "function", value))%>%
filter (!grepl ( "var gcse", value))%>%
filter (!grepl ( "gcse.type", value))%>%
filter (!grepl ( "gcse.async", value))
outfileb <- rbind (outfileb, text)
}
head(outfileb)
carefz1 <- c(" safe", " peace", " compassion", " help", " empath", " sympath", " protect", " secur", " benefit", " defen", " guard", " care", " caring", " shield", " shelter", " amity", " harm", " suffer", " warl",
" fight", " violen", " hurt", " killer", " endanger", " cruel", " brutal", " abuse", " damag", " detriment",
" crush", " attack", " annihilate", " impair", " war", " wars", " warring", " kill", " killing", " ravage",
" destroy", " stomp", " spurn", " impair")
carefz1care <- carefz1 %>%
as.tibble () %>%
mutate (category = "care") %>%
mutate (code = 1)
colnames (carefz1care)<- c("word", "category", "code")
fairfz1 <- c( " fair-", " fairmind", " equal", " justifi", " reciproc", " impartial", " egalitar", " unbias", " balance",
" unprejudice", " fair", " fairly", " fairness", " fairplay", " justice", " justness", " rights", " equity",
" evenness", " equivalent", " tolerant", " equable", " homologous", " reasonable", " constant", " unfair",
" unequal", " bias", " unjust", " injust", " bigot", " discriminat", " disproportion", " prejud", " exclud",
" inequitable", " dishonest", " unscrupulous", " dissociate", " preference", " favoritism", " exclusion")
fairfz1fair <- fairfz1 %>%
as.tibble () %>%
mutate (category = "fair") %>%
mutate (code = 1)
colnames (fairfz1fair)<- c("word", "category", "code")
authorityfz1 <- c(" nation", " homeland", " patriot", " commune", " communit", " communis", " comrad", " collectiv",
" unite", " fellow", " devot", " cliqu", " together", " family", " families", " familial", " group",
" communal", " cadre", " joint", " unison", " guild", " solidarity", " member", " cohort", " ally",
" insider", " foreign", " enem", " individual", " deceiv", " deceiv", " jilt", " terroris", " immigra",
" imposter", " miscreant", " spy", " sequester", " renegade")
authorityfz1authority <- authorityfz1 %>%
as.tibble () %>%
mutate (category = "authority") %>%
mutate (code = 1)
colnames (authorityfz1authority)<- c("word", "category", "code")
ingroupfz1 <- c(" obey", " obedien", " duti", " honor", " respectful", " order", " father", " mother", " tradition",
" hierarch", " authorit", " status", " rank", " leader", " caste", " complian", " submi", " allegian",
" defere", " revere", " venerat", " duty", " law", " respect", " permit", " permission", " class",
" bourgeoisie", " position", " command", " supremacy", " control", " serve", " abide", " comply",
" defian", " rebel", " dissent", " subver", " disrespect", " disobe", " sediti", " agitat", " insubordinat",
" illegal", " lawless", " defy", " riot", " insurgent", " mutinous", " dissident", " unfaithful",
" alienate", " defector", " nonconformist", " oppose", " protest", " refuse", " denounce", " remonstrate", " obstruct")
ingroupfz1ingroup <- ingroupfz1 %>%
as.tibble () %>%
mutate (category = "ingroup") %>%
mutate (code = 1)
colnames (ingroupfz1ingroup)<- c("word", "category", "code")
purityfz1 <- c(" pure", " clean", " steril", " sacred", " chast", " saint", " celiba", " abstinen",
" church", " purity", " holy", " holiness", " abstention", " virgin", " austerity", " modesty",
" abstemiousness", " limpid", " unadulterated", " maiden", " virtuous", " refined", " immaculate",
" innocent", " pristine", " disgust", " deprav", " disease", " unclean", " contagio", " sinful",
" sinner", " slut", " dirt", " profan", " repuls", " sick", " promiscu", " lewd", " adulter",
" debauche", " defile", " prostitut", " filth", " obscen", " taint", " stain", " tarnish",
" debase", " desecrat", " exploitat", " sin", " whore", " impiety", " impious", " gross",
" tramp", " unchaste", " intemperate", " wanton", " profligate", " trashy", " lax", " blemish", " pervert")
purityfz1purity <- purityfz1 %>%
as.tibble () %>%
mutate (category = "purity") %>%
mutate (code = 1)
colnames (purityfz1purity)<- c("word", "category", "code")
foundationsdictionary <- rbind (purityfz1purity, ingroupfz1ingroup, authorityfz1authority,
fairfz1fair,carefz1care )