rm(list = ls())
pacman::p_load(tidyverse, lubridate,
rvest, glue,
gtrendsR, rtweet,
broom, modelr,
Hmisc, DT,
jtools, huxtable,
interactions,
gvlma, ggfortify, car, psych,
ggthemes, scales, gridExtra
)
glimpse(covid_us_jurisdiction)
glimpse(covid_us_tidy)
glimpse(US_2020)
glimpse(us_vaccination)
glimpse(us_vaccination_news)
view(us_vaccination_news)
view(covid_us_jurisdiction)
view(covid_us_merged)
# Import of CSV files
covid_us_jurisdiction <- read_csv("cdc_vaccines_distributed_administered_by_jurisdiction.csv")
us_2020 <- read_csv("voting.csv")
# Tidy merged dataset
covid_us_tidy <- covid_us_jurisdiction %>%
select(date, state_abbreviation, state_name, Administered_Dose2_Recip, population) %>%
rename(state = state_name) %>%
mutate(vaccination_rate = Administered_Dose2_Recip/population)
# Identify Republican or Democratic State
rep_dem <- us_2020 %>%
mutate(blue_red_state = ifelse(trump_win, "Republican", "Democratic"))
# Join covid vaccination data set with red/blue state data
us_vaccination <- merge(x = covid_us_tidy, y = rep_dem[ , c("state", "blue_red_state")], by = "state", all.x=TRUE)
# Find out unemployment data for US
# Extracting volume searches of fake news via Gtrends R
covid_search <- gtrends(keyword = c("covid 19 vaccine + dna",
"covid 19 vaccine + death",
"covid 19 vaccine + pregnancy",
"covid 19 vaccine + sick"),
geo = "US",
gprop = "web",
time = "2021-01-04 2021-10-24")
covid_region <- covid_search$interest_by_region
covid_region <- covid_region %>%
rename(state = location,
volume_of_hits = `hits`) %>%
mutate(volume_of_hits = as.numeric(volume_of_hits))
us_vaccination_news <- us_vaccination %>%
inner_join(covid_region,
by = "state") %>%
select(-gprop)
# Dummification of Google trends result
# Set K-1 dummy variables only. This is to avoid multi-collinearity problem
# In our case, Covid vaccine + sick will be our baseline, therefore dummy variable for this keyword will not be created
us_vaccination_news$covid_dna <- ifelse(us_vaccination_news$keyword == 'covid 19 vaccine + dna', 1, 0)
us_vaccination_news$covid_death <- ifelse(us_vaccination_news$keyword == 'covid 19 vaccine + death', 1, 0)
us_vaccination_news$covid_pregnancy <- ifelse(us_vaccination_news$keyword == 'covid 19 vaccine + pregnancy', 1, 0)
# Use Analysis of variance for
model1_log <- us_vaccination_news %>%
lm(sqrt(vaccination_rate) ~ volume_of_hits,
.)
gvlma(model1_log)
model1_poissons <- us_vaccination_news %>%
glm(vaccination_rate ~ volume_of_hits,
data = .,
family = poisson(link = "log")
)
anova(model1_poissons, test = "Chisq")
plot(model1_log, 1)
plot(model1_poissons, 1)
# Check the distribution of Y variable
Density_Y <- density(us_vaccination_news$vaccination_rate)
plot(Density_Y)
histogram <- us_vaccination_news %>%
ggplot(aes(x = vaccination_rate)
) +
geom_histogram(fill = "tomato3",
bins = 75) # NOT robust
density <- our_merged_dataset %>%
ggplot(aes(x = cases)
) +
geom_density(fill = "deepskyblue4",
alpha = 0.25)
'''