Hi community,
I have been trying to scrape links and published date from a pdf in a website using rvest but the functions I wrote keeps returning itself without giving out result. I want my functions to run without returning just the functions to me
# Load Packages ------
pacman::p_load(
# Data Wrangling
tidyverse, lubridate, magrittr,
# Web scraping
rvest, xopen,
# Text data mining
readtext, tidytext,
quanteda, textclean
)
search_pages <- c("[https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=1&endrec=20&keyword=&from=&tod= ](https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=1&endrec=20&keyword=&from=&tod=)", "[https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=21&endrec=40&keyword=&from=&tod= ](https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=21&endrec=40&keyword=&from=&tod=)") %>% tibble(page = .) %>%
print()
# Create a function to grab the links
get_links <- function(page){
# page <- search_pages %>% pull(page) %>% .[1] %>% read_html()
page <- search_pages %>% read_html()
# Create a table of extracted data
page_tbl <- tibble(
# Get Title
```
title = page %>%
html_nodes('.dbasetable a') %>%
html_text2() %>%
str_remove_all(
"(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)") %>%
str_squish(),
# Get Published Date
date = page %>%
html_nodes('#publishedDt') %>% #
html_text2() %>%
str_squish() %>%
str_replace("Published ", "") %>%
str_extract("\\d+/\\d+/\\d+") %>%
mdy() %>%
format(., format = "%Y%m%d"),
# Get the download links
links = page %>%
html_nodes('.dbasetable a') %>%
html_attr("href") %>%
str_replace("^(\\.\\.)", "") %>%
str_c("https://www.cbn.gov.ng", .)
```
)
return(links)
}
Thank you