I'm scraping Google News and storing the data in a data frame with two columns - title and URL. But I want the absolute URL, that is the URL of the page that is getting redirected and not the news.google.com/... URL. How do I do that?
news <- function(term) {
html_dat <- read_html(paste0("https://news.google.com/search?q=",term,"&hl=en-US&gl=US&ceid=US%3Aen"))
dat <- data.frame(Link = html_dat %>%
html_nodes('.WwrzSb') %>%
html_attr('href')) %>%
mutate(Link = gsub("./articles/","https://news.google.com/articles/",Link))
news_dat <- data.frame(
Title = html_dat %>%
html_nodes('.IFHyqb.DeXSAc .JtKRv') %>%
html_text(),
Link = dat$Link
)
return(news_dat)
}