I try to scrap article information (title, authors, abstract), but there exists a problem when I scrap the abstract. I have 261 weblinks. The number of abstracts is only 19. And there is an error. Can anyone help me~ Thanks!
The following is the data:
read_html(url[1],encoding = 'utf-8') %>%
html_nodes(' #search-results > section.search-results-list > div.search-results-chunks > div > article:nth-child(2) > div.docsum-wrap > div.docsum-content > a') %>%
html_text(trim = TRUE)
read_html(url[1],encoding = 'utf-8') %>%
html_nodes('.docsum-title') %>%
html_text(trim = TRUE)
title <- c()
for (i in url) {
title <- c(title,read_html(i,encoding = 'utf-8') %>% html_nodes(".docsum-title") %>% html_text(trim = T))
}
check numbers
length(title)
author <- c()
for (i in url) {
author <- c(author,read_html(i,encoding = 'utf-8') %>%
html_nodes('.full-authors') %>%
html_text())
}
length(author)
web <- c()
for (i in url) {
web <- c(web,read_html(i,encoding = 'utf-8') %>% html_nodes('.docsum-title') %>% html_attr(name = 'href'))
}
length(web)
web_link <- paste('https://pubmed.ncbi.nlm.nih.gov',web,sep = '')
web_link
abstract <- list()
for (i in web_link) {
abstract[[i]] <- read_html(i,encoding = 'utf-8') %>% html_nodes("#eng-abstract > p") %>% html_text(trim = T)
}
Error in open.connection(x, "rb") : HTTP error 404.