Hi community,
Im want make scraping in this page but I have little experience in loops. The idea is obtain Title, Date and autors. Are 23 Results in different pages.
url<-"https://dataverse.harvard.edu/dataverse/harvard?q=cassava&fq1=authorAffiliation_ss%3A%22International+Center+for+Tropical+Agriculture+-+CIAT%22&fq0=dvObjectType%3A%28dataverses+OR+datasets+OR+files%29&types=dataverses%3Adatasets%3Afiles&sort=score&order="
url <- GET(url, add_headers('user-agent' = 'Gov employment data scraper ([[your email]])'))
text_html <- url %>% read_html()
text_html
Title<-text_html %>%
html_nodes(".card-title-icon-block") %>%
html_text(trim = T)
Title <- as.data.frame(Title)
# Title
# 1 Global Climate Regions for Cassava
# 2 Informal “Seed” systems and the management of gene flow in traditional agroecosystems: the case of cassava in Cauca, Colombia
# 3 Vietnam household survey data for cassava varietal adoption study
# 4 Screening for gemini- and potyviridae-related viruses infecting cassava in South America.
# 5 Adoption of cassava improved modern varieties in the Cauca department in Colombia using DNA fingerprinting
# 6 Adoption of cassava improved modern varieties in the Colombian Caribbean Region using DNA Fingerprinting
# 7 Cassava Diseases Evaluation - Edaphoclimatic Zone 4: Medium Altitude Tropics
# 8 Cassava pest and disease surveillance data for mainland SE Asia – 2014
# 9 Replication Data for: Cassava Breeding I: The value of breeding value
# 10 Replication data for: The Cassava Mealybug (Phenacoccus manihoti) in Asia: First Records, Potential Distribution, and an Identification Key
## How to iterare the /tr[1]/ because [] have the date in different number for earch item?
Fecha<-text_html %>%
html_nodes(xpath=' //*[@id="resultsTable"]/tbody/tr[1]/td/div/span[1]') %>%
html_text(trim = T)
Fecha <- as.data.frame(Fecha)
# Fecha
#1 Aug 4, 2020
## How to iterare the /tr[1]/ because [] have the date in different number for earch item?
Autors <-text_html %>%
html_nodes(xpath='//*[@id="resultsTable"]/tbody/tr[1]/td/div/div[3]') %>%
html_text(trim = T)
Autors <- as.data.frame(Autors)
# Autors
# 1 Hyman, Glenn Graham, 2020, "Global Climate Regions for Cassava", # https://doi.org/10.7910/DVN/WFAMUM, Harvard Dataverse, V2
Other way is there:
-Show error
website <-"https://dataverse.harvard.edu/dataverse/harvard?q=cassava&fq1=authorAffiliation_ss%3A%22International+Center+for+Tropical+Agriculture+-+CIAT%22&fq0=dvObjectType%3A%28dataverses+OR+datasets+OR+files%29&types=dataverses%3Adatasets%3Afiles&sort=score&order="
Title <- vector()
Autor <- vector()
#loop through nodes
for (i in 1:10){
Title[i]<- website %>%
read_html() %>%
html_nodes(xpath=' //*[@id="resultsTable"]/tbody/tr[i]/td/div/div[1]') %>%
html_text(trim = T)
Autor[i]<- website %>%
read_html() %>%
html_nodes(xpath='//*[@id="resultsTable"]/tbody/tr[i]/td/div/div[3]') %>%
html_text(trim = T)
}
# Error in Title[i] <- website %>% read_html() %>% html_nodes(xpath = " //*[@id=\"resultsTable\"]/tbody/tr[i]/td/div/div[1]") %>% :
# replacement has length zero
Tnks