Help for web scraping loop

Im want make scraping in this page but I have little experience in loops. The idea is obtain Title, Date and autors. Are 23 Results in different pages.


url <- GET(url, add_headers('user-agent' = 'Gov employment data scraper ([[your email]])'))

text_html <- url %>% read_html()

Title<-text_html %>% 
  html_nodes(".card-title-icon-block") %>% 
  html_text(trim = T)  
Title <-

#                                                                                                                                    Title
# 1                                                                                                           Global Climate Regions for Cassava
# 2                Informal “Seed” systems and the management of gene flow in traditional agroecosystems: the case of cassava in Cauca, Colombia
# 3                                                                            Vietnam household survey data for cassava varietal adoption study
# 4                                                    Screening for gemini- and potyviridae-related viruses infecting cassava in South America.
# 5                                   Adoption of cassava improved modern varieties in the Cauca department in Colombia using DNA fingerprinting
# 6                                     Adoption of cassava improved modern varieties in the Colombian Caribbean Region using DNA Fingerprinting
# 7                                                                 Cassava Diseases Evaluation - Edaphoclimatic Zone 4: Medium Altitude Tropics
# 8                                                                       Cassava pest and disease surveillance data for mainland SE Asia – 2014
# 9                                                                        Replication Data for: Cassava Breeding I: The value of breeding value
# 10 Replication data for: The Cassava Mealybug (Phenacoccus manihoti) in Asia: First Records, Potential Distribution, and an Identification Key

## How to iterare the /tr[1]/ because [] have the date in different number for earch item?
Fecha<-text_html %>% 
  html_nodes(xpath=' //*[@id="resultsTable"]/tbody/tr[1]/td/div/span[1]') %>% 
  html_text(trim = T)  
Fecha <-

#       Fecha
#1 Aug 4, 2020

## How to iterare the /tr[1]/ because [] have the date in different number for earch item?
Autors <-text_html %>% 
  html_nodes(xpath='//*[@id="resultsTable"]/tbody/tr[1]/td/div/div[3]') %>% 
  html_text(trim = T)
Autors <-

#                                               Autors
# 1 Hyman, Glenn Graham, 2020, "Global Climate Regions for Cassava", #, Harvard Dataverse, V2

Other way is there:
-Show error

website <-""

Title <- vector()
Autor <- vector()

#loop through nodes
for (i in 1:10){
  Title[i]<- website %>%
    read_html() %>%
    html_nodes(xpath=' //*[@id="resultsTable"]/tbody/tr[i]/td/div/div[1]') %>% 
    html_text(trim = T)  
  Autor[i]<- website %>%
    read_html() %>%
    html_nodes(xpath='//*[@id="resultsTable"]/tbody/tr[i]/td/div/div[3]') %>%
    html_text(trim = T)  

# Error in Title[i] <- website %>% read_html() %>% html_nodes(xpath = " //*[@id=\"resultsTable\"]/tbody/tr[i]/td/div/div[1]") %>%  : 
#   replacement has length zero


The issue is in the string interpolation. The xpath argument to html_nodes has tr[i] in it, but R doesn't have a way of knowing what i is supposed to be, and so it is looking for that literal part of the html - which doesn't exist. One simple way to fix it would be to wrap each xpath argument in glue::glue and add {} around the i, like so:

for (i in 1:10){
    Title[i]<- website %>%
        read_html() %>%
        html_nodes(xpath=glue(' //*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[1]')) %>% 
        html_text(trim = T)  
    Autor[i]<- website %>%
        read_html() %>%
        html_nodes(xpath=glue('//*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[3]')) %>%
        html_text(trim = T)  

Strings fed to glue have special rules where anything wrapped in curly braces is evaluated as R code. You could achieve the same thing as above using paste or paste0, but glue makes it a little easier to read.

Lastly, just as a performance note - you probably don't want to call website %>% read_html() on each pass through the loop. Since the result doesn't change, I think you could probably just do it once outside the loop, which would speed up your loop dramatically:

#> Warning: package 'microbenchmark' was built under R version 3.6.3
#> Warning: package 'magrittr' was built under R version 3.6.3
#> Warning: package 'httr' was built under R version 3.6.3
#> Warning: package 'rvest' was built under R version 3.6.3
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 3.6.3

website <-""

Title <- vector()
Autor <- vector()
    for (i in 1:10){
        Title[i]<- website %>%
            read_html() %>%
            html_nodes(xpath=glue(' //*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[1]')) %>% 
            html_text(trim = T)  
        Autor[i]<- website %>%
            read_html() %>%
            html_nodes(xpath=glue('//*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[3]')) %>%
            html_text(trim = T)  
times = 10
#> Unit: seconds
#>                                                                                                                                                                                                                                                                                                                                                                                expr
#>  {     for (i in 1:10) {         Title[i] <- website %>% read_html() %>% html_nodes(xpath = glue(" //*[@id=\\"resultsTable\\"]/tbody/tr[{i}]/td/div/div[1]")) %>%              html_text(trim = T)         Autor[i] <- website %>% read_html() %>% html_nodes(xpath = glue("//*[@id=\\"resultsTable\\"]/tbody/tr[{i}]/td/div/div[3]")) %>%              html_text(trim = T)     } }
#>       min       lq     mean   median       uq      max neval
#>  32.74991 33.76215 37.44991 37.56984 38.80656 44.29856    10

Title <- vector()
Autor <- vector()
    tmp_website <- website %>% 
    for (i in 1:10){
        Title[i]<- tmp_website %>% 
            html_nodes(xpath=glue(' //*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[1]')) %>% 
            html_text(trim = T)  
        Autor[i]<- tmp_website %>% 
            html_nodes(xpath=glue('//*[@id="resultsTable"]/tbody/tr[{i}]/td/div/div[3]')) %>%
            html_text(trim = T)  
times = 10    
#> Unit: seconds
#>                                                                                                                                                                                                                                                                                                                                                                                                   expr
#>  {     tmp_website <- website %>% read_html()     for (i in 1:10) {         Title[i] <- tmp_website %>% html_nodes(xpath = glue(" //*[@id=\\"resultsTable\\"]/tbody/tr[{i}]/td/div/div[1]")) %>%              html_text(trim = T)         Autor[i] <- tmp_website %>% html_nodes(xpath = glue("//*[@id=\\"resultsTable\\"]/tbody/tr[{i}]/td/div/div[3]")) %>%              html_text(trim = T)     } }
#>       min       lq     mean   median       uq      max neval
#>  1.407044 1.581103 1.869299 1.643043 2.002832 3.227948    10

Created on 2022-10-24 by the reprex package (v1.0.0)

