I'm trying to scrape the data within #boxgoals
for each scheduled game in my url. However, it seems like splashr
stops scraping after 155 observations in mydata
, or -- what I believe to be -- 25 games. Any idea if this is a splashr
issue, or something else? Thanks!
library(tidyverse)
library(splashr)
library(rvest)
sp <- start_splash()
url <- "https://www.uscho.com/scoreboard/michigan/mens-hockey/"
get_data <- function(myurl) {
link_data <- myurl %>%
read_html() %>%
html_nodes("td:nth-child(13) a") %>%
html_attr("href") %>%
str_c("https://www.uscho.com", .) %>%
as_tibble() %>%
set_names("url")
game_type <- myurl %>%
read_html() %>%
html_nodes("td:nth-child(12)") %>%
html_text() %>%
as_tibble() %>%
filter(between(row_number(), 2, n())) %>%
set_names("game_type")
as_tibble(data.frame(link_data, game_type))
}
link_list <- get_data(url)
urls <- link_list %>%
filter(game_type != "EX") %>%
pull(url)
get_box_score <- function(my_url) {
progress_bar$tick()$print()
Sys.sleep(sample(seq(0, 0.1, by = 0.001), 1))
render_html(url = my_url) %>%
html_node("#boxgoals") %>%
html_table() %>%
as_tibble()
}
persistently_get_box_score <- warrenr::persistently(get_box_score, max_attempts = 15, wait_seconds = 0.001)
try_get_box_score <- function(url) {
tryCatch(persistently_get_box_score(url), error = function(e) {data.frame()})
}
progress_bar <- link_list %>%
filter(game_type != "EX") %>%
tally() %>%
progress_estimated(min_time = 0)
mydata <- pmap_df(list(urls), try_get_box_score)
stop_splash(sp)