Hey everybody, I keep getting an HTTP error 403 when using my scraper, and I'm confused why:
a. I get it in the first place
b. I sometimes don't get the error but other times do
Any ideas? Thanks!
library(tidyverse)
library(rvest)
library(RSelenium)
library(progress)
get_schedule <- function(league, season, ..., progress = TRUE) {
mydata <- tidyr::crossing(league, season)
if (progress) {
pb <- progress::progress_bar$new(format = "get_schedule() [:bar] :percent eta: :eta", clear = FALSE, total = nrow(mydata), show_after = 0)
pb$tick(0)
}
.get_schedule <- function(league, season, ...) {
if (league == "KHL") {
if (season == "2018-19") {url = "https://en.khl.ru/calendar/671/00/"}
else if (season == "2017-18") {url = "https://en.khl.ru/calendar/468/00/"}
else if (season == "2016-17") {url = "https://en.khl.ru/calendar/405/00/"}
else if (season == "2015-16") {url = "https://en.khl.ru/calendar/309/00/"}
else if (season == "2014-15") {url = "https://en.khl.ru/calendar/266/00/"}
else if (season == "2013-14") {url = "https://en.khl.ru/calendar/244/00/"}
else if (season == "2012-13") {url = "https://en.khl.ru/calendar/222/00/"}
else if (season == "2011-12") {url = "https://en.khl.ru/calendar/202/00/"}
else if (season == "2010-11") {url = "https://en.khl.ru/calendar/185/00/"}
else if (season == "2009-10") {url = "https://en.khl.ru/calendar/167/00/"}
else if (season == "2008-09") {url = "https://en.khl.ru/calendar/160/00/"}
else {stop("Season not available. Sorry!")}
}
else {stop("League not available. Sorry!")}
driver <- rsDriver(verbose = FALSE)
driver$client$navigate(url)
page <- driver$client$getPageSource() %>%
purrr::pluck(1) %>%
read_html()
driver$client$close()
driver$server$stop()
schedule <- page %>%
html_nodes("ul+ ul li:nth-child(1) a") %>%
html_attr("href") %>%
str_c("https://en.khl.ru", .) %>%
as_tibble() %>%
set_names("url") %>%
mutate(season = season) %>%
mutate(league = league) %>%
distinct()
return(schedule)
}
schedule_data <- map2_dfr(mydata[["league"]], mydata[["season"]], .get_schedule)
return(schedule_data)
}
get_schedule("KHL", "2018-19")
#> Error in open.connection(con, "rb"): HTTP error 403.
Created on 2018-11-25 by the reprex package (v0.2.0).