library(RSelenium)
library(XML)
library(dplyr)
library(rvest)
remDr <- remoteDriver(browserName = "chrome",port = 4444,
remoteServerAddr = "localhost")
remDr$open()
Sys.sleep(1)
remDr$navigate("https://members.parliament.uk/members/Commons")
html <- remDr$getPageSource()[[1]]
url_data1 <- html %>%
read_html() %>%
html_nodes(xpath='//*[@id="main-content"]/div/article/div/div/div[3]/a[1]') %>%
html_attr("href");url_data1
#"/member/172/contact"
url_data2 <- html %>%
read_html() %>%
html_nodes(xpath='//*[@id="main-content"]/div/article/div/div/div[3]/a[2]') %>%
html_attr("href");url_data2
# "/member/4212/contact"
url_data3 <- html %>%
read_html() %>%
html_nodes(xpath='//*[@id="main-content"]/div/article/div/div/div[3]/a[3]') %>%
html_attr("href");url_data3
# "/member/4639/contact"
# but whe im try to make a loop for all post of page 1 show me this error:
for (i in 1:20) {
url_data <- html %>%
html_nodes(xpath = paste('//*[@id="main-content"]/div/article/div/div/div[3]/a[', i, ']')) %>%
html_attr("href")
Sys.sleep(2)
# data frame
df <- df %>% bind_rows(data.frame(url_data))
}
# Error in UseMethod("xml_find_all") :
# no applicable method for 'xml_find_all' applied to an object of class "character"
# The idea is repeat this loop for all pages.
It's maybe possible to script this using the {curl} package with some tweaking of customized handle options. Otherwise, you need to know either the constituency numbers, which are in no regular order, or the member id numbers, which aren't either, and you can use their API. Maybe outsource it to Mechanical Turk?