I wanted to scrape all the MDs and their specialties from a website. For simplicity I am looping through first three pages. I end up getting few empty rows every time. Can you check why it is so and suggest a solution?
library(rvest)
library(dplyr)
Define the base URL
base_url <- "Primary Care | Specialists | Physicians | Mass General Brigham"
provider_data <- matrix(NA, nrow = 30, ncol = 2,
dimnames = list(NULL, c("Name", "Specialities")))
for (page_num in 1:3) { # Adjust the range based on the number of pages you want to scrape
url <- paste0(base_url, page_num)
page <- read_html(url)
Get the links to individual provider pages
provider_links <- page %>%
html_nodes(".e16v8r6n5 .e1anvbbl0") %>%
html_attr("href")
for (i in seq_along(provider_links)) {
full_link <- paste0("https://doctors.massgeneralbrigham.org", provider_links[i])
provider_page <- read_html(full_link)
# Extract provider name
name <- provider_page %>%
html_node("#provider-name") %>%
html_text()
# Extract provider specialties
specialties <- provider_page %>%
html_nodes(".is-hierarchy") %>%
html_text() %>%
paste(collapse = ", ")
# Add the name and specialties to the matrix
provider_data[i + (page_num - 1) * 10, 1] <- name
provider_data[i + (page_num - 1) * 10, 2] <- specialties
}
}
Convert the matrix to a data frame
provider_data <- as.data.frame(provider_data)
View the resulting data frame
View(provider_data)