Hello! Disclaimer: I am a layperson when it comes to R and programming in general. I was given a script to scrape Goodreads-reviews using Rselenium. In this script (to be run in Rstudio) I can fill in the url to the Goodreads-page to the book of which I want to scrape the reviews. The script will then set the browser and navigate the url:
#Paste the GoodReads Url
url <- "https://www.goodreads.com/book/show/96290.Die_unendliche_Geschichte"
englishOnly = F #If FALSE, all languages are chosen
#Set your browser settings (if chrome not working, pick closest version)
rD <- rsDriver(browser = "chrome", chromever = "latest")
remDr <- rD[["client"]]
remDr$setTimeout(type = "implicit", 2000)
remDr$navigate(url)
The disadvantage is that I have to go in and paste a new url for each book I wish to scrape. If it is possible, I would like to be able to put all of the relevant urls in a csv-file (each line = 1 url) and the let the script do it's job for each of the urls included in the csv-file. Would this be possible? I don't have any experience with R myself, except for googling things and praying they work (which they usually don't), so I'm at a loss. Thank you very much for your kind help in advance!
I will add the full script here, so you can try it out if necessary (just set the directory for the output at the end):
library(rJava) # Required to use RSelenium
library(data.table) # Required for rbindlist
library(dplyr) # Required to use the pipes %>% and some table manipulation commands
library(magrittr) # Required to use the pipes %>%
library(rvest) # Required for read_html
library(RSelenium) # Required for webscraping with javascript
library(lubridate) # Required to scrape the correct dates
library(stringr) # Required to cut off any leading or trailing whitespace from text
library(purrr)
options(stringsAsFactors = F) #needed to prevent errors when merging data frames
#Paste the GoodReads Url
url <- "https://www.goodreads.com/book/show/96290.Die_unendliche_Geschichte"
englishOnly = F #If FALSE, all languages are chosen
#Set your browser settings (if chrome not working, pick closest version)
rD <- rsDriver(browser = "chrome", chromever = "latest")
remDr <- rD[["client"]]
remDr$setTimeout(type = "implicit", 2000)
remDr$navigate(url)
bookTitle = unlist(remDr$getTitle())
finalData = data.frame()
# Main loop going through the website pages
morePages = T
pageNumber = 1
while(morePages){
#Select reviews in correct language
#Go to the goodreads page of the book in Chrome and right-click.
#Click on "View Page Source".
#Look for the language code, it will look like this:
#<select name="language_code" id="language_code"><option value="">All Languages</option><option value="de">Deutsch ‎(9)</option>
#<option value="en">English ‎(9)</option><option value="es">EspaƱol ‎(1)</option>
#The numeral language code is the sequence, so here "All Languages" is 1, "Deutsch" is 2, "English" is 3...
#This sequence is not the same for every book, so check it each time!
#It is sufficient if you only fill in the numeral language code.
selectLanguage = if(englishOnly){
selectLanguage = remDr$findElement("xpath", "//select[@id='language_code']/option[@value='de']")
} else {
selectLanguage = remDr$findElement("xpath", "//select[@id='language_code']/option[4]")
}
selectLanguage$clickElement()
Sys.sleep(1)
#Expand all reviews
expandMore <- remDr$findElements("link text", "...more")
expandMore = sapply(expandMore, function(x) x$clickElement())
#Extracting the reviews from the page
reviews <- remDr$findElements("css selector", "#bookReviews .stacked")
reviews.html <- lapply(reviews, function(x){x$getElementAttribute("outerHTML")[[1]]})
#Remove double text when expanded
reviews.html <- lapply(reviews.html, function(x){
if(str_count(x, "span id=\"freeText") > 1) {
str_remove(x, "<span id=\"freeTextContainer.*")
} else {
x
}
})
reviews.list <- lapply(reviews.html, function(x){read_html(x) %>% html_text()} )
reviews.text <- unlist(reviews.list)
#Some reviews have only rating and no text, so we process them separately
onlyRating = unlist(map(1:length(reviews.text), function(i) str_detect(reviews.text[i], "^\\\n\\\n")))
#Full reviews
if(sum(!onlyRating) > 0){
filterData = reviews.text[!onlyRating]
fullReviews = purrr::map_df(seq(1, length(filterData), by=2), function(i){
review = unlist(strsplit(filterData[i], "\n"))
data.frame(
date = mdy(review[2]), #date
username = str_trim(review[5]), #user
rating = str_trim(review[9]), #overall
comment = str_trim(review[12]) #comment
)
})
#Add review text to full reviews
fullReviews$review = unlist(purrr::map(seq(2, length(filterData), by=2), function(i){
str_trim(str_remove(filterData[i], "\\s*\\n\\s*\\(less\\)"))
}))
} else {
fullReviews = data.frame()
}
#partial reviews (only rating)
if(sum(onlyRating) > 0){
filterData = reviews.text[onlyRating]
partialReviews = purrr::map_df(1:length(filterData), function(i){
review = unlist(strsplit(filterData[i], "\n"))
data.frame(
date = mdy(review[9]), #date
username = str_trim(review[4]), #user
rating = str_trim(review[8]), #overall
comment = "",
review = ""
)
})
} else {
partialReviews = data.frame()
}
#Get the review ID's from all the links
reviewId = reviews.html %>% str_extract("/review/show/\\d+")
partialId = reviewId[(length(reviewId) - nrow(partialReviews) + 1):length(reviewId)] %>%
str_extract("\\d+")
if(nrow(fullReviews) > 0){
reviewId = reviewId[1:(length(reviewId) - nrow(partialReviews))]
reviewId = reviewId[seq(1, length(reviewId), 2)] %>% str_extract("\\d+")
} else {
reviewId = NULL
}
if(nrow(partialReviews) > 0){
reviewId = c(reviewId, partialId)
}
finalData = rbind(finalData, cbind(reviewId, rbind(fullReviews, partialReviews)))
#Go to next page if possible
nextPage = remDr$findElements("xpath", "//a[@class='next_page']")
if(length(nextPage) > 0){
message(paste("PAGE", pageNumber, "Processed - Going to next"))
nextPage[[1]]$clickElement()
pageNumber = pageNumber + 1
Sys.sleep(2)
} else {
message(paste("PAGE", pageNumber, "Processed - Last page"))
morePages = FALSE
}
}
#end of the main loop
#Replace missing ratings by 'not rated'
finalData$rating = ifelse(finalData$rating == "", "not rated", finalData$rating)
#Stop server
remDr$close()
rD$server$stop()
rm(rD, remDr)
gc()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
#set directory to where you wish the file to go
#copy your working directory and exchange all backward slashes for forward slashes
getwd()
setwd("C:/Users/...")
#Write results
write.csv(finalData, paste0(bookTitle, ".csv"), row.names = F)
message("FINISHED!")
NOTE: changed title to be more specific