Hello
I have been following this thread in order to acquire data from tesla model 3 forum (forums.tesla. com/forum/tesla-model-3) .
thanks to gueyenono i have been able to get data from page 1 of the forum. can you please advise on how can i automate this code so i can get more than 1 page?
Also, can you please recommend further steps after i have scraped several pages of the forum ?
thanks!
The following code will scrape the data you are looking for. I did my best for the little time I had. I only focus on the first two pages of the forum, but you can easily scrape more pages by adapting the code. Do not hesitate to ask questions if you want to understand what some chunks of the code do.
# Load packages
library(tibble)
library(tidyr)
library(rvest)
library(dplyr)
library(stringr)
library(purrr)
# Is it legal to scrape from the website?
robotstxt::paths_allowed("https://forums.tesla.com/") # Yes
# Custom functions for scraping the data
scrape_page_info <- function(page_url){
html <- read_html(page_url)
topics <- html %>%
html_nodes(".title a") %>%
html_text()
topic_urls <- html %>%
html_nodes(".title a") %>%
html_attr(name = "href") %>%
paste0("https://forums.tesla.com", .)
created_info <- html %>%
html_nodes(".created") %>%
html_text() %>%
str_squish()
tibble(topics, created_info, topic_urls) %>%
separate(col = created_info, into = c("date_of_creation", "thread_author"), sep = " by ")
}
scrape_thread_info <- function(thread_html){
thread_html %>%
html_nodes(".clearfix") %>%
html_text() %>%
str_squish() %>%
str_replace(pattern = "^(.*?(\\|.*?){1})\\|", replacement = "\\1") %>% # Remove second "|"
str_replace(pattern = "(^.*?\\d{4})", replacement = "\\1 \\|") %>% # Add "|" after first date
enframe(name = NULL, value = "content") %>%
separate(col = "content", into = c("author", "date", "content"), sep = "\\|")
}
scrape_thread <- function(url){
html <- read_html(url)
n_pages <- html %>%
html_node("#article_content > div.panel-pane.pane-node-comments > div > div.item-list > ul > li.pager-last.last > a") %>%
html_attr("href") %>%
str_extract(pattern = "(\\d+)$") %>%
as.numeric()
df_page_1 <-
html %>%
html_nodes(".clearfix") %>%
html_text() %>%
str_squish() %>%
str_replace(pattern = "^(.*?(\\|.*?){1})\\|", replacement = "\\1") %>% # Remove second "|"
str_replace(pattern = "(^.*?\\d{4})", replacement = "\\1 \\|") %>% # Add "|" after first date
enframe(name = NULL, value = "content") %>%
separate(col = "content", into = c("author", "date", "content"), sep = "\\|")
df_page_1$author[1] <- html %>%
html_node(".username") %>%
html_text()
df_page_1$date[1] <- html %>%
html_node(".submitted") %>%
html_text() %>%
str_squish() %>%
str_extract(pattern = "\\w+\\s\\d+\\W\\s\\d{4}$")
df_page_1$content[1] <- html %>%
html_node(".clearfix") %>%
html_text() %>%
str_squish() %>%
str_replace(pattern = "^.*\\d+\\,\\s\\d{4}\\s", replacement = "")
extra_page_data <- NULL
if(!is.na(n_pages)){
extra_urls <- paste0(thread_url, "?page=", seq_len(n_pages-1))
other_htmls <- lapply(extra_urls, function(x) read_html(x))
df <- lapply(other_htmls, function(x){
scrape_thread_info(x)[-1, ]
})
extra_page_data <- do.call(rbind, df)
}
rbind(df_page_1, extra_page_data)
}
scrape_thread_possibly <- possibly(scrape_thread, otherwise = NA)
# Actual scraping
page_urls <- c("https://forums.tesla.com/forum/tesla-model-3", paste0("https://forums.tesla.com/forum/tesla-model-3?page=", 1:2))
master_data <- map_dfr(page_urls[1:2], function(url){
scrape_page_info(url) %>%
mutate(forum_data = map(topic_urls, scrape_thread_possibly))
})