Scraping several pages of a forum

andresmorago · March 29, 2020, 1:17am

Hello
I have been following this thread in order to acquire data from tesla model 3 forum (forums.tesla. com/forum/tesla-model-3) .
thanks to gueyenono i have been able to get data from page 1 of the forum. can you please advise on how can i automate this code so i can get more than 1 page?
Also, can you please recommend further steps after i have scraped several pages of the forum ?
thanks!

library(rvest)
library(dplyr)
library(stringr)
library(purrr)

# Scrape thread titles, thread links, authors and number of views
rm(list = ls())
url <- "https://forums.tesla.com/forum/tesla-model-3"
h <- read_html(url)



threads <- h %>%
  html_nodes("#content-body .title a") %>%
  html_text()

thread_links <- h %>%
  html_nodes("#content-body .title a") %>%
  html_attr(name = "href") %>% paste0("https://forums.tesla.com", .)

authors <- h %>%
  html_nodes("#content-body .created .username") %>%
  html_text() %>%
  str_replace_all(pattern = "\t|\r|\n", replacement = "")

replies <- h %>%
  html_nodes("#content-body .replies ") %>%
  html_text() %>%
  str_replace_all(pattern = "\n", replacement = "") %>%
  str_replace_all(pattern = " ", replacement = "") %>%
  str_replace_all(pattern = "Reply", replacement = "") %>%
  as.numeric()


# Custom function to scrape messages in each thread

scrape_messages <- function(link){
    read_html(link) %>%
    html_nodes(css = ".field-items") %>%
    html_text() %>%
    str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
    str_trim()
}



# Create master dataset (and scrape messages in each thread in process)

master_data <- 
  tibble(threads, authors, replies, thread_links) %>%
  mutate(messages = map(thread_links, scrape_messages)) %>%
  select(threads:replies, messages, thread_links)

gueyenono · March 30, 2020, 8:28am

Hi @andresmorago,

The following code will scrape the data you are looking for. I did my best for the little time I had. I only focus on the first two pages of the forum, but you can easily scrape more pages by adapting the code. Do not hesitate to ask questions if you want to understand what some chunks of the code do.

# Load packages

library(tibble)
library(tidyr)
library(rvest)
library(dplyr)
library(stringr)
library(purrr)


# Is it legal to scrape from the website?

robotstxt::paths_allowed("https://forums.tesla.com/") # Yes


# Custom functions for scraping the data

scrape_page_info <- function(page_url){
  
  html <- read_html(page_url)
  
  topics <- html %>%
    html_nodes(".title a") %>%
    html_text()
  
  topic_urls <- html %>%
    html_nodes(".title a") %>%
    html_attr(name = "href") %>%
    paste0("https://forums.tesla.com", .)
  
  created_info <- html %>%
    html_nodes(".created") %>%
    html_text() %>%
    str_squish()
  
  tibble(topics, created_info, topic_urls) %>%
    separate(col = created_info, into = c("date_of_creation", "thread_author"), sep = " by ")
  
}

scrape_thread_info <- function(thread_html){
  
  thread_html %>%
    html_nodes(".clearfix") %>%
    html_text() %>%
    str_squish() %>%
    str_replace(pattern = "^(.*?(\\|.*?){1})\\|", replacement = "\\1") %>%  # Remove second "|"
    str_replace(pattern = "(^.*?\\d{4})", replacement = "\\1 \\|") %>% # Add "|" after first date
    enframe(name = NULL, value = "content") %>%
    separate(col = "content", into = c("author", "date", "content"), sep = "\\|")
  
}


scrape_thread <- function(url){
  
  html <- read_html(url)
  
  n_pages <- html %>%
    html_node("#article_content > div.panel-pane.pane-node-comments > div > div.item-list > ul > li.pager-last.last > a") %>%
    html_attr("href") %>%
    str_extract(pattern = "(\\d+)$") %>%
    as.numeric()
  
  df_page_1 <-
    html %>%
    html_nodes(".clearfix") %>%
    html_text() %>%
    str_squish() %>%
    str_replace(pattern = "^(.*?(\\|.*?){1})\\|", replacement = "\\1") %>%  # Remove second "|"
    str_replace(pattern = "(^.*?\\d{4})", replacement = "\\1 \\|") %>% # Add "|" after first date
    enframe(name = NULL, value = "content") %>%
    separate(col = "content", into = c("author", "date", "content"), sep = "\\|")
  
  df_page_1$author[1] <- html %>%
    html_node(".username") %>%
    html_text()
  
  df_page_1$date[1] <- html %>%
    html_node(".submitted") %>%
    html_text() %>%
    str_squish() %>%
    str_extract(pattern = "\\w+\\s\\d+\\W\\s\\d{4}$")
  
  df_page_1$content[1] <- html %>%
    html_node(".clearfix") %>%
    html_text() %>%
    str_squish() %>%
    str_replace(pattern = "^.*\\d+\\,\\s\\d{4}\\s", replacement = "")
    
  extra_page_data <- NULL
  
  if(!is.na(n_pages)){
    extra_urls <- paste0(thread_url, "?page=", seq_len(n_pages-1))
    other_htmls <- lapply(extra_urls, function(x) read_html(x))
    df <- lapply(other_htmls, function(x){
      scrape_thread_info(x)[-1, ]
    })
    extra_page_data <- do.call(rbind, df)
  } 
  
  rbind(df_page_1, extra_page_data)
  
}

scrape_thread_possibly <- possibly(scrape_thread, otherwise = NA)


# Actual scraping

page_urls <- c("https://forums.tesla.com/forum/tesla-model-3", paste0("https://forums.tesla.com/forum/tesla-model-3?page=", 1:2))

master_data <- map_dfr(page_urls[1:2], function(url){
  scrape_page_info(url) %>%
    mutate(forum_data = map(topic_urls, scrape_thread_possibly))
})

system · April 6, 2020, 8:34am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.