Thanks to @gueyenono reply to my previous post
https://forum.posit.co/t/scraping-q-a-forum-user-info/27923/7
now I've been trying to scrape two more fields:
- the date of the posts
- info about the posters
Here is the code from @gueyenono I added the scrape_poster_dates
function but it is not working
library(dplyr)
library(rvest)
library(purrr)
library(RCurl)
library(stringr)
library(tidyr)
# Estimate the number of pages on the forum by dividing the number of pages by 20
page1_html <- getURL("https://www.medhelp.org/forums/Aspergers-Syndrome/show/191?page=1")
n_pages <- page1_html %>%
read_html() %>%
html_node("div.forum_title") %>%
html_text() %>%
str_extract_all("\\d+") %>%
flatten_chr() %>%
as.numeric() %>%
`[`(3) %>%
{. / 20}
# Get all thread titles and thread links
page_urls <- paste0("https://www.medhelp.org/forums/Aspergers-Syndrome/show/191?page=", seq_len(n_pages))
page_htmls <- map_chr(page_urls[1], getURL) # use page_urls instead of page_urls[1] if you want to scrape everything!
scrape_thread_titles <- function(html){
read_html(html) %>%
html_nodes(".subj_title a") %>%
html_text()
}
scrape_thread_links <- function(html){
read_html(html) %>%
html_nodes(".subj_title a") %>%
html_attr("href") %>%
paste0("https://www.medhelp.org", .)
}
thread_titles <- map(page_htmls, scrape_thread_titles) %>%
discard(~ length(.x) == 0)
correct_n_pages <- length(thread_titles)
thread_titles <- thread_titles %>%
flatten_chr()
thread_links <- map(page_htmls, scrape_thread_links) %>%
`[`(seq_len(correct_n_pages)) %>%
flatten_chr()
master_data <- tibble(thread_titles, thread_links)
# Scrape all thread posts and poster's IDs
thread_htmls <- map_chr(master_data$thread_links, getURL)
html <- thread_htmls[1]
link <- master_data$thread_links[1]
scrape_poster_ids <- function(html){
read_html(html) %>%
html_nodes(css = "span span") %>%
html_text()
}
scrape_poster_dates <- function(html){
read_html(html) %>%
html_nodes(css = ".subj_info .mh_timestamp") %>%
html_text()
}
scrape_posts <- function(html){
read_html(html) %>%
html_nodes(".resp_body , #subject_msg") %>%
html_text() %>%
str_replace_all("\r|\n", "") %>%
str_trim()
}
master_data <- master_data %>%
mutate(
poster_ids = map(thread_htmls, scrape_poster_ids),
posts = map(thread_htmls, scrape_posts),
dates = map(thread_htmls, scrape_poster_dates)
) %>%
unnest()
head(master_data, 15)
titles<-master_data$thread_titles
posters<-master_data$poster_ids
posts<-master_data$posts
dates<-master_data$dates
employ.data <- data.frame(titles, posters, posts, fechas)
write.csv(employ.data, "C:/Asperger/page1.csv", na = "")
For scraping the dates I used Selectorgadget, it is difficult to find the actual tag for the dates of the questions and answers and not including the comments, because I'm not including comments here, so I tried all of the following:
.username .mh_timestamp
.username:nth-child(2) .mh_timestamp
div time
.subj_info .mh_timestamp
But none of them works, I don't know which one else to try.. I keep getting the message
Error: All nested columns must have the same number of elements.
Regarding the user info I'm trying to scrape the gender and the age of each user that posts a question or an answer. For example it would be Female,35
for this one
https://www.medhelp.org/personal_pages/user/365714
Using Selectorgadget it seems to be here
.section .title+ span
But I could not scrape them either