Hi,
My issue is that I want to webscrapping with parallel computing, but read_html fails with parLapply. I cant find the reason, but I enclose the following minimal example. Any suggestion?
URLs <- c( # 2 URL from imdb
"https://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=8JCSP6GA953V0B89HFAC&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1",
"https://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=8JCSP6GA953V0B89HFAC&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2"
)
library(rvest)
#> Loading required package: xml2
read_html(URLs[1])
#> {html_document}
#> <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
read_html(URLs[2])
#> {html_document}
#> <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
lapply(URLs, function(x) read_html(x))
#> [[1]]
#> {html_document}
#> <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
#>
#> [[2]]
#> {html_document}
#> <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
library(parallel)
cl <- makeCluster(7)
clusterExport(cl, list("URLs"))
clusterEvalQ(cl, library(rvest))
#> [[1]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[2]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[3]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[4]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[5]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[6]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[7]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
parLapply(cl = cl, X = URLs, fun = function(X) read_html(x = X))
#> [[1]]
#> Error in doc_type(x): external pointer is not valid
stopCluster(cl)
Created on 2021-01-28 by the reprex package (v0.3.0)