Hi,
I want to run a read_html (and some unenclosed calculations) parallel, but read_html fails. The code works perfectly with lapply, but does not with parLapply. I tried already almost everything, but I cant solve this issue. I enclose a minimal example. Thank you for the help in advance!
Marcell
library(tidyverse)
library(rvest)
#> Loading required package: xml2
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(parallel)
URL.raw <- c('https://www.france24.com/fr/tag/covid-19/#paper',
paste0('https://www.france24.com/fr/tag/covid-19/', 1:2, '/#paper') # sample ofURLs to read
)
read_URL.raw <- function(URL.raw) { # URL reader function <--- error occurs sometimes
tryCatch({
read_html(URL.raw)
}, error = function(e) NULL)
}
w_lapply <- lapply(URL.raw, read_URL.raw) # works w lapply !!!
w_lapply
#> [[1]]
#> {html_document}
#> <html dir="ltr" lang="fr">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body class="u-header-sticky u-has-nav-bar" data-cmp-theme="france24" dat ...
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> {html_document}
#> <html dir="ltr" lang="fr">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body class="u-header-sticky u-has-nav-bar" data-cmp-theme="france24" dat ...
cl <- makeCluster(7)
clusterExport(cl, list("URL.raw", "read_URL.raw"), envir = environment())
clusterEvalQ(cl, library(rvest))
#> [[1]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[2]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[3]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[4]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[5]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[6]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
#>
#> [[7]]
#> [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
#> [7] "datasets" "methods" "base"
w_parLapply <- parLapply(cl = cl, X = URL.raw, fun = read_URL.raw) # fails with parLapply
stopCluster(cl)
w_parLapply
#> [[1]]
#> Error in doc_type(x): external pointer is not valid
Created on 2021-01-28 by the reprex package (v0.3.0)