I'm parsing some rather large xml files into tibbles and in an attempt to speed the process, using the combination of purrr::map and future. I've been able to complete a successful end to end test in a Linux environment, but the same test fails in Windows. Here's the reprex:
# Large XML future::multiprocess Parse Test - Windows Error
library(httr)
library(xml2)
library(tidyverse)
library(RCurl)
#> Loading required package: bitops
#>
#> Attaching package: 'RCurl'
#> The following object is masked from 'package:tidyr':
#>
#> complete
library(future)
sessionInfo()
#> R version 3.4.3 (2017-11-30)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 16299)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] future_1.7.0 RCurl_1.95-4.10 bitops_1.0-6 forcats_0.3.0
#> [5] stringr_1.3.0 dplyr_0.7.4 purrr_0.2.4 readr_1.1.1
#> [9] tidyr_0.8.0 tibble_1.4.2 ggplot2_2.2.1 tidyverse_1.2.1
#> [13] xml2_1.2.0 httr_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] listenv_0.7.0 reshape2_1.4.3 haven_1.1.1 lattice_0.20-35
#> [5] colorspace_1.3-2 htmltools_0.3.6 yaml_2.1.18 rlang_0.2.0
#> [9] pillar_1.2.1 foreign_0.8-69 glue_1.2.0 modelr_0.1.1
#> [13] readxl_1.0.0 bindrcpp_0.2 bindr_0.1 plyr_1.8.4
#> [17] munsell_0.4.3 gtable_0.2.0 cellranger_1.1.0 rvest_0.3.2
#> [21] codetools_0.2-15 psych_1.7.8 evaluate_0.10.1 knitr_1.20
#> [25] parallel_3.4.3 broom_0.4.3 Rcpp_0.12.15 backports_1.1.2
#> [29] scales_0.5.0 jsonlite_1.5 mnormt_1.5-5 hms_0.4.2
#> [33] digest_0.6.15 stringi_1.1.6 grid_3.4.3 rprojroot_1.3-2
#> [37] cli_1.0.0 tools_3.4.3 magrittr_1.5 lazyeval_0.2.1
#> [41] crayon_1.3.4 pkgconfig_2.0.1 lubridate_1.7.2 assertthat_0.2.0
#> [45] rmarkdown_1.9 R6_2.2.2 globals_0.11.0 nlme_3.1-131.1
#> [49] compiler_3.4.3
# New Mexico Oil and Gas FTP Root
url <- "ftp://164.64.106.6/Public/OCD/OCD%20Data/"
# str_split string determined by OS Env
split_on_char <- if (Sys.info()[1] == "Windows") {
"\r\n"
} else {
"\n"
}
# Get Core Data .zip file
file_name <- str_split(getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE), split_on_char)[[1]] %>% str_subset('OCDCoreData(\\d{8}).zip$')
file_path <- paste(url, file_name, sep = "")
GET(file_path, write_disk(paste0(tempdir(),"\\nm.zip"), overwrite = TRUE))
#> Warning in parse_http_status(lines[[1]]): NAs introduced by coercion
#> Warning: Failed to parse headers:
#> 331 Password required for anonymous
#> 230 Logged on
#> 257 "/" is current directory.
#> 250 CWD successful. "/Public" is current directory.
#> 250 CWD successful. "/Public/OCD" is current directory.
#> 250 CWD successful. "/Public/OCD/OCD Data" is current directory.
#> 213 20180319063106
#> 229 Entering Extended Passive Mode (|||50028|)
#> 200 Type set to I
#> 213 20518739
#> 150 Opening data channel for file download from server of "/Public/OCD/OCD Data/OCDCoreData20180319.zip"
#> 226 Successfully transferred "/Public/OCD/OCD Data/OCDCoreData20180319.zip"
#> Response [ftp://164.64.106.6/Public/OCD/OCD%20Data/OCDCoreData20180319.zip]
#> Date: 2018-03-20 15:26
#> Status: 226
#> Content-Type: <unknown>
#> Size: 20.5 MB
#> <ON DISK> C:\Users\gmccomas\AppData\Local\Temp\RtmpgR0Jc1\nm.zip
unzip(paste0(tempdir(),"\\nm.zip"), files = "TEMP\\T_OGRID.xml", exdir = tempdir(), junkpaths = TRUE, overwrite = TRUE)
# Generate NM operator xml document
nm_xml <- read_xml(list.files(tempdir(), pattern = "T_OGRID.xml", full.names = TRUE))
cols <- nm_xml %>%
xml_find_first("./Table") %>%
xml_children() %>%
xml_name(ns = xml_ns(.))
# future::sequential xml parse - no error in any environment
plan(sequential)
system.time(
nm_df <- cols %>%
map(~ future({
nm_xml %>%
xml_find_all(sprintf("./Table/%s", .x)) %>%
xml_text() %>%
trimws() %>%
list() %>%
set_names(tolower(gsub(".*:","",.x)))
})) %>%
values() %>%
flatten_df()
)
#> user system elapsed
#> 26.89 0.19 27.08
# future::multiprocess xml parse - Generates 'Error: external pointer is not valid' on Windows but completes on Linux
plan(multiprocess)
system.time(
nm_df <- cols %>%
map(~ future({
nm_xml %>%
xml_find_all(sprintf("./Table/%s", .x)) %>%
xml_text() %>%
trimws() %>%
list() %>%
set_names(tolower(gsub(".*:","",.x)))
})) %>%
values() %>%
flatten_df()
)
#> Error: external pointer is not valid
#> Timing stopped at: 1.86 0.05 8.42
unlink(tempdir())
Created on 2018-03-20 by the reprex package (v0.2.0).
Any idea why this fails to complete in Windows?