I wanted to extract information on about 10000 papers to a data.frame. Each paper is its own node, with many child nodes containing the relevant information. An example of the xml can be found here: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=11850928,11482001&version=2.0
It seems that looping over the 10000 nodes take much more than 100 time longer than looping over the 100 nodes used for testing purposes.
Can I improve the code somehow, or is this rather an issue with xml2
and pointer lookup?
The problem is emulated here
library(xml2)
library(dplyr)
library(purrr)
library(glue)
create_child_node <- function(x = read_xml("<DocEntry/>"), num_useless = 0){
xml_attr(x, "uid") <- 1
xml_add_child(x, read_xml("<E1>Entry 1 text</E1>"))
xml_add_child(x, read_xml("<E2>Entry 2 text</E2>"))
xml_add_child(x, read_xml("<E3><flag>Entry 3a text</flag><flag>Entry 3a text</flag></E3>"))
for(i in seq_len(num_useless)){
xml_str <- glue("<UE{i}>Useless entry {i} text</UE{i}>")
xml_add_child(x, read_xml(xml_str))
}
return(x)
}
add_child_nodes <- function(root, child, times = 10){
for(i in seq_len(times)){
xml_add_child(root, child)
}
}
cn_to_df <- function(xml_doc){
xml_doc %>%
xml_find_all("./DocEntry") %>%
map_dfr(
~list(
ID = xml_attr(.x, "uid"),
Entry1 = xml_child(.x, "E1") %>% xml_text(),
Entry2 = xml_child(.x, "E2") %>% xml_text(),
Entry3 = xml_child(.x, "E3") %>%
xml_children() %>%
xml_text() %>%
paste(collapse = "|")
)
)
}
x <- create_child_node()
xu <- create_child_node(num_useless = 10)
r10 <- read_xml("<outer_1/>")
add_child_nodes(root = r10, child = x, times = 10)
r10 %>% cn_to_df()
r10u <- read_xml("<outer_1/>")
add_child_nodes(root = r10u, child = xu, times = 10)
r100 <- read_xml("<outer_1/>")
add_child_nodes(root = r100, child = x, times = 100)
r100u <- read_xml("<outer_1/>")
add_child_nodes(root = r100u, child = xu, times = 100)
r1000 <- read_xml("<outer_1/>")
add_child_nodes(root = r1000, child = x, times = 1000)
r1000u <- read_xml("<outer_1/>")
add_child_nodes(root = r1000u, child = xu, times = 1000)
microbenchmark::microbenchmark(
`10n` = r10 %>% cn_to_df(),
`100n` = r100 %>% cn_to_df(),
`1000n` = r1000 %>% cn_to_df(),
`10nu` = r10u %>% cn_to_df(),
`100nu` = r100u %>% cn_to_df(),
`1000nu` = r1000u %>% cn_to_df(),
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# 10n 6.990414 7.301658 7.931157 7.402987 7.754296 12.44875 10
# 100n 69.019426 74.343384 75.675192 75.106936 77.007598 80.21526 10
# 1000n 1223.347959 1235.432667 1271.188656 1279.414423 1298.329585 1304.84773 10
# 10nu 7.035068 7.117824 7.755804 7.255503 8.496271 10.30647 10
# 100nu 75.430356 78.497456 89.083573 81.702070 82.691879 156.27016 10
# 1000nu 2741.825817 2826.365735 3022.082937 3096.974694 3147.563586 3168.02681 10
It does indeed seem to be related to the loop, because the number of child nodes do not greatly affect the runtime:
cn_to_df_nl <- function(xml_doc){
tibble(
ID = xml_attr(xml_doc, "uid"),
Entry1 = xml_child(xml_doc, "E1") %>% xml_text(),
Entry2 = xml_child(xml_doc, "E2") %>% xml_text(),
Entry3 = xml_child(xml_doc, "E3") %>%
xml_children() %>%
xml_text() %>%
paste(collapse = "|")
)
}
cn_to_df_nl_find <- function(xml_doc){
tibble(
ID = xml_attr(xml_doc, "uid"),
Entry1 = xml_find_first(xml_doc, "./E1") %>% xml_text(),
Entry2 = xml_find_first(xml_doc, "./E2") %>% xml_text(),
Entry3 = xml_find_first(xml_doc, "./E3") %>%
xml_children() %>%
xml_text() %>%
paste(collapse = "|")
)
}
x <- create_child_node()
xu10 <- create_child_node(num_useless = 10)
xu100 <- create_child_node(num_useless = 100)
xu1000 <- create_child_node(num_useless = 1000)
microbenchmark::microbenchmark(
`1n_chld` = cn_to_df_nl(x),
`1n_fnd` = cn_to_df_nl_find(x),
`1nu_chld` = cn_to_df_nl(xu),
`1n10u_fnd` = cn_to_df_nl_find(xu10),
`1n100u_fnd` = cn_to_df_nl_find(xu100),
`1n1000u_fnd` = cn_to_df_nl_find(xu1000)
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# 1n_chld 1.507413 1.535759 1.758106 1.577304 1.653047 6.335169 100
# 1n_fnd 1.498048 1.527944 1.689174 1.549376 1.651442 7.071299 100
# 1nu_chld 1.510079 1.540442 1.712164 1.566106 1.644284 6.862856 100
# 1n10u_fnd 1.497976 1.536436 1.783982 1.566508 1.761430 7.099310 100
# 1n100u_fnd 1.513050 1.579365 1.788714 1.620808 1.829052 6.285841 100
# 1n1000u_fnd 1.723999 1.794999 2.101975 1.848664 1.961226 7.327973 100
Best wishes,
Ulrik