xml2 slows in looping when the number nodes increase

UlrikStervbo · August 8, 2020, 9:23am

I wanted to extract information on about 10000 papers to a data.frame. Each paper is its own node, with many child nodes containing the relevant information. An example of the xml can be found here: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=11850928,11482001&version=2.0

It seems that looping over the 10000 nodes take much more than 100 time longer than looping over the 100 nodes used for testing purposes.

Can I improve the code somehow, or is this rather an issue with xml2 and pointer lookup?

The problem is emulated here

library(xml2)
library(dplyr)
library(purrr)
library(glue)

create_child_node <- function(x = read_xml("<DocEntry/>"), num_useless = 0){
  xml_attr(x, "uid") <- 1
  xml_add_child(x, read_xml("<E1>Entry 1 text</E1>"))
  xml_add_child(x, read_xml("<E2>Entry 2 text</E2>"))
  xml_add_child(x, read_xml("<E3><flag>Entry 3a text</flag><flag>Entry 3a text</flag></E3>"))

  for(i in seq_len(num_useless)){
      xml_str <- glue("<UE{i}>Useless entry {i} text</UE{i}>")
      xml_add_child(x, read_xml(xml_str))
  }

  return(x)
}

add_child_nodes <- function(root, child, times = 10){
  for(i in seq_len(times)){
    xml_add_child(root, child)
  }
}

cn_to_df <- function(xml_doc){
  xml_doc %>%
    xml_find_all("./DocEntry") %>%
    map_dfr(
      ~list(
        ID = xml_attr(.x, "uid"),
        Entry1 = xml_child(.x, "E1") %>% xml_text(),
        Entry2 = xml_child(.x, "E2") %>% xml_text(),
        Entry3 = xml_child(.x, "E3") %>%
          xml_children() %>%
          xml_text() %>%
          paste(collapse = "|")
      )
    )
}

x <- create_child_node()
xu <- create_child_node(num_useless = 10)

r10 <- read_xml("<outer_1/>")
add_child_nodes(root = r10, child = x, times = 10)
r10 %>% cn_to_df()

r10u <- read_xml("<outer_1/>")
add_child_nodes(root = r10u, child = xu, times = 10)

r100 <- read_xml("<outer_1/>")
add_child_nodes(root = r100, child = x, times = 100)

r100u <- read_xml("<outer_1/>")
add_child_nodes(root = r100u, child = xu, times = 100)

r1000 <- read_xml("<outer_1/>")
add_child_nodes(root = r1000, child = x, times = 1000)

r1000u <- read_xml("<outer_1/>")
add_child_nodes(root = r1000u, child = xu, times = 1000)

microbenchmark::microbenchmark(
  `10n` = r10 %>% cn_to_df(),
  `100n` = r100 %>% cn_to_df(),
  `1000n` = r1000 %>% cn_to_df(),
  `10nu` = r10u %>% cn_to_df(),
  `100nu` = r100u %>% cn_to_df(),
  `1000nu` = r1000u %>% cn_to_df(),
  times = 10
)

# Unit: milliseconds
#   expr         min          lq        mean      median          uq        max neval
#    10n    6.990414    7.301658    7.931157    7.402987    7.754296   12.44875    10
#   100n   69.019426   74.343384   75.675192   75.106936   77.007598   80.21526    10
#  1000n 1223.347959 1235.432667 1271.188656 1279.414423 1298.329585 1304.84773    10
#   10nu    7.035068    7.117824    7.755804    7.255503    8.496271   10.30647    10
#  100nu   75.430356   78.497456   89.083573   81.702070   82.691879  156.27016    10
# 1000nu 2741.825817 2826.365735 3022.082937 3096.974694 3147.563586 3168.02681    10

It does indeed seem to be related to the loop, because the number of child nodes do not greatly affect the runtime:

cn_to_df_nl <- function(xml_doc){
  tibble(
    ID = xml_attr(xml_doc, "uid"),
    Entry1 = xml_child(xml_doc, "E1") %>% xml_text(),
    Entry2 = xml_child(xml_doc, "E2") %>% xml_text(),
    Entry3 = xml_child(xml_doc, "E3") %>%
      xml_children() %>%
      xml_text() %>%
      paste(collapse = "|")
  )
}

cn_to_df_nl_find <- function(xml_doc){
  tibble(
    ID = xml_attr(xml_doc, "uid"),
    Entry1 = xml_find_first(xml_doc, "./E1") %>% xml_text(),
    Entry2 = xml_find_first(xml_doc, "./E2") %>% xml_text(),
    Entry3 = xml_find_first(xml_doc, "./E3") %>%
      xml_children() %>%
      xml_text() %>%
      paste(collapse = "|")
  )
}

x <- create_child_node()
xu10 <- create_child_node(num_useless = 10)
xu100 <- create_child_node(num_useless = 100)
xu1000 <- create_child_node(num_useless = 1000)

microbenchmark::microbenchmark(
  `1n_chld` = cn_to_df_nl(x),
  `1n_fnd` = cn_to_df_nl_find(x),
  `1nu_chld` = cn_to_df_nl(xu),
  `1n10u_fnd` = cn_to_df_nl_find(xu10),
  `1n100u_fnd` = cn_to_df_nl_find(xu100),
  `1n1000u_fnd` = cn_to_df_nl_find(xu1000)
)

# Unit: milliseconds
#        expr      min       lq     mean   median       uq      max neval
#     1n_chld 1.507413 1.535759 1.758106 1.577304 1.653047 6.335169   100
#      1n_fnd 1.498048 1.527944 1.689174 1.549376 1.651442 7.071299   100
#    1nu_chld 1.510079 1.540442 1.712164 1.566106 1.644284 6.862856   100
#   1n10u_fnd 1.497976 1.536436 1.783982 1.566508 1.761430 7.099310   100
#  1n100u_fnd 1.513050 1.579365 1.788714 1.620808 1.829052 6.285841   100
# 1n1000u_fnd 1.723999 1.794999 2.101975 1.848664 1.961226 7.327973   100

Best wishes,
Ulrik

system · August 29, 2020, 9:23am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.