How to convert a large XML file to a dataframe faster?

nirgrahamuk · February 4, 2022, 4:41pm

I duplicated some of the data in the sample.xml a few times, to make my benchmarking tractable. I found some opportunities to improve the speed.

library(xml2)
library(tidyverse)

pg <-read_xml("sample2.xml")
node<-xml_find_all(pg, xpath = "//kf:Series")


f0 <- function(pg,node){
  
  datalist=list()
  
  for (i in seq_along(node)){
    
    l<-length(xml_children( node[[i]]))
    df<-data.frame(matrix(ncol = 4,nrow=l))
    colnames(df) <- c('date','value',"SERIES_NAME","UNIT")
    
    for (z in 1:length(xml_children( node[[i]]))){
      
      df$date[z]<-xml_attrs(xml_child(node[[i]], z))[["DATE"]]
      df$value[z]<-xml_attrs(xml_child(node[[i]], z))[["VALUE"]]
      
      df$SERIES_NAME<-xml_attrs(node[[i]])[["SERIES_NAME"]]
      df$UNIT<-xml_attrs(node[[i]])[["UNIT"]]
      
      datalist[[i]]<-df
    }   
  }
  datalist
}

f1 <- function(pg,node){
  
  datalist=list()
  for (i in seq_along(node)){
    
    l<-length(xml_children( node[[i]]))
    df<-data.frame(matrix(ncol = 2,nrow=l))
    colnames(df) <- c("SERIES_NAME","UNIT")
    nd <- xml_attrs(node[[i]])
    df$SERIES_NAME<-nd[["SERIES_NAME"]]
    df$UNIT<-nd[["UNIT"]]
    
    subdf <- map_dfr(xml_attrs(xml_children(node[[i]])),
                     ~data.frame(date=.[[2]],value=.[[1]],stringsAsFactors=FALSE))
    
    datalist[[i]] <- bind_cols(subdf,df)
  }
  datalist
}
f1()

bench::mark(f0(),f1(),iterations = 500L,filter_gc = FALSE)

# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result   memory          time        gc          
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>   <list>          <list>      <list>      
1 f0()         54.1ms   69.2ms      14.2     3.9MB    10.7    500   376      35.2s <list [~ <Rprofmem [1,6~ <bench_tm ~ <tibble [50~
2 f1()         38.1ms   43.9ms      22.2   677.9KB     6.97   500   157      22.5s <list [~ <Rprofmem [272~ <bench_tm ~ <tibble [50~

so maybe if f0 would take 6 hours, f1 might take 4 ? hard to say