I duplicated some of the data in the sample.xml a few times, to make my benchmarking tractable. I found some opportunities to improve the speed.
library(xml2)
library(tidyverse)
pg <-read_xml("sample2.xml")
node<-xml_find_all(pg, xpath = "//kf:Series")
f0 <- function(pg,node){
datalist=list()
for (i in seq_along(node)){
l<-length(xml_children( node[[i]]))
df<-data.frame(matrix(ncol = 4,nrow=l))
colnames(df) <- c('date','value',"SERIES_NAME","UNIT")
for (z in 1:length(xml_children( node[[i]]))){
df$date[z]<-xml_attrs(xml_child(node[[i]], z))[["DATE"]]
df$value[z]<-xml_attrs(xml_child(node[[i]], z))[["VALUE"]]
df$SERIES_NAME<-xml_attrs(node[[i]])[["SERIES_NAME"]]
df$UNIT<-xml_attrs(node[[i]])[["UNIT"]]
datalist[[i]]<-df
}
}
datalist
}
f1 <- function(pg,node){
datalist=list()
for (i in seq_along(node)){
l<-length(xml_children( node[[i]]))
df<-data.frame(matrix(ncol = 2,nrow=l))
colnames(df) <- c("SERIES_NAME","UNIT")
nd <- xml_attrs(node[[i]])
df$SERIES_NAME<-nd[["SERIES_NAME"]]
df$UNIT<-nd[["UNIT"]]
subdf <- map_dfr(xml_attrs(xml_children(node[[i]])),
~data.frame(date=.[[2]],value=.[[1]],stringsAsFactors=FALSE))
datalist[[i]] <- bind_cols(subdf,df)
}
datalist
}
f1()
bench::mark(f0(),f1(),iterations = 500L,filter_gc = FALSE)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 f0() 54.1ms 69.2ms 14.2 3.9MB 10.7 500 376 35.2s <list [~ <Rprofmem [1,6~ <bench_tm ~ <tibble [50~
2 f1() 38.1ms 43.9ms 22.2 677.9KB 6.97 500 157 22.5s <list [~ <Rprofmem [272~ <bench_tm ~ <tibble [50~
so maybe if f0 would take 6 hours, f1 might take 4 ? hard to say