Increase performance of function to extract data from xml files

krose · November 7, 2017, 12:02pm

I'm writing a R package for the Entsoe-e API but I have some performance issues when I parse the XML files that are returned by the API.

The xml file is not very tidy and not all elements are always returned, so currently I'm experimenting.

I have profiled the function and id_extractor is called a lot of times and is not very fast.

The API returns a zip file with up to 200 xml files, that I extract in a temp folder, read with xml2::read_html and then parse each file individually with the outages_helper below.

library(dplyr)
library(rvest)
library(purrr)
library(readr)
library(tibble)
library(tidyr)
library(xml2)

outages_helper <- function(html_doc){
  
  html_doc <- html_doc %>% rvest::html_node("unavailability_marketdocument")
  
  id_extractor <- function(html_doc, id){
    
    rvest::html_nodes(html_doc, xpath = id) %>%
      rvest::html_text() %>%
      tibble::tibble(id = id, value = .)
  }
  
  ###########################################
  # extract doc info
  #############################################
  
  ids <- c("mRID", 
           "revisionNumber", 
           "type", 
           "process.processType", 
           "createdDateTime", 
           "sender_MarketParticipant.mRID", 
           "sender_MarketParticipant.marketRole.type", 
           "receiver_MarketParticipant.mRID", 
           "receiver_MarketParticipant.marketRole.type", 
           "unavailability_Time_Period.timeInterval",
           "docStatus")
  ids <- tolower(ids)
  
  doc_result <- 
    purrr::map(ids, ~id_extractor(html_doc, .x)) %>% 
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  ####################################
  # extract timeseries
  ######################################
  
  ids <- c("mRID",
           "businessType",
           "biddingZone_Domain.mRID",
           "in_Domain.mRID",
           "out_Domain.mRID",
           "start_DateAndOrTime.date",
           "start_DateAndOrTime.time",
           "end_DateAndOrTime.date",
           "end_DateAndOrTime.time",
           "quantity_Measure_Unit.name",
           "curveType",
           "production_RegisteredResource.mRID",
           "production_RegisteredResource.name",
           "production_RegisteredResource.location.name",
           "production_RegisteredResource.pSRType.psrType",
           "production_RegisteredResource.pSRType.powerSystemResources.mRID",
           "production_RegisteredResource.pSRType.powerSystemResources.name",
           "production_RegisteredResource.pSRType.powerSystemResources.nominalP")
  ids <- tolower(ids)
  
  html_ts <- 
    html_doc %>% 
    rvest::html_nodes("timeseries")
  
  doc_result_ts <- 
    purrr::map(ids, ~id_extractor(html_ts, .x)) %>% 
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$timeseries <- list(doc_result_ts)
  
  ids <- c("timeInterval", 
           "resolution")
  ids <- tolower(ids)
  
  html_ts_ps <- 
    html_ts %>% 
    rvest::html_nodes("available_period")
  
  doc_result_ts_ps <- 
    purrr::map(ids, ~id_extractor(html_ts_ps, .x)) %>% 
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$point_series <- list(doc_result_ts_ps)
  
  ids <- c("position", 
           "quantity")
  ids <- tolower(ids)
  
  html_ts_ps_p <- 
    html_ts_ps %>% 
    rvest::html_nodes("point")
  
  doc_result_ts_ps_p <- 
    purrr::map(ids, ~id_extractor(html_ts_ps_p, .x)) %>% 
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$point <- list(doc_result_ts_ps_p)
  
  ##########################################
  # extract reason
  #############################################
  
  ids <- c("code", "text")
  
  html_reason <- 
    html_doc %>% 
    rvest::html_nodes("reason")
  
  doc_result_reason <- 
    purrr::map(ids, ~id_extractor(html_reason, .x)) %>% 
    dplyr::bind_rows() %>%
    tidyr::spread(id, value)
  
  doc_result$reason <- list(doc_result_reason)
  doc_result <- tidyr::unnest(doc_result, reason, .sep = "_")
  
  doc_result
}


# path to xml file on github
path_to_data <- paste0("https://raw.githubusercontent.com/krose/", 
                       "entsoeR/master/inst/", 
                       "001-001-PLANNED_UNAVAIL_OF_GENERATION_UNITS_201711080000-201801010000.xml")

# Read the xml file and create ten copies.
doc <- xml2::read_html(path_to_data, encoding = "UTF-8")
doc_list <- lapply(1:10, function(x){doc})

# parse the list of xml files and row bind the output of each file.
system.time({purrr::map(doc_list, ~outages_helper(.x)) %>%
  dplyr::bind_rows()})

Other suggestion are also welcome.

Update: I forgot to wrap the last part in system.time.

cderv · November 7, 2017, 10:13pm

What the result of system.time have you on your system ?

I got 2.84 for ten files 60 seconds for 200 hundred copies of your example files.

What performance would you achieve ?

nick · November 7, 2017, 10:46pm

Much of your time is spent on the tibble function. While it's a useful function, since you are effectively calling it on every single row of the resulting table, it's a lot of overhead. You may be able to build your value column with the appropriate map function, and then build the tibble with that, though it will likely be more roundabout to deal with what are currently empty tibbles.

hadley · November 8, 2017, 3:02am

You might be able to speed things up by collapsing ids into a single xpath expression (e.g. using paste())

mgirlich · November 8, 2017, 7:45am

Using @hadley's suggestions of a single xpath expression and using readr::type_convert() instead of dplyr::mutate_all(dplyr::funs(readr::parse_guess(.))) one gets a nice speed up.

library(dplyr)
library(rvest)
library(purrr)
library(readr)
library(tibble)
library(tidyr)
library(xml2)

outages_helper <- function(html_doc){
  
  html_doc <- html_doc %>% rvest::html_node("unavailability_marketdocument")
  
  id_extractor <- function(html_doc, id){
    rvest::html_nodes(html_doc, xpath = id) %>%
      rvest::html_text() %>%
      tibble::tibble(id = id, value = .)
  }
  
  ###########################################
  # extract doc info
  #############################################
  
  ids <- c("mRID", 
           "revisionNumber", 
           "type", 
           "process.processType", 
           "createdDateTime", 
           "sender_MarketParticipant.mRID", 
           "sender_MarketParticipant.marketRole.type", 
           "receiver_MarketParticipant.mRID", 
           "receiver_MarketParticipant.marketRole.type", 
           "unavailability_Time_Period.timeInterval",
           "docStatus")
  ids <- tolower(ids)
  
  doc_result <-
    purrr::map(ids, ~id_extractor(html_doc, .x)) %>%
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  ####################################
  # extract timeseries
  ######################################
  
  ids <- c("mRID",
           "businessType",
           "biddingZone_Domain.mRID",
           "in_Domain.mRID",
           "out_Domain.mRID",
           "start_DateAndOrTime.date",
           "start_DateAndOrTime.time",
           "end_DateAndOrTime.date",
           "end_DateAndOrTime.time",
           "quantity_Measure_Unit.name",
           "curveType",
           "production_RegisteredResource.mRID",
           "production_RegisteredResource.name",
           "production_RegisteredResource.location.name",
           "production_RegisteredResource.pSRType.psrType",
           "production_RegisteredResource.pSRType.powerSystemResources.mRID",
           "production_RegisteredResource.pSRType.powerSystemResources.name",
           "production_RegisteredResource.pSRType.powerSystemResources.nominalP")
  ids <- tolower(ids)
  
  html_ts <- 
    html_doc %>% 
    rvest::html_nodes("timeseries")
  
  doc_result_ts <-
    purrr::map(ids, ~id_extractor(html_ts, .x)) %>%
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$timeseries <- list(doc_result_ts)
  
  ids <- c("timeInterval", 
           "resolution")
  ids <- tolower(ids)
  
  html_ts_ps <- 
    html_ts %>% 
    rvest::html_nodes("available_period")
  
  doc_result_ts_ps <-
    purrr::map(ids, ~id_extractor(html_ts_ps, .x)) %>%
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$point_series <- list(doc_result_ts_ps)
  
  ids <- c("position", 
           "quantity")
  ids <- tolower(ids)
  
  html_ts_ps_p <- 
    html_ts_ps %>% 
    rvest::html_nodes("point")
  
  doc_result_ts_ps_p <-
    purrr::map(ids, ~id_extractor(html_ts_ps_p, .x)) %>%
    dplyr::bind_rows() %>%
    tidyr::spread(id, value) %>%
    dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
  
  doc_result$point <- list(doc_result_ts_ps_p)
  
  ##########################################
  # extract reason
  #############################################
  
  ids <- c("code", "text")
  
  html_reason <- 
    html_doc %>% 
    rvest::html_nodes("reason")
  
  doc_result_reason <-
    purrr::map(ids, ~id_extractor(html_reason, .x)) %>%
    dplyr::bind_rows() %>%
    tidyr::spread(id, value)
  
  doc_result$reason <- list(doc_result_reason)
  doc_result <- tidyr::unnest(doc_result, reason, .sep = "_")
  
  doc_result
}


outages_helper2 <- function(html_doc){
  
  html_doc <- html_doc %>% rvest::html_node("unavailability_marketdocument")
  
  id_extractor <- function(html_doc, ids) {
    r <- rvest::html_nodes(html_doc, xpath = paste(ids, collapse = "|"))
    as.list(r %>% html_text()) %>% 
      set_names(r %>% html_name()) %>% 
      as_tibble()
  }
  
  ###########################################
  # extract doc info
  #############################################
  
  ids <- c("mRID", 
           "revisionNumber", 
           "type", 
           "process.processType", 
           "createdDateTime", 
           "sender_MarketParticipant.mRID", 
           "sender_MarketParticipant.marketRole.type", 
           "receiver_MarketParticipant.mRID", 
           "receiver_MarketParticipant.marketRole.type", 
           "unavailability_Time_Period.timeInterval",
           "docStatus")
  ids <- tolower(ids)
  
  doc_result <-
    id_extractor(html_doc, ids) %>%
    readr::type_convert()
  
  ####################################
  # extract timeseries
  ######################################
  
  ids <- c("mRID",
           "businessType",
           "biddingZone_Domain.mRID",
           "in_Domain.mRID",
           "out_Domain.mRID",
           "start_DateAndOrTime.date",
           "start_DateAndOrTime.time",
           "end_DateAndOrTime.date",
           "end_DateAndOrTime.time",
           "quantity_Measure_Unit.name",
           "curveType",
           "production_RegisteredResource.mRID",
           "production_RegisteredResource.name",
           "production_RegisteredResource.location.name",
           "production_RegisteredResource.pSRType.psrType",
           "production_RegisteredResource.pSRType.powerSystemResources.mRID",
           "production_RegisteredResource.pSRType.powerSystemResources.name",
           "production_RegisteredResource.pSRType.powerSystemResources.nominalP")
  ids <- tolower(ids)
  
  html_ts <- 
    html_doc %>% 
    rvest::html_nodes("timeseries")
  
  doc_result_ts <-
    id_extractor(html_ts, ids) %>% 
    readr::type_convert()
  
  doc_result$timeseries <- list(doc_result_ts)
  
  ids <- c("timeInterval", 
           "resolution")
  ids <- tolower(ids)
  
  html_ts_ps <- 
    html_ts %>% 
    rvest::html_nodes("available_period")
  
  doc_result_ts_ps <- 
    id_extractor(html_ts_ps, ids) %>% 
    readr::type_convert()
  
  doc_result$point_series <- list(doc_result_ts_ps)
  
  ids <- c("position", 
           "quantity")
  ids <- tolower(ids)
  
  html_ts_ps_p <- 
    html_ts_ps %>% 
    rvest::html_nodes("point")
  
  doc_result_ts_ps_p <- 
    id_extractor(html_ts_ps_p, ids) %>% 
    readr::type_convert()
  
  doc_result$point <- list(doc_result_ts_ps_p)
  
  ##########################################
  # extract reason
  #############################################
  
  ids <- c("code", "text")
  
  html_reason <- 
    html_doc %>% 
    rvest::html_nodes("reason")
  
  doc_result_reason <- 
    id_extractor(html_reason, ids)
  
  doc_result$reason <- list(doc_result_reason)
  doc_result <- tidyr::unnest(doc_result, reason, .sep = "_")
  
  doc_result
}


# path to xml file on github
path_to_data <- paste0("https://raw.githubusercontent.com/krose/", 
                       "entsoeR/master/inst/", 
                       "001-001-PLANNED_UNAVAIL_OF_GENERATION_UNITS_201711080000-201801010000.xml")

# Read the xml file and create ten copies.
doc <- xml2::read_html(path_to_data, encoding = "UTF-8")
doc_list <- lapply(1:10, function(x){doc})

# parse the list of xml files and row bind the output of each file.
microbenchmark::microbenchmark(
  purrr::map(doc_list, ~outages_helper(.x)) %>%
    dplyr::bind_rows(),
  suppressMessages(purrr::map(doc_list, ~outages_helper2(.x)) %>%
    dplyr::bind_rows()),
  times = 10
)

krose · November 8, 2017, 9:50am

Thanks for all your suggestions. I had a feeling there was a few low hanging fruits .

@mgirlich: I didn't know about type_convert and it makes somewhat easier to understand what is going on than what I was doing. I'll continue with your function

@hadley: Your suggestion to collapse the xpath made a nice improvement.

For future reference: w3schools has a very short intro to xpath.