I'm writing a R package for the Entsoe-e API but I have some performance issues when I parse the XML files that are returned by the API.
The xml file is not very tidy and not all elements are always returned, so currently I'm experimenting.
I have profiled the function and id_extractor is called a lot of times and is not very fast.
The API returns a zip file with up to 200 xml files, that I extract in a temp folder, read with xml2::read_html and then parse each file individually with the outages_helper below.
library(dplyr)
library(rvest)
library(purrr)
library(readr)
library(tibble)
library(tidyr)
library(xml2)
outages_helper <- function(html_doc){
html_doc <- html_doc %>% rvest::html_node("unavailability_marketdocument")
id_extractor <- function(html_doc, id){
rvest::html_nodes(html_doc, xpath = id) %>%
rvest::html_text() %>%
tibble::tibble(id = id, value = .)
}
###########################################
# extract doc info
#############################################
ids <- c("mRID",
"revisionNumber",
"type",
"process.processType",
"createdDateTime",
"sender_MarketParticipant.mRID",
"sender_MarketParticipant.marketRole.type",
"receiver_MarketParticipant.mRID",
"receiver_MarketParticipant.marketRole.type",
"unavailability_Time_Period.timeInterval",
"docStatus")
ids <- tolower(ids)
doc_result <-
purrr::map(ids, ~id_extractor(html_doc, .x)) %>%
dplyr::bind_rows() %>%
tidyr::spread(id, value) %>%
dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
####################################
# extract timeseries
######################################
ids <- c("mRID",
"businessType",
"biddingZone_Domain.mRID",
"in_Domain.mRID",
"out_Domain.mRID",
"start_DateAndOrTime.date",
"start_DateAndOrTime.time",
"end_DateAndOrTime.date",
"end_DateAndOrTime.time",
"quantity_Measure_Unit.name",
"curveType",
"production_RegisteredResource.mRID",
"production_RegisteredResource.name",
"production_RegisteredResource.location.name",
"production_RegisteredResource.pSRType.psrType",
"production_RegisteredResource.pSRType.powerSystemResources.mRID",
"production_RegisteredResource.pSRType.powerSystemResources.name",
"production_RegisteredResource.pSRType.powerSystemResources.nominalP")
ids <- tolower(ids)
html_ts <-
html_doc %>%
rvest::html_nodes("timeseries")
doc_result_ts <-
purrr::map(ids, ~id_extractor(html_ts, .x)) %>%
dplyr::bind_rows() %>%
tidyr::spread(id, value) %>%
dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
doc_result$timeseries <- list(doc_result_ts)
ids <- c("timeInterval",
"resolution")
ids <- tolower(ids)
html_ts_ps <-
html_ts %>%
rvest::html_nodes("available_period")
doc_result_ts_ps <-
purrr::map(ids, ~id_extractor(html_ts_ps, .x)) %>%
dplyr::bind_rows() %>%
tidyr::spread(id, value) %>%
dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
doc_result$point_series <- list(doc_result_ts_ps)
ids <- c("position",
"quantity")
ids <- tolower(ids)
html_ts_ps_p <-
html_ts_ps %>%
rvest::html_nodes("point")
doc_result_ts_ps_p <-
purrr::map(ids, ~id_extractor(html_ts_ps_p, .x)) %>%
dplyr::bind_rows() %>%
tidyr::spread(id, value) %>%
dplyr::mutate_all(dplyr::funs(readr::parse_guess(.)))
doc_result$point <- list(doc_result_ts_ps_p)
##########################################
# extract reason
#############################################
ids <- c("code", "text")
html_reason <-
html_doc %>%
rvest::html_nodes("reason")
doc_result_reason <-
purrr::map(ids, ~id_extractor(html_reason, .x)) %>%
dplyr::bind_rows() %>%
tidyr::spread(id, value)
doc_result$reason <- list(doc_result_reason)
doc_result <- tidyr::unnest(doc_result, reason, .sep = "_")
doc_result
}
# path to xml file on github
path_to_data <- paste0("https://raw.githubusercontent.com/krose/",
"entsoeR/master/inst/",
"001-001-PLANNED_UNAVAIL_OF_GENERATION_UNITS_201711080000-201801010000.xml")
# Read the xml file and create ten copies.
doc <- xml2::read_html(path_to_data, encoding = "UTF-8")
doc_list <- lapply(1:10, function(x){doc})
# parse the list of xml files and row bind the output of each file.
system.time({purrr::map(doc_list, ~outages_helper(.x)) %>%
dplyr::bind_rows()})
Other suggestion are also welcome.
Update: I forgot to wrap the last part in system.time.