I need to create codebooks for social science datasets (typical number of variables above 5-600). I used the base function cat() with good results, but xml2 seems like a much better alternative. However, xml2 needs about 15-16 seconds per dataset while creating the same XML file with sink() and cat() doesn't take more than 4-5 seconds.
Below you can find a trimmed down reprex that on my computer takes about 8 seconds to create the XML file. In the real situation, there are many more nodes and attributes to create (some depend on other logical conditions), but this is the bottleneck.
library(xml2)
missing <- c(-1, -2, -3)
values <- c("Very weak" = 1, "Weak" = 2, "Middle" = 3, "Strong" = 4, "Very strong" = 5, "Don't know" = -1)
root <- xml_new_document()
codeBook <- xml_add_child(root, "codeBook")
dataDscr <- xml_add_child(codeBook, "dataDscr")
for (i in seq(600)) {
var <- xml_add_child(dataDscr, "var", name = paste("V", i, sep = "_"))
if (TRUE) { # something needs to be checked here, as an example
xml_attr(var, "nature") <- "ordinal"
xml_attr(var, "representationType") <- "text"
}
labl <- xml_add_child(var, "labl")
xml_text(labl) <- paste("Variable label for V", i, sep = "_")
for (v in seq(length(values))) {
ismiss <- is.element(values[v], missing)
catgry <- xml_add_child(var, "catgry")
if (ismiss) xml_attr(catgry, "missing") <- "Y"
catValu <- xml_add_child(catgry, "catValu")
xml_text(catValu) <- as.character(values[v])
labl <- xml_add_child(catgry, "labl")
xml_text(labl) <- names(values)[v]
}
}
write_xml(root, "test.xml")