Not possible to add data to the database.

llrs · April 13, 2023, 6:36pm

I'm using the latest RStudio desktop 2023.03 and when filling a database I'm not able to finish adding all the data.
Rstudio crashes with errors similar to this one: R Studio crashes with fatal error: elf_dynamic_array_reader.h:61] tag not found or this issue.

library("progress")
library("dplyr")
library("DBI")
library("BOE") # devtools::install_github("ropenspain/BOE")
library("duckdb")

update_sumarios <- function(con, start = "2009-01-01") {
    last_date <- con |>
        tbl("sumario")  |>
        filter(date == max(date)) |>
        pull(date) |>
        unique()
    if (length(last_date) == 0) {
        last_date <- as.Date(start)
    }

    date <- as.Date(last_date) + 1
    days <- as.numeric(Sys.Date() - date)
    pb <- progress_bar$new(total = days,
                           format = "  downloading :what [:bar] :percent :elapsedfull :eta",
    )
    t.init <- Sys.time()
    pb$tick(0)
    while (date < Sys.Date()) {
        pb$tick(tokens = list(what = date))
        suma <- try(retrieve_sumario(date), silent = TRUE)
        if (is(suma, "try-error")) {
            date <- date + 1
            next
        }
        suma$date <- as.character(suma$date)
        dbAppendTable(con, "sumario", suma)
        date <- date + 1
    }
    t.end <- Sys.time() - t.init
    message("Updated ", days, " in ", round(t.end, 2), units(t.end))
}

create_tables <- function(con) {
    types_sumario <- c(date = "DATE", sumario_nbo = "TEXT", sumario_code = "TEXT",
                       section = "TEXT", section_number = "TEXT", departament = "TEXT",
                       departament_etq = "TEXT", epigraph = "TEXT", text = "TEXT", publication = "TEXT",
                       pages = "DOUBLE")

    types_pub <- c(identificador = "TEXT", titulo = "TEXT", diario = "TEXT",
                   diario_numero = "TEXT",
                   seccion = "TEXT", subseccion = "TEXT", departamento = "TEXT",
                   rango = "TEXT", numero_oficial = "TEXT", fecha_disposicion = "DATE",
                   fecha_publicacion = "DATE", fecha_vigencia = "DATE",
                   fecha_derogacion = "DATE",
                   letra_imagen = "TEXT", pagina_inicial = "TEXT", pagina_final = "TEXT",
                   suplemento_letra_imagen = "TEXT", suplemento_pagina_inicial = "TEXT",
                   suplemento_pagina_final = "TEXT", estatus_legislativo = "TEXT",
                   origen_legislativo = "TEXT", estado_consolidacion = "TEXT",
                   judicialmente_anulada = "TEXT",
                   vigencia_agotada = "TEXT", estatus_derogacion = "TEXT", url_epub = "TEXT",
                   url_pdf = "TEXT", url_pdf_catalan = "TEXT", url_pdf_euskera = "TEXT",
                   url_pdf_gallego = "TEXT", url_pdf_valenciano = "TEXT", url_eli = "TEXT",
                   departamento_codigo = "TEXT", fecha_actualizacion = "TIMESTAMP",
                   text = "TEXT")

    types_an <- c(
        identificador = "TEXT", titulo = "TEXT", diario = "TEXT",
        diario_numero = "TEXT",
        seccion = "TEXT", departamento = "TEXT", numero_anuncio = "TEXT",
        fecha_publicacion = "DATE", letra_imagen = "TEXT", pagina_inicial = "TEXT",
        pagina_final = "TEXT", url_pdf = "TEXT", departamento_codigo = "TEXT",
        fecha_actualizacion = "TIMESTAMP", subseccion = "TEXT",
        suplemento_letra_imagen = "TEXT", suplemento_pagina_inicial = "TEXT",
        suplemento_pagina_final = "TEXT", text = "TEXT")

    if (!dbExistsTable(con, "sumario")) {
        dbCreateTable(con, name = "sumario", fields = types_sumario,
                      temporary = FALSE)
    }

    if (!dbExistsTable(con, "disposicion")) {
        dbCreateTable(con, name = "disposicion", fields = types_pub,
                      temporary = FALSE)
    }

    if (!dbExistsTable(con, "anuncio")) {
        dbCreateTable(con, name = "anuncio", fields = types_an, temporary = FALSE)
    }
    if (!dbExistsTable(con, "notas")) {
        dbCreateTable(con, name = "notas",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "materias")) {
        dbCreateTable(con,
                      name = "materias",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "alertas")) {
        dbCreateTable(con,
                      name = "alertas",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "referencias")) {
        dbCreateTable(con,
                      name = "referencias",
                      fields = c(publication = "TEXT", type = "TEXT",
                                 referencia = "TEXT", orden = "TEXT",
                                 codigo = "TEXT", palabra = "TEXT", texto = "TEXT"),
                      temporary = FALSE)
    }
}

pub_collect <- function(conection) {
    p <- tbl(conection, "sumario") |>
        anti_join(tbl(conection, "anuncio"),
                  by = c(publication = "identificador")) |>
        anti_join(tbl(conection, "disposicion"),
                  by = c(publication = "identificador")) |>
        arrange(publication) |>
        pull(publication)
    p
}

add_anuncio <- function(con, doc) {
    if (!is.list(doc$analysis[[1]])) {
        return(FALSE)
    }

    doc <- select(doc, -analysis, -text_xml)
    doc$fecha_publicacion <- as.character(doc$fecha_publicacion)
    doc$fecha_actualizacion <- as.character(doc$fecha_actualizacion)
    dbAppendTable(con, "anuncio", doc)

}

add_disposicion <- function(con, doc) {
    redoc <- select(doc, -analysis, -text_xml)
    redoc$fecha_publicacion <- as.character(redoc$fecha_publicacion)
    redoc$fecha_disposicion <- as.character(redoc$fecha_disposicion)
    redoc$fecha_actualizacion <- as.character(redoc$fecha_actualizacion)
    redoc$fecha_derogacion <- as.character(redoc$fecha_derogacion)
    redoc$fecha_vigencia <- as.character(redoc$fecha_vigencia)

    dbAppendTable(con, "disposicion", redoc)
    if (!is.list(doc$analysis[[1]])) {
        return(FALSE)
    }
    if (is.list(doc$analysis[[1]]$referencias)) {
        add_references(con, doc)
    }

}

add_references <- function(con, doc) {
    if (is.matrix(doc$analysis[[1]]$referencias$anteriores)) {
        anteriores <- cbind(type = "anteriores", doc$analysis[[1]]$referencias$anteriores)
    }
    if (is.matrix(doc$analysis[[1]]$referencias$posteriores)) {
        posteriores <- cbind(type = "posteriores", doc$analysis[[1]]$referencias$posteriores)
    }
    if (exists("anteriores") && exists("posteriores")) {
        ref <- rbind(anteriores, posteriores)
    } else if (exists("anteriores")) {
        ref <- anteriores
    } else if (exists("posteriores")) {
        ref <- posteriores
    }

    if (exists("ref")) {
        ref <- cbind(publication = doc$identificador, ref) %>%
            as.data.frame()
        dbAppendTable(con, "referencias", ref)
    }
}

is.named <- function(x) {
    !is.null(names(x))
}

update_publications <- function(con, publi) {
    out <- anti_join(data.frame(identificador = publi),
              tbl(con, "disposicion") |> select("identificador"), copy = TRUE,
              by = join_by(identificador)) |>
        anti_join(tbl(con, "anuncio") |> select("identificador"), copy = TRUE,
                  by = join_by(identificador))
    publi <- out$identificador

    t.init <- Sys.time()
    pb <- progress_bar$new(total = length(publi),
                           format = "  downloading :what [:bar] :percent :elapsedfull :eta",
    )
    pb$tick(0)
    for (pub in publi) {
        # Skip if already exists:

        pb$tick(tokens = list(what = pub))
        doc <- retrieve_document(pub)
        # Empty value appearing for first time in BOE-A-2011-3164
        doc$fecha_anulacion <- NULL
        # message(pub)
        if (strsplit(pub, "-", fixed = TRUE)[[1]][2] == "B") {
            add_anuncio(con, doc)
        } else {
            add_disposicion(con, doc)
        }
        analysis <- doc$analysis[[1]]
        if (!is.null(analysis) && is.named(analysis)) {
            if (!is.null(analysis$notas) && is.data.frame(analysis$notas)) {
                    dbAppendTable(con, "notas", cbind(publication = pub, analysis$notas))
            }
            if (!is.null(analysis$alertas) && is.data.frame(analysis$alertas)) {
                dbAppendTable(con, "alertas", cbind(publication = pub, analysis$alertas))
            }
            if (!is.null(analysis$materias) && is.data.frame(analysis$materias)) {
                dbAppendTable(con, "materias", cbind(publication = pub, analysis$materias))
            }
            if (!is.null(analysis$referencias) && is.list(analysis$referencias)) {
                add_references(con, doc)
            }
        }
    }
    t.end <- Sys.time() - t.init
    message("Updated ", length(publi), " in ", round(t.end, 2), units(t.end))
}

clean_duplicates <- function(con, table) {
    # Delete duplicated on from https://stackoverflow.com/a/8190671/2886003

    str <- paste0("delete from ", table)
    str2 <- paste0(" where rowid not in (select  min(rowid) from ", table,
                   " group by ")
    columns <- dbListFields(con, table)
    columns <- paste0(paste0(columns, collapse = ", "), ")")
    query <- paste0(str, str2, columns)
    res <- dbSendStatement(con, query)
    dbClearResult(res)
}

clean_con <- function(con) {
    if (length(dbListTables(con)) == 0) {
        create_tables(con)
        res <- dbSendQuery(con, "CREATE INDEX sumario_code ON sumario (sumario_code)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX publication ON sumario (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX identificador_d ON disposicion (identificador)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX identificador_a ON anuncio (identificador)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_a ON alertas (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_m ON materias (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_n ON notas (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_r ON referencias (publication)")
        dbClearResult(res)
    } else {
        clean_duplicates(con, "sumario")
        clean_duplicates(con, "disposicion")
        clean_duplicates(con, "anuncio")
        clean_duplicates(con, "alertas")
        clean_duplicates(con, "materias")
        clean_duplicates(con, "notas")
        clean_duplicates(con, "referencias")
    }
}

And then call the functions like this:

# Update sumarios table
con <- dbConnect(duckdb::duckdb(dbdir = "boe_ddbb2.duck"))
update_sumarios(con) # Takes around 10 minutes
publications <- pub_collect(con)
# Takes around 15 minutes to hang Rstudio
update_publications(con, publications)

After 15 minutes or so adding data to the database the following message is printed in the R terminal and Rstudio crashes: ERROR elf_dynamic_array_reader.h:61] tag not found

I have tried changing the database (Originally it was MySQLite) but it didn't register entries after some time and hang up Rstudio too.
I tried to fill the database from R without RStudio but the error is similar.

Any advice on how to debug this error will be appreciated too.

system · May 25, 2023, 6:37pm

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.