Not possible to add data to the database.

I'm using the latest RStudio desktop 2023.03 and when filling a database I'm not able to finish adding all the data.
Rstudio crashes with errors similar to this one: R Studio crashes with fatal error: elf_dynamic_array_reader.h:61] tag not found or this issue.

library("progress")
library("dplyr")
library("DBI")
library("BOE") # devtools::install_github("ropenspain/BOE")
library("duckdb")

update_sumarios <- function(con, start = "2009-01-01") {
    last_date <- con |>
        tbl("sumario")  |>
        filter(date == max(date)) |>
        pull(date) |>
        unique()
    if (length(last_date) == 0) {
        last_date <- as.Date(start)
    }

    date <- as.Date(last_date) + 1
    days <- as.numeric(Sys.Date() - date)
    pb <- progress_bar$new(total = days,
                           format = "  downloading :what [:bar] :percent :elapsedfull :eta",
    )
    t.init <- Sys.time()
    pb$tick(0)
    while (date < Sys.Date()) {
        pb$tick(tokens = list(what = date))
        suma <- try(retrieve_sumario(date), silent = TRUE)
        if (is(suma, "try-error")) {
            date <- date + 1
            next
        }
        suma$date <- as.character(suma$date)
        dbAppendTable(con, "sumario", suma)
        date <- date + 1
    }
    t.end <- Sys.time() - t.init
    message("Updated ", days, " in ", round(t.end, 2), units(t.end))
}

create_tables <- function(con) {
    types_sumario <- c(date = "DATE", sumario_nbo = "TEXT", sumario_code = "TEXT",
                       section = "TEXT", section_number = "TEXT", departament = "TEXT",
                       departament_etq = "TEXT", epigraph = "TEXT", text = "TEXT", publication = "TEXT",
                       pages = "DOUBLE")

    types_pub <- c(identificador = "TEXT", titulo = "TEXT", diario = "TEXT",
                   diario_numero = "TEXT",
                   seccion = "TEXT", subseccion = "TEXT", departamento = "TEXT",
                   rango = "TEXT", numero_oficial = "TEXT", fecha_disposicion = "DATE",
                   fecha_publicacion = "DATE", fecha_vigencia = "DATE",
                   fecha_derogacion = "DATE",
                   letra_imagen = "TEXT", pagina_inicial = "TEXT", pagina_final = "TEXT",
                   suplemento_letra_imagen = "TEXT", suplemento_pagina_inicial = "TEXT",
                   suplemento_pagina_final = "TEXT", estatus_legislativo = "TEXT",
                   origen_legislativo = "TEXT", estado_consolidacion = "TEXT",
                   judicialmente_anulada = "TEXT",
                   vigencia_agotada = "TEXT", estatus_derogacion = "TEXT", url_epub = "TEXT",
                   url_pdf = "TEXT", url_pdf_catalan = "TEXT", url_pdf_euskera = "TEXT",
                   url_pdf_gallego = "TEXT", url_pdf_valenciano = "TEXT", url_eli = "TEXT",
                   departamento_codigo = "TEXT", fecha_actualizacion = "TIMESTAMP",
                   text = "TEXT")

    types_an <- c(
        identificador = "TEXT", titulo = "TEXT", diario = "TEXT",
        diario_numero = "TEXT",
        seccion = "TEXT", departamento = "TEXT", numero_anuncio = "TEXT",
        fecha_publicacion = "DATE", letra_imagen = "TEXT", pagina_inicial = "TEXT",
        pagina_final = "TEXT", url_pdf = "TEXT", departamento_codigo = "TEXT",
        fecha_actualizacion = "TIMESTAMP", subseccion = "TEXT",
        suplemento_letra_imagen = "TEXT", suplemento_pagina_inicial = "TEXT",
        suplemento_pagina_final = "TEXT", text = "TEXT")

    if (!dbExistsTable(con, "sumario")) {
        dbCreateTable(con, name = "sumario", fields = types_sumario,
                      temporary = FALSE)
    }

    if (!dbExistsTable(con, "disposicion")) {
        dbCreateTable(con, name = "disposicion", fields = types_pub,
                      temporary = FALSE)
    }

    if (!dbExistsTable(con, "anuncio")) {
        dbCreateTable(con, name = "anuncio", fields = types_an, temporary = FALSE)
    }
    if (!dbExistsTable(con, "notas")) {
        dbCreateTable(con, name = "notas",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "materias")) {
        dbCreateTable(con,
                      name = "materias",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "alertas")) {
        dbCreateTable(con,
                      name = "alertas",
                      fields = c(publication = "TEXT", codigo = "TEXT",
                                 orden = "TEXT", text = "TEXT"), temporary = FALSE)
    }
    if (!dbExistsTable(con, "referencias")) {
        dbCreateTable(con,
                      name = "referencias",
                      fields = c(publication = "TEXT", type = "TEXT",
                                 referencia = "TEXT", orden = "TEXT",
                                 codigo = "TEXT", palabra = "TEXT", texto = "TEXT"),
                      temporary = FALSE)
    }
}

pub_collect <- function(conection) {
    p <- tbl(conection, "sumario") |>
        anti_join(tbl(conection, "anuncio"),
                  by = c(publication = "identificador")) |>
        anti_join(tbl(conection, "disposicion"),
                  by = c(publication = "identificador")) |>
        arrange(publication) |>
        pull(publication)
    p
}

add_anuncio <- function(con, doc) {
    if (!is.list(doc$analysis[[1]])) {
        return(FALSE)
    }

    doc <- select(doc, -analysis, -text_xml)
    doc$fecha_publicacion <- as.character(doc$fecha_publicacion)
    doc$fecha_actualizacion <- as.character(doc$fecha_actualizacion)
    dbAppendTable(con, "anuncio", doc)

}

add_disposicion <- function(con, doc) {
    redoc <- select(doc, -analysis, -text_xml)
    redoc$fecha_publicacion <- as.character(redoc$fecha_publicacion)
    redoc$fecha_disposicion <- as.character(redoc$fecha_disposicion)
    redoc$fecha_actualizacion <- as.character(redoc$fecha_actualizacion)
    redoc$fecha_derogacion <- as.character(redoc$fecha_derogacion)
    redoc$fecha_vigencia <- as.character(redoc$fecha_vigencia)

    dbAppendTable(con, "disposicion", redoc)
    if (!is.list(doc$analysis[[1]])) {
        return(FALSE)
    }
    if (is.list(doc$analysis[[1]]$referencias)) {
        add_references(con, doc)
    }

}

add_references <- function(con, doc) {
    if (is.matrix(doc$analysis[[1]]$referencias$anteriores)) {
        anteriores <- cbind(type = "anteriores", doc$analysis[[1]]$referencias$anteriores)
    }
    if (is.matrix(doc$analysis[[1]]$referencias$posteriores)) {
        posteriores <- cbind(type = "posteriores", doc$analysis[[1]]$referencias$posteriores)
    }
    if (exists("anteriores") && exists("posteriores")) {
        ref <- rbind(anteriores, posteriores)
    } else if (exists("anteriores")) {
        ref <- anteriores
    } else if (exists("posteriores")) {
        ref <- posteriores
    }

    if (exists("ref")) {
        ref <- cbind(publication = doc$identificador, ref) %>%
            as.data.frame()
        dbAppendTable(con, "referencias", ref)
    }
}

is.named <- function(x) {
    !is.null(names(x))
}

update_publications <- function(con, publi) {
    out <- anti_join(data.frame(identificador = publi),
              tbl(con, "disposicion") |> select("identificador"), copy = TRUE,
              by = join_by(identificador)) |>
        anti_join(tbl(con, "anuncio") |> select("identificador"), copy = TRUE,
                  by = join_by(identificador))
    publi <- out$identificador

    t.init <- Sys.time()
    pb <- progress_bar$new(total = length(publi),
                           format = "  downloading :what [:bar] :percent :elapsedfull :eta",
    )
    pb$tick(0)
    for (pub in publi) {
        # Skip if already exists:

        pb$tick(tokens = list(what = pub))
        doc <- retrieve_document(pub)
        # Empty value appearing for first time in BOE-A-2011-3164
        doc$fecha_anulacion <- NULL
        # message(pub)
        if (strsplit(pub, "-", fixed = TRUE)[[1]][2] == "B") {
            add_anuncio(con, doc)
        } else {
            add_disposicion(con, doc)
        }
        analysis <- doc$analysis[[1]]
        if (!is.null(analysis) && is.named(analysis)) {
            if (!is.null(analysis$notas) && is.data.frame(analysis$notas)) {
                    dbAppendTable(con, "notas", cbind(publication = pub, analysis$notas))
            }
            if (!is.null(analysis$alertas) && is.data.frame(analysis$alertas)) {
                dbAppendTable(con, "alertas", cbind(publication = pub, analysis$alertas))
            }
            if (!is.null(analysis$materias) && is.data.frame(analysis$materias)) {
                dbAppendTable(con, "materias", cbind(publication = pub, analysis$materias))
            }
            if (!is.null(analysis$referencias) && is.list(analysis$referencias)) {
                add_references(con, doc)
            }
        }
    }
    t.end <- Sys.time() - t.init
    message("Updated ", length(publi), " in ", round(t.end, 2), units(t.end))
}

clean_duplicates <- function(con, table) {
    # Delete duplicated on from https://stackoverflow.com/a/8190671/2886003

    str <- paste0("delete from ", table)
    str2 <- paste0(" where rowid not in (select  min(rowid) from ", table,
                   " group by ")
    columns <- dbListFields(con, table)
    columns <- paste0(paste0(columns, collapse = ", "), ")")
    query <- paste0(str, str2, columns)
    res <- dbSendStatement(con, query)
    dbClearResult(res)
}

clean_con <- function(con) {
    if (length(dbListTables(con)) == 0) {
        create_tables(con)
        res <- dbSendQuery(con, "CREATE INDEX sumario_code ON sumario (sumario_code)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX publication ON sumario (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX identificador_d ON disposicion (identificador)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE UNIQUE INDEX identificador_a ON anuncio (identificador)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_a ON alertas (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_m ON materias (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_n ON notas (publication)")
        dbClearResult(res)
        res <- dbSendQuery(con, "CREATE INDEX publication_r ON referencias (publication)")
        dbClearResult(res)
    } else {
        clean_duplicates(con, "sumario")
        clean_duplicates(con, "disposicion")
        clean_duplicates(con, "anuncio")
        clean_duplicates(con, "alertas")
        clean_duplicates(con, "materias")
        clean_duplicates(con, "notas")
        clean_duplicates(con, "referencias")
    }
}

And then call the functions like this:

# Update sumarios table
con <- dbConnect(duckdb::duckdb(dbdir = "boe_ddbb2.duck"))
update_sumarios(con) # Takes around 10 minutes
publications <- pub_collect(con)
# Takes around 15 minutes to hang Rstudio
update_publications(con, publications)

After 15 minutes or so adding data to the database the following message is printed in the R terminal and Rstudio crashes: ERROR elf_dynamic_array_reader.h:61] tag not found

I have tried changing the database (Originally it was MySQLite) but it didn't register entries after some time and hang up Rstudio too.
I tried to fill the database from R without RStudio but the error is similar.

Any advice on how to debug this error will be appreciated too.

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.