Troubleshooting R Shiny app for finding and merging duplicate references in a RIS file

MouseB · June 29, 2023, 2:48pm

I'm trying to create an R Shiny application that takes in a .RIS (bibliographic citation file) as input, finds potential duplicates based on certain fields (like title, authors, year, etc.), allows the user to decide if they are duplicates and choose the better reference to keep, and then merges these duplicates and exports a cleaned .RIS file.

Here's the R code that I'm using to build the app:

##Part 1
# Load necessary libraries
library(shiny) 
library(bibliometrix)
library(DT)
library(dplyr)
library(stringr)
library(stringdist)
library(iotools)
library(RefManageR)

# Removing all HTML tags function
remove_html_tags <- function(x) {
  gsub("<.*?>", "", x)
}

# Function to find duplicates
find_duplicates <- function(df, selected_fields) {
  df %>%
    group_by(across(all_of(selected_fields))) %>%
    filter(n() > 1)
}

# Function to merge duplicates
merge_duplicates <- function(df, selected_ids) {
  df %>%
    group_by(id) %>%
    summarise(across(everything(), ~ first(.x[which.max(Score)]))) %>%
    ungroup() %>%
    slice(-which(id %in% selected_ids))
}


# Function to write RIS file
write_ris <- function(df, file) {
  for(i in 1:nrow(df)) {
    line <- paste0("TY  - ", df[i, "TY"])
    write(line, file, append = TRUE)
    if(!is.na(df[i, "TI"])) {
      line <- paste0("TI  - ", df[i, "TI"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "AU"])) {
      authors <- strsplit(as.character(df[i, "AU"]), ";")[[1]]
      for(author in authors) {
        line <- paste0("AU  - ", author)
        write(line, file, append = TRUE)
      }
    }
    if(!is.na(df[i, "PY"])) {
      line <- paste0("PY  - ", df[i, "PY"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "T2"])) {
      line <- paste0("T2  - ", df[i, "T2"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "DO"])) {
      line <- paste0("DO  - ", df[i, "DO"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "SP"])) {
      line <- paste0("SP  - ", df[i, "SP"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "IS"])) {
      line <- paste0("IS  - ", df[i, "IS"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "VL"])) {
      line <- paste0("VL  - ", df[i, "VL"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "CN"])) {
      line <- paste0("CN  - ", df[i, "CN"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "LB"])) {
      line <- paste0("LB  - ", df[i, "LB"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "AN"])) {
      line <- paste0("AN  - ", df[i, "AN"])
      write(line, file, append = TRUE)
    }
    # End of entry
    line <- "ER  -"
    write(line, file, append = TRUE)
  }
}


##Part 2
# Define the UI
ui <- fluidPage(
  fileInput('file', 'Choose RIS file', accept = c('text/ris', '.ris')),
  checkboxGroupInput("duplicateFields", "Select Fields for Checking Duplicates", choices = c("Author" = "AU", "Year" = "PY", "Title" = "TI", "Pages" = "SP"), selected = "TI"),
  actionButton("start", "Start Duplicates Check"),
  DTOutput('contents'),
  uiOutput("checkboxesUI"),
  actionButton("merge", "Merge Selected References"),
  downloadButton("download", "Download Cleaned RIS file")
)


##Part 3
# Define the server logic
server <- function(input, output, session) {
  
  data <- reactive({
    browser()  # add browser() here
    
    req(input$file)  # replace if (is.null(file)) { return(NULL); } with req(file)
    
    # Read RIS file
    ris <- readLines(input$file$datapath)
    
    # Convert to dataframe
    df <- convert2df(ris, dbsource = "isi", format = "plaintext")
    
    print(dim(df))  # Debug line to print dimensions of the dataframe
    
    
    # Preprocess the title field
    df$TI <- df$TI %>%
      remove_html_tags() %>%
      iconv(to = "ASCII//TRANSLIT") %>%
      str_to_title() %>%
      str_replace_all("Alpha", "alpha") %>%
      str_replace_all("Beta", "beta") %>%
      str_replace_all("Gamma", "gamma")
    
    print(dim(df))  # Debug line to print dimensions of the dataframe after preprocessing the title field
    
    
    # Handle the abstract field
    df$AB <- ifelse(df$AB %in% c("", "no abstract"), "no abstract", "abstract available")
    
    print(dim(df))  # Debug line to print dimensions of the dataframe
    
    
    # Indicate preferable references
    df$Score <- 0
    df$Score[df$AU != ""] <- df$Score[df$AU != ""] + 1
    df$Score[df$PG != ""] <- df$Score[df$PG != ""] + 1
    df$Score[df$TI != ""] <- df$Score[df$TI != ""] + 1
    df$Score[df$AB != "no abstract"] <- df$Score[df$AB != "no abstract"] + 2
    
    return(df)
  })
  
  observeEvent(input$start, {
    browser()  # add browser() here
    
    showModal(modalDialog(
      title = "Please wait",
      "Checking for duplicates...",
      easyClose = FALSE
    ))
    
    df <- data()
    
    selected_fields <- input$duplicateFields
    
    duplicates <- find_duplicates(df, selected_fields)
    
    output$contents <- DT::renderDataTable({
      duplicates
    })
    
    removeModal()
    
    output$checkboxesUI <- renderUI({
      checkboxGroupInput('selected_ids', 'Select Duplicates to Merge:', choices = duplicates$id)
    })
  })
  
  observeEvent(input$merge, {
    browser()  # add browser() here
    
    showModal(modalDialog(
      title = "Please wait",
      "Merging selected references...",
      easyClose = FALSE
    ))
    
    df <- data()
    
    selected_ids <- input$selected_ids
    
    df_clean <- merge_duplicates(df, selected_ids)
    
    output$contents <- DT::renderDataTable({
      df_clean
    })
    
    removeModal()
    
    data <<- df_clean
  })
  
  # Download data
  output$download <- downloadHandler(
    filename = function() {
      paste("cleaned_data-", Sys.Date(), ".ris", sep="")
    },
    content = function(file) {
      write_ris(data(), file)
    }
  )
}

# Run the application
shinyApp(ui = ui, server = server)

The issue I'm facing is that the application crashes after clicking the "Start Duplicate Check" button with the error message in the console:

Converting your isi collection into a bibliographic dataframe

Warning: Error in : 'TY  - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
  1: runApp

In terms of debugging, I've tried checking the dimensions of the data frame at various points in the code using print(dim(df)), but everything seems to be in order until the application crashes.

What pops up in the browser isn't helpful to my understanding either.

I'm also including a sample RIS file that I've been using to test the application: Bibliographic Reference Sample with Dups

Does anyone have any suggestions on how to debug this issue? Also, if you have any suggestions for improving the design or efficiency of the code, I would appreciate that as well. Thanks so much!

Edit:
Making some updates and browser()'s, the console gives:

Called from: observe()
Browse[1]> c
Called from: `<reactive:data>`(...)
Browse[1]> c

Converting your isi collection into a bibliographic dataframe

Warning: Error in : 'TY  - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
  1: runApp

'TY - JOUR' seems to be interpreted as a path, which shouldn't be the case. In the RIS file format, 'TY - JOUR' is used to denote that a given entry is a journal article, it's not a file or a directory.

May be the convert2df function that seems to be triggering the issue? Should I use something different?

    # Convert to dataframe
    df <- convert2df(ris, dbsource = "isi", format = "plaintext")

system · August 22, 2023, 6:48pm

This topic was automatically closed 54 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.