I'm trying to create an R Shiny application that takes in a .RIS (bibliographic citation file) as input, finds potential duplicates based on certain fields (like title, authors, year, etc.), allows the user to decide if they are duplicates and choose the better reference to keep, and then merges these duplicates and exports a cleaned .RIS file.
Here's the R code that I'm using to build the app:
##Part 1
# Load necessary libraries
library(shiny)
library(bibliometrix)
library(DT)
library(dplyr)
library(stringr)
library(stringdist)
library(iotools)
library(RefManageR)
# Removing all HTML tags function
remove_html_tags <- function(x) {
gsub("<.*?>", "", x)
}
# Function to find duplicates
find_duplicates <- function(df, selected_fields) {
df %>%
group_by(across(all_of(selected_fields))) %>%
filter(n() > 1)
}
# Function to merge duplicates
merge_duplicates <- function(df, selected_ids) {
df %>%
group_by(id) %>%
summarise(across(everything(), ~ first(.x[which.max(Score)]))) %>%
ungroup() %>%
slice(-which(id %in% selected_ids))
}
# Function to write RIS file
write_ris <- function(df, file) {
for(i in 1:nrow(df)) {
line <- paste0("TY - ", df[i, "TY"])
write(line, file, append = TRUE)
if(!is.na(df[i, "TI"])) {
line <- paste0("TI - ", df[i, "TI"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "AU"])) {
authors <- strsplit(as.character(df[i, "AU"]), ";")[[1]]
for(author in authors) {
line <- paste0("AU - ", author)
write(line, file, append = TRUE)
}
}
if(!is.na(df[i, "PY"])) {
line <- paste0("PY - ", df[i, "PY"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "T2"])) {
line <- paste0("T2 - ", df[i, "T2"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "DO"])) {
line <- paste0("DO - ", df[i, "DO"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "SP"])) {
line <- paste0("SP - ", df[i, "SP"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "IS"])) {
line <- paste0("IS - ", df[i, "IS"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "VL"])) {
line <- paste0("VL - ", df[i, "VL"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "CN"])) {
line <- paste0("CN - ", df[i, "CN"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "LB"])) {
line <- paste0("LB - ", df[i, "LB"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "AN"])) {
line <- paste0("AN - ", df[i, "AN"])
write(line, file, append = TRUE)
}
# End of entry
line <- "ER -"
write(line, file, append = TRUE)
}
}
##Part 2
# Define the UI
ui <- fluidPage(
fileInput('file', 'Choose RIS file', accept = c('text/ris', '.ris')),
checkboxGroupInput("duplicateFields", "Select Fields for Checking Duplicates", choices = c("Author" = "AU", "Year" = "PY", "Title" = "TI", "Pages" = "SP"), selected = "TI"),
actionButton("start", "Start Duplicates Check"),
DTOutput('contents'),
uiOutput("checkboxesUI"),
actionButton("merge", "Merge Selected References"),
downloadButton("download", "Download Cleaned RIS file")
)
##Part 3
# Define the server logic
server <- function(input, output, session) {
data <- reactive({
browser() # add browser() here
req(input$file) # replace if (is.null(file)) { return(NULL); } with req(file)
# Read RIS file
ris <- readLines(input$file$datapath)
# Convert to dataframe
df <- convert2df(ris, dbsource = "isi", format = "plaintext")
print(dim(df)) # Debug line to print dimensions of the dataframe
# Preprocess the title field
df$TI <- df$TI %>%
remove_html_tags() %>%
iconv(to = "ASCII//TRANSLIT") %>%
str_to_title() %>%
str_replace_all("Alpha", "alpha") %>%
str_replace_all("Beta", "beta") %>%
str_replace_all("Gamma", "gamma")
print(dim(df)) # Debug line to print dimensions of the dataframe after preprocessing the title field
# Handle the abstract field
df$AB <- ifelse(df$AB %in% c("", "no abstract"), "no abstract", "abstract available")
print(dim(df)) # Debug line to print dimensions of the dataframe
# Indicate preferable references
df$Score <- 0
df$Score[df$AU != ""] <- df$Score[df$AU != ""] + 1
df$Score[df$PG != ""] <- df$Score[df$PG != ""] + 1
df$Score[df$TI != ""] <- df$Score[df$TI != ""] + 1
df$Score[df$AB != "no abstract"] <- df$Score[df$AB != "no abstract"] + 2
return(df)
})
observeEvent(input$start, {
browser() # add browser() here
showModal(modalDialog(
title = "Please wait",
"Checking for duplicates...",
easyClose = FALSE
))
df <- data()
selected_fields <- input$duplicateFields
duplicates <- find_duplicates(df, selected_fields)
output$contents <- DT::renderDataTable({
duplicates
})
removeModal()
output$checkboxesUI <- renderUI({
checkboxGroupInput('selected_ids', 'Select Duplicates to Merge:', choices = duplicates$id)
})
})
observeEvent(input$merge, {
browser() # add browser() here
showModal(modalDialog(
title = "Please wait",
"Merging selected references...",
easyClose = FALSE
))
df <- data()
selected_ids <- input$selected_ids
df_clean <- merge_duplicates(df, selected_ids)
output$contents <- DT::renderDataTable({
df_clean
})
removeModal()
data <<- df_clean
})
# Download data
output$download <- downloadHandler(
filename = function() {
paste("cleaned_data-", Sys.Date(), ".ris", sep="")
},
content = function(file) {
write_ris(data(), file)
}
)
}
# Run the application
shinyApp(ui = ui, server = server)
The issue I'm facing is that the application crashes after clicking the "Start Duplicate Check" button with the error message in the console:
Converting your isi collection into a bibliographic dataframe
Warning: Error in : 'TY - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
1: runApp
In terms of debugging, I've tried checking the dimensions of the data frame at various points in the code using print(dim(df))
, but everything seems to be in order until the application crashes.
What pops up in the browser isn't helpful to my understanding either.
I'm also including a sample RIS file that I've been using to test the application: Bibliographic Reference Sample with Dups
Does anyone have any suggestions on how to debug this issue? Also, if you have any suggestions for improving the design or efficiency of the code, I would appreciate that as well. Thanks so much!
Edit:
Making some updates and browser()'s, the console gives:
Called from: observe()
Browse[1]> c
Called from: `<reactive:data>`(...)
Browse[1]> c
Converting your isi collection into a bibliographic dataframe
Warning: Error in : 'TY - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
1: runApp
'TY - JOUR' seems to be interpreted as a path, which shouldn't be the case. In the RIS file format, 'TY - JOUR' is used to denote that a given entry is a journal article, it's not a file or a directory.
May be the convert2df function that seems to be triggering the issue? Should I use something different?
# Convert to dataframe
df <- convert2df(ris, dbsource = "isi", format = "plaintext")