Hello,
I have created the following r script to extract data from various pdf files and create an excel file. I'd like to be able to do a conditional formatting (fill in the cells with colours) according to threshold values in my column 3 (Escherichia coli) and 4 (enterococci) but I can't find a correct script or the beginnings of an answer. Can you help me?
library(pdftools)
library(writexl)
Set the source folder path where the PDF files are located
folder_path <- "path/Données estivales EDP"
Get a list of PDF files in the folder
pdf_files <- list.files(folder_path, pattern = "\.pdf$", full.names = TRUE)
Create an empty list to store the data frames
dfs <- list()
Loop through each PDF file
for (pdf_file in pdf_files) {
Convert the PDF to plain text
text <- pdf_text(pdf_file)
Split the text into individual lines
lines <- unlist(strsplit(text, "\n"))
#Filter lines containing specific keywords
filtered_lines <- lines[grepl("Escherichia coli par microplaque|Entérocoques intestinaux par microplaque|Date de prélèvement|Adresse :", lines, ignore.case = TRUE)]
Split each filtered line into columns based on spaces
split_lines <- lapply(filtered_lines, function(line) unlist(strsplit(line, "\s+")))
Determine the maximum number of columns
max_cols <- max(lengths(split_lines))
Create a data frame with a fixed number of columns
df <- as.data.frame(matrix("", nrow = length(split_lines), ncol = max_cols))
Fill the data frame with the split lines
for (i in seq_along(split_lines)) {
if (split_lines[[i]][1] == "") {
df[i, 1] <- paste0(split_lines[[i]][2], " ", split_lines[[i]][3])
df[i, 4:length(split_lines[[i]])] <- split_lines[[i]][4:length(split_lines[[i]])]
} else {
df[i, 1:length(split_lines[[i]])] <- split_lines[[i]]
}
}
Reorganize variables
df[df[, 1] == "Date", 2] <- df[df[, 1] == "Date", 5]
df[df[, 1] == "Date", 5] <- ""
df[df[, 1] == "Adresse", 2] <-apply(df[df[, 1] == "Adresse", 3:max_cols],1, function(x) paste(x,collapse=" "))
df[df[, 1] == "Escherichia coli", 2]<-df[df[, 1] == "Escherichia coli", 6]
df[df[, 1] == "Entérocoques intestinaux", 2]<-df[df[, 1] == "Entérocoques intestinaux", 6]
df<-df[,1:2]
Transpose the data frame
df <- as.data.frame(t(df), stringsAsFactors = FALSE)
Set the first row as variable names
names(df) <- df[1, ]
df <- df[-1, ]
Reshape the data frame
reshaped_df <- matrix(df, nrow = ncol(df)/4, ncol = 4, byrow = TRUE)
Convert the reshaped matrix back to a data frame
df_new <- as.data.frame(reshaped_df)
colnames(df_new)<-colnames(df)[1:4]
df_new<-sapply(df_new,function(x) as.character(x))
df_new <- as.data.frame(df_new)
df_new$Escherichia coli
<- as.numeric(df_new$Escherichia coli
)
df_new$Entérocoques intestinaux
<- as.numeric(df_new$Entérocoques intestinaux
)
Append the data frame to the list
dfs[[pdf_file]] <- df_new
}
Combine all data frames into a single data frame
combined_df <- do.call(rbind, dfs)
Set the path for the output Excel file
output_file <- "path/Données estivales EDP/Extraction_données_EDP.xlsx"
Write the combined data frame to an Excel file
write_xlsx(combined_df, output_file)