Hey everybody!! looking for some help again.
I have multiple excel files that have in sheet 1 multiple tables so I want to get one data frame of all imported excel files joining the multiple tables in sheet 1.
I found this post How to efficiently import multiple excel tables located in one sheet into an R list? - Stack Overflow that I was trying to reproduce. Basically, this imports an excel file, map the tables in the sheet then split them in different elements of a list.
I tried to do the same but I don´t get clean tables.
the first row of each table has a name that I want to pass it to a new column so in the final data frame I would know from whom the table is. I try to get the name that is in position [1,1] and goes like UNIDAD 1
then I delete that row and 2 more so I get the table just for the data that I need but sometimes I don't get the name from the position [1,1].
Something to tell is that not always all the tables are fill with data.
Another thing is that the column HORA
is supposed to show hours from 2 to 24 but when files are loaded, the format is changed to values form 0 to 1. I don't know how to get hours again. I tried with hm()
from lubridate
but it didn't work.
I hope you can help me with a better solution, I got stuck with this one.
I'm sharing 3 files for you to test.
This is my code. (Not so clean)
pacman::p_load(readxl, tidyverse, janitor)
# UBICACION ARCHIVOS----
path <- "D:/Downloads/DATOS_LUCY"
# FUNCIONES----
# carga de datos
input_files_excel <- function(path) {
require(stringr)
table_raw <- read_excel(path = path,
sheet = 1, #col_types = "text",
col_names = F)
table_raw <- clean_names(table_raw)
return(table_raw)
}
# utility function to get rle as a named vector
vec_rle <- function(v){
temp <- rle(v)
out <- temp$values
names(out) <- temp$lengths
return(out)
}
# utility function to map table with their columns/rows in a bigger table
make_df_index <- function(v){
table_rle <- vec_rle(v)
divide_points <- c(0,cumsum(names(table_rle)))
table_index <- map2((divide_points + 1)[1:length(divide_points)-1],
divide_points[2:length(divide_points)],
~.x:.y)
return(table_index[table_rle])
}
# split a large table in one direction if there are blank columns or rows
split_direction <- function(df,direction = "col"){
if(direction == "col"){
col_has_data <- unname(map_lgl(df,~!all(is.na(.x))))
df_mapping <- make_df_index(col_has_data)
out <- map(df_mapping,~df[,.x])
} else if(direction == "row"){
row_has_data <- df %>%
mutate_all(~!is.na(.x)) %>%
as.matrix() %>%
apply(1,any)
df_mapping <- make_df_index(row_has_data)
out <- map(df_mapping,~df[.x,])
}
return(out)
}
# split a large table into smaller tables if there are blank columns or rows
# if you still see entire rows or columns missing. Please increase complexity
split_df <- function(df,showWarnig = TRUE,complexity = 1){
if(showWarnig){
warning("Please don't use first row as column names.")
}
out <- split_direction(df,"col")
for(i in 1 :complexity){
out <- out %>%
map(~split_direction(.x,"row")) %>%
flatten() %>%
map(~split_direction(.x,"col")) %>%
flatten()
}
return(out)
}
# funciones para nombres de columnas
nom_col_tab <- function(df){
names(df) <- col_tab
return(df)
}
# This function is to take the name in position [1,1] of each table and pass it to column named UNIDAD
# Crear columna unidad
c_unidad <- function(i){
uni <- i[1,1]
i$UNIDAD <- uni
return(i)
}
# Eliminar filas superiores
rm_filas <- function(i){
i <- i[-c(1:3),]
}
# CODIGO----
# Columns names
col_tab <- c("HORA", "TANQUE_EXPAN", "POTENCIA_ACTIVA", "ENERGIA",
"AA_INY1", "AA_INY2", "AA_INY3", "AA_INY4", "AA_INY5",
"AA_INY6", "AD_DEF1", "AD_DEF2", "AD_DEF3", "AD_DEF4",
"AD_DEF5", "AD_DEF6", "CE_TEM_MAX_PATIN_EMPUJE_C",
"CE_TEM_ACEITE_ENTRADA_C", "CE_TEM_ACEITE_SALIDA_C",
"CGS_TEM_MAX_PATIN_C_SUPERIOR_C", "CGS_TEM_ACEITE_ENTRADA_C",
"CGS_TEM_ACEITE_SALIDA_C", "CGI_TEM_MAX_PATIN_C_INFERIOR_C",
"CGI_TEM_ACEITE_ENTRADA_C", "CGI_TEM_ACEITE_SALIDA_C",
"CGT_TEM_MAX_PATIN_C_TURBINA_C", "CGT_TEM_ACEITE_ENTRADA_C",
"CGT_TEM_ACEITE_SALIDA_C", "ENTRADA_AIRE")
# PATH COMPLETO DE ARCHIVOS----
pathsx <- list.files(path=path,
pattern="\\.xls",
full.names=TRUE)
table_raw <- lapply(pathsx, input_files_excel) # load files
#> New names:
#> New names:
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
#> • `` -> `...13`
#> • `` -> `...14`
#> • `` -> `...15`
#> • `` -> `...16`
#> • `` -> `...17`
#> • `` -> `...18`
#> • `` -> `...19`
#> • `` -> `...20`
#> • `` -> `...21`
#> • `` -> `...22`
#> • `` -> `...23`
#> • `` -> `...24`
#> • `` -> `...25`
#> • `` -> `...26`
#> • `` -> `...27`
#> • `` -> `...28`
#> • `` -> `...29`
# create a df with all mapped tables
dd <- bind_rows(table_raw)
# Split each table as an element of a list
split_table <- dd %>%
split_df(complexity = 1) # another custom function I wrote
#> Warning in split_df(., complexity = 1): Please don't use first row as column
#> names.
# name columns
split_table <- lapply(split_table, nom_col_tab)
# creating column UNIDAD with the name of position [1,1] of each table before deleting unuseful rows.
split_table <- lapply(split_table, c_unidad)
# Deleting unuseful rows
split_table <- lapply(split_table, rm_filas)
# Creating final df
datos <- bind_rows(split_table)
Created on 2022-07-15 by the reprex package (v2.0.1)