Hi! I've got some pretty ugly date data reading into excel that is a mix of numeric encoding and character strings. I've got some cleaning steps to this below, and if you have some suggestions on easier cleaning steps I would love to see it!
Otherwise, what I am struggling with is how to convert this to a function that allows the user to apply the data cleaning steps with mutate_at()
or mutate + across
in a single go, instead of for each variable individually. I would greatly appreciate help conceptualizing this!
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
# example data
dat <- tibble(
x = c("42736", NA, "09/UN/2020", NA),
y = c("43017", "42988", "01/UN/2018", NA)
)
# helper function 1 - identify if character string should be treated as numeric
is_numeric <- function(x){
!str_detect(x, "[:alpha:]+|[:punct:]+")
}
# helper function 2 - substitute UN with 01 and convert to date
convert_un <- function(x){
stringr::str_replace(x, pattern = "UN", replacement = "01") %>%
lubridate::parse_date_time(., '%m %d %y') %>%
lubridate::as_date()
}
# clean up weird dates containing excel numeric imports and UN
clean_date <- function(data, var_date){
var_date_chr <- rlang::as_label(rlang::ensym(var_date))
data %>%
mutate(
dt_un = case_when(
str_detect( as.character({{var_date}}) , "UN") ~ as.character({{var_date}}) ,
TRUE ~ NA_character_
),
dt_num = case_when(
is_numeric( as.character({{var_date}} )) ~ as.character({{var_date}}) ,
TRUE ~ NA_character_
),
dt_num = as.numeric(dt_num),
dt_clean_un = convert_un(dt_un),
dt_clean_num = janitor::convert_to_date(dt_num),
"{var_date_chr}" := dplyr::coalesce(dt_clean_num, dt_clean_un)
) %>%
dplyr::select(-c(dt_un, dt_num, dt_clean_un, dt_clean_num))
}
# can clean date one variable at a time
dat_clean <- dat %>%
clean_date(x) %>%
clean_date(y)
# goal would be to do a mutate across or at like this:
# dat_clean <- dat %>%
# mutate_at(vars(x, y), clean_date)
Created on 2021-12-15 by the reprex package (v2.0.1)