Optimize code to generate same results in R

JojoSouza · March 30, 2022, 5:05pm

library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)

#database
df1 <- data.frame( Id = rep(1:5, length=900),
                   date1 =  as.Date( "2021-12-01"),
                   date2= rep(seq( as.Date("2021-01-01"), length.out=450, by=1), each = 2),
                   Category = rep(c("ABC", "EFG"), length.out = 900),
                   Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
                                "Saturday", "Sunday"), length.out = 900),
                   DR1 = sample( 200:250, 900, repl=TRUE),  
                   setNames( replicate(365, { sample(0:900, 900)}, simplify=FALSE),
                             paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))

# First idea: Calculate the median of the values resulting from the subtraction between DR01 and the values of the DRM columns

selection = startsWith(names(df1), "DRM")

df1[selection][is.na(df1[selection])] = 0

data1<-subset(df1,df1$date2<df1$date1)

dt1 <- as.data.table(data1)

cols <- grep("^DRM", colnames(dt1), value = TRUE)

result_median <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
  ][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]


# Second idea: After obtaining the median, I add the values found with the values of the DRM columns of my df1 database.

SP<-df1%>%
  inner_join(result_median, by = c('Id','Category', 'Week')) %>%
  mutate(across(matches("^DRM\\d+$"), ~.x + 
                  get(paste0(cur_column(), '_PV')),
                .names = '{col}_{col}_PV')) %>%
  select(Id:Category, DRM01_DRM01_PV:last_col())

SP<-data.frame(SP)
 

# Third idea: The idea here is to specifically filter a line from `SP`, which will depend on what the user chooses, for that it will be necessary to choose an Id, date and Category.

# This code remove_values_0 I use because sometimes i have values equal to zero so i remove these columns ((this question was solved here: https://stackoverflow.com/questions/69452882/delete-column-depending-on-the-date-and-code-you-choose)  

remove_values_0 <- df1 %>%
  dplyr::filter(Id==idd,date2 == ymd(dmda), Category == CategoryChosse) %>%
  select(starts_with("DRM")) %>%
  pivot_longer(cols = everything()) %>%
  arrange(desc(row_number())) %>%
  mutate(cs = cumsum(value)) %>%
  dplyr::filter(cs == 0) %>%
  pull(name)
(dropnames <- paste0(remove_values_0,"_",remove_values_0, "_PV"))

 # Selecting the id, dmda and CategoryChoose

idd="5"
dmda="2022-03-19"
CategoryChosse="ABC"

filterid_date_category <- SP %>%
  filter(Id==idd,date2 == ymd(dmda), Category == CategoryChosse) %>%
  select(-any_of(dropnames))

system · April 20, 2022, 5:06pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.