I would very much like to optimize the code below. I used the tictoc
function to calculate the calculation time, the time is even short (0.17 sec elapsed), but I would like to know if there is a way to make it even faster. I made an explanation in some parts of the code. I see that I leave a lot of code line by line, instead of using %>%
, but I don't know that would also reduce computational time.
One idea would be to know which command is taking the longest to execute and thus adjust.
library(dplyr)
library(tidyverse)
library(lubridate)
library(tictoc)
df1 <- structure(
list(
Id = c(
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
1011,
1011,
1011,
1011,
1011,
1011,
1011,
1011,
1011
),
Tp = c(
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA
),
date1 = structure(
c(
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600,
1641945600
),
class = c("POSIXct",
"POSIXt"),
tzone = "UTC"
),
date2 = structure(
c(
1641340800,
1641340800,
1641427200,
1641427200,
1641513600,
1641513600,
1641600000,
1641600000,
1641686400,
1641686400,
1641772800,
1641772800,
1641859200,
1641859200,
1641945600,
1641945600,
1641254400,
1641340800,
1641427200,
1641513600,
1641600000,
1641686400,
1641772800,
1641859200,
1641945600
),
class = c("POSIXct", "POSIXt"),
tzone = "UTC"
),
Week = c(
"Wednesday",
"Wednesday",
"Thursday",
"Thursday",
"Friday",
"Friday",
"Saturday",
"Saturday",
"Sunday",
"Sunday",
"Monday",
"Monday",
"Tuesday",
"Tuesday",
"Wednesday",
"Wednesday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
"Monday",
"Tuesday",
"Wednesday"
),
Category = c(
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"EFG",
"ABC",
"ABC",
"ABC",
"ABC",
"ABC",
"ABC",
"ABC",
"ABC",
"ABC"
),
DR1 = c(
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
NA,
NA,
0,
200,
350,
330,
400,
400,
332,
327.9,
383.6,
0
),
DRM0 = c(
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
0,
0,
200,
350,
330,
400,
400,
332,
327.9,
327.6,
323.75
),
DRM01 = c(
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
0,
0,
200,
350,
330,
400,
400,
332,
327.9,
340,
329.17
),
DRM02 = c(
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
0,
200,
350,
330,
400,
400,
332,
340,
340,
329.17
),
DRM03 = c(
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
300,
0,
200,
350,
330,
400,
400,
338.8,
340,
340,
329.17
),
DRM04 = c(
300,
250,
250,
250,
250,
250,
250,
250,
250,
250,
300,
300,
300,
300,
300,
0,
200,
350,
330,
400,
400,
338.8,
340,
340,
NA
)
),
row.names = c(NA,-25L),
class = c("tbl_df",
"tbl", "data.frame")
)
tic()
idd<-"4"
dmda<-"2022-01-12"
CategoryChosse<-"ABC"
df1$Week <- weekdays(df1$date2) #changing weekday names from Week column according to dates from date2
df1$Tp[is.na(df1$Tp)] <- 0 # IF you have NA in the TP column, use 0.
selection = startsWith(names(df1), "DR")
df1[selection][is.na(df1[selection])] = 0 # IF you have NA in the DRs columns, use 0.
df2<-subset(df1,df1$date2<df1$date1) # The idea here is to use dates smaller than the value of date1 to calculate the median after (historical data)
x<-df2 %>% select(starts_with("DRM0"))
x<-cbind(df2, setNames(df2$DR1 - x, paste0(names(x), "_PV")))
PV<-select(x,Id, date2,Week, Category, DR1, ends_with("PV"))
med<-PV %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(ends_with("PV"), median))
SPV<-df1%>% #
inner_join(med, by = c('Id','Category', 'Week')) %>%
mutate(across(matches("^DRM0\\d+$"), ~.x +
get(paste0(cur_column(), '_PV')),
.names = '{col}_{col}_PV')) %>%
select(Id:Category, DRM01_DRM01_PV:last_col())
SPV<-data.frame(SPV)
SPV <- SPV %>% #Here I specifically filter the id, category and date I want from SPV
filter(Id==idd,date2 == ymd(dmda), Category == CategoryChosse)
toc()