The idea of this code below is to find the coef
value for each date/category
. To find the value of coef
I use the lm
function. The code uses the data.table
function. However, I would like to know if it is possible to optimize the code somehow and still get the same result? The idea is to reduce the code in order to decrease the processing time.
library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)
#database
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-25","2021-06-25","2021-06-27","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
DR1 = c(4,1,4,3,6,4,3,5),
DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
DR08 = c(4,0,0,1,2,4,4,4),DR09 = c(2,5,4,4,9,4,7,8),DR010 = c(2,5,4,4,9,4,7,8),DR011 = c(4,7,3,2,2,7,7,7),
DR012 = c(4,4,2,3,0,4,4,5),DR013 = c(4,4,1,4,0,3,2,0),DR014 = c(0,3,1,2,0,2,NA,NA)),
class = "data.frame", row.names = c(NA, -8L))
#Replace by value equal to 0 when you have columns equal to NA in DR0 columns
selection = startsWith(names(df1), "DR0")
df1[selection][is.na(df1[selection])] = 0
# First idea: Calculate the median of the values resulting from the subtraction between DR1 and the values of the DR0 columns
dt1 <- as.data.table(df1)
cols <- grep("^DR0", colnames(dt1), value = TRUE)
medi_ana <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Code, Week), .SDcols = paste0(cols, "_PV") ]
# Second idea: After obtaining the median, I add the values found with the values of the DRM columns of my df1 database.
f1 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f1(names(df1), "^DR0\\d+$")
nm2 <- f1(names(medi_ana), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[medi_ana, (nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Code, Week)]
SPV1 <- df1[, c('date1', 'date2', 'Code', 'Week', nm2), with = FALSE]
# Third idea: The idea here is to specifically filter a line from SPV1, which will depend on what the user chooses, for that it will be necessary to choose an Date and Code.
dmda<-"2021-07-09"
code<-"CDE"
SPV2<-melt(SPV1[date2 == dmda & Code == code][,
lapply(.SD, sum, na.rm = TRUE), by = Code,
.SDcols = patterns("^DR0")],
id.var = "Code", variable.name = "name", value.name = "val")[,
name := readr::parse_number(as.character(name))][]
result<-SPV2[na.omit(SPV2[, .I[(as.Date(dmda) - min(as.Date(df1$date1) [
df1$Code == first(Code)])):max(name)+1], .(Code)]$V1)]
> result
Code name val
1: CDE 12 5
2: CDE 13 5
3: CDE 14 5
## After I calculate the result dataset, I used the lm function to obtain the coef value.
mod <- lm(val ~ I(name^2), result)
coef<-as.numeric(coef(mod)[1])
> coef
[1] 5