Use data.table instead of pivot.longer

JojoSouza · March 31, 2022, 10:09pm

I would like some help to adjust the output called adjusted that I generate. My idea is to optimize somehow to generate faster. Notice that I'm using pivot_longer, which takes longer. One idea would be to continue using data.table as I did to generate SPV. However, I don't know how to do that in this case for adjusted. Can you help me?

I would like to generate the same output table as in the question.

library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)

df1 <- structure(
  list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
                "2021-06-28","2021-06-28","2021-06-28"),
       date2 = c("2021-06-25","2021-06-25","2021-06-27","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
       Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
       Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
       DR1 = c(4,1,4,3,3,4,3,5),
       DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
       DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
       DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
       DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0), 
       DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
  class = "data.frame", row.names = c(NA, -8L))

selection = startsWith(names(df1), "DRM")

df1[selection][is.na(df1[selection])] = 0

dt1 <- as.data.table(df1)

cols <- grep("^DR0", colnames(dt1), value = TRUE)

medi_ana <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
  ][, lapply(.SD, median), by = .(Code, Week), .SDcols = paste0(cols, "_PV") ]

f1 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f1(names(df1), "^DR0\\d+$")
nm2 <- f1(names(medi_ana), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[medi_ana,  (nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Code, Week)]
SPV <- df1[, c('date1', 'date2', 'Code', 'Week', nm2), with = FALSE]

dmda<-"2021-07-09"
code<-"CDE"

adjusted<-SPV %>%
filter(date2==dmda,Code == code) %>%
group_by(Code) %>%
summarize(across(starts_with("DR0"), sum),.groups = 'drop') %>%
pivot_longer(cols= -Code, names_pattern = "DR0(.+)", values_to = "val") %>%
mutate(name = readr::parse_number(name))
    
    > adjusted
    # A tibble: 14 x 3
       Code   name   val
       <chr> <dbl> <dbl>
     1 CDE       1     5
     2 CDE       2     5
     3 CDE       3     5
     4 CDE       4     5
     5 CDE       5     5
     6 CDE       6     5
     7 CDE       7     5
     8 CDE       8     5
     9 CDE       9     5
    10 CDE      10     5
    11 CDE      11     5
    12 CDE      12     5
    13 CDE      13     5
    14 CDE      14     5

FJCC · April 1, 2022, 1:26am

Are you looking to replace this part of your code

adjusted<-SPV %>%
filter(date2==dmda,Code == code) %>%
group_by(Code) %>%
summarize(across(starts_with("DR0"), sum),.groups = 'drop') %>%
pivot_longer(cols= -Code, names_pattern = "DR0(.+)", values_to = "val") %>%
mutate(name = readr::parse_number(name))

with data.table syntax? I came up with the following code.

library(dplyr)

library(tidyr)
library(lubridate)
library(data.table)

df1 <- structure(
  list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
                "2021-06-28","2021-06-28","2021-06-28"),
       date2 = c("2021-06-25","2021-06-25","2021-06-27","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
       Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
       Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
       DR1 = c(4,1,4,3,3,4,3,5),
       DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
       DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
       DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
       DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0), 
       DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
  class = "data.frame", row.names = c(NA, -8L))

selection = startsWith(names(df1), "DRM")

df1[selection][is.na(df1[selection])] = 0

dt1 <- as.data.table(df1)

cols <- grep("^DR0", colnames(dt1), value = TRUE)

medi_ana <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
  ][, lapply(.SD, median), by = .(Code, Week), .SDcols = paste0(cols, "_PV") ]

f1 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f1(names(df1), "^DR0\\d+$")
nm2 <- f1(names(medi_ana), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[medi_ana,  (nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Code, Week)]
SPV <- df1[, c('date1', 'date2', 'Code', 'Week', nm2), with = FALSE]

dmda<-"2021-07-09"
code<-"CDE"

adjusted <- SPV[date2 == dmda & Code == code, lapply(.SD, sum), 
                by=Code, .SDcols = DR01_PV:DR014_PV] |> 
  melt(measure=patterns("^DR"), variable.name = "name",  variable.factor = FALSE) 
adjusted[,name := readr::parse_number(name)]  

adjusted
#>     Code name value
#>  1:  CDE    1     5
#>  2:  CDE    2     5
#>  3:  CDE    3     5
#>  4:  CDE    4     5
#>  5:  CDE    5     5
#>  6:  CDE    6     5
#>  7:  CDE    7     5
#>  8:  CDE    8     5
#>  9:  CDE    9     5
#> 10:  CDE   10     5
#> 11:  CDE   11     5
#> 12:  CDE   12     5
#> 13:  CDE   13     5
#> 14:  CDE   14     5

^{Created on 2022-03-31 by the reprex package (v2.0.1)}

JojoSouza · April 1, 2022, 1:35am

That's exactly what I wanted @FJCC , thanks for the reply. By any chance, do you know how to calculate the computational time of the two so I can have an idea of the optimized time?

FJCC · April 1, 2022, 1:56am

I do not have experience with tracking computation time. The last post in the following thread has an example.

michaelbgarcia · April 1, 2022, 3:12am

Here is a plug for one of my favorite packages {dtplyr}, and it looks like it supports tidyr::pivot_longer, along with the preceding verbs in your statement. Since you are already passing through a data.table, all you would need is to add as.data.table() at the end to access your results.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
library(dtplyr)

df1 <- structure(
  list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
                "2021-06-28","2021-06-28","2021-06-28"),
       date2 = c("2021-06-25","2021-06-25","2021-06-27","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
       Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
       Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
       DR1 = c(4,1,4,3,3,4,3,5),
       DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
       DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
       DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
       DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0), 
       DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
  class = "data.frame", row.names = c(NA, -8L))

selection = startsWith(names(df1), "DRM")

df1[selection][is.na(df1[selection])] = 0

dt1 <- as.data.table(df1)

cols <- grep("^DR0", colnames(dt1), value = TRUE)

medi_ana <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
  ][, lapply(.SD, median), by = .(Code, Week), .SDcols = paste0(cols, "_PV") ]

f1 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f1(names(df1), "^DR0\\d+$")
nm2 <- f1(names(medi_ana), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[medi_ana,  (nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Code, Week)]
SPV <- df1[, c('date1', 'date2', 'Code', 'Week', nm2), with = FALSE]

dmda<-"2021-07-09"
code<-"CDE"

adjusted<-SPV %>%
  filter(date2==dmda,Code == code) %>%
  group_by(Code) %>%
  summarize(across(starts_with("DR0"), sum),.groups = 'drop') %>%
  pivot_longer(cols= -Code, names_pattern = "DR0(.+)", values_to = "val") %>%
  mutate(name = readr::parse_number(name)) %>%
  as.data.table()

adjusted
#>     Code name val
#>  1:  CDE    1   5
#>  2:  CDE    2   5
#>  3:  CDE    3   5
#>  4:  CDE    4   5
#>  5:  CDE    5   5
#>  6:  CDE    6   5
#>  7:  CDE    7   5
#>  8:  CDE    8   5
#>  9:  CDE    9   5
#> 10:  CDE   10   5
#> 11:  CDE   11   5
#> 12:  CDE   12   5
#> 13:  CDE   13   5
#> 14:  CDE   14   5

^{Created on 2022-04-01 by the reprex package (v2.0.1)}

mgirlich · April 1, 2022, 5:16am

@JojoSouza You can measure the time with microbenchmark():

adjusted_prep <- SPV %>%
  filter(date2==dmda,Code == code) %>%
  group_by(Code) %>%
  summarize(across(starts_with("DR0"), sum),.groups = 'drop')

microbenchmark::microbenchmark(
  tidyr = adjusted_prep %>% pivot_longer(cols= -Code, names_pattern = "DR0(.+)", values_to = "val"),
  dtplyr = dtplyr::lazy_dt(adjusted_prep) %>% 
    pivot_longer(cols= -Code, names_pattern = "DR0(.+)", values_to = "val") %>% 
    as_tibble()
)

Are you using the current version of tidyr (i.e. 1.2.0)? Because many functions in tidyr recently got much faster

system · April 8, 2022, 5:17am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.