group_max performance

Dong · April 10, 2019, 6:31pm

Below is my solution to accommodate indeterminate number of data columns. Many thanks to @Col 's suggestions for the join approach and the example of setName, the latest satisfies all my requirement. While it is not as easy to read as the group_map solution, the performance gain is worth it for me.

library(data.table)

set.seed(1)
times <- 1e5
cols <- 4
df3 <- as.data.frame(x = matrix(rnorm(times * cols, mean = 5), ncol = cols)) 

df3 <- cbind(grp = rep(seq_len(1e3), each = 100), df3) 

DT <- setDT(df3)

setnames(DT, c("V1", "V2", "V3", 'V4'), c("A", "B", "C", "X"))

DT_way2 <- function(DT1) {
  data_cols <- setdiff(colnames(DT1), c("grp"))
  normal_data_cols <- setdiff(data_cols, c("X"))
  sum_DT <- DT1[ , lapply(.SD, max), .SDcols = normal_data_cols, by = grp]
  sum_DT[ , X := rowMeans(.SD), by = grp]
  DT1[sum_DT,on = 'grp', by = .EACHI,
     j = lapply(setNames(data_cols, data_cols),
                function(x) match(TRUE, get(x) < 0.5 * get(paste0("i.", x))))]
}

DT_way2(DT)
#>        grp  A  B  C  X
#>    1:    1 14 16  9 11
#>    2:    2 34  4 21  1
#>    3:    3  5 11  8  3
#>    4:    4 24 17  1  3
#>    5:    5  3 16 12 10
#>   ---                 
#>  996:  996  2  9  5  5
#>  997:  997  6 21 18 17
#>  998:  998  3  3 13 19
#>  999:  999  1  4  5  9
#> 1000: 1000  1  2  2 13