That solution may have issues. Since you lose the grouping after the summarize_at()
function, the rowMeans(.)
includes the grp in the calculation.
If your grp
is a character, you would be good to go. Otherwise, grp
is included in the calculation. This is one way to fix it:
df3%>%
group_by(grp) %>%
summarise_at(vars(A:C), max) %>%
mutate(X = rowMeans(.[, 2:4]))
I noticed then when trying to implement a data.table solution. It's about 3 times faster.
library(tidyverse)
library(data.table)
# Simulation setup --------------------------------------------------------
n_per_grp <- 100
grps <- 1000
cols <- 4
# Generate data -----------------------------------------------------------
DT <- as.data.table(matrix(rnorm(n_per_grp * grps * cols, 5), ncol = cols))
setnames(DT, c('A', 'B', 'C', 'X'))
DT[, grp := rep(seq_len(grps), each = 100)]
tib <- as_tibble(DT)
# data.Table Solution --------------------------------------------------------------
DT_Way <- function (DT1) {
# Make Summary Table ------------------------------------------------------
sum_DT <- DT1[ , lapply(.SD, max), .SDcols = c('A','B','C'), by = grp]
sum_DT[ , X := rowMeans(.SD), by = grp]
# Join back to original DT ------------------------------------------------
DT1[sum_DT,on = 'grp', by=.EACHI,
j = lapply(setNames(c('A','B','C','X'), c('A','B','C','X')),
function(x) get(x) - 0.5 * get(paste0("i.", x)))
][, lapply(.SD, function(x) (match(TRUE, x <0))), by = grp]
}
# dplyr solution ----------------------------------------------------------
dplyr_way <- function (df3) {
all_max <- df3%>%
group_by(grp) %>%
summarise_at(vars(A:C), max) %>%
mutate(X = rowMeans(.[, -1]))
left_join(df3, all_max, by = 'grp')%>%
group_by(grp)%>%
transmute(A = A.x - 0.5* A.y ,
B = B.x - 0.5* B.y ,
C = C.x - 0.5* C.y ,
X = X.x - 0.5* X.y )%>%
summarize_at(vars(A:X), ~match(TRUE, . < 0))
}
dplyr_way(tib)
DT_Way (DT)
identical(as.data.frame(dplyr_way(tib)), as.data.frame(DT_Way(DT)))
library(microbenchmark)
microbenchmark(dplyr_way(tib), DT_Way(DT), times = 20)
Performance:
> microbenchmark(dplyr_way(tib), DT_Way(DT), times = 20)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr_way(tib) 456.4055 458.7190 464.7088 459.2215 460.4009 536.389 20
DT_Way(DT) 128.5559 129.7499 131.8996 131.0867 134.2225 136.661 20