For larger datasets, I'd recommend the data.table
package.
I'm currently looking at how you shaped the data.
@pavel , Does the [WHICH] column function as intended?
Here's my results based on my current assumption of [WHICH].
Code
# df -- Generate large dataset in data.table; create [Row] variable
dt_a <- data.table(
A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10^6),
B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10^6),
C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10^6),
D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10^6)
)[, Row := 1:.N]
# Long -- Reshape to long format; create [WHICH] variable based on conditionals
dt_b <-
melt(
dt_a,
id.vars = "Row",
variable.name = "name",
value.name = "value"
)[value == "I123" |
value == "I128" |
value == "I121" |
value == "I129", WHICH := 1
][,
.(
C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
C2 = as.logical(sum(value == "I121")),
C3 = as.logical(sum(value == "I129")),
WHICH = sum(WHICH, na.rm = TRUE)
), Row]
# FINAL -- Join tables; output
dt_c <- dt_a[dt_b, on = .(Row = Row)]
dt_c
A B C D Row C1 C2 C3 WHICH
1: I123 I121 I1908 I129 1 TRUE TRUE TRUE 3
2: I128 I123 I124 I109 2 TRUE FALSE FALSE 2
3: I126 I1855 I129 I183 3 FALSE FALSE TRUE 1
4: I121 I163 F121 I8773 4 FALSE TRUE FALSE 1
5: I123 I129 I1536 I121 5 TRUE TRUE TRUE 3
---
6999996: I126 I1855 I129 I183 6999996 FALSE FALSE TRUE 1
6999997: I121 I163 F121 I8773 6999997 FALSE TRUE FALSE 1
6999998: I123 I129 I1536 I121 6999998 TRUE TRUE TRUE 3
6999999: I129 I1665 I128 F843 6999999 TRUE FALSE TRUE 2
7000000: X Y Z ZZ 7000000 FALSE FALSE FALSE 0
Time Report
# df
rt_a <- system.time(data.table(
A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10 ^ 6),
B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10 ^ 6),
C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10 ^ 6),
D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10 ^ 6)
)[, Row := 1:.N])
rt_a
user system elapsed
0.13 0.00 0.12
# Long -- This process uses the most memory
rt_b <- system.time(
melt(dt_a, id.vars = "Row", variable.name = "name", value.name = "value")[
value == "I123" | value == "I128" | value == "I121" | value == "I129",
WHICH := 1
][,
.(
C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
C2 = as.logical(sum(value == "I121")),
C3 = as.logical(sum(value == "I129")),
WHICH = sum(WHICH, na.rm = TRUE)
), Row])
rt_b
user system elapsed
16.88 0.36 16.84
# FINAL
rt_c <- system.time(dt_a[dt_b, on = .(Row = Row)])
rt_c
user system elapsed
0.42 0.00 0.28
Forgive the multiple edits; doing dumb things while trying to optimize the already boiler-plate code.