Looking for faster solution

pavel · July 20, 2022, 6:38pm

Hello everybody, I'm looking for a faster solution to the problem below. The dataset contains more than 7M rows, and I have no idea how to make it faster. Thank you!

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.1.2
df<- tribble(~A,~B,~C,~D,
             "I123","I121","I1908","I129",
             "I128","I123","I124","I109",
             "I126","I1855","I129","I183",
             "I121","I163","F121","I8773",
             "I123","I129","I1563","I121",
             "I129","I1665","I128", "F843",
             "X","Y","Z","ZZ")


df <- df |> mutate(Row=row_number()) 
Long <- df |> pivot_longer(cols = A:D,names_to = "name")
Long <-  Long |> group_by(Row) |> 
  summarize(C1 = any(str_detect(value,"I123|I128")),
            C2 = any(str_detect(value,"I121")),
            C3 = any(str_detect(value,"I129"))) |> 
  rowwise() |> 
  mutate(WHICH=which(c_across(C1:C3))[1])
FINAL <- inner_join(df,Long,by="Row")
FINAL

Ven · July 20, 2022, 10:36pm

For larger datasets, I'd recommend the data.table package.

I'm currently looking at how you shaped the data.

@pavel , Does the [WHICH] column function as intended?

Here's my results based on my current assumption of [WHICH].

Code

# df -- Generate large dataset in data.table; create [Row] variable
dt_a <- data.table(
A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10^6),
B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10^6),
C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10^6),
D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10^6)
)[, Row := 1:.N]

# Long -- Reshape to long format; create [WHICH] variable based on conditionals
dt_b <-
  melt(
    dt_a,
    id.vars = "Row",
    variable.name = "name",
    value.name = "value"
  )[value == "I123" |
      value == "I128" |
      value == "I121" |
      value == "I129", WHICH := 1
  ][,
    .(
      C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
      C2 = as.logical(sum(value == "I121")),
      C3 = as.logical(sum(value == "I129")),
      WHICH = sum(WHICH, na.rm = TRUE)
    ), Row]

# FINAL -- Join tables; output
dt_c <- dt_a[dt_b, on = .(Row = Row)]
dt_c

            A     B     C     D     Row    C1    C2    C3 WHICH
      1: I123  I121 I1908  I129       1  TRUE  TRUE  TRUE     3
      2: I128  I123  I124  I109       2  TRUE FALSE FALSE     2
      3: I126 I1855  I129  I183       3 FALSE FALSE  TRUE     1
      4: I121  I163  F121 I8773       4 FALSE  TRUE FALSE     1
      5: I123  I129 I1536  I121       5  TRUE  TRUE  TRUE     3
     ---                                                       
6999996: I126 I1855  I129  I183 6999996 FALSE FALSE  TRUE     1
6999997: I121  I163  F121 I8773 6999997 FALSE  TRUE FALSE     1
6999998: I123  I129 I1536  I121 6999998  TRUE  TRUE  TRUE     3
6999999: I129 I1665  I128  F843 6999999  TRUE FALSE  TRUE     2
7000000:    X     Y     Z    ZZ 7000000 FALSE FALSE FALSE     0

Time Report

# df 
rt_a <- system.time(data.table(
  A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10 ^ 6),
  B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10 ^ 6),
  C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10 ^ 6),
  D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10 ^ 6)
)[, Row := 1:.N])
rt_a

   user  system elapsed 
   0.13    0.00    0.12

# Long -- This process uses the most memory
rt_b <- system.time(
      melt(dt_a, id.vars = "Row", variable.name = "name", value.name = "value")[
        value == "I123" | value == "I128" | value == "I121" | value == "I129", 
        WHICH := 1
      ][,
        .(
          C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
          C2 = as.logical(sum(value == "I121")),
          C3 = as.logical(sum(value == "I129")),
          WHICH = sum(WHICH, na.rm = TRUE)
        ), Row])
rt_b

   user  system elapsed 
  16.88    0.36   16.84

# FINAL
rt_c <- system.time(dt_a[dt_b, on = .(Row = Row)])
rt_c

   user  system elapsed 
   0.42    0.00    0.28

Forgive the multiple edits; doing dumb things while trying to optimize the already boiler-plate code.

mgirlich · July 21, 2022, 6:24am

I would do something along these lines:

make the table long
join with a table that maps your values to your "c number"
get the minimal "c number" per row

library(tidyverse)
df<- tribble(~A,~B,~C,~D,
             "I123","I121","I1908","I129",
             "I128","I123","I124","I109",
             "I126","I1855","I129","I183",
             "I121","I163","F121","I8773",
             "I123","I129","I1563","I121",
             "I129","I1665","I128", "F843",
             "X","Y","Z","ZZ")

df <- df |> mutate(row = row_number()) 
long <- df |> pivot_longer(cols = A:D, names_to = "name")

mapping_tbl <- tribble(
  ~ C_number, ~ value,
    1,  "I123",
    1,  "I128",
    2,  "I121",
    3,  "I129",
)

row_results <- long %>% 
  inner_join(mapping_tbl, by = "value") %>% 
  group_by(row) %>% 
  summarise(which = min(C_number))

^{Created on 2022-07-21 by the reprex package (v2.0.1)}

pavel · July 23, 2022, 1:59am

@Ven Thank you very much for your reply, sorry, I haven't had a chance to test the solution, I'm busy with my regular job. Thank you again.

@mgirlich Thank you! Again, I will test it as soon as possible.

system · July 30, 2022, 2:00am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.