Run function on each row - based on multiple (but not all) columns

srowell · March 27, 2018, 1:13pm

I'm trying to find a more efficient way to run a function that counts the number of favorable responses across several columns for each row. As you can see in my reprex, right now I use a work around where I set a unique value for each row (using row_number), and then grouping by that value. While this works for this example set, it isn't very efficient, and is slow when I use on my real dataset of 25 variables and 200,000 cases.

Is there a more efficient way to get the same output? Maybe something with pmap? But I wasn't quite able to grasp how to do this, but I don't yet have experience in purrr. I'm at that stage in learning R where I can generally get the desired outcome, but not in the most efficient way.

library(tidyverse)
#> Warning: package 'stringr' was built under R version 3.4.4
#> Warning: package 'forcats' was built under R version 3.4.4

#simplfied example dataset
testing_by_row <- 
structure(list(group = c("a", "a", "b", "b", "c", "c"), Q01 = structure(c(1L, 
                                                                          1L, 2L, 3L, 3L, NA), .Label = c("F", "U", "N"), class = "factor"), 
               Q02 = structure(c(2L, 1L, NA, 3L, 1L, 1L), .Label = c("F", 
                                                                     "U", "N"), class = "factor"), Q03 = structure(c(3L, 1L, 1L, 
                                                                                                                     2L, 3L, 3L), .Label = c("F", "U", "N"), class = "factor"), 
               Q04 = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("F", 
                                                                     "U", "N"), class = "factor"), Q05 = structure(c(3L, 3L, NA, 
                                                                                                                     3L, 1L, 2L), .Label = c("F", "U", "N"), class = "factor"), 
               Q06 = structure(c(2L, 2L, 1L, 3L, 1L, 2L), .Label = c("F", 
                                                                     "U", "N"), class = "factor")), .Names = c("group", "Q01", 
                                                                                                               "Q02", "Q03", "Q04", "Q05", "Q06"), class = c("tbl_df", "tbl", 
                                                                                                                                                             "data.frame"), row.names = c(NA, -6L))


#Function to calc count fav for index
countF <- function(x) {
    sum(x == 1, na.rm = TRUE)
}

countN <- function(x) {
    sum(x == 1 | x == 2 | x == 3, na.rm = TRUE)
}

#Counts the #Fav for each index - and the total that answered Fav, Unfav, or Neutral.  Note the questions in the index are not always consecutive (e.g. index 1 is Q01, Q02, and Q04).
testing_by_row %>% 
    mutate(Par_ID = row_number()) %>% 
    group_by(Par_ID) %>% 
    mutate(Index1_Fav = countF(c(Q01, Q02, Q04)),
           Index1_N = countN(c(Q01, Q02, Q04)),
           Index2_Fav = countF(c(Q05, Q06)),
           Index2_N = countN(c(Q05, Q06)))
#> # A tibble: 6 x 12
#> # Groups:   Par_ID [6]
#>   group Q01   Q02   Q03   Q04   Q05   Q06   Par_ID Index1_Fav Index1_N
#>   <chr> <fct> <fct> <fct> <fct> <fct> <fct>  <int>      <int>    <int>
#> 1 a     F     U     N     F     N     U          1          2        3
#> 2 a     F     F     F     F     N     U          2          3        3
#> 3 b     U     <NA>  F     F     <NA>  F          3          1        2
#> 4 b     N     N     U     U     N     N          4          0        3
#> 5 c     N     F     N     U     F     F          5          1        3
#> 6 c     <NA>  F     N     N     U     U          6          1        2
#> # ... with 2 more variables: Index2_Fav <int>, Index2_N <int>

Created on 2018-03-27 by the reprex package (v0.2.0).

jdb · March 27, 2018, 3:58pm

Instead of creating the Par_ID variable, maybe you could use the rowwise() function:

# Using your original code
system.time(testing_by_row %>% 
              mutate(Par_ID = row_number()) %>% 
              group_by(Par_ID) %>% 
              mutate(Index1_Fav = countF(c(Q01, Q02, Q04)),
                     Index1_N = countN(c(Q01, Q02, Q04)),
                     Index2_Fav = countF(c(Q05, Q06)),
                     Index2_N = countN(c(Q05, Q06))))
# Time taken to run
#>    user  system elapsed 
#>   0.020   0.000   0.019

# Using rowwise function
system.time(testing_by_row %>% 
              rowwise() %>% 
              mutate(Index1_Fav = countF(c(Q01, Q02, Q04)),
                     Index1_N = countN(c(Q01, Q02, Q04)),
                     Index2_Fav = countF(c(Q05, Q06)),
                     Index2_N = countN(c(Q05, Q06))))
# Time taken to run (5x faster for user time)
#>    user  system elapsed 
#>   0.004   0.000   0.003

# Check system time for each method
df1 <- testing_by_row %>% 
  mutate(Par_ID = row_number()) %>% 
  group_by(Par_ID) %>% 
  mutate(Index1_Fav = countF(c(Q01, Q02, Q04)),
         Index1_N = countN(c(Q01, Q02, Q04)),
         Index2_Fav = countF(c(Q05, Q06)),
         Index2_N = countN(c(Q05, Q06))) %>% 
  ungroup() %>% 
  select(-Par_ID)


df2 <- testing_by_row %>% 
  rowwise() %>% 
  mutate(Index1_Fav = countF(c(Q01, Q02, Q04)),
         Index1_N = countN(c(Q01, Q02, Q04)),
         Index2_Fav = countF(c(Q05, Q06)),
         Index2_N = countN(c(Q05, Q06)))

# Both should be equal
all.equal(df1, df2)
#> [1] TRUE

Created on 2018-03-27 by the reprex package (v0.2.0).

markdly · March 27, 2018, 11:38pm

I like these sorts of examples to see the different ways other people solve these issues. For me, I would probably try and tackle this by first 'tidying' up the data a little bit using tidyr::gather and then leveraging the use of dplyr::case_when to create each Index required. I'm not sure this is suitable for your use case though as the factors get converted to character during the process.

# continuing on from OP reprex
library(tidyverse)
testing_by_row %>% 
  rowid_to_column() %>%
  gather(key, value, -c(group, rowid)) %>%
  mutate(Index = case_when(
    key %in% c("Q01", "Q02", "Q04") ~ 1L,
    key %in% c("Q05", "Q06")        ~ 2L)) %>% 
  group_by(rowid, Index) %>%
  summarise(Fav = sum(value == "F", na.rm = TRUE), N = sum(!is.na(value)))
#> # A tibble: 18 x 4
#> # Groups:   rowid [?]
#>    rowid Index   Fav     N
#>    <int> <int> <int> <int>
#>  1     1     1     2     3
#>  2     1     2     0     2
#>  3     1    NA     0     1
#>  4     2     1     3     3
#>  5     2     2     0     2
#>  6     2    NA     1     1
#>  7     3     1     1     2
#>  8     3     2     1     1
#>  9     3    NA     1     1
#> 10     4     1     0     3
#> 11     4     2     0     2
#> 12     4    NA     0     1
#> 13     5     1     1     3
#> 14     5     2     2     2
#> 15     5    NA     0     1
#> 16     6     1     1     2
#> 17     6     2     0     2
#> 18     6    NA     0     1

cderv · March 28, 2018, 4:51pm

As I agree on this (), and because you were looking for a solution with pmap, I came up with another way of doing this, without any gather and using purrr.

To not be bothered by factors, I took the liberty to rework the example with only characters.

What I try to show here is that pmap inside a mutate call allows to create some list column, and to iterate "by row" by definition. pmap with a dataframe can be use with a named function that allows to call elements by their respective name during the iteration. ... is needed if you do not use all the columns in the function, as in this example.

here is the code:

library(tidyverse)

#simplfied example dataset
testing_by_row <- tibble::tribble(
  ~group, ~Q01, ~Q02, ~Q03, ~Q04, ~Q05, ~Q06,
  "a",  "F",  "U",  "N",  "F",  "N",  "U",
  "a",  "F",  "F",  "F",  "F",  "N",  "U",
  "b",  "U",   NA,  "F",  "F",   NA,  "F",
  "b",  "N",  "N",  "U",  "U",  "N",  "N",
  "c",  "N",  "F",  "N",  "U",  "F",  "F",
  "c",   NA,  "F",  "N",  "N",  "U",  "U"
)

#Function to calc count fav for index
countF <- function(x) {
  sum(x == "F", na.rm = TRUE)
}

countN <- function(x) {
  sum(x == "F" | x == "U" | x == "N", na.rm = TRUE)
}

#Counts the `F` for each row of the dataframe - and the total that answered F, U, or N.  
testing_by_row %>% 
  mutate(
    Index1 = pmap(., function(Q01, Q02, Q04, ...) {
      val <- c(Q01, Q02, Q04)
      tibble(`F` = countF(val), N = countN(val))
      }),
    Index2 = pmap(., function(Q05, Q06, ...) {
      val <- c(Q06, Q05)
      tibble(`F` = countF(val), N = countN(val))
    })
  ) %>% unnest(.sep = "_")
#> # A tibble: 6 x 11
#>   group Q01   Q02   Q03   Q04   Q05   Q06   Index1_F Index1_N Index2_F
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr>    <int>    <int>    <int>
#> 1 a     F     U     N     F     N     U            2        3        0
#> 2 a     F     F     F     F     N     U            3        3        0
#> 3 b     U     <NA>  F     F     <NA>  F            1        2        1
#> 4 b     N     N     U     U     N     N            0        3        0
#> 5 c     N     F     N     U     F     F            1        3        2
#> 6 c     <NA>  F     N     N     U     U            1        2        0
#> # ... with 1 more variable: Index2_N <int>

Created on 2018-03-28 by the reprex package (v0.2.0).

aosmith · March 28, 2018, 5:33pm

The approach I took looks pretty complicated (I think), but it is based on the premise of not only making the code more efficient, but also making the time it takes to type out your "index" groups more efficient. I'm not sure how many index groups you have, but at the moment you have to type out the names of the columns multiple times.

My answer to that is to type these out one time, which I do by putting these groups into a list. I name the list elements here, which will be useful later.

groups = list( Index1 = c("Q01", "Q02", "Q04"),
               Index2 = c("Q05", "Q06") )

Then I change your functions slightly, so they take more than a single argument. This is because I'm going to use pmap, which passes all columns as arguments.

#Function to calc count fav for index
countF <- function(...) {
    x = c(...)
    sum(x == 1, na.rm = TRUE)
}

countN <- function(...) {
    x = c(...)
    sum(x == 1 | x == 2 | x == 3, na.rm = TRUE)
}

Now for some heavier lifting. You have upwards of 25 variables you are going to work with, and pmap() by default works with all columns. To avoid lots of pesky typing of variable names, selecting the variables in each "index" group prior to using pmap() seems warranted. I test this process using the first element of groups and the two edited functions (using pmap_int() to get an integer vector out).

# Test selecting columns and 
    # then using pmap_int() for counting across the selected columns
testing_by_row %>%
    select(groups[[1]]) %>%
    pmap_int(countF)
#> [1] 2 3 1 0 1 1

testing_by_row %>%
    select(groups[[1]]) %>%
    pmap_int(countN)
#> [1] 3 3 2 3 3 2

Since that worked, I'm ready to make this process into a function. I decide to return a data.frame with the two new columns of counts in it. After some testing (not shown ), I decided to set the names of the output dataset by adding the name of the index to the count-type name. This will make things more organized once we get to the step of looping through the list of index groups.

# Make a function to select the variables in the group,
    # calculate the two counts for those variables,
    # and define the names in the output
count_each = function(dat, groups, names) {
    d1 = select(dat, groups)
    output = data.frame(Fav = pmap_int(d1, countF),
               N = pmap_int(d1, countN) )
    set_names(output, ~paste(names, ., sep = "_") )
}

Before proceeding I test this function with the first element of groups and the name of that list element. It works!

# Testing function on individual groups
count_each(testing_by_row, groups[[1]], names(groups)[1])
#>   Index1_Fav Index1_N
#> 1          2        3
#> 2          3        3
#> 3          1        2
#> 4          0        3
#> 5          1        3
#> 6          1        2

Now I can loop through the groups list and use the function on each index group. I end up using imap_dfc() from purrr for this, as this conveniently loops through the list and the list names simultaneously (and remember I need those list names for naming the output columns). The dfc option is to column-bind the result into a single data.frame.

# Looping through all groups and checking output
imap_dfc(groups, ~count_each(testing_by_row, .x, .y) )
#>   Index1_Fav Index1_N Index2_Fav Index2_N
#> 1          2        3          0        2
#> 2          3        3          0        2
#> 3          1        2          1        1
#> 4          0        3          0        2
#> 5          1        3          2        2
#> 6          1        2          0        2

In the end, I use bind_cols() to get the output attached to the original dataset. This works fine as long as the dataset hasn't been re-arranged throughout this process (which it wasn't). I tend to prefer merging when possible for "safety", but didn't come up with a good option for that here.

bind_cols(testing_by_row, 
          imap_dfc(groups, ~count_each(testing_by_row, .x, .y) ) )
#> # A tibble: 6 x 11
#>   group Q01    Q02    Q03    Q04    Q05    Q06    Index~ Inde~ Inde~ Inde~
#>   <chr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr>  <int> <int> <int> <int>
#> 1 a     F      U      N      F      N      U           2     3     0     2
#> 2 a     F      F      F      F      N      U           3     3     0     2
#> 3 b     U      <NA>   F      F      <NA>   F           1     2     1     1
#> 4 b     N      N      U      U      N      N           0     3     0     2
#> 5 c     N      F      N      U      F      F           1     3     2     2
#> 6 c     <NA>   F      N      N      U      U           1     2     0     2

Created on 2018-03-28 by the reprex package (v0.2.0).

cderv · March 28, 2018, 5:36pm

It is also possible to go further into purrr use and adding a bit of tidyevaluation to programmatically create the columns' name.

It would look like this, using a summary function to create the count table based on a vector of column and a resulting column name. It is not the easiest but worth showing for usage of invoke_*.

library(tidyverse)

#simplfied example dataset
testing_by_row <- tibble::tribble(
  ~group, ~Q01, ~Q02, ~Q03, ~Q04, ~Q05, ~Q06,
  "a",  "F",  "U",  "N",  "F",  "N",  "U",
  "a",  "F",  "F",  "F",  "F",  "N",  "U",
  "b",  "U",   NA,  "F",  "F",   NA,  "F",
  "b",  "N",  "N",  "U",  "U",  "N",  "N",
  "c",  "N",  "F",  "N",  "U",  "F",  "F",
  "c",   NA,  "F",  "N",  "N",  "U",  "U"
)

#Function to calc count fav for index
countF <- function(x) {
  sum(x == "F", na.rm = TRUE)
}

countN <- function(x) {
  sum(x == "F" | x == "U" | x == "N", na.rm = TRUE)
}

count_table <- function(x, index_num = "Index") {
  # using tidyeval to create column name
  tibble( !! paste0(index_num, "_F") := countF(x), !! paste0(index_num, "_N") := countN(x))
}

testing_by_row %>% 
  mutate(
    tab = pmap(., function(Q01, Q02, Q04, Q06, Q05, ...) {
      # allows iteration on a fonction with different argument. Here, binding resulting tibble by column.
      invoke_map_dfc(count_table, 
                     list(
                       # first summary table for index1
                       list(x = c(Q01, Q02, Q04), index_num = "Index1"),
                       # second summary table for index2
                       list(x = c(Q06, Q05), index_num = "Index2")
                     )
      )
    })
  ) %>% unnest()
#> # A tibble: 6 x 11
#>   group Q01   Q02   Q03   Q04   Q05   Q06   Index1_F Index1_N Index2_F
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr>    <int>    <int>    <int>
#> 1 a     F     U     N     F     N     U            2        3        0
#> 2 a     F     F     F     F     N     U            3        3        0
#> 3 b     U     <NA>  F     F     <NA>  F            1        2        1
#> 4 b     N     N     U     U     N     N            0        3        0
#> 5 c     N     F     N     U     F     F            1        3        2
#> 6 c     <NA>  F     N     N     U     U            1        2        0
#> # ... with 1 more variable: Index2_N <int>

Created on 2018-03-28 by the reprex package (v0.2.0).

srowell · March 28, 2018, 7:52pm

I definitely appreciate all the solutions - I selected the final solution from @cderv (since it only lets you choose 1) but all of them taught me something. In my searching for info on pmap, I never saw it explained as clearly as it is here.

Also - just knowing about rowwise() will be useful for some of my other work.

Thank you!