Using Purrr to group by columns iteratively

mara · October 11, 2018, 12:54pm

With the reprex (below) you can see what's happening a bit better:

library(tidyverse)
options(stringsAsFactors = FALSE)
data(gss_cat)

mydf <- gss_cat %>% select(year, marital, race, age)
group_by <- c("year", "marital", "race")

test_func <- function(mydf, var) {
  
  var = enquo(var)
  
  mydf %>% 
    group_by(!!var) %>% 
    summarise(n = n())
  
}

# Test my function by grouping by year
test_func(mydf, year)
#> # A tibble: 8 x 2
#>    year     n
#>   <int> <int>
#> 1  2000  2817
#> 2  2002  2765
#> 3  2004  2812
#> 4  2006  4510
#> 5  2008  2023
#> 6  2010  2044
#> 7  2012  1974
#> 8  2014  2538
test_func(mydf, marital)
#> # A tibble: 6 x 2
#>   marital           n
#>   <fct>         <int>
#> 1 No answer        17
#> 2 Never married  5416
#> 3 Separated       743
#> 4 Divorced       3383
#> 5 Widowed        1807
#> 6 Married       10117
test_func(mydf, race)
#> # A tibble: 4 x 2
#>   race               n
#>   <fct>          <int>
#> 1 Other           1959
#> 2 Black           3129
#> 3 White          16395
#> 4 Not applicable     0

# Bring purrr into the equation by trying to group by the vector searchby
pmap(list(var = group_by), test_func, mydf = mydf)
#> [[1]]
#> # A tibble: 1 x 2
#>   `"year"`     n
#>   <chr>    <int>
#> 1 year     21483
#> 
#> [[2]]
#> # A tibble: 1 x 2
#>   `"marital"`     n
#>   <chr>       <int>
#> 1 marital     21483
#> 
#> [[3]]
#> # A tibble: 1 x 2
#>   `"race"`     n
#>   <chr>    <int>
#> 1 race     21483

^{Created on 2018-10-11 by the reprex package (v0.2.1.9000)}

Note that in the output for the list you're grouping by variables with quotation marks in them (e.g. "race" as opposed to race), which is why they have backticks around them.

Because dplyr::group_by() already quotes its input, you're (in effect) quoting twice.* This works with rlang::ensym():

test_func2 <- function(mydf, var) {
  var <- rlang::ensym(var)
  
  mydf %>% 
    group_by(!!var) %>% 
    summarise(n = n())
  
}

# Bring purrr into the equation by trying to group by the vector searchby
pmap(list(var = group_by), test_func2, mydf = mydf)
#> [[1]]
#> # A tibble: 8 x 2
#>    year     n
#>   <int> <int>
#> 1  2000  2817
#> 2  2002  2765
#> 3  2004  2812
#> 4  2006  4510
#> 5  2008  2023
#> 6  2010  2044
#> 7  2012  1974
#> 8  2014  2538
#> 
#> [[2]]
#> # A tibble: 6 x 2
#>   marital           n
#>   <fct>         <int>
#> 1 No answer        17
#> 2 Never married  5416
#> 3 Separated       743
#> 4 Divorced       3383
#> 5 Widowed        1807
#> 6 Married       10117
#> 
#> [[3]]
#> # A tibble: 4 x 2
#>   race               n
#>   <fct>          <int>
#> 1 Other           1959
#> 2 Black           3129
#> 3 White          16395
#> 4 Not applicable     0

* Edit: With the quotation marks, you're effectively making a new column with one group, because of R's recycling rules, which is why you were getting the same n for all three variables before!

library(tidyverse)
data(gss_cat)

mydf <- gss_cat %>% select(year, marital, race, age)
group_by(mydf, "year")
#> # A tibble: 21,483 x 5
#> # Groups:   "year" [1]
#>     year marital       race    age `"year"`
#>    <int> <fct>         <fct> <int> <chr>   
#>  1  2000 Never married White    26 year    
#>  2  2000 Divorced      White    48 year    
#>  3  2000 Widowed       White    67 year    
#>  4  2000 Never married White    39 year    
#>  5  2000 Divorced      White    25 year    
#>  6  2000 Married       White    25 year    
#>  7  2000 Never married White    36 year    
#>  8  2000 Divorced      White    44 year    
#>  9  2000 Married       White    44 year    
#> 10  2000 Married       White    47 year    
#> # … with 21,473 more rows

^{Created on 2018-10-11 by the reprex package (v0.2.1.9000)}