I'll admit to using do
to apply non-dplyr functions to database tables, e.g.
library(sergeant)
db <- src_drill()
db %>%
tbl("cp.`employee.json`") %>%
select(first_name, last_name, salary) %>%
do(tidyr::unite(., name, first_name, last_name, sep = " "))
#> # A tibble: 1,155 x 2
#> name salary
#> * <chr> <dbl>
#> 1 Sheri Nowmer 80000
#> 2 Derrick Whelply 40000
#> 3 Michael Spence 40000
#> 4 Maya Gutierrez 35000
#> 5 Roberta Damstra 25000
#> 6 Rebecca Kanagaki 15000
#> 7 Kim Brunner 10000
#> 8 Brenda Blumberg 17000
#> 9 Darren Stanz 50000
#> 10 Jonathan Murraiin 15000
#> # ... with 1,145 more rows
though in this (every?) case it's really equivalent to calling collect
beforehand, and is thus not really that useful. To avoid bringing the data into memory yet, you've got to use SQL functions:
db %>%
tbl("cp.`employee.json`") %>%
transmute(name = concat(first_name, " ", last_name), salary)
#> # Source: lazy query [?? x 2]
#> # Database: DrillConnection
#> name salary
#> <chr> <dbl>
#> 1 Sheri Nowmer 80000
#> 2 Derrick Whelply 40000
#> 3 Michael Spence 40000
#> 4 Maya Gutierrez 35000
#> 5 Roberta Damstra 25000
#> 6 Rebecca Kanagaki 15000
#> 7 Kim Brunner 10000
#> 8 Brenda Blumberg 17000
#> 9 Darren Stanz 50000
#> 10 Jonathan Murraiin 15000
#> # ... with more rows
For do
's list column behavior, it's always possible to explicitly call list
, e.g.
mtcars %>%
group_by(cyl) %>%
do(mod = lm(mpg ~ disp, .))
#> Source: local data frame [3 x 2]
#> Groups: <by row>
#>
#> # A tibble: 3 x 2
#> cyl mod
#> * <dbl> <list>
#> 1 4 <S3: lm>
#> 2 6 <S3: lm>
#> 3 8 <S3: lm>
mtcars %>%
group_by(cyl) %>%
summarise(mod = list(lm(mpg ~ disp, .)))
#> # A tibble: 3 x 2
#> cyl mod
#> <dbl> <list>
#> 1 4 <S3: lm>
#> 2 6 <S3: lm>
#> 3 8 <S3: lm>
do
's data frame behavior has been superseded by the idiom of nest
ing the non-grouping columns, iterating over them, and unnest
ing.
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
do(model = lm(mpg ~ disp, .)) %>%
do(broom::tidy(.$model))
#> Source: local data frame [6 x 5]
#> Groups: <by row>
#>
#> # A tibble: 6 x 5
#> term estimate std.error statistic p.value
#> * <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 40.871955322 3.589605400 11.3861973 1.202715e-06
#> 2 disp -0.135141815 0.033171608 -4.0740206 2.782827e-03
#> 3 (Intercept) 19.081987419 2.913992892 6.5483988 1.243968e-03
#> 4 disp 0.003605119 0.015557115 0.2317344 8.259297e-01
#> 5 (Intercept) 22.032798914 3.345241115 6.5863112 2.588765e-05
#> 6 disp -0.019634095 0.009315926 -2.1075838 5.677488e-02
mtcars %>%
as_data_frame() %>%
nest(-cyl) %>%
mutate(model = map(data, ~lm(mpg ~ disp, .x)),
summary = map(model, broom::tidy)) %>%
unnest(summary)
#> # A tibble: 6 x 6
#> cyl term estimate std.error statistic p.value
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 6 (Intercept) 19.081987419 2.913992892 6.5483988 1.243968e-03
#> 2 6 disp 0.003605119 0.015557115 0.2317344 8.259297e-01
#> 3 4 (Intercept) 40.871955322 3.589605400 11.3861973 1.202715e-06
#> 4 4 disp -0.135141815 0.033171608 -4.0740206 2.782827e-03
#> 5 8 (Intercept) 22.032798914 3.345241115 6.5863112 2.588765e-05
#> 6 8 disp -0.019634095 0.009315926 -2.1075838 5.677488e-02