Multiple summaries in data.table with .SDcols on some summaries

StatSteph · December 14, 2024, 3:45pm

I'm trying to learn data.table and apply to a large dataset. I can easily approach this problem in tidyverse but struggling in data.table without running the analysis twice. This is a very large dataset I'm trying to apply to so I'd like to only run through the summarization once. See examples below. The second approach doesn't work. How can this be done to get the same results as the other approaches?

library(tidyverse)
library(palmerpenguins)

# Want to concatenate all the comments and find the mean of all flipper and bill measurements by species and sex

penguins_comments <-  penguins_raw %>%
  janitor::clean_names() %>%
  mutate(species_short = word(species, 1),
         sex = tolower(sex),
         across(where(is.character) & !comments, as.factor),
         flipper_length_mm = as.integer(flipper_length_mm),
         body_mass_g = as.integer(body_mass_g)) %>%
  select(species=species_short,
         bill_length_mm = culmen_length_mm,
         bill_depth_mm = culmen_depth_mm,
         flipper_length_mm,
         sex,
         comments)

# Tidyverse way

tidy_summary <- penguins_comments %>%
  summarize(
    comment_tog = str_flatten_comma(comments, na.rm = TRUE),
    across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
    .by=c(species, sex)
  )

tidy_summary
#> # A tibble: 8 × 6
#>   species   sex    comment_tog    bill_length_mm bill_depth_mm flipper_length_mm
#>   <fct>     <fct>  <chr>                   <dbl>         <dbl>             <dbl>
#> 1 Adelie    male   Not enough bl…           40.4          19.1              192.
#> 2 Adelie    female Nest never ob…           37.3          17.6              188.
#> 3 Adelie    <NA>   Adult not sam…           37.8          18.3              186.
#> 4 Gentoo    female Nest never ob…           45.6          14.2              213.
#> 5 Gentoo    male   Nest never ob…           49.5          15.7              222.
#> 6 Gentoo    <NA>   Sexing primer…           45.6          14.6              216.
#> 7 Chinstrap female Nest never ob…           46.6          17.6              192.
#> 8 Chinstrap male   Nest never ob…           51.1          19.3              200.

# data.table way - do the mean and comment concat separately

library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
penguins_comments_dt <- data.table(penguins_comments)

(varmean <- setdiff(names(penguins_comments_dt), c("species", "sex", "comments")))
#> [1] "bill_length_mm"    "bill_depth_mm"     "flipper_length_mm"

dt_1_pt1 <- penguins_comments_dt[
  , .(comment_tog=str_flatten_comma(comments, na.rm=TRUE)),
  by=.(species, sex)
]

dt_1_pt2 <- penguins_comments_dt[,                      
  lapply(.SD, mean, na.rm=TRUE),## compute the mean
  by = .(species, sex),         ## for every 'origin,dest,month'
  .SDcols = varmean]            ## for just those specified in .SDcols

dt_1 <- dt_1_pt1 |>
  merge(dt_1_pt2, by=c("species", "sex"))

dt_1
#> Key: <species, sex>
#>      species    sex
#>       <fctr> <fctr>
#> 1:    Adelie   <NA>
#> 2:    Adelie female
#> 3:    Adelie   male
#> 4: Chinstrap female
#> 5: Chinstrap   male
#> 6:    Gentoo   <NA>
#> 7:    Gentoo female
#> 8:    Gentoo   male
#>                                                                                                                                                                                                                                                                                                                                                                                                                                       comment_tog
#>                                                                                                                                                                                                                                                                                                                                                                                                                                            <char>
#> 1:                                                                                                                                                                                                                           Adult not sampled., No blood sample obtained., No blood sample obtained for sexing., No blood sample obtained for sexing., No blood sample obtained., Sexing primers did not amplify. Not enough blood for isotopes.
#> 2:                                                                                                Nest never observed with full clutch., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 3: Not enough blood for isotopes., Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch. Not enough blood for isotopes., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 4:                                                                                                                                                                Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 5:                                                                                                                           Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., No delta15N data received from lab., Nest never observed with full clutch.
#> 6:                                                                                                                                                                                                                                                   Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Adult not sampled. Nest never observed with full clutch.
#> 7:                                                                                                                                                                                                                                                                                     Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 8:                                                                                                                                                                                                                                                                                            Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch.
#>    bill_length_mm bill_depth_mm flipper_length_mm
#>             <num>         <num>             <num>
#> 1:       37.84000      18.32000          185.6000
#> 2:       37.25753      17.62192          187.7945
#> 3:       40.39041      19.07260          192.4110
#> 4:       46.57353      17.58824          191.7353
#> 5:       51.09412      19.25294          199.9118
#> 6:       45.62500      14.55000          215.7500
#> 7:       45.56379      14.23793          212.7069
#> 8:       49.47377      15.71803          221.5410
# Comparing methods - essentially the same

waldo::compare(
  tidy_summary %>% arrange(species, sex),
  as_tibble(dt_1) %>% arrange(species, sex),
  x_arg="tidy",
  y_arg="dt",
  ignore_attr=TRUE
)
#>     tidy[[4]]          | dt[[4]]               
#> [1] 37.257534246575339 - 37.257534246575332 [1]
#> [2] 40.390410958904113 - 40.390410958904070 [2]
#> [3] 37.839999999999996 | 37.839999999999996 [3]
#> [4] 46.573529411764703 | 46.573529411764703 [4]
#> [5] 51.094117647058823 | 51.094117647058823 [5]
#> [6] 45.563793103448276 | 45.563793103448276 [6]
#> [7] 49.473770491803279 - 49.473770491803286 [7]
#> [8] 45.625000000000000 | 45.625000000000000 [8]
#> 
#>     tidy[[5]]          | dt[[5]]               
#> [1] 17.621917808219177 | 17.621917808219177 [1]
#> [2] 19.072602739726026 - 19.072602739726022 [2]
#> [3] 18.320000000000000 | 18.320000000000000 [3]
#> [4] 17.588235294117649 - 17.588235294117652 [4]
#> [5] 19.252941176470589 - 19.252941176470586 [5]
#> [6] 14.237931034482759 - 14.237931034482758 [6]
#> [7] 15.718032786885246 - 15.718032786885248 [7]
#> [8] 14.550000000000001 | 14.550000000000001 [8]

### Try to do data.table summarization in one step

dt_2 <- penguins_comments_dt[,                      
  list(comment_tog=str_flatten_comma(comments, na.rm=TRUE), # concatenate
       lapply(.SD, mean, na.rm=TRUE)),## compute the mean
  by = .(species, sex),         ## for every 'origin,dest,month'
  .SD = varmean]            ## for just those specified in .SDcols

names(dt_2)
#> [1] "species"     "sex"         "comment_tog" "V2"
dim(dt_2)
#> [1] 24  4
dim(dt_1)
#> [1] 8 6

^{Created on 2024-12-14 with reprex v2.1.0}

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.0 (2024-04-24 ucrt)
#>  os       Windows 11 x64 (build 22631)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/New_York
#>  date     2024-12-14
#>  pandoc   3.5 @ C:/Users/steph/AppData/Local/Pandoc/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package        * version date (UTC) lib source
#>  cli              3.6.3   2024-06-21 [1] RSPM (R 4.4.0)
#>  colorspace       2.1-0   2023-01-23 [1] CRAN (R 4.4.0)
#>  crayon           1.5.2   2022-09-29 [1] CRAN (R 4.4.0)
#>  data.table     * 1.15.4  2024-03-30 [1] CRAN (R 4.4.0)
#>  diffobj          0.3.5   2021-10-05 [1] CRAN (R 4.4.0)
#>  digest           0.6.35  2024-03-11 [1] CRAN (R 4.4.0)
#>  dplyr          * 1.1.4   2023-11-17 [1] CRAN (R 4.4.0)
#>  evaluate         0.23    2023-11-01 [1] CRAN (R 4.4.0)
#>  fansi            1.0.6   2023-12-08 [1] CRAN (R 4.4.0)
#>  fastmap          1.1.1   2023-02-24 [1] CRAN (R 4.4.0)
#>  forcats        * 1.0.0   2023-01-29 [1] CRAN (R 4.4.0)
#>  fs               1.6.4   2024-04-25 [1] CRAN (R 4.4.0)
#>  generics         0.1.3   2022-07-05 [1] CRAN (R 4.4.0)
#>  ggplot2        * 3.5.1   2024-04-23 [1] CRAN (R 4.4.0)
#>  glue             1.8.0   2024-09-30 [1] RSPM (R 4.4.0)
#>  gtable           0.3.5   2024-04-22 [1] CRAN (R 4.4.0)
#>  hms              1.1.3   2023-03-21 [1] CRAN (R 4.4.0)
#>  htmltools        0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#>  janitor          2.2.0   2023-02-02 [1] RSPM (R 4.4.0)
#>  knitr            1.46    2024-04-06 [1] CRAN (R 4.4.0)
#>  lifecycle        1.0.4   2023-11-07 [1] CRAN (R 4.4.0)
#>  lubridate      * 1.9.3   2023-09-27 [1] CRAN (R 4.4.0)
#>  magrittr         2.0.3   2022-03-30 [1] CRAN (R 4.4.0)
#>  munsell          0.5.1   2024-04-01 [1] CRAN (R 4.4.0)
#>  palmerpenguins * 0.1.1   2022-08-15 [1] RSPM (R 4.4.0)
#>  pillar           1.9.0   2023-03-22 [1] CRAN (R 4.4.0)
#>  pkgconfig        2.0.3   2019-09-22 [1] CRAN (R 4.4.0)
#>  purrr          * 1.0.2   2023-08-10 [1] CRAN (R 4.4.0)
#>  R.cache          0.16.0  2022-07-21 [1] CRAN (R 4.4.0)
#>  R.methodsS3      1.8.2   2022-06-13 [1] CRAN (R 4.4.0)
#>  R.oo             1.26.0  2024-01-24 [1] CRAN (R 4.4.0)
#>  R.utils          2.12.3  2023-11-18 [1] CRAN (R 4.4.0)
#>  R6               2.5.1   2021-08-19 [1] CRAN (R 4.4.0)
#>  readr          * 2.1.5   2024-01-10 [1] CRAN (R 4.4.0)
#>  rematch2         2.1.2   2020-05-01 [1] CRAN (R 4.4.0)
#>  reprex           2.1.0   2024-01-11 [1] CRAN (R 4.4.0)
#>  rlang            1.1.3   2024-01-10 [1] CRAN (R 4.4.0)
#>  rmarkdown        2.28    2024-08-17 [1] RSPM (R 4.4.0)
#>  rstudioapi       0.16.0  2024-03-24 [1] CRAN (R 4.4.0)
#>  scales           1.3.0   2023-11-28 [1] CRAN (R 4.4.0)
#>  sessioninfo      1.2.2   2021-12-06 [1] CRAN (R 4.4.0)
#>  snakecase        0.11.1  2023-08-27 [1] RSPM (R 4.4.0)
#>  stringi          1.8.3   2023-12-11 [1] CRAN (R 4.4.0)
#>  stringr        * 1.5.1   2023-11-14 [1] CRAN (R 4.4.0)
#>  styler           1.10.3  2024-04-07 [1] CRAN (R 4.4.0)
#>  tibble         * 3.2.1   2023-03-20 [1] CRAN (R 4.4.0)
#>  tidyr          * 1.3.1   2024-01-24 [1] CRAN (R 4.4.0)
#>  tidyselect       1.2.1   2024-03-11 [1] CRAN (R 4.4.0)
#>  tidyverse      * 2.0.0   2023-02-22 [1] CRAN (R 4.4.0)
#>  timechange       0.3.0   2024-01-18 [1] CRAN (R 4.4.0)
#>  tzdb             0.4.0   2023-05-12 [1] CRAN (R 4.4.0)
#>  utf8             1.2.4   2023-10-22 [1] CRAN (R 4.4.0)
#>  vctrs            0.6.5   2023-12-01 [1] CRAN (R 4.4.0)
#>  waldo            0.5.2   2023-11-02 [1] CRAN (R 4.4.0)
#>  withr            3.0.0   2024-01-16 [1] CRAN (R 4.4.0)
#>  xfun             0.43    2024-03-25 [1] CRAN (R 4.4.0)
#>  yaml             2.3.8   2023-12-11 [1] CRAN (R 4.4.0)
#> 
#>  [1] C:/Users/steph/AppData/Local/R/win-library/4.4
#>  [2] C:/Program Files/R/R-4.4.0/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

vedoa · December 14, 2024, 4:43pm

Hi @StatSteph

since you did all the work here is my approach - use dtplyr to convert your code to the data.table equvailent.

library(data.table)
library(dtplyr)
library(tidyverse)
library(palmerpenguins)

# Want to concatenate all the comments and find the mean of all flipper and bill measurements by species and sex
penguins_comments <-  penguins_raw %>%
  janitor::clean_names() %>%
  mutate(species_short = word(species, 1),
         sex = tolower(sex),
         across(where(is.character) & !comments, as.factor),
         flipper_length_mm = as.integer(flipper_length_mm),
         body_mass_g = as.integer(body_mass_g)) %>%
  select(species=species_short,
         bill_length_mm = culmen_length_mm,
         bill_depth_mm = culmen_depth_mm,
         flipper_length_mm,
         sex,
         comments)

# Tidyverse way
tidy_summary <- penguins_comments %>%
  summarize(
    comment_tog = str_flatten_comma(comments, na.rm = TRUE),
    across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
    .by=c(species, sex)
  )

# Convert with package to data.table syntax

penguins_comments2 <- lazy_dt(penguins_comments)

penguins_comments2 %>%
  summarize(
    comment_tog = str_flatten_comma(comments, na.rm = TRUE),
    across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
    .by=c(species, sex)
  )

# this will give you the translation
#  `_DT1`[, .(comment_tog = str_flatten_comma(comments, na.rm = TRUE), 
# bill_length_mm = mean(bill_length_mm, na.rm = TRUE), bill_depth_mm = mean(bill_depth_mm, 
#                                                                           na.rm = TRUE), flipper_length_mm = mean(flipper_length_mm, 

# final call                                                                                                               na.rm = TRUE)), keyby = .(species, sex)]

penguins_comments_dt <- data.table(penguins_comments)
res <- penguins_comments_dt[, .(comment_tog = str_flatten_comma(comments, na.rm = TRUE), 
     bill_length_mm = mean(bill_length_mm, na.rm = TRUE), bill_depth_mm = mean(bill_depth_mm, 
                                                                               na.rm = TRUE), flipper_length_mm = mean(flipper_length_mm, 
                                                                                                                       na.rm = TRUE)), keyby = .(species, sex)]

dickoah · December 14, 2024, 4:51pm

Hi @StatSteph

You can concatenate the comments_tog and the lapply(.SD, ...) part, something along these lines

library(tidyverse)
library(data.table)
library(palmerpenguins)

penguins_comments <-  penguins_raw %>%
  janitor::clean_names() %>%
  mutate(species_short = word(species, 1),
         sex = tolower(sex),
         across(where(is.character) & !comments, as.factor),
         flipper_length_mm = as.integer(flipper_length_mm),
         body_mass_g = as.integer(body_mass_g)) %>%
  select(species=species_short,
         bill_length_mm = culmen_length_mm,
         bill_depth_mm = culmen_depth_mm,
         flipper_length_mm,
         sex,
         comments)
)

penguins_comments |>
 summarize(comment_tog = str_flatten_comma(comments, na.rm = TRUE),
           across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
           .by = c(species, sex))

penguins_comments_dt <- as.data.table(penguins_comments)
penguins_comments_dt[, c(list(comment_tog = str_flatten_comma(comments)),
                    lapply(.SD, mean, na.rm = TRUE)),
            .SDcols = patterns("^bill|^flipper"),
            by = list(species, sex)]

system · December 21, 2024, 4:51pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.