I'm trying to learn data.table and apply to a large dataset. I can easily approach this problem in tidyverse but struggling in data.table without running the analysis twice. This is a very large dataset I'm trying to apply to so I'd like to only run through the summarization once. See examples below. The second approach doesn't work. How can this be done to get the same results as the other approaches?
library(tidyverse)
library(palmerpenguins)
# Want to concatenate all the comments and find the mean of all flipper and bill measurements by species and sex
penguins_comments <-  penguins_raw %>%
  janitor::clean_names() %>%
  mutate(species_short = word(species, 1),
         sex = tolower(sex),
         across(where(is.character) & !comments, as.factor),
         flipper_length_mm = as.integer(flipper_length_mm),
         body_mass_g = as.integer(body_mass_g)) %>%
  select(species=species_short,
         bill_length_mm = culmen_length_mm,
         bill_depth_mm = culmen_depth_mm,
         flipper_length_mm,
         sex,
         comments)
# Tidyverse way
tidy_summary <- penguins_comments %>%
  summarize(
    comment_tog = str_flatten_comma(comments, na.rm = TRUE),
    across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
    .by=c(species, sex)
  )
tidy_summary
#> # A tibble: 8 Γ 6
#>   species   sex    comment_tog    bill_length_mm bill_depth_mm flipper_length_mm
#>   <fct>     <fct>  <chr>                   <dbl>         <dbl>             <dbl>
#> 1 Adelie    male   Not enough bl⦠          40.4          19.1              192.
#> 2 Adelie    female Nest never ob⦠          37.3          17.6              188.
#> 3 Adelie    <NA>   Adult not sam⦠          37.8          18.3              186.
#> 4 Gentoo    female Nest never ob⦠          45.6          14.2              213.
#> 5 Gentoo    male   Nest never ob⦠          49.5          15.7              222.
#> 6 Gentoo    <NA>   Sexing primer⦠          45.6          14.6              216.
#> 7 Chinstrap female Nest never ob⦠          46.6          17.6              192.
#> 8 Chinstrap male   Nest never ob⦠          51.1          19.3              200.
# data.table way - do the mean and comment concat separately
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
penguins_comments_dt <- data.table(penguins_comments)
(varmean <- setdiff(names(penguins_comments_dt), c("species", "sex", "comments")))
#> [1] "bill_length_mm"    "bill_depth_mm"     "flipper_length_mm"
dt_1_pt1 <- penguins_comments_dt[
  , .(comment_tog=str_flatten_comma(comments, na.rm=TRUE)),
  by=.(species, sex)
]
dt_1_pt2 <- penguins_comments_dt[,                      
  lapply(.SD, mean, na.rm=TRUE),## compute the mean
  by = .(species, sex),         ## for every 'origin,dest,month'
  .SDcols = varmean]            ## for just those specified in .SDcols
dt_1 <- dt_1_pt1 |>
  merge(dt_1_pt2, by=c("species", "sex"))
dt_1
#> Key: <species, sex>
#>      species    sex
#>       <fctr> <fctr>
#> 1:    Adelie   <NA>
#> 2:    Adelie female
#> 3:    Adelie   male
#> 4: Chinstrap female
#> 5: Chinstrap   male
#> 6:    Gentoo   <NA>
#> 7:    Gentoo female
#> 8:    Gentoo   male
#>                                                                                                                                                                                                                                                                                                                                                                                                                                       comment_tog
#>                                                                                                                                                                                                                                                                                                                                                                                                                                            <char>
#> 1:                                                                                                                                                                                                                           Adult not sampled., No blood sample obtained., No blood sample obtained for sexing., No blood sample obtained for sexing., No blood sample obtained., Sexing primers did not amplify. Not enough blood for isotopes.
#> 2:                                                                                                Nest never observed with full clutch., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 3: Not enough blood for isotopes., Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch. Not enough blood for isotopes., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 4:                                                                                                                                                                Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 5:                                                                                                                           Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., No delta15N data received from lab., Nest never observed with full clutch.
#> 6:                                                                                                                                                                                                                                                   Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Adult not sampled. Nest never observed with full clutch.
#> 7:                                                                                                                                                                                                                                                                                     Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 8:                                                                                                                                                                                                                                                                                            Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch.
#>    bill_length_mm bill_depth_mm flipper_length_mm
#>             <num>         <num>             <num>
#> 1:       37.84000      18.32000          185.6000
#> 2:       37.25753      17.62192          187.7945
#> 3:       40.39041      19.07260          192.4110
#> 4:       46.57353      17.58824          191.7353
#> 5:       51.09412      19.25294          199.9118
#> 6:       45.62500      14.55000          215.7500
#> 7:       45.56379      14.23793          212.7069
#> 8:       49.47377      15.71803          221.5410
# Comparing methods - essentially the same
waldo::compare(
  tidy_summary %>% arrange(species, sex),
  as_tibble(dt_1) %>% arrange(species, sex),
  x_arg="tidy",
  y_arg="dt",
  ignore_attr=TRUE
)
#>     tidy[[4]]          | dt[[4]]               
#> [1] 37.257534246575339 - 37.257534246575332 [1]
#> [2] 40.390410958904113 - 40.390410958904070 [2]
#> [3] 37.839999999999996 | 37.839999999999996 [3]
#> [4] 46.573529411764703 | 46.573529411764703 [4]
#> [5] 51.094117647058823 | 51.094117647058823 [5]
#> [6] 45.563793103448276 | 45.563793103448276 [6]
#> [7] 49.473770491803279 - 49.473770491803286 [7]
#> [8] 45.625000000000000 | 45.625000000000000 [8]
#> 
#>     tidy[[5]]          | dt[[5]]               
#> [1] 17.621917808219177 | 17.621917808219177 [1]
#> [2] 19.072602739726026 - 19.072602739726022 [2]
#> [3] 18.320000000000000 | 18.320000000000000 [3]
#> [4] 17.588235294117649 - 17.588235294117652 [4]
#> [5] 19.252941176470589 - 19.252941176470586 [5]
#> [6] 14.237931034482759 - 14.237931034482758 [6]
#> [7] 15.718032786885246 - 15.718032786885248 [7]
#> [8] 14.550000000000001 | 14.550000000000001 [8]
### Try to do data.table summarization in one step
dt_2 <- penguins_comments_dt[,                      
  list(comment_tog=str_flatten_comma(comments, na.rm=TRUE), # concatenate
       lapply(.SD, mean, na.rm=TRUE)),## compute the mean
  by = .(species, sex),         ## for every 'origin,dest,month'
  .SD = varmean]            ## for just those specified in .SDcols
names(dt_2)
#> [1] "species"     "sex"         "comment_tog" "V2"
dim(dt_2)
#> [1] 24  4
dim(dt_1)
#> [1] 8 6
Created on 2024-12-14 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> β Session info βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#>  setting  value
#>  version  R version 4.4.0 (2024-04-24 ucrt)
#>  os       Windows 11 x64 (build 22631)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/New_York
#>  date     2024-12-14
#>  pandoc   3.5 @ C:/Users/steph/AppData/Local/Pandoc/ (via rmarkdown)
#> 
#> β Packages βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#>  package        * version date (UTC) lib source
#>  cli              3.6.3   2024-06-21 [1] RSPM (R 4.4.0)
#>  colorspace       2.1-0   2023-01-23 [1] CRAN (R 4.4.0)
#>  crayon           1.5.2   2022-09-29 [1] CRAN (R 4.4.0)
#>  data.table     * 1.15.4  2024-03-30 [1] CRAN (R 4.4.0)
#>  diffobj          0.3.5   2021-10-05 [1] CRAN (R 4.4.0)
#>  digest           0.6.35  2024-03-11 [1] CRAN (R 4.4.0)
#>  dplyr          * 1.1.4   2023-11-17 [1] CRAN (R 4.4.0)
#>  evaluate         0.23    2023-11-01 [1] CRAN (R 4.4.0)
#>  fansi            1.0.6   2023-12-08 [1] CRAN (R 4.4.0)
#>  fastmap          1.1.1   2023-02-24 [1] CRAN (R 4.4.0)
#>  forcats        * 1.0.0   2023-01-29 [1] CRAN (R 4.4.0)
#>  fs               1.6.4   2024-04-25 [1] CRAN (R 4.4.0)
#>  generics         0.1.3   2022-07-05 [1] CRAN (R 4.4.0)
#>  ggplot2        * 3.5.1   2024-04-23 [1] CRAN (R 4.4.0)
#>  glue             1.8.0   2024-09-30 [1] RSPM (R 4.4.0)
#>  gtable           0.3.5   2024-04-22 [1] CRAN (R 4.4.0)
#>  hms              1.1.3   2023-03-21 [1] CRAN (R 4.4.0)
#>  htmltools        0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#>  janitor          2.2.0   2023-02-02 [1] RSPM (R 4.4.0)
#>  knitr            1.46    2024-04-06 [1] CRAN (R 4.4.0)
#>  lifecycle        1.0.4   2023-11-07 [1] CRAN (R 4.4.0)
#>  lubridate      * 1.9.3   2023-09-27 [1] CRAN (R 4.4.0)
#>  magrittr         2.0.3   2022-03-30 [1] CRAN (R 4.4.0)
#>  munsell          0.5.1   2024-04-01 [1] CRAN (R 4.4.0)
#>  palmerpenguins * 0.1.1   2022-08-15 [1] RSPM (R 4.4.0)
#>  pillar           1.9.0   2023-03-22 [1] CRAN (R 4.4.0)
#>  pkgconfig        2.0.3   2019-09-22 [1] CRAN (R 4.4.0)
#>  purrr          * 1.0.2   2023-08-10 [1] CRAN (R 4.4.0)
#>  R.cache          0.16.0  2022-07-21 [1] CRAN (R 4.4.0)
#>  R.methodsS3      1.8.2   2022-06-13 [1] CRAN (R 4.4.0)
#>  R.oo             1.26.0  2024-01-24 [1] CRAN (R 4.4.0)
#>  R.utils          2.12.3  2023-11-18 [1] CRAN (R 4.4.0)
#>  R6               2.5.1   2021-08-19 [1] CRAN (R 4.4.0)
#>  readr          * 2.1.5   2024-01-10 [1] CRAN (R 4.4.0)
#>  rematch2         2.1.2   2020-05-01 [1] CRAN (R 4.4.0)
#>  reprex           2.1.0   2024-01-11 [1] CRAN (R 4.4.0)
#>  rlang            1.1.3   2024-01-10 [1] CRAN (R 4.4.0)
#>  rmarkdown        2.28    2024-08-17 [1] RSPM (R 4.4.0)
#>  rstudioapi       0.16.0  2024-03-24 [1] CRAN (R 4.4.0)
#>  scales           1.3.0   2023-11-28 [1] CRAN (R 4.4.0)
#>  sessioninfo      1.2.2   2021-12-06 [1] CRAN (R 4.4.0)
#>  snakecase        0.11.1  2023-08-27 [1] RSPM (R 4.4.0)
#>  stringi          1.8.3   2023-12-11 [1] CRAN (R 4.4.0)
#>  stringr        * 1.5.1   2023-11-14 [1] CRAN (R 4.4.0)
#>  styler           1.10.3  2024-04-07 [1] CRAN (R 4.4.0)
#>  tibble         * 3.2.1   2023-03-20 [1] CRAN (R 4.4.0)
#>  tidyr          * 1.3.1   2024-01-24 [1] CRAN (R 4.4.0)
#>  tidyselect       1.2.1   2024-03-11 [1] CRAN (R 4.4.0)
#>  tidyverse      * 2.0.0   2023-02-22 [1] CRAN (R 4.4.0)
#>  timechange       0.3.0   2024-01-18 [1] CRAN (R 4.4.0)
#>  tzdb             0.4.0   2023-05-12 [1] CRAN (R 4.4.0)
#>  utf8             1.2.4   2023-10-22 [1] CRAN (R 4.4.0)
#>  vctrs            0.6.5   2023-12-01 [1] CRAN (R 4.4.0)
#>  waldo            0.5.2   2023-11-02 [1] CRAN (R 4.4.0)
#>  withr            3.0.0   2024-01-16 [1] CRAN (R 4.4.0)
#>  xfun             0.43    2024-03-25 [1] CRAN (R 4.4.0)
#>  yaml             2.3.8   2023-12-11 [1] CRAN (R 4.4.0)
#> 
#>  [1] C:/Users/steph/AppData/Local/R/win-library/4.4
#>  [2] C:/Program Files/R/R-4.4.0/library
#> 
#> ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ