I'm trying to learn data.table and apply to a large dataset. I can easily approach this problem in tidyverse but struggling in data.table without running the analysis twice. This is a very large dataset I'm trying to apply to so I'd like to only run through the summarization once. See examples below. The second approach doesn't work. How can this be done to get the same results as the other approaches?
library(tidyverse)
library(palmerpenguins)
# Want to concatenate all the comments and find the mean of all flipper and bill measurements by species and sex
penguins_comments <- penguins_raw %>%
janitor::clean_names() %>%
mutate(species_short = word(species, 1),
sex = tolower(sex),
across(where(is.character) & !comments, as.factor),
flipper_length_mm = as.integer(flipper_length_mm),
body_mass_g = as.integer(body_mass_g)) %>%
select(species=species_short,
bill_length_mm = culmen_length_mm,
bill_depth_mm = culmen_depth_mm,
flipper_length_mm,
sex,
comments)
# Tidyverse way
tidy_summary <- penguins_comments %>%
summarize(
comment_tog = str_flatten_comma(comments, na.rm = TRUE),
across(starts_with("bill")|starts_with("flipper"), ~mean(.x, na.rm=TRUE)),
.by=c(species, sex)
)
tidy_summary
#> # A tibble: 8 Γ 6
#> species sex comment_tog bill_length_mm bill_depth_mm flipper_length_mm
#> <fct> <fct> <chr> <dbl> <dbl> <dbl>
#> 1 Adelie male Not enough bl⦠40.4 19.1 192.
#> 2 Adelie female Nest never ob⦠37.3 17.6 188.
#> 3 Adelie <NA> Adult not sam⦠37.8 18.3 186.
#> 4 Gentoo female Nest never ob⦠45.6 14.2 213.
#> 5 Gentoo male Nest never ob⦠49.5 15.7 222.
#> 6 Gentoo <NA> Sexing primer⦠45.6 14.6 216.
#> 7 Chinstrap female Nest never ob⦠46.6 17.6 192.
#> 8 Chinstrap male Nest never ob⦠51.1 19.3 200.
# data.table way - do the mean and comment concat separately
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday, week,
#> yday, year
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
#> The following object is masked from 'package:purrr':
#>
#> transpose
penguins_comments_dt <- data.table(penguins_comments)
(varmean <- setdiff(names(penguins_comments_dt), c("species", "sex", "comments")))
#> [1] "bill_length_mm" "bill_depth_mm" "flipper_length_mm"
dt_1_pt1 <- penguins_comments_dt[
, .(comment_tog=str_flatten_comma(comments, na.rm=TRUE)),
by=.(species, sex)
]
dt_1_pt2 <- penguins_comments_dt[,
lapply(.SD, mean, na.rm=TRUE),## compute the mean
by = .(species, sex), ## for every 'origin,dest,month'
.SDcols = varmean] ## for just those specified in .SDcols
dt_1 <- dt_1_pt1 |>
merge(dt_1_pt2, by=c("species", "sex"))
dt_1
#> Key: <species, sex>
#> species sex
#> <fctr> <fctr>
#> 1: Adelie <NA>
#> 2: Adelie female
#> 3: Adelie male
#> 4: Chinstrap female
#> 5: Chinstrap male
#> 6: Gentoo <NA>
#> 7: Gentoo female
#> 8: Gentoo male
#> comment_tog
#> <char>
#> 1: Adult not sampled., No blood sample obtained., No blood sample obtained for sexing., No blood sample obtained for sexing., No blood sample obtained., Sexing primers did not amplify. Not enough blood for isotopes.
#> 2: Nest never observed with full clutch., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 3: Not enough blood for isotopes., Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch. Not enough blood for isotopes., Not enough blood for isotopes., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 4: Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 5: Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., No delta15N data received from lab., Nest never observed with full clutch.
#> 6: Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Sexing primers did not amplify., Adult not sampled. Nest never observed with full clutch.
#> 7: Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch., Nest never observed with full clutch.
#> 8: Nest never observed with full clutch., Not enough blood for isotopes., Nest never observed with full clutch., Nest never observed with full clutch.
#> bill_length_mm bill_depth_mm flipper_length_mm
#> <num> <num> <num>
#> 1: 37.84000 18.32000 185.6000
#> 2: 37.25753 17.62192 187.7945
#> 3: 40.39041 19.07260 192.4110
#> 4: 46.57353 17.58824 191.7353
#> 5: 51.09412 19.25294 199.9118
#> 6: 45.62500 14.55000 215.7500
#> 7: 45.56379 14.23793 212.7069
#> 8: 49.47377 15.71803 221.5410
# Comparing methods - essentially the same
waldo::compare(
tidy_summary %>% arrange(species, sex),
as_tibble(dt_1) %>% arrange(species, sex),
x_arg="tidy",
y_arg="dt",
ignore_attr=TRUE
)
#> tidy[[4]] | dt[[4]]
#> [1] 37.257534246575339 - 37.257534246575332 [1]
#> [2] 40.390410958904113 - 40.390410958904070 [2]
#> [3] 37.839999999999996 | 37.839999999999996 [3]
#> [4] 46.573529411764703 | 46.573529411764703 [4]
#> [5] 51.094117647058823 | 51.094117647058823 [5]
#> [6] 45.563793103448276 | 45.563793103448276 [6]
#> [7] 49.473770491803279 - 49.473770491803286 [7]
#> [8] 45.625000000000000 | 45.625000000000000 [8]
#>
#> tidy[[5]] | dt[[5]]
#> [1] 17.621917808219177 | 17.621917808219177 [1]
#> [2] 19.072602739726026 - 19.072602739726022 [2]
#> [3] 18.320000000000000 | 18.320000000000000 [3]
#> [4] 17.588235294117649 - 17.588235294117652 [4]
#> [5] 19.252941176470589 - 19.252941176470586 [5]
#> [6] 14.237931034482759 - 14.237931034482758 [6]
#> [7] 15.718032786885246 - 15.718032786885248 [7]
#> [8] 14.550000000000001 | 14.550000000000001 [8]
### Try to do data.table summarization in one step
dt_2 <- penguins_comments_dt[,
list(comment_tog=str_flatten_comma(comments, na.rm=TRUE), # concatenate
lapply(.SD, mean, na.rm=TRUE)),## compute the mean
by = .(species, sex), ## for every 'origin,dest,month'
.SD = varmean] ## for just those specified in .SDcols
names(dt_2)
#> [1] "species" "sex" "comment_tog" "V2"
dim(dt_2)
#> [1] 24 4
dim(dt_1)
#> [1] 8 6
Created on 2024-12-14 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> β Session info βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#> setting value
#> version R version 4.4.0 (2024-04-24 ucrt)
#> os Windows 11 x64 (build 22631)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.utf8
#> ctype English_United States.utf8
#> tz America/New_York
#> date 2024-12-14
#> pandoc 3.5 @ C:/Users/steph/AppData/Local/Pandoc/ (via rmarkdown)
#>
#> β Packages βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#> package * version date (UTC) lib source
#> cli 3.6.3 2024-06-21 [1] RSPM (R 4.4.0)
#> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.4.0)
#> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.4.0)
#> data.table * 1.15.4 2024-03-30 [1] CRAN (R 4.4.0)
#> diffobj 0.3.5 2021-10-05 [1] CRAN (R 4.4.0)
#> digest 0.6.35 2024-03-11 [1] CRAN (R 4.4.0)
#> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0)
#> evaluate 0.23 2023-11-01 [1] CRAN (R 4.4.0)
#> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.4.0)
#> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.4.0)
#> fs 1.6.4 2024-04-25 [1] CRAN (R 4.4.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0)
#> ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.4.0)
#> glue 1.8.0 2024-09-30 [1] RSPM (R 4.4.0)
#> gtable 0.3.5 2024-04-22 [1] CRAN (R 4.4.0)
#> hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.0)
#> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#> janitor 2.2.0 2023-02-02 [1] RSPM (R 4.4.0)
#> knitr 1.46 2024-04-06 [1] CRAN (R 4.4.0)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
#> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.4.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0)
#> munsell 0.5.1 2024-04-01 [1] CRAN (R 4.4.0)
#> palmerpenguins * 0.1.1 2022-08-15 [1] RSPM (R 4.4.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
#> purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.4.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.4.0)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.4.0)
#> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.4.0)
#> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
#> readr * 2.1.5 2024-01-10 [1] CRAN (R 4.4.0)
#> rematch2 2.1.2 2020-05-01 [1] CRAN (R 4.4.0)
#> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.4.0)
#> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.4.0)
#> rmarkdown 2.28 2024-08-17 [1] RSPM (R 4.4.0)
#> rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0)
#> scales 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
#> snakecase 0.11.1 2023-08-27 [1] RSPM (R 4.4.0)
#> stringi 1.8.3 2023-12-11 [1] CRAN (R 4.4.0)
#> stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.4.0)
#> styler 1.10.3 2024-04-07 [1] CRAN (R 4.4.0)
#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.4.0)
#> tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.0)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
#> tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
#> timechange 0.3.0 2024-01-18 [1] CRAN (R 4.4.0)
#> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.0)
#> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.0)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0)
#> waldo 0.5.2 2023-11-02 [1] CRAN (R 4.4.0)
#> withr 3.0.0 2024-01-16 [1] CRAN (R 4.4.0)
#> xfun 0.43 2024-03-25 [1] CRAN (R 4.4.0)
#> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.4.0)
#>
#> [1] C:/Users/steph/AppData/Local/R/win-library/4.4
#> [2] C:/Program Files/R/R-4.4.0/library
#>
#> ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ