I have a long dataset in kind of a weird format, where I need to kind of coalesce several columns (of different types) and pivot wider. Here's what the data looks like:
library(tibble)
library(tidyr)
library(dplyr)
long <- tibble(
row_id = c(1, 1, 1, 1, 2, 2, 2, 2),
type = c("a", "a", "b", "b", "a", "a", "b", "b"),
time = c(1, 2, 1, 2, 1, 2, 1, 2),
val_a = c(1000, 2000, NA, NA, 3000, 4000, NA, NA),
val_b = c(NA, NA, "x", "y", NA, NA, "z", "q")
)
long
#> # A tibble: 8 × 5
#> row_id type time val_a val_b
#> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 1 a 1 1000 NA
#> 2 1 a 2 2000 NA
#> 3 1 b 1 NA x
#> 4 1 b 2 NA y
#> 5 2 a 1 3000 NA
#> 6 2 a 2 4000 NA
#> 7 2 b 1 NA z
#> 8 2 b 2 NA q
I want to have columns wide by type and time, but there's duplicate encoding of type in rows and the columns of the existing dataset. Here's what I tried at first:
spec <- build_wider_spec(long, names_from = c(type, time), values_from = c(val_a, val_b))
spec_unique <- spec |>
filter(.value == paste0("val_", type)) |>
dplyr::mutate(.name = paste0(.value, "_", time))
spec_unique
#> # A tibble: 4 × 4
#> .name .value type time
#> <chr> <chr> <chr> <dbl>
#> 1 val_a_1 val_a a 1
#> 2 val_a_2 val_a a 2
#> 3 val_b_1 val_b b 1
#> 4 val_b_2 val_b b 2
pivot_wider_spec(long, spec_unique, id_cols = row_id)
#> Warning: Values from `val_a` and `val_b` are not uniquely identified; output will
#> contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#> {data} |>
#> dplyr::summarise(n = dplyr::n(), .by = c(row_id, type, time)) |>
#> dplyr::filter(n > 1L)
#> # A tibble: 2 × 5
#> row_id val_a_1 val_a_2 val_b_1 val_b_2
#> <dbl> <list> <list> <list> <list>
#> 1 1 <dbl [1]> <dbl [1]> <chr [1]> <chr [1]>
#> 2 2 <dbl [1]> <dbl [1]> <chr [1]> <chr [1]>
Despite warning, all lists are length 1, and to my eyes the spec guarantees this. Is there some better way than dropping rows in the spec to indicate I don't care about the values from this combination of id variables?
Here are the 2 work arounds I've found. One is too slow and the other seems like ugly code to me.
Is there a better way?
# This is the cleanest code, but it's too slow
pivot_wider_spec(long, spec_unique, id_cols = row_id, values_fn = dplyr::first)
#> # A tibble: 2 × 5
#> row_id val_a_1 val_a_2 val_b_1 val_b_2
#> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 1 1000 2000 x y
#> 2 2 3000 4000 z q
# Uglier, but maybe the best way to do this?
pivot_a <- long |> filter(type == "a") |> select(-type, -val_b) |>
pivot_wider(id_cols = row_id, names_from = time, values_from = val_a, names_prefix = "val_a_")
pivot_b <- long |> filter(type == "b") |> select(-type, -val_a) |>
pivot_wider(id_cols = row_id, names_from = time, values_from = val_b, names_prefix = "val_b_")
left_join(pivot_a, pivot_b, by = "row_id")
#> # A tibble: 2 × 5
#> row_id val_a_1 val_a_2 val_b_1 val_b_2
#> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 1 1000 2000 x y
#> 2 2 3000 4000 z q
Here's some benchmarking showing that it's much faster to do the joined approach.
# Benchmarking
make_long <- function(n_ids) {
tibble(
row_id = rep(1:n_ids, each = 4),
type = rep(c("a", "a", "b", "b"), n_ids),
time = rep(c(1, 2, 1, 2), n_ids),
val_a = rep(c(1000, 2000, NA, NA), n_ids),
val_b = rep(c(NA, NA, "x", "y"), n_ids)
)
}
make_spec <- function(long) {
spec <- build_wider_spec(long, names_from = c(type, time), values_from = c(val_a, val_b))
spec |>
filter(.value == paste0("val_", type)) |>
mutate(.name = paste0(.value, "_", time))
}
benchmark_pivot <- function(n_ids) {
long <- make_long(n_ids)
spec <- make_spec(long)
bench::mark(
list_col = pivot_wider_spec(long, spec, id_cols = row_id, values_fn = dplyr::first),
join = {
pivot_a <- long |> filter(type == "a") |> select(-type, -val_b) |>
pivot_wider(id_cols = row_id, names_from = time, values_from = val_a, names_prefix = "val_a_")
pivot_b <- long |> filter(type == "b") |> select(-type, -val_a) |>
pivot_wider(id_cols = row_id, names_from = time, values_from = val_b, names_prefix = "val_b_")
left_join(pivot_a, pivot_b, by = "row_id")
},
relative = TRUE,
filter_gc = FALSE
)
}
benchmark_pivot(1e4)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 list_col 77.1 72.3 1 1.18 1.90
#> 2 join 1 1 62.0 1 1
benchmark_pivot(1e5)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 list_col 127. 110. 1 1.20 1
#> 2 join 1 1 81.4 1 1.82