Convincing `tidyr::pivot_wider()` that my spec is unique so that it doesn't create list columns

I have a long dataset in kind of a weird format, where I need to kind of coalesce several columns (of different types) and pivot wider. Here's what the data looks like:

library(tibble)
library(tidyr)
library(dplyr)

long <- tibble(
    row_id = c(1, 1, 1, 1, 2, 2, 2, 2),
    type = c("a", "a", "b", "b", "a", "a", "b", "b"),
    time = c(1, 2, 1, 2, 1, 2, 1, 2),
    val_a = c(1000, 2000, NA, NA, 3000, 4000, NA, NA),
    val_b = c(NA, NA, "x", "y", NA, NA, "z", "q")
)

long
#> # A tibble: 8 × 5
#>   row_id type   time val_a val_b
#>    <dbl> <chr> <dbl> <dbl> <chr>
#> 1      1 a         1  1000 NA   
#> 2      1 a         2  2000 NA   
#> 3      1 b         1    NA x    
#> 4      1 b         2    NA y    
#> 5      2 a         1  3000 NA   
#> 6      2 a         2  4000 NA   
#> 7      2 b         1    NA z    
#> 8      2 b         2    NA q   

I want to have columns wide by type and time, but there's duplicate encoding of type in rows and the columns of the existing dataset. Here's what I tried at first:

spec <- build_wider_spec(long, names_from = c(type, time), values_from = c(val_a, val_b))

spec_unique <- spec |>
    filter(.value == paste0("val_", type)) |>
    dplyr::mutate(.name = paste0(.value, "_", time))

spec_unique
#> # A tibble: 4 × 4
#>   .name   .value type   time
#>   <chr>   <chr>  <chr> <dbl>
#> 1 val_a_1 val_a  a         1
#> 2 val_a_2 val_a  a         2
#> 3 val_b_1 val_b  b         1
#> 4 val_b_2 val_b  b         2


pivot_wider_spec(long, spec_unique, id_cols = row_id)
#> Warning: Values from `val_a` and `val_b` are not uniquely identified; output will
#> contain list-cols.
#> • Use `values_fn = list` to suppress this warning.
#> • Use `values_fn = {summary_fun}` to summarise duplicates.
#> • Use the following dplyr code to identify duplicates.
#>   {data} |>
#>   dplyr::summarise(n = dplyr::n(), .by = c(row_id, type, time)) |>
#>   dplyr::filter(n > 1L)
#> # A tibble: 2 × 5
#>   row_id val_a_1   val_a_2   val_b_1   val_b_2  
#>    <dbl> <list>    <list>    <list>    <list>   
#> 1      1 <dbl [1]> <dbl [1]> <chr [1]> <chr [1]>
#> 2      2 <dbl [1]> <dbl [1]> <chr [1]> <chr [1]>

Despite warning, all lists are length 1, and to my eyes the spec guarantees this. Is there some better way than dropping rows in the spec to indicate I don't care about the values from this combination of id variables?

Here are the 2 work arounds I've found. One is too slow and the other seems like ugly code to me.

Is there a better way?

# This is the cleanest code, but it's too slow
pivot_wider_spec(long, spec_unique, id_cols = row_id, values_fn = dplyr::first)
#> # A tibble: 2 × 5
#>   row_id val_a_1 val_a_2 val_b_1 val_b_2
#>    <dbl>   <dbl>   <dbl> <chr>   <chr>  
#> 1      1    1000    2000 x       y      
#> 2      2    3000    4000 z       q


# Uglier, but maybe the best way to do this?
pivot_a <- long |> filter(type == "a") |> select(-type, -val_b) |>
    pivot_wider(id_cols = row_id, names_from = time, values_from = val_a, names_prefix = "val_a_")
pivot_b <- long |> filter(type == "b") |> select(-type, -val_a) |>
    pivot_wider(id_cols = row_id, names_from = time, values_from = val_b, names_prefix = "val_b_")
left_join(pivot_a, pivot_b, by = "row_id")
#> # A tibble: 2 × 5
#>   row_id val_a_1 val_a_2 val_b_1 val_b_2
#>    <dbl>   <dbl>   <dbl> <chr>   <chr>  
#> 1      1    1000    2000 x       y      
#> 2      2    3000    4000 z       q

Here's some benchmarking showing that it's much faster to do the joined approach.

# Benchmarking
make_long <- function(n_ids) {
    tibble(
        row_id = rep(1:n_ids, each = 4),
        type   = rep(c("a", "a", "b", "b"), n_ids),
        time   = rep(c(1, 2, 1, 2), n_ids),
        val_a  = rep(c(1000, 2000, NA, NA), n_ids),
        val_b  = rep(c(NA, NA, "x", "y"), n_ids)
    )
}

make_spec <- function(long) {
    spec <- build_wider_spec(long, names_from = c(type, time), values_from = c(val_a, val_b))
    spec |>
        filter(.value == paste0("val_", type)) |>
        mutate(.name = paste0(.value, "_", time))
}

benchmark_pivot <- function(n_ids) {
    long <- make_long(n_ids)
    spec <- make_spec(long)
    
    bench::mark(
        list_col = pivot_wider_spec(long, spec, id_cols = row_id, values_fn = dplyr::first),
        join = {
            pivot_a <- long |> filter(type == "a") |> select(-type, -val_b) |>
                pivot_wider(id_cols = row_id, names_from = time, values_from = val_a, names_prefix = "val_a_")
            pivot_b <- long |> filter(type == "b") |> select(-type, -val_a) |>
                pivot_wider(id_cols = row_id, names_from = time, values_from = val_b, names_prefix = "val_b_")
            left_join(pivot_a, pivot_b, by = "row_id")
        },
        relative = TRUE,
        filter_gc = FALSE
    )
}

benchmark_pivot(1e4)
#> # A tibble: 2 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 list_col    77.1   72.3       1        1.18     1.90
#> 2 join         1      1        62.0      1        1
benchmark_pivot(1e5)
#> # A tibble: 2 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 list_col    127.   110.       1        1.20     1   
#> 2 join          1      1       81.4      1        1.82

Have you tried coercing a and b to same type, using pivot_wider, and then changing the variables back if necessary?

long |> 
  mutate(values = if_else(is.na(val_a), val_b, as.character(val_a))) |>
  pivot_wider(id_cols = row_id, names_from = c(type, time), values_from = c(values))

Hm, I don't love doing type conversion like that, it seems very easy to lose precision or introduce a bug, and it's not as fast as the join.

benchmark_pivot <- function(n_ids) {
  long <- make_long(n_ids)
  spec <- make_spec(long)
  
  bench::mark(
    list_col = pivot_wider_spec(long, spec, id_cols = row_id, values_fn = dplyr::first),
    join = {
      pivot_a <- long |> filter(type == "a") |> select(-type, -val_b) |>
        pivot_wider(id_cols = row_id, names_from = time, values_from = val_a, names_prefix = "val_a_")
      pivot_b <- long |> filter(type == "b") |> select(-type, -val_a) |>
        pivot_wider(id_cols = row_id, names_from = time, values_from = val_b, names_prefix = "val_b_")
      left_join(pivot_a, pivot_b, by = "row_id")
    },
    convert_types = {
      long |> 
        mutate(values = if_else(is.na(val_a), val_b, as.character(val_a))) |>
        pivot_wider(id_cols = row_id, names_from = c(type, time), values_from = c(values), names_prefix = "val_") |>
        mutate(across(starts_with("val_a_"), as.numeric))
    },
    relative = TRUE,
    filter_gc = FALSE
  )
}

benchmark_pivot(1e4)
#> # A tibble: 3 × 13
#>   expression      min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                memory     time           gc      
#>   <bch:expr>    <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list>                <list>     <list>         <list>  
#> 1 list_col      78.8   79.1        1        1.84     1.09     2    11      1.01s <tibble [10,000 × 5]> <Rprofmem> <bench_tm [2]> <tibble>
#> 2 join           1      1         68.7      1.56     2.14    70    11   514.99ms <tibble [10,000 × 5]> <Rprofmem> <bench_tm>     <tibble>
#> 3 convert_types  1.37   1.34      55.6      1        1       55     5   500.09ms <tibble [10,000 × 5]> <Rprofmem> <bench_tm>     <tibble>

benchmark_pivot(1e5)
#> # A tibble: 3 × 13
#>   expression       min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                 memory     time       gc      
#>   <bch:expr>     <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list>                 <list>     <list>     <list>  
#> 1 list_col      129.   108.         1        1.92     1        1    28      4.12s <tibble [100,000 × 5]> <Rprofmem> <bench_tm> <tibble>
#> 2 join            1      1         73.6      1.60     3.80     9    13   503.76ms <tibble [100,000 × 5]> <Rprofmem> <bench_tm> <tibble>
#> 3 convert_types   2.08   1.87      58.4      1        1.57     8     6   563.71ms <tibble [100,000 × 5]> <Rprofmem> <bench_tm> <tibble>