How do I collapse rows in a tibble?

Hi all,

I have a large tibble and would like to collapse rows


library(tidyverse)

# Tibble I have:
tib_1 <- tibble(
  tissue = c('Duodenum', 'Duodenum', 'Duodenum', 'Duodenum', 'Ileum',  'Ileum',  'Ileum',  'Ileum', 'Jejunum', 'Jejunum', 'Jejunum', 'Jejunum'),
  rfi = c('high', 'high', 'low', 'low', 'high', 'high', 'low', 'low', 'high', 'high', 'low', 'low'),
  trial = c(1,2,1,2,1,2,1,2,1,2,1,2),
  sample_ids = c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
) |>
  mutate_at(c('tissue', 'rfi', 'trial'), as.factor)
tib_1

# Tibble I want:
tib_2 <- tibble(
  tissue = c('Duodenum', 'Ileum', 'Jejunum'),
  sample_ids = c("1,2,3,4,5,6,7,8,9,10", "11,12,13,14,15,16,17,18,19,20,21,22", "23,24,25,26,27,28,29,30,31,32,33,34,35,36,37")
) |>
  mutate_at(c('tissue'), as.factor)
tib_2

# My best attempt is a workaround (still not working) by looping through the data of another tibble not shown here. 

# create empty tibble
tissue_sample_lists <- tibble(
  tissue = character(),
  sample_ids = list()
)

# loop, and populate tibble
tissues <- c('Duodenum', 'Ileum', 'Jejunum')
for (x in tissues) {
  temp <- fastq_annotations_joined_cleaned |> filter(tissue == x) |> select(sample_id)
  id_list <- (sort(unique(temp$sample_id)))
  my_tib <- tibble(tissue = x, sample_ids = list(id_list))
  rbind(tissue_sample_lists, my_tib)
}
tissue_sample_lists

I have tried group_by, filter, select, and extracting the sample_ids column and creating a new tibble with a loop, but it all seems very complex that I'm sure there must be a simple way to do this.

Thanks all in advance,
Kenneth

Hi @bioinfguru ,

tib_1 |> group_by(tissue) 
|> summarise(sample_ids = paste0(sort(as.numeric(unlist(strsplit(x=paste0(sample_ids, collapse = ","), ",")))), collapse = ","))

I assume the real dataset is far more complex then your example so here the steps:

  1. paste(sample_ids, collapse = ",") will use all found IDs for per group and make a single string out of them
  2. strsplit will split by "," and give back a list of entries
  3. unlist - will make a regular vector from the list
  4. as.numeric - since sorting string values will lead to the ordering of 1,10,2,etc a numeric one is better
  5. sort - sort the numeric vector
  6. same as step 1 to get back to a single entry.

Hope it helps.

1 Like

Here is a baby version of the same sort of strategy @vedoa used.

library(tidyverse)
tib_1 <- tibble(
  tissue = c('Duodenum', 'Duodenum', 'Duodenum', 'Duodenum', 'Ileum',  'Ileum',  'Ileum',  'Ileum', 'Jejunum', 'Jejunum', 'Jejunum', 'Jejunum'),
  rfi = c('high', 'high', 'low', 'low', 'high', 'high', 'low', 'low', 'high', 'high', 'low', 'low'),
  trial = c(1,2,1,2,1,2,1,2,1,2,1,2),
  sample_ids = c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
) |>
  mutate_at(c('tissue', 'rfi', 'trial'), as.factor)
tib_1
#> # A tibble: 12 × 4
#>    tissue   rfi   trial sample_ids 
#>    <fct>    <fct> <fct> <chr>      
#>  1 Duodenum high  1     1,2,3      
#>  2 Duodenum high  2     4,5,6      
#>  3 Duodenum low   1     7,8        
#>  4 Duodenum low   2     9,10       
#>  5 Ileum    high  1     11,12,13   
#>  6 Ileum    high  2     14,15      
#>  7 Ileum    low   1     16,17,18,19
#>  8 Ileum    low   2     20,21,22   
#>  9 Jejunum  high  1     23,24,25,26
#> 10 Jejunum  high  2     27,28,29,30
#> 11 Jejunum  low   1     31,32,33   
#> 12 Jejunum  low   2     34,35,36,37

# Tibble I want:
tib_2 <- tibble(
  tissue = c('Duodenum', 'Ileum', 'Jejunum'),
  sample_ids = c("1,2,3,4,5,6,7,8,9,10", "11,12,13,14,15,16,17,18,19,20,21,22", "23,24,25,26,27,28,29,30,31,32,33,34,35,36,37")
) |>
  mutate_at(c('tissue'), as.factor)
tib_2
#> # A tibble: 3 × 2
#>   tissue   sample_ids                                  
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37

tib_1 |> group_by(tissue) |> summarize(S = paste(sample_ids, collapse = ","))
#> # A tibble: 3 × 2
#>   tissue   S                                           
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37

Created on 2024-03-15 with reprex v2.0.2

1 Like

Here's another approach.

tib_3 <- tib_1 |> 
          pivot_wider(id_cols = tissue, names_from = c(rfi, trial), values_from = sample_ids) |>
            unite(sample_ids, -tissue, sep = ",")

Since it seems to me that the orginal table, tib_1, should likely be converted to a more fundamental table for later manipulation, my first suggestion is to create tib_0 from tib_1, which I've first recreated using more up-to-date tidyverse functions and deliberately using table order to define the levels of the factor columns:

library(tidyverse)

# original starting tibble
tib_1 <- 
  expand_grid(
    tissue = c('Duodenum', 'Ileum', 'Jejunum'),
    rfi = c('high', 'low'),
    trial = 1:2
  ) |> 
  mutate(
    sample_ids = 
      c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", 
        "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
  ) |>
  # use across() to apply anonymous function to each of a set of columns
  # anonymous function first converts to character since fct_inorder requires it
  # fct_inoder uses table order to establish order of factor levels
  mutate(
    across(
      c(tissue, rfi, trial), 
      \(column) column |> as.character() |> fct_inorder()
      )
    ) 
tib_1
#> # A tibble: 12 × 4
#>    tissue   rfi   trial sample_ids 
#>    <fct>    <fct> <fct> <chr>      
#>  1 Duodenum high  1     1,2,3      
#>  2 Duodenum high  2     4,5,6      
#>  3 Duodenum low   1     7,8        
#>  4 Duodenum low   2     9,10       
#>  5 Ileum    high  1     11,12,13   
#>  6 Ileum    high  2     14,15      
#>  7 Ileum    low   1     16,17,18,19
#>  8 Ileum    low   2     20,21,22   
#>  9 Jejunum  high  1     23,24,25,26
#> 10 Jejunum  high  2     27,28,29,30
#> 11 Jejunum  low   1     31,32,33   
#> 12 Jejunum  low   2     34,35,36,37
# more fundamental version of tib_1 data for later manipulation
tib_0 <- 
  tib_1 |> 
  separate_longer_delim(sample_ids, ',') |> 
  # convert sample IDs to numbers so can later use numerical order
  mutate(sample_ids = parse_number(sample_ids))
tib_0
#> # A tibble: 37 × 4
#>    tissue   rfi   trial sample_ids
#>    <fct>    <fct> <fct>      <dbl>
#>  1 Duodenum high  1              1
#>  2 Duodenum high  1              2
#>  3 Duodenum high  1              3
#>  4 Duodenum high  2              4
#>  5 Duodenum high  2              5
#>  6 Duodenum high  2              6
#>  7 Duodenum low   1              7
#>  8 Duodenum low   1              8
#>  9 Duodenum low   2              9
#> 10 Duodenum low   2             10
#> # ℹ 27 more rows

With tib_0 in hand, we can now proceed similarly to @vedoa and @FJCC

# desired table created without depending on order in which tib_1 strings occur
tib_0 |> 
  group_by(tissue) |> 
  arrange(sample_ids) |> 
  summarise(sample_ids  = sample_ids |> str_flatten(collapse = ','))
#> # A tibble: 3 × 2
#>   tissue   sample_ids                                  
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37
(Edit)

I originally had the following code for reproducing tib_1, but then remembered expand_grid():

# original starting tibble
tib_1 <- 
  tibble(
    tissue = c('Duodenum', 'Ileum', 'Jejunum') |> rep(each = 4),
    rfi = c('high', 'low') |> rep(each = 2) |> rep(3),
    trial = c(1,2) |>  rep(6),
    sample_ids = 
      c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", 
        "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
  ) |>
  # use across() to apply anonymous function to each of a set of columns
  # anonymous function first converts to character since fct_inorder requires it
  # fct_inoder uses table order to establish order of factor levels
  mutate(
    across(
      c(tissue, rfi, trial), 
      \(column) column |> as.character() |> fct_inorder()
      )
    ) 
1 Like

Hi all,

Thank you all for your help and explanations. Yes, the real dataset is considerably more complex. The combined solution from you all turned out to be as follows:

tib_1 |>
  separate_longer_delim(sample_ids, delim=",") |>
  group_by(tissue) |>
  summarize(sample_ids=paste(sort(as.numeric(sample_ids)), collapse=","), .groups="drop")

@dromano The purpose of this was to create a short easy reference summary table, so no need to save the intermediate lengthened table or be concerned with factors here... I already have the "fundamental" table all this was extracted from. But thanks for providing the missing link to make it more simple.

Thanks again,
Kenneth

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.