How do I collapse rows in a tibble?

bioinfguru · March 15, 2024, 1:21pm

Hi all,

I have a large tibble and would like to collapse rows


library(tidyverse)

# Tibble I have:
tib_1 <- tibble(
  tissue = c('Duodenum', 'Duodenum', 'Duodenum', 'Duodenum', 'Ileum',  'Ileum',  'Ileum',  'Ileum', 'Jejunum', 'Jejunum', 'Jejunum', 'Jejunum'),
  rfi = c('high', 'high', 'low', 'low', 'high', 'high', 'low', 'low', 'high', 'high', 'low', 'low'),
  trial = c(1,2,1,2,1,2,1,2,1,2,1,2),
  sample_ids = c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
) |>
  mutate_at(c('tissue', 'rfi', 'trial'), as.factor)
tib_1

# Tibble I want:
tib_2 <- tibble(
  tissue = c('Duodenum', 'Ileum', 'Jejunum'),
  sample_ids = c("1,2,3,4,5,6,7,8,9,10", "11,12,13,14,15,16,17,18,19,20,21,22", "23,24,25,26,27,28,29,30,31,32,33,34,35,36,37")
) |>
  mutate_at(c('tissue'), as.factor)
tib_2

# My best attempt is a workaround (still not working) by looping through the data of another tibble not shown here. 

# create empty tibble
tissue_sample_lists <- tibble(
  tissue = character(),
  sample_ids = list()
)

# loop, and populate tibble
tissues <- c('Duodenum', 'Ileum', 'Jejunum')
for (x in tissues) {
  temp <- fastq_annotations_joined_cleaned |> filter(tissue == x) |> select(sample_id)
  id_list <- (sort(unique(temp$sample_id)))
  my_tib <- tibble(tissue = x, sample_ids = list(id_list))
  rbind(tissue_sample_lists, my_tib)
}
tissue_sample_lists

I have tried group_by, filter, select, and extracting the sample_ids column and creating a new tibble with a loop, but it all seems very complex that I'm sure there must be a simple way to do this.

Thanks all in advance,
Kenneth

vedoa · March 15, 2024, 2:09pm

Hi @bioinfguru ,

tib_1 |> group_by(tissue) 
|> summarise(sample_ids = paste0(sort(as.numeric(unlist(strsplit(x=paste0(sample_ids, collapse = ","), ",")))), collapse = ","))

I assume the real dataset is far more complex then your example so here the steps:

paste(sample_ids, collapse = ",") will use all found IDs for per group and make a single string out of them
strsplit will split by "," and give back a list of entries
unlist - will make a regular vector from the list
as.numeric - since sorting string values will lead to the ordering of 1,10,2,etc a numeric one is better
sort - sort the numeric vector
same as step 1 to get back to a single entry.

Hope it helps.

FJCC · March 15, 2024, 2:53pm

Here is a baby version of the same sort of strategy @vedoa used.

library(tidyverse)
tib_1 <- tibble(
  tissue = c('Duodenum', 'Duodenum', 'Duodenum', 'Duodenum', 'Ileum',  'Ileum',  'Ileum',  'Ileum', 'Jejunum', 'Jejunum', 'Jejunum', 'Jejunum'),
  rfi = c('high', 'high', 'low', 'low', 'high', 'high', 'low', 'low', 'high', 'high', 'low', 'low'),
  trial = c(1,2,1,2,1,2,1,2,1,2,1,2),
  sample_ids = c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
) |>
  mutate_at(c('tissue', 'rfi', 'trial'), as.factor)
tib_1
#> # A tibble: 12 × 4
#>    tissue   rfi   trial sample_ids 
#>    <fct>    <fct> <fct> <chr>      
#>  1 Duodenum high  1     1,2,3      
#>  2 Duodenum high  2     4,5,6      
#>  3 Duodenum low   1     7,8        
#>  4 Duodenum low   2     9,10       
#>  5 Ileum    high  1     11,12,13   
#>  6 Ileum    high  2     14,15      
#>  7 Ileum    low   1     16,17,18,19
#>  8 Ileum    low   2     20,21,22   
#>  9 Jejunum  high  1     23,24,25,26
#> 10 Jejunum  high  2     27,28,29,30
#> 11 Jejunum  low   1     31,32,33   
#> 12 Jejunum  low   2     34,35,36,37

# Tibble I want:
tib_2 <- tibble(
  tissue = c('Duodenum', 'Ileum', 'Jejunum'),
  sample_ids = c("1,2,3,4,5,6,7,8,9,10", "11,12,13,14,15,16,17,18,19,20,21,22", "23,24,25,26,27,28,29,30,31,32,33,34,35,36,37")
) |>
  mutate_at(c('tissue'), as.factor)
tib_2
#> # A tibble: 3 × 2
#>   tissue   sample_ids                                  
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37

tib_1 |> group_by(tissue) |> summarize(S = paste(sample_ids, collapse = ","))
#> # A tibble: 3 × 2
#>   tissue   S                                           
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37

^{Created on 2024-03-15 with reprex v2.0.2}

prubin · March 15, 2024, 4:57pm

Here's another approach.

tib_3 <- tib_1 |> 
          pivot_wider(id_cols = tissue, names_from = c(rfi, trial), values_from = sample_ids) |>
            unite(sample_ids, -tissue, sep = ",")

dromano · March 16, 2024, 7:25pm

Since it seems to me that the orginal table, tib_1, should likely be converted to a more fundamental table for later manipulation, my first suggestion is to create tib_0 from tib_1, which I've first recreated using more up-to-date tidyverse functions and deliberately using table order to define the levels of the factor columns:

library(tidyverse)

# original starting tibble
tib_1 <- 
  expand_grid(
    tissue = c('Duodenum', 'Ileum', 'Jejunum'),
    rfi = c('high', 'low'),
    trial = 1:2
  ) |> 
  mutate(
    sample_ids = 
      c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", 
        "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
  ) |>
  # use across() to apply anonymous function to each of a set of columns
  # anonymous function first converts to character since fct_inorder requires it
  # fct_inoder uses table order to establish order of factor levels
  mutate(
    across(
      c(tissue, rfi, trial), 
      \(column) column |> as.character() |> fct_inorder()
      )
    )

tib_1
#> # A tibble: 12 × 4
#>    tissue   rfi   trial sample_ids 
#>    <fct>    <fct> <fct> <chr>      
#>  1 Duodenum high  1     1,2,3      
#>  2 Duodenum high  2     4,5,6      
#>  3 Duodenum low   1     7,8        
#>  4 Duodenum low   2     9,10       
#>  5 Ileum    high  1     11,12,13   
#>  6 Ileum    high  2     14,15      
#>  7 Ileum    low   1     16,17,18,19
#>  8 Ileum    low   2     20,21,22   
#>  9 Jejunum  high  1     23,24,25,26
#> 10 Jejunum  high  2     27,28,29,30
#> 11 Jejunum  low   1     31,32,33   
#> 12 Jejunum  low   2     34,35,36,37

# more fundamental version of tib_1 data for later manipulation
tib_0 <- 
  tib_1 |> 
  separate_longer_delim(sample_ids, ',') |> 
  # convert sample IDs to numbers so can later use numerical order
  mutate(sample_ids = parse_number(sample_ids))
tib_0
#> # A tibble: 37 × 4
#>    tissue   rfi   trial sample_ids
#>    <fct>    <fct> <fct>      <dbl>
#>  1 Duodenum high  1              1
#>  2 Duodenum high  1              2
#>  3 Duodenum high  1              3
#>  4 Duodenum high  2              4
#>  5 Duodenum high  2              5
#>  6 Duodenum high  2              6
#>  7 Duodenum low   1              7
#>  8 Duodenum low   1              8
#>  9 Duodenum low   2              9
#> 10 Duodenum low   2             10
#> # ℹ 27 more rows

With tib_0 in hand, we can now proceed similarly to @vedoa and @FJCC

# desired table created without depending on order in which tib_1 strings occur
tib_0 |> 
  group_by(tissue) |> 
  arrange(sample_ids) |> 
  summarise(sample_ids  = sample_ids |> str_flatten(collapse = ','))
#> # A tibble: 3 × 2
#>   tissue   sample_ids                                  
#>   <fct>    <chr>                                       
#> 1 Duodenum 1,2,3,4,5,6,7,8,9,10                        
#> 2 Ileum    11,12,13,14,15,16,17,18,19,20,21,22         
#> 3 Jejunum  23,24,25,26,27,28,29,30,31,32,33,34,35,36,37

(Edit)

I originally had the following code for reproducing tib_1, but then remembered expand_grid():

# original starting tibble
tib_1 <- 
  tibble(
    tissue = c('Duodenum', 'Ileum', 'Jejunum') |> rep(each = 4),
    rfi = c('high', 'low') |> rep(each = 2) |> rep(3),
    trial = c(1,2) |>  rep(6),
    sample_ids = 
      c("1,2,3", "4,5,6", "7,8", "9,10", "11,12,13", "14,15", "16,17,18,19", 
        "20,21,22", "23,24,25,26", "27,28,29,30", "31,32,33", "34,35,36,37")
  ) |>
  # use across() to apply anonymous function to each of a set of columns
  # anonymous function first converts to character since fct_inorder requires it
  # fct_inoder uses table order to establish order of factor levels
  mutate(
    across(
      c(tissue, rfi, trial), 
      \(column) column |> as.character() |> fct_inorder()
      )
    )

bioinfguru · March 17, 2024, 12:46am

Hi all,

Thank you all for your help and explanations. Yes, the real dataset is considerably more complex. The combined solution from you all turned out to be as follows:

tib_1 |>
  separate_longer_delim(sample_ids, delim=",") |>
  group_by(tissue) |>
  summarize(sample_ids=paste(sort(as.numeric(sample_ids)), collapse=","), .groups="drop")

@dromano The purpose of this was to create a short easy reference summary table, so no need to save the intermediate lengthened table or be concerned with factors here... I already have the "fundamental" table all this was extracted from. But thanks for providing the missing link to make it more simple.

Thanks again,
Kenneth

system · March 24, 2024, 12:47am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.