Data Wrangling with the Tidyverse

BB1464 · July 26, 2023, 4:58pm

Good Evening Dear All, I have a genotyping dataset in which I want to find all the chromosomes number that can be found across the four column and I will specify that that can be found across the four column, but if that can be found in only two columns I will specify that that can be found in only two columns ...., I know that this can be achieved with across along side with case_when but I have tried it but my code is still giving me error. Here is the sampled dataset below.

library(tidyverse)

dataset

dat_geno <-
  structure(
    list(
      SN...1 = c(
        374,
        100,
        155,
        362,
        609,
        534,
        502,
        371,
        94,
        504,
        355,
        640,
        264,
        310,
        637,
        20,
        594,
        531,
        432,
        564
      ),
      `chrom_pos F1` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_12209997",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254842",
        "13_1254880",
        "13_1273697",
        "13_1282054",
        "13_1282089",
        "13_1309511",
        "13_1318162",
        "13_1321216"
      ),
      `chrom_pos F2` = c(
        "13_1056059",
        "13_11823950",
        "13_1195790",
        "13_12005749",
        "13_1211923",
        "13_1212374",
        "13_12172917",
        "13_12224072",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_1309511",
        "13_1318162",
        "13_1321216",
        "13_134542",
        "13_134612",
        "13_1354008"
      ),
      `Chrom_pos F3` = c(
        "13_1056059",
        "13_1132788",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_1231476",
        "13_1244605",
        "13_1244635",
        "13_1250227",
        "13_1250260",
        "13_1250265",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_134612",
        "13_143187",
        "13_1455602",
        "13_1455638"
      ),
      `chrom_pos F4` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923", "13_1212374", "13_12209997", "13_1244605", "13_1250227",
                                                                                                                                                            "13_1250260", "13_1254842", "13_1254880", "13_1273697", "13_1282054",
                                                                                                                                                            "13_1282089", "13_1309511", "13_1318162", "13_1321216")), row.names = c(NA,
                                                                                                                                                                                                                                    -20L), class = c("tbl_df", "tbl", "data.frame"))

technocrat · July 27, 2023, 12:22am

Just testing four or two columns match leaves unaddressed if three columns match.

dat_geno <-
  data.frame(
      SN...1 = c(
        374,
        100,
        155,
        362,
        609,
        534,
        502,
        371,
        94,
        504,
        355,
        640,
        264,
        310,
        637,
        20,
        594,
        531,
        432,
        564
      ),
      `chrom_pos F1` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_12209997",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254842",
        "13_1254880",
        "13_1273697",
        "13_1282054",
        "13_1282089",
        "13_1309511",
        "13_1318162",
        "13_1321216"
      ),
      `chrom_pos F2` = c(
        "13_1056059",
        "13_11823950",
        "13_1195790",
        "13_12005749",
        "13_1211923",
        "13_1212374",
        "13_12172917",
        "13_12224072",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_1309511",
        "13_1318162",
        "13_1321216",
        "13_134542",
        "13_134612",
        "13_1354008"
      ),
      `Chrom_pos F3` = c(
        "13_1056059",
        "13_1132788",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_1231476",
        "13_1244605",
        "13_1244635",
        "13_1250227",
        "13_1250260",
        "13_1250265",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_134612",
        "13_143187",
        "13_1455602",
        "13_1455638"
      ),
      `chrom_pos F4` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923", "13_1212374", "13_12209997", "13_1244605", "13_1250227",
        "13_1250260", "13_1254842", "13_1254880", "13_1273697", "13_1282054",
        "13_1282089", "13_1309511", "13_1318162", "13_1321216"))

# Function to count identical strings in a row
count_identical <- function(row) {
  sum(duplicated(row) | duplicated(row, fromLast = TRUE))
}

# Apply the function to each row and create a new column
dat_geno$count_identical <- apply(dat_geno, 1, count_identical)

dat_geno
#>    SN...1 chrom_pos.F1 chrom_pos.F2 Chrom_pos.F3 chrom_pos.F4 count_identical
#> 1     374   13_1056059   13_1056059   13_1056059   13_1056059               4
#> 2     100  13_11817560  13_11823950   13_1132788  13_11817560               2
#> 3     155  13_11873028   13_1195790   13_1195790  13_11873028               4
#> 4     362   13_1195790  13_12005749  13_12005749   13_1195790               4
#> 5     609  13_12005749   13_1211923  13_12019437  13_12005749               2
#> 6     534  13_12019437   13_1212374   13_1211923  13_12019437               2
#> 7     502   13_1211923  13_12172917   13_1212374   13_1211923               2
#> 8     371   13_1212374  13_12224072   13_1231476   13_1212374               2
#> 9      94  13_12209997   13_1244605   13_1244605  13_12209997               4
#> 10    504   13_1244605   13_1250227   13_1244635   13_1244605               2
#> 11    355   13_1250227   13_1250260   13_1250227   13_1250227               3
#> 12    640   13_1250260   13_1254880   13_1250260   13_1250260               3
#> 13    264   13_1254842   13_1282089   13_1250265   13_1254842               2
#> 14    310   13_1254880   13_1283220   13_1254880   13_1254880               3
#> 15    637   13_1273697   13_1309511   13_1282089   13_1273697               2
#> 16     20   13_1282054   13_1318162   13_1283220   13_1282054               2
#> 17    594   13_1282089   13_1321216    13_134612   13_1282089               2
#> 18    531   13_1309511    13_134542    13_143187   13_1309511               2
#> 19    432   13_1318162    13_134612   13_1455602   13_1318162               2
#> 20    564   13_1321216   13_1354008   13_1455638   13_1321216               2

^{Created on 2023-07-26 with reprex v2.0.2}

BB1464 · July 27, 2023, 3:25am

technocrat:

# Function to count identical strings in a row
count_identical <- function(row) {
  sum(duplicated(row) | duplicated(row, fromLast = TRUE))
}

# Apply the function to each row and create a new column
dat_geno$count_identical <- apply(dat_geno, 1, count_identical)

dat_geno

Thank you very much @technocrat but in the end I want to be able to specify where the duplicated row occur e.g if this can be found across the four column I will say that this can be found across all. I have also attach a screen shot of the expected output.

MarekGierlinski · July 27, 2023, 9:18am

This should do the job:

dat_geno <- dat_geno |>
  rename(SN = SN...1)

get_duplicates <- function(name, value) {
  dup_vals <- unique(value[duplicated(value)])
  map(dup_vals, function(v) {
    name[value == v] |> 
      str_c(collapse = ",")
  }) |> 
    unlist() |> 
    str_c(collapse = "; ")
}

dg <- dat_geno |> 
  pivot_longer(-SN) |> 
  mutate(name = str_extract(name, "F\\d")) |> 
  group_by(SN) |> 
  summarise(dups = get_duplicates(name, value))

full_join(dat_geno, dg, by = "SN")

The result indicates duplicated columns (separated by commas). When there is more than one duplicated value, the duplicated groups are separated by semicolons.

# A tibble: 20 × 6
      SN `chrom_pos F1` `chrom_pos F2` `Chrom_pos F3` `chrom_pos F4` dups       
   <dbl> <chr>          <chr>          <chr>          <chr>          <chr>      
 1   374 13_1056059     13_1056059     13_1056059     13_1056059     F1,F2,F3,F4
 2   100 13_11817560    13_11823950    13_1132788     13_11817560    F1,F4      
 3   155 13_11873028    13_1195790     13_1195790     13_11873028    F2,F3; F1,F4
 4   362 13_1195790     13_12005749    13_12005749    13_1195790     F2,F3; F1,F4
 5   609 13_12005749    13_1211923     13_12019437    13_12005749    F1,F4      
 6   534 13_12019437    13_1212374     13_1211923     13_12019437    F1,F4      
 7   502 13_1211923     13_12172917    13_1212374     13_1211923     F1,F4      
 8   371 13_1212374     13_12224072    13_1231476     13_1212374     F1,F4      
 9    94 13_12209997    13_1244605     13_1244605     13_12209997    F2,F3; F1,F4
10   504 13_1244605     13_1250227     13_1244635     13_1244605     F1,F4      
11   355 13_1250227     13_1250260     13_1250227     13_1250227     F1,F3,F4   
12   640 13_1250260     13_1254880     13_1250260     13_1250260     F1,F3,F4   
13   264 13_1254842     13_1282089     13_1250265     13_1254842     F1,F4      
14   310 13_1254880     13_1283220     13_1254880     13_1254880     F1,F3,F4   
15   637 13_1273697     13_1309511     13_1282089     13_1273697     F1,F4      
16    20 13_1282054     13_1318162     13_1283220     13_1282054     F1,F4      
17   594 13_1282089     13_1321216     13_134612      13_1282089     F1,F4      
18   531 13_1309511     13_134542      13_143187      13_1309511     F1,F4      
19   432 13_1318162     13_134612      13_1455602     13_1318162     F1,F4      
20   564 13_1321216     13_1354008     13_1455638     13_1321216     F1,F4

BB1464 · July 27, 2023, 10:25am

MarekGierlinski:

dat_geno <- dat_geno |>
  rename(SN = SN...1)

get_duplicates <- function(name, value) {
  dup_vals <- unique(value[duplicated(value)])
  map(dup_vals, function(v) {
    name[value == v] |> 
      str_c(collapse = ",")
  }) |> 
    unlist() |> 
    str_c(collapse = "; ")
}

dg <- dat_geno |> 
  pivot_longer(-SN) |> 
  mutate(name = str_extract(name, "F\\d")) |> 
  group_by(SN) |> 
  summarise(dups = get_duplicates(name, value))

full_join(dat_geno, dg, by = "SN")

Thank you very much for this @MarekGierlinski , please sorry to disturb you again what about for each chromosome position e.g. 13_1056059 and I want to check across the four column where I can get a match for all the four combinations. Please how can I achieve that with this your elegant solution. Thanks!!!

MarekGierlinski · July 27, 2023, 11:53am

for each chromosome position e.g. 13_1056059 and I want to check across the four column where I can get a match for all the four combinations. Please how can I achieve that with this your elegant solution

I'm sorry, I don't understand the request. Can you show an example of the required output?

BB1464 · July 27, 2023, 12:13pm

Here is a screenshot of the required output @MarekGierlinski , what I mean is that for each chromosome position I want to check in all the rows for the for column when that chromosome is present in the four column regardless of their row position then just create a new column and indicate that it can be found across the four column. Here is the screenshot below. Thank you very much for your time.

MarekGierlinski · July 27, 2023, 12:42pm

When you say "the four column", do you mean "the fourth column" or "all four columns"?

BB1464 · July 27, 2023, 1:06pm

Yes all four columns where I have the chromosome position @MarekGierlinski

MarekGierlinski · July 27, 2023, 1:19pm

So, you want to find occurrence of chromosome positions across four columns, regardless of the row?

oc <- dat_geno |> 
  pivot_longer(-SN, values_to = "chr_pos") |> 
  mutate(name = str_extract(name, "F\\d")) |> 
  group_by(chr_pos) |> 
  summarise(columns = sort(unique(name)) |> str_c(collapse = ","))

This will give you occurrence per column per chromosome position:

> oc
# A tibble: 34 × 2
   chr_pos     columns    
   <chr>       <chr>      
 1 13_1056059  F1,F2,F3,F4
 2 13_1132788  F3         
 3 13_11817560 F1,F4      
 4 13_11823950 F2         
 5 13_11873028 F1,F4      
 6 13_1195790  F1,F2,F3,F4
 7 13_12005749 F1,F2,F3,F4
 8 13_12019437 F1,F3,F4   
 9 13_1211923  F1,F2,F3,F4
10 13_1212374  F1,F2,F3,F4
# ℹ 24 more rows

Now we need to filter it for occurrences across all columns:

oc |> filter(columns == "F1,F2,F3,F4")
# A tibble: 10 × 2
   chr_pos     columns    
   <chr>       <chr>      
 1 13_1056059  F1,F2,F3,F4
 2 13_1195790  F1,F2,F3,F4
 3 13_12005749 F1,F2,F3,F4
 4 13_1211923  F1,F2,F3,F4
 5 13_1212374  F1,F2,F3,F4
 6 13_1244605  F1,F2,F3,F4
 7 13_1250227  F1,F2,F3,F4
 8 13_1250260  F1,F2,F3,F4
 9 13_1254880  F1,F2,F3,F4
10 13_1282089  F1,F2,F3,F4

Or, you can count columns found per chromosome directly:

dat_geno |> 
  pivot_longer(-SN, values_to = "chr_pos") |> 
  group_by(chr_pos) |> 
  summarise(n = length(unique(name))) |> 
  filter(n == 4)

# A tibble: 10 × 2
   chr_pos         n
   <chr>       <int>
 1 13_1056059      4
 2 13_1195790      4
 3 13_12005749     4
 4 13_1211923      4
 5 13_1212374      4
 6 13_1244605      4
 7 13_1250227      4
 8 13_1250260      4
 9 13_1254880      4
10 13_1282089      4

MarekGierlinski · July 27, 2023, 1:27pm

A more general comment: it is usually easier to work with data in long format, as we can do grouping, counting, summarising etc. Hence, the pivot_longer function.

BB1464 · July 27, 2023, 1:40pm

Thank you very much for the help today. I really appreciate this @MarekGierlinski

system · August 17, 2023, 1:41pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.