Data Wrangling with the Tidyverse

Good Evening Dear All, I have a genotyping dataset in which I want to find all the chromosomes number that can be found across the four column and I will specify that that can be found across the four column, but if that can be found in only two columns I will specify that that can be found in only two columns ...., I know that this can be achieved with across along side with case_when but I have tried it but my code is still giving me error. Here is the sampled dataset below.

library(tidyverse)

dataset

dat_geno <-
  structure(
    list(
      SN...1 = c(
        374,
        100,
        155,
        362,
        609,
        534,
        502,
        371,
        94,
        504,
        355,
        640,
        264,
        310,
        637,
        20,
        594,
        531,
        432,
        564
      ),
      `chrom_pos F1` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_12209997",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254842",
        "13_1254880",
        "13_1273697",
        "13_1282054",
        "13_1282089",
        "13_1309511",
        "13_1318162",
        "13_1321216"
      ),
      `chrom_pos F2` = c(
        "13_1056059",
        "13_11823950",
        "13_1195790",
        "13_12005749",
        "13_1211923",
        "13_1212374",
        "13_12172917",
        "13_12224072",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_1309511",
        "13_1318162",
        "13_1321216",
        "13_134542",
        "13_134612",
        "13_1354008"
      ),
      `Chrom_pos F3` = c(
        "13_1056059",
        "13_1132788",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_1231476",
        "13_1244605",
        "13_1244635",
        "13_1250227",
        "13_1250260",
        "13_1250265",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_134612",
        "13_143187",
        "13_1455602",
        "13_1455638"
      ),
      `chrom_pos F4` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923", "13_1212374", "13_12209997", "13_1244605", "13_1250227",
                                                                                                                                                            "13_1250260", "13_1254842", "13_1254880", "13_1273697", "13_1282054",
                                                                                                                                                            "13_1282089", "13_1309511", "13_1318162", "13_1321216")), row.names = c(NA,
                                                                                                                                                                                                                                    -20L), class = c("tbl_df", "tbl", "data.frame"))

Just testing four or two columns match leaves unaddressed if three columns match.

dat_geno <-
  data.frame(
      SN...1 = c(
        374,
        100,
        155,
        362,
        609,
        534,
        502,
        371,
        94,
        504,
        355,
        640,
        264,
        310,
        637,
        20,
        594,
        531,
        432,
        564
      ),
      `chrom_pos F1` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_12209997",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254842",
        "13_1254880",
        "13_1273697",
        "13_1282054",
        "13_1282089",
        "13_1309511",
        "13_1318162",
        "13_1321216"
      ),
      `chrom_pos F2` = c(
        "13_1056059",
        "13_11823950",
        "13_1195790",
        "13_12005749",
        "13_1211923",
        "13_1212374",
        "13_12172917",
        "13_12224072",
        "13_1244605",
        "13_1250227",
        "13_1250260",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_1309511",
        "13_1318162",
        "13_1321216",
        "13_134542",
        "13_134612",
        "13_1354008"
      ),
      `Chrom_pos F3` = c(
        "13_1056059",
        "13_1132788",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923",
        "13_1212374",
        "13_1231476",
        "13_1244605",
        "13_1244635",
        "13_1250227",
        "13_1250260",
        "13_1250265",
        "13_1254880",
        "13_1282089",
        "13_1283220",
        "13_134612",
        "13_143187",
        "13_1455602",
        "13_1455638"
      ),
      `chrom_pos F4` = c(
        "13_1056059",
        "13_11817560",
        "13_11873028",
        "13_1195790",
        "13_12005749",
        "13_12019437",
        "13_1211923", "13_1212374", "13_12209997", "13_1244605", "13_1250227",
        "13_1250260", "13_1254842", "13_1254880", "13_1273697", "13_1282054",
        "13_1282089", "13_1309511", "13_1318162", "13_1321216"))

# Function to count identical strings in a row
count_identical <- function(row) {
  sum(duplicated(row) | duplicated(row, fromLast = TRUE))
}

# Apply the function to each row and create a new column
dat_geno$count_identical <- apply(dat_geno, 1, count_identical)

dat_geno
#>    SN...1 chrom_pos.F1 chrom_pos.F2 Chrom_pos.F3 chrom_pos.F4 count_identical
#> 1     374   13_1056059   13_1056059   13_1056059   13_1056059               4
#> 2     100  13_11817560  13_11823950   13_1132788  13_11817560               2
#> 3     155  13_11873028   13_1195790   13_1195790  13_11873028               4
#> 4     362   13_1195790  13_12005749  13_12005749   13_1195790               4
#> 5     609  13_12005749   13_1211923  13_12019437  13_12005749               2
#> 6     534  13_12019437   13_1212374   13_1211923  13_12019437               2
#> 7     502   13_1211923  13_12172917   13_1212374   13_1211923               2
#> 8     371   13_1212374  13_12224072   13_1231476   13_1212374               2
#> 9      94  13_12209997   13_1244605   13_1244605  13_12209997               4
#> 10    504   13_1244605   13_1250227   13_1244635   13_1244605               2
#> 11    355   13_1250227   13_1250260   13_1250227   13_1250227               3
#> 12    640   13_1250260   13_1254880   13_1250260   13_1250260               3
#> 13    264   13_1254842   13_1282089   13_1250265   13_1254842               2
#> 14    310   13_1254880   13_1283220   13_1254880   13_1254880               3
#> 15    637   13_1273697   13_1309511   13_1282089   13_1273697               2
#> 16     20   13_1282054   13_1318162   13_1283220   13_1282054               2
#> 17    594   13_1282089   13_1321216    13_134612   13_1282089               2
#> 18    531   13_1309511    13_134542    13_143187   13_1309511               2
#> 19    432   13_1318162    13_134612   13_1455602   13_1318162               2
#> 20    564   13_1321216   13_1354008   13_1455638   13_1321216               2

Created on 2023-07-26 with reprex v2.0.2

Thank you very much @technocrat but in the end I want to be able to specify where the duplicated row occur e.g if this can be found across the four column I will say that this can be found across all. I have also attach a screen shot of the expected output.

This should do the job:

dat_geno <- dat_geno |>
  rename(SN = SN...1)

get_duplicates <- function(name, value) {
  dup_vals <- unique(value[duplicated(value)])
  map(dup_vals, function(v) {
    name[value == v] |> 
      str_c(collapse = ",")
  }) |> 
    unlist() |> 
    str_c(collapse = "; ")
}

dg <- dat_geno |> 
  pivot_longer(-SN) |> 
  mutate(name = str_extract(name, "F\\d")) |> 
  group_by(SN) |> 
  summarise(dups = get_duplicates(name, value))

full_join(dat_geno, dg, by = "SN")

The result indicates duplicated columns (separated by commas). When there is more than one duplicated value, the duplicated groups are separated by semicolons.

# A tibble: 20 × 6
      SN `chrom_pos F1` `chrom_pos F2` `Chrom_pos F3` `chrom_pos F4` dups       
   <dbl> <chr>          <chr>          <chr>          <chr>          <chr>      
 1   374 13_1056059     13_1056059     13_1056059     13_1056059     F1,F2,F3,F4
 2   100 13_11817560    13_11823950    13_1132788     13_11817560    F1,F4      
 3   155 13_11873028    13_1195790     13_1195790     13_11873028    F2,F3; F1,F4
 4   362 13_1195790     13_12005749    13_12005749    13_1195790     F2,F3; F1,F4
 5   609 13_12005749    13_1211923     13_12019437    13_12005749    F1,F4      
 6   534 13_12019437    13_1212374     13_1211923     13_12019437    F1,F4      
 7   502 13_1211923     13_12172917    13_1212374     13_1211923     F1,F4      
 8   371 13_1212374     13_12224072    13_1231476     13_1212374     F1,F4      
 9    94 13_12209997    13_1244605     13_1244605     13_12209997    F2,F3; F1,F4
10   504 13_1244605     13_1250227     13_1244635     13_1244605     F1,F4      
11   355 13_1250227     13_1250260     13_1250227     13_1250227     F1,F3,F4   
12   640 13_1250260     13_1254880     13_1250260     13_1250260     F1,F3,F4   
13   264 13_1254842     13_1282089     13_1250265     13_1254842     F1,F4      
14   310 13_1254880     13_1283220     13_1254880     13_1254880     F1,F3,F4   
15   637 13_1273697     13_1309511     13_1282089     13_1273697     F1,F4      
16    20 13_1282054     13_1318162     13_1283220     13_1282054     F1,F4      
17   594 13_1282089     13_1321216     13_134612      13_1282089     F1,F4      
18   531 13_1309511     13_134542      13_143187      13_1309511     F1,F4      
19   432 13_1318162     13_134612      13_1455602     13_1318162     F1,F4      
20   564 13_1321216     13_1354008     13_1455638     13_1321216     F1,F4      
2 Likes

Thank you very much for this @MarekGierlinski , please sorry to disturb you again what about for each chromosome position e.g. 13_1056059 and I want to check across the four column where I can get a match for all the four combinations. Please how can I achieve that with this your elegant solution. Thanks!!!

for each chromosome position e.g. 13_1056059 and I want to check across the four column where I can get a match for all the four combinations. Please how can I achieve that with this your elegant solution

I'm sorry, I don't understand the request. Can you show an example of the required output?

Here is a screenshot of the required output @MarekGierlinski , what I mean is that for each chromosome position I want to check in all the rows for the for column when that chromosome is present in the four column regardless of their row position then just create a new column and indicate that it can be found across the four column. Here is the screenshot below. Thank you very much for your time.

When you say "the four column", do you mean "the fourth column" or "all four columns"?

Yes all four columns where I have the chromosome position @MarekGierlinski

So, you want to find occurrence of chromosome positions across four columns, regardless of the row?

oc <- dat_geno |> 
  pivot_longer(-SN, values_to = "chr_pos") |> 
  mutate(name = str_extract(name, "F\\d")) |> 
  group_by(chr_pos) |> 
  summarise(columns = sort(unique(name)) |> str_c(collapse = ","))

This will give you occurrence per column per chromosome position:

> oc
# A tibble: 34 × 2
   chr_pos     columns    
   <chr>       <chr>      
 1 13_1056059  F1,F2,F3,F4
 2 13_1132788  F3         
 3 13_11817560 F1,F4      
 4 13_11823950 F2         
 5 13_11873028 F1,F4      
 6 13_1195790  F1,F2,F3,F4
 7 13_12005749 F1,F2,F3,F4
 8 13_12019437 F1,F3,F4   
 9 13_1211923  F1,F2,F3,F4
10 13_1212374  F1,F2,F3,F4
# ℹ 24 more rows

Now we need to filter it for occurrences across all columns:

oc |> filter(columns == "F1,F2,F3,F4")
# A tibble: 10 × 2
   chr_pos     columns    
   <chr>       <chr>      
 1 13_1056059  F1,F2,F3,F4
 2 13_1195790  F1,F2,F3,F4
 3 13_12005749 F1,F2,F3,F4
 4 13_1211923  F1,F2,F3,F4
 5 13_1212374  F1,F2,F3,F4
 6 13_1244605  F1,F2,F3,F4
 7 13_1250227  F1,F2,F3,F4
 8 13_1250260  F1,F2,F3,F4
 9 13_1254880  F1,F2,F3,F4
10 13_1282089  F1,F2,F3,F4

Or, you can count columns found per chromosome directly:

dat_geno |> 
  pivot_longer(-SN, values_to = "chr_pos") |> 
  group_by(chr_pos) |> 
  summarise(n = length(unique(name))) |> 
  filter(n == 4)

# A tibble: 10 × 2
   chr_pos         n
   <chr>       <int>
 1 13_1056059      4
 2 13_1195790      4
 3 13_12005749     4
 4 13_1211923      4
 5 13_1212374      4
 6 13_1244605      4
 7 13_1250227      4
 8 13_1250260      4
 9 13_1254880      4
10 13_1282089      4

A more general comment: it is usually easier to work with data in long format, as we can do grouping, counting, summarising etc. Hence, the pivot_longer function.

Thank you very much for the help today. I really appreciate this @MarekGierlinski

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.