I feel that's easier to work with if:
- you separate the reference (P1/P2) from the individuals
- you work with long data
library(tidyverse)
# read data
dat <- read.table(text = "IND SNP1 SNP2 SNP3
P1 A/A G/G T/T
P2 T/T C/C G/G
IND1 T/A G/C MISSING
IND2 A/T G/G G/T
IND3 T/T C/G T/T
IND4 NA MISSING T/G",
header = TRUE)
# separate the reference P1/P2...
ref <- dat |>
filter(IND %in% c("P1", "P2"))
ref
#> IND SNP1 SNP2 SNP3
#> 1 P1 A/A G/G T/T
#> 2 P2 T/T C/C G/G
# ...from the test subjects
inds <- dat |>
filter(startsWith(IND, "IND"))
inds
#> IND SNP1 SNP2 SNP3
#> 1 IND1 T/A G/C MISSING
#> 2 IND2 A/T G/G G/T
#> 3 IND3 T/T C/G T/T
#> 4 IND4 <NA> MISSING T/G
# make ref to long format and code what we want
ref_long <- pivot_longer(ref,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(if_match = if_else(IND == "P1", "A", "B")) |>
rename(IND_ref = IND)
ref_long
#> # A tibble: 6 × 4
#> IND_ref SNP_number SNP if_match
#> <chr> <chr> <chr> <chr>
#> 1 P1 SNP1 A/A A
#> 2 P1 SNP2 G/G A
#> 3 P1 SNP3 T/T A
#> 4 P2 SNP1 T/T B
#> 5 P2 SNP2 C/C B
#> 6 P2 SNP3 G/G B
# long format
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP")
#> # A tibble: 12 × 3
#> IND SNP_number SNP
#> <chr> <chr> <chr>
#> 1 IND1 SNP1 T/A
#> 2 IND1 SNP2 G/C
#> 3 IND1 SNP3 MISSING
#> 4 IND2 SNP1 A/T
#> 5 IND2 SNP2 G/G
#> 6 IND2 SNP3 G/T
#> 7 IND3 SNP1 T/T
#> 8 IND3 SNP2 C/G
#> 9 IND3 SNP3 T/T
#> 10 IND4 SNP1 <NA>
#> 11 IND4 SNP2 MISSING
#> 12 IND4 SNP3 T/G
# Properly identify all NA
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP))
#> # A tibble: 12 × 3
#> IND SNP_number SNP
#> <chr> <chr> <chr>
#> 1 IND1 SNP1 T/A
#> 2 IND1 SNP2 G/C
#> 3 IND1 SNP3 <NA>
#> 4 IND2 SNP1 A/T
#> 5 IND2 SNP2 G/G
#> 6 IND2 SNP3 G/T
#> 7 IND3 SNP1 T/T
#> 8 IND3 SNP2 C/G
#> 9 IND3 SNP3 T/T
#> 10 IND4 SNP1 <NA>
#> 11 IND4 SNP2 <NA>
#> 12 IND4 SNP3 T/G
# Assemble this data with the ref
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP"))
#> # A tibble: 12 × 5
#> IND SNP_number SNP IND_ref if_match
#> <chr> <chr> <chr> <chr> <chr>
#> 1 IND1 SNP1 T/A <NA> <NA>
#> 2 IND1 SNP2 G/C <NA> <NA>
#> 3 IND1 SNP3 <NA> <NA> <NA>
#> 4 IND2 SNP1 A/T <NA> <NA>
#> 5 IND2 SNP2 G/G P1 A
#> 6 IND2 SNP3 G/T <NA> <NA>
#> 7 IND3 SNP1 T/T P2 B
#> 8 IND3 SNP2 C/G <NA> <NA>
#> 9 IND3 SNP3 T/T P1 A
#> 10 IND4 SNP1 <NA> <NA> <NA>
#> 11 IND4 SNP2 <NA> <NA> <NA>
#> 12 IND4 SNP3 T/G <NA> <NA>
# Note that IND1-SNP2, IND3-SNP1 and IND3-SNP3 have one match,
# and the others have no matches, so they have a NA that we'll replace with H
# In this data, doesn't seem we have any SNP that matches both P1 and P2!
# if that were the case, you might need to do something about it.
# So, now we can put "H" instead of NA
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP")) |>
mutate(if_match = if_else(is.na(if_match), "H", if_match))
#> # A tibble: 12 × 5
#> IND SNP_number SNP IND_ref if_match
#> <chr> <chr> <chr> <chr> <chr>
#> 1 IND1 SNP1 T/A <NA> H
#> 2 IND1 SNP2 G/C <NA> H
#> 3 IND1 SNP3 <NA> <NA> H
#> 4 IND2 SNP1 A/T <NA> H
#> 5 IND2 SNP2 G/G P1 A
#> 6 IND2 SNP3 G/T <NA> H
#> 7 IND3 SNP1 T/T P2 B
#> 8 IND3 SNP2 C/G <NA> H
#> 9 IND3 SNP3 T/T P1 A
#> 10 IND4 SNP1 <NA> <NA> H
#> 11 IND4 SNP2 <NA> <NA> H
#> 12 IND4 SNP3 T/G <NA> H
# Oh no! We are forgetting about the fact that the SNP can be NA itself!
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP")) |>
mutate(if_match = if_else(is.na(if_match), "H", if_match),
if_match = if_else(is.na(SNP), "-", if_match))
#> # A tibble: 12 × 5
#> IND SNP_number SNP IND_ref if_match
#> <chr> <chr> <chr> <chr> <chr>
#> 1 IND1 SNP1 T/A <NA> H
#> 2 IND1 SNP2 G/C <NA> H
#> 3 IND1 SNP3 <NA> <NA> -
#> 4 IND2 SNP1 A/T <NA> H
#> 5 IND2 SNP2 G/G P1 A
#> 6 IND2 SNP3 G/T <NA> H
#> 7 IND3 SNP1 T/T P2 B
#> 8 IND3 SNP2 C/G <NA> H
#> 9 IND3 SNP3 T/T P1 A
#> 10 IND4 SNP1 <NA> <NA> -
#> 11 IND4 SNP2 <NA> <NA> -
#> 12 IND4 SNP3 T/G <NA> H
# And put it back in wide format if that's what you need later
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP")) |>
mutate(if_match = if_else(is.na(if_match), "H", if_match),
if_match = if_else(is.na(SNP), "-", if_match)) |>
pivot_wider(id_cols = IND,
names_from = "SNP_number",
values_from = "if_match")
#> # A tibble: 4 × 4
#> IND SNP1 SNP2 SNP3
#> <chr> <chr> <chr> <chr>
#> 1 IND1 H H -
#> 2 IND2 H A H
#> 3 IND3 B H A
#> 4 IND4 - - H
# Finally the percentage of "-" is easier on a long format,
# or can be done rowwise on the wide format (with rowwise and c_across)
prop_dash <- pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP")) |>
mutate(if_match = if_else(is.na(if_match), "H", if_match),
if_match = if_else(is.na(SNP), "-", if_match)) |>
group_by(IND) |>
summarize(prop_dash = 100 * mean(if_match == "-"))
prop_dash
#> # A tibble: 4 × 2
#> IND prop_dash
#> <chr> <dbl>
#> 1 IND1 33.3
#> 2 IND2 0
#> 3 IND3 0
#> 4 IND4 66.7
# And we can reassemble them
pivot_longer(inds,
cols = -IND,
names_to = "SNP_number",
values_to = "SNP") |>
mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
left_join(ref_long,
by = c("SNP_number", "SNP")) |>
mutate(if_match = if_else(is.na(if_match), "H", if_match),
if_match = if_else(is.na(SNP), "-", if_match)) |>
pivot_wider(id_cols = IND,
names_from = "SNP_number",
values_from = "if_match") |>
left_join(prop_dash,
by = "IND")
#> # A tibble: 4 × 5
#> IND SNP1 SNP2 SNP3 prop_dash
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 IND1 H H - 33.3
#> 2 IND2 H A H 0
#> 3 IND3 B H A 0
#> 4 IND4 - - H 66.7
Created on 2023-05-18 with reprex v2.0.2
The filtering of INDs with too many dashes is trivial with mutate()
, if you also want to filter out SNPs you might want to do it separately.