Hi,
I have a reference dataframe with genes coordinates like this
df=data.frame(Gene_name=c("geneA","geneB","geneC","geneD"),Gene_start=c(20684560,22971177,31944853,32074946),Gene_end=c(20701216,22982551,31950382,32076793) )
and another dataframe of positions
df2 = data.frame(name = c("position1","position2","position3","position4","position5","position6"),
position =c(20684565,31944858,33076793,52076793,62076793,72076793) )
I want to keep in df2 only positions which fit in the genes of the reference df, meaning only the lines of df2 for which position fit in the intervals created by Gene_start and terminated by Gene_end in the df.
The ivs package was created for problems like this one. You can turn your start/end columns into a single column representing the interval. Depending on whether or not Gene_end is included in the range or not, you may need to add 1 to it.
library(tidyverse)
library(ivs)
df <- tibble(
Gene_name=c("geneA","geneB","geneC","geneD"),
Gene_start=c(20684560,22971177,31944853,32074946),
Gene_end=c(20701216,22982551,31950382,32076793)
)
df2 <- tibble(
name = c("position1","position2","position3","position4","position5","position6"),
position =c(20684565,31944858,33076793,52076793,62076793,72076793)
)
# This assumes `Gene_end` is an exclusive boundary, i.e. you are making
# intervals like `[, )`. If it is inclusive, add `1` to `Gene_end`.
df <- df %>%
mutate(Gene_range = iv(Gene_start, Gene_end), .keep = "unused")
df
#> # A tibble: 4 × 2
#> Gene_name Gene_range
#> <chr> <iv<dbl>>
#> 1 geneA [20684560, 20701216)
#> 2 geneB [22971177, 22982551)
#> 3 geneC [31944853, 31950382)
#> 4 geneD [32074946, 32076793)
df2 <- df2 %>%
mutate(in_any_range = iv_between(position, df$Gene_range))
df2
#> # A tibble: 6 × 3
#> name position in_any_range
#> <chr> <dbl> <lgl>
#> 1 position1 20684565 TRUE
#> 2 position2 31944858 TRUE
#> 3 position3 33076793 FALSE
#> 4 position4 52076793 FALSE
#> 5 position5 62076793 FALSE
#> 6 position6 72076793 FALSE
filter(df2, in_any_range)
#> # A tibble: 2 × 3
#> name position in_any_range
#> <chr> <dbl> <lgl>
#> 1 position1 20684565 TRUE
#> 2 position2 31944858 TRUE