Selecting Rows Based on Conditions Stored in Another Column

I have the following data:

num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)

factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")

factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <-  as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <-  as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

id = 1:1000

my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)

I also have another data frame that stores "conditions that can be used to select rows from my_data" - each row acts as a set of conditions:

res2 <- structure(list(num_var_1 = c(13.0594020922422, NA, 11.4862445222133, 
9.61245560356254, 9.09042162012824, 11.2655306773389, 8.17775208814389, 
10.3368491267469, NA, NA), num_var_2 = c(NA, 15.5787995721324, 
0.0478277041848836, 18.2224315520234, NA, 7.96911807784013, NA, 
8.05245674365955, 4.27070199562321, NA), num_var_3 = c(32.5008410879472, 
40.8727408367019, NA, 46.8308850184899, 0.356266194135838, 20.6076522107065, 
-22.1119350917156, NA, 47.4219529058096, 47.7495176181095), num_var_4 = c(-10.5323504910751, 
NA, -21.9504025438917, NA, -8.07098853672244, 36.6252011463549, 
-10.729434726805, NA, NA, 14.5063601192342), num_var_5 = c(25.736287930052, 
4.7650948325899, NA, -8.57811306991066, 3.36041241809802, -14.8050577655299, 
-18.0775264320681, NA, NA, NA), factor_var_1 = c("A B", "B B", 
NA, NA, "C C C", "A B", "B C", "B B C", "A B", NA), factor_var_2 = c("AA AA", 
"AA CC CC", "AA CC", "AA CC CC", "BB", "AA AA", "CC", NA, NA, 
"AA AA CC"), factor_var_3 = c(NA, NA, "AAA AAA CCC", NA, "AAA AAA AAA AAA", 
"AAA DDD DDD", NA, NA, "BBB CCC", NA), factor_var_4 = c("AAAA EEEE", 
"AAAA AAAA", NA, NA, NA, "BBBB", "AAAA AAAA BBBB DDDD DDDD", 
"AAAA CCCC", "AAAA AAAA", "AAAA BBBB"), factor_var_5 = c("AAAA BBBB CCCC DDDD", 
NA, NA, NA, "AAAA AAAA", NA, NA, "BBBB", "AAAA AAAA AAAA EEEE", 
NA)), class = "data.frame", row.names = c(NA, -10L))

   num_var_1  num_var_2   num_var_3  num_var_4  num_var_5 factor_var_1 factor_var_2    factor_var_3             factor_var_4        factor_var_5
1  13.059402         NA  32.5008411 -10.532350  25.736288          A B        AA AA            <NA>                AAAA EEEE AAAA BBBB CCCC DDDD
2         NA 15.5787996  40.8727408         NA   4.765095          B B     AA CC CC            <NA>                AAAA AAAA                <NA>
3  11.486245  0.0478277          NA -21.950403         NA         <NA>        AA CC     AAA AAA CCC                     <NA>                <NA>
4   9.612456 18.2224316  46.8308850         NA  -8.578113         <NA>     AA CC CC            <NA>                     <NA>                <NA>
5   9.090422         NA   0.3562662  -8.070989   3.360412        C C C           BB AAA AAA AAA AAA                     <NA>           AAAA AAAA
6  11.265531  7.9691181  20.6076522  36.625201 -14.805058          A B        AA AA     AAA DDD DDD                     BBBB                <NA>
7   8.177752         NA -22.1119351 -10.729435 -18.077526          B C           CC            <NA> AAAA AAAA BBBB DDDD DDDD                <NA>
8  10.336849  8.0524567          NA         NA         NA        B B C         <NA>            <NA>                AAAA CCCC                BBBB
9         NA  4.2707020  47.4219529         NA         NA          A B         <NA>         BBB CCC                AAAA AAAA AAAA AAAA AAAA EEEE
10        NA         NA  47.7495176  14.506360         NA         <NA>     AA AA CC            <NA>                AAAA BBBB                <NA>

I would like to (directly) select rows from "my_data" using the "conditions" from the 10th row of "res2" - in theory, this should look something like this:

# code 1
final_result <- my_data[which(my_data$num_var_1 < res2[10,1] & my_data$num_var_2 < res2[10,2] & my_data$num_var_3 < res2[10,3] & my_data$num_var_4 < res2[10,4] & my_data$num_var_5 < res2[10,5] & my_data$factor_var_1 == res2[10,6] & my_data$factor_var_2 == res2[10,7] & my_data$factor_var_3 == res2[10,8] & my_data$factor_var_4 == res2[10,9] & my_data$factor_var_5 == res2[10,10]), ]

However, the above code does not work.

I was able to write this code by explicitly specifying the conditions:

#code 2
final_result <- my_data[which(my_data$num_var_2 < 4.66 & my_data$num_var_3 < 22.33 &  my_data$factor_var_1 %in% c( "B", "C") & my_data$factor_var_2 %in% c( "AA", "BB") & my_data$factor_var_3 %in% c( "AAA", "DDD") & my_data$factor_var_4 %in% c( "AAAA", "CCCC", "EEEE" )), ]

But is there a way to write "code 1" in the same format as "code 2"?

Thanks!

There are a couple of problems with your "code 1". It does not handle NA values in res2. Comparisons like my_data$num_var_1 < res2[10,1] will never be TRUE when res2[10, 1] is NA. Another problem is that my_data$factor_var_2 == res2[10,7] is not TRUE if my_data$factor_var_2 is "AA" and res2[10,7] is "AA CC". Even my_data$factor_var_2 %in% res2[10,7] is not TRUE. This code shows that

> "AA" == "AA CC"
[1] FALSE
> "AA" %in% "AA CC"
[1] FALSE

Your code 2 worked because your comparison was my_data$factor_var_2 %in% c("AA", "CC") but "AA CC" is not the same thing as c("AA", "CC"). You can use code like stringr::str_detect("AA CC", "AA") to see if one string is a sub string of another. But you need to be careful that you only get the matches you want. For example, if you are checking for "the", you can match "hypothesis".

str_detect("hypothesis", "the")
[1] TRUE

I wrote code that I think does what you want with your toy data but may be very fragile.

num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)

factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")

factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <-  as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <-  as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

id = 1:1000

my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)

res2 <- structure(list(num_var_1 = c(13.0594020922422, NA, 11.4862445222133, 
                                     9.61245560356254, 9.09042162012824, 11.2655306773389, 8.17775208814389, 
                                     10.3368491267469, NA, NA), 
                       num_var_2 = c(NA, 15.5787995721324, 
                                     0.0478277041848836, 18.2224315520234, NA, 7.96911807784013, NA, 
                                     8.05245674365955, 4.27070199562321, NA), 
                       num_var_3 = c(32.5008410879472, 
                                     40.8727408367019, NA, 46.8308850184899, 0.356266194135838, 20.6076522107065, 
                                     -22.1119350917156, NA, 47.4219529058096, 47.7495176181095), 
                       num_var_4 = c(-10.5323504910751, 
                                     NA, -21.9504025438917, NA, -8.07098853672244, 36.6252011463549, 
                                     -10.729434726805, NA, NA, 14.5063601192342), 
                       num_var_5 = c(25.736287930052, 
                                     4.7650948325899, NA, -8.57811306991066, 3.36041241809802, -14.8050577655299, 
                                     -18.0775264320681, NA, NA, NA), 
                       factor_var_1 = c("A B", "B B", 
                                        NA, NA, "C C C", "A B", "B C", "B B C", "A B", NA), 
                       factor_var_2 = c("AA AA", 
                                        "AA CC CC", "AA CC", "AA CC CC", "BB", "AA AA", "CC", NA, NA, 
                                        "AA AA CC"), 
                       factor_var_3 = c(NA, NA, "AAA AAA CCC", NA, "AAA AAA AAA AAA", 
                                        "AAA DDD DDD", NA, NA, "BBB CCC", NA), 
                       factor_var_4 = c("AAAA EEEE", 
                                        "AAAA AAAA", NA, NA, NA, "BBBB", "AAAA AAAA BBBB DDDD DDDD", 
                                        "AAAA CCCC", "AAAA AAAA", "AAAA BBBB"), 
                       factor_var_5 = c("AAAA BBBB CCCC DDDD", 
                                        NA, NA, NA, "AAAA AAAA", NA, NA, "BBBB", "AAAA AAAA AAAA EEEE", 
                                        NA)), 
                  class = "data.frame", row.names = c(NA, -10L))


library(dplyr)
library(fuzzyjoin)
library(stringr)
SelectedRes2 <- select(res2[10, ], where(~!is.na(.))) #select columns in res2 that are not NA in the desired row
NMs <- colnames(SelectedRes2)
grepFunc <- function(x, y) str_detect(as.character(y), as.character(x)) 
match_functions <- lapply(NMs, function(NM) if(grepl("^num", NM)) `<` else grepFunc) #If column names starts with 'num' use <, otherwise use grepFunc
OUT <- fuzzy_semi_join(my_data, SelectedRes2, by = NMs, match_fun = match_functions)
nrow(OUT)
#> [1] 329

Created on 2022-04-04 by the reprex package (v0.2.1)

1 Like

Thank you! I think this has worked for now! I will have to keep that comment in mind about "the" and "hypothesis"!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.