Selecting Rows of Data Based on Conditions in a List

I am working with the R programming language.

Part 1: I have the following data set ("my_data"):

num_var_1 <- abs(rnorm(1000, 10, 1))
num_var_2 <- abs(rnorm(1000, 10, 5))
num_var_3 <- abs(rnorm(1000, 10, 10))
num_var_4 <- abs(rnorm(1000, 10, 10))
num_var_5 <- abs(rnorm(1000, 10, 10))

factor_1 <- c("0","B", "C")
factor_2 <- c("0","BB", "CC")
factor_3 <- c("0","BBB", "CCC", "DDD")
factor_4 <- c("0","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")

factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <-  as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <-  as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <-  as.factor(sample(factor_5, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

id = 1:1000

my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)

> head(my_data)

  id num_var_1 num_var_2  num_var_3   num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1  1  9.427387 13.367001 19.5313274 19.40222609  8.084350            C           CC            0            0       FFFFFF
2  2 10.885915  6.763050  1.6141172  7.85581615 33.810473            0           CC          DDD         EEEE       FFFFFF
3  3 10.782206 15.779463 11.6990473 19.62717981 19.623990            0           BB          BBB            0        EEEEE
4  4  9.483538  6.574688  4.7517693  4.71974413  5.273193            C           BB          CCC         EEEE        EEEEE
5  5 11.554266 10.141942  9.9191620  0.06409279 12.613053            C           CC            0            0        BBBBB
6  6  7.909920  2.391006  0.2375911 20.86201685 10.246962            B            0          CCC            0        EEEEE

Part 2: I have the following list ("my_list"):

> my_list

$x1
[1] 10.99873

$x2
[1] 14.4339

$x3
[1] 31.6395

$x4
[1] 25.10335

$x5
[1] 5.566093

[[6]]
[1] "B" "C"

[[7]]
[1] "BB" "CC"

[[8]]
[1] "0"   "CCC"

[[9]]
[1] "DDDD" "EEEE"

[[10]]
[1] "0"     "CCCCC" "DDDDD"

> str(my_list)
List of 10
 $ x1: num 11
 $ x2: num 14.4
 $ x3: num 31.6
 $ x4: num 25.1
 $ x5: num 5.57
 $   : chr [1:2] "B" "C"
 $   : chr [1:2] "BB" "CC"
 $   : chr [1:2] "0" "CCC"
 $   : chr [1:2] "DDDD" "EEEE"
 $   : chr [1:3] "0" "CCCCC" "DDDDD"

My Question: There are 10 elements in the list and there are 10 variables in the data frame (excluding ID) - I am trying to select rows from the data frame based on the conditions in the list. For example, here I do this manually:

library(dplyr)
select =  filter(my_data[,-1],
                    num_var_1 < my_list[1] &
                      num_var_2 < my_list[2] &
                      num_var_3 < my_list[3] &
                      num_var_4 < my_list[4] &
                      num_var_5 < my_list[5] & 
                      factor_var_1 %in% c("B","C") &
                      factor_var_2 %in% c("BB","CC") &
                      factor_var_3 %in% c("0","CCC") &
                      factor_var_4 %in% c("DDDD","EEEE") &
                      factor_var_5 %in% c("0","CCCCC", "DDDDD"))

>select

  num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1  9.399423 12.366828  8.267739 21.996184  3.064587            B           BB            0         EEEE        CCCCC
2 10.172698  1.722735 25.165994 10.733476  4.541144            C           BB            0         DDDD        DDDDD
3 10.079877 11.674709 24.133511  6.970897  2.234517            B           BB          CCC         DDDD        CCCCC

Problem: But this does not work for the factor variables:

  select =  filter(my_data[,-1],
                        num_var_1 < my_list[1] &
                          num_var_2 < my_list[2] &
                          num_var_3 < my_list[3] &
                          num_var_4 < my_list[4] &
                          num_var_5 < my_list[5] & 
                          factor_var_1 %in%  my_list[6] &
                          factor_var_2 %in%  my_list[7] &
                          factor_var_3 %in%  my_list[8] &
                          factor_var_4 %in%  my_list[9] &
                          factor_var_5 %in%  my_list[10])

 >select
 [1] num_var_1    num_var_2    num_var_3    num_var_4    num_var_5    factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<0 rows> (or 0-length row.names)

Can someone please show me how to fix this problem?

Thanks!

Use double brackets to select the contents of that list element.

selected_data <- filter(my_data[,-1],
                        num_var_1 < my_list[1] &
                          num_var_2 < my_list[2] &
                          num_var_3 < my_list[3] &
                          num_var_4 < my_list[4] &
                          num_var_5 < my_list[5] & 
                          factor_var_1 %in%  my_list[[6]] &
                          factor_var_2 %in%  my_list[[7]] &
                          factor_var_3 %in%  my_list[[8]] &
                          factor_var_4 %in%  my_list[[9]] &
                          factor_var_5 %in%  my_list[[10]])

You should also avoid using select as a variable name, as this is already a (commonly used) function exported by dplyr.
Best,
Valentin

1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.