I am working with the R programming language.
Part 1: I have the following data set ("my_data"):
num_var_1 <- abs(rnorm(1000, 10, 1))
num_var_2 <- abs(rnorm(1000, 10, 5))
num_var_3 <- abs(rnorm(1000, 10, 10))
num_var_4 <- abs(rnorm(1000, 10, 10))
num_var_5 <- abs(rnorm(1000, 10, 10))
factor_1 <- c("0","B", "C")
factor_2 <- c("0","BB", "CC")
factor_3 <- c("0","BBB", "CCC", "DDD")
factor_4 <- c("0","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_5, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
id = 1:1000
my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
> head(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 1 9.427387 13.367001 19.5313274 19.40222609 8.084350 C CC 0 0 FFFFFF
2 2 10.885915 6.763050 1.6141172 7.85581615 33.810473 0 CC DDD EEEE FFFFFF
3 3 10.782206 15.779463 11.6990473 19.62717981 19.623990 0 BB BBB 0 EEEEE
4 4 9.483538 6.574688 4.7517693 4.71974413 5.273193 C BB CCC EEEE EEEEE
5 5 11.554266 10.141942 9.9191620 0.06409279 12.613053 C CC 0 0 BBBBB
6 6 7.909920 2.391006 0.2375911 20.86201685 10.246962 B 0 CCC 0 EEEEE
Part 2: I have the following list ("my_list"):
> my_list
$x1
[1] 10.99873
$x2
[1] 14.4339
$x3
[1] 31.6395
$x4
[1] 25.10335
$x5
[1] 5.566093
[[6]]
[1] "B" "C"
[[7]]
[1] "BB" "CC"
[[8]]
[1] "0" "CCC"
[[9]]
[1] "DDDD" "EEEE"
[[10]]
[1] "0" "CCCCC" "DDDDD"
> str(my_list)
List of 10
$ x1: num 11
$ x2: num 14.4
$ x3: num 31.6
$ x4: num 25.1
$ x5: num 5.57
$ : chr [1:2] "B" "C"
$ : chr [1:2] "BB" "CC"
$ : chr [1:2] "0" "CCC"
$ : chr [1:2] "DDDD" "EEEE"
$ : chr [1:3] "0" "CCCCC" "DDDDD"
My Question: There are 10 elements in the list and there are 10 variables in the data frame (excluding ID) - I am trying to select rows from the data frame based on the conditions in the list. For example, here I do this manually:
library(dplyr)
select = filter(my_data[,-1],
num_var_1 < my_list[1] &
num_var_2 < my_list[2] &
num_var_3 < my_list[3] &
num_var_4 < my_list[4] &
num_var_5 < my_list[5] &
factor_var_1 %in% c("B","C") &
factor_var_2 %in% c("BB","CC") &
factor_var_3 %in% c("0","CCC") &
factor_var_4 %in% c("DDDD","EEEE") &
factor_var_5 %in% c("0","CCCCC", "DDDDD"))
>select
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 9.399423 12.366828 8.267739 21.996184 3.064587 B BB 0 EEEE CCCCC
2 10.172698 1.722735 25.165994 10.733476 4.541144 C BB 0 DDDD DDDDD
3 10.079877 11.674709 24.133511 6.970897 2.234517 B BB CCC DDDD CCCCC
Problem: But this does not work for the factor variables:
select = filter(my_data[,-1],
num_var_1 < my_list[1] &
num_var_2 < my_list[2] &
num_var_3 < my_list[3] &
num_var_4 < my_list[4] &
num_var_5 < my_list[5] &
factor_var_1 %in% my_list[6] &
factor_var_2 %in% my_list[7] &
factor_var_3 %in% my_list[8] &
factor_var_4 %in% my_list[9] &
factor_var_5 %in% my_list[10])
>select
[1] num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<0 rows> (or 0-length row.names)
Can someone please show me how to fix this problem?
Thanks!