Apply lists of conditional statements to a dataset. RStudio

Hello Community,

I have a large dataset to which I need to apply a list of hundreds of pre-defined edits. The edit conditions are stored in a table and need to be run sequentially.

I have tried unsuccessfully to paste the conditions as strings into nested IFELSE functions. It works if I hardcode each one manually, but I need it to run from the central list of edit conditions.

What is a good method of importing the list of edits automatically and then applying them to the data iteratively?

Thanks for any help.


##########################################################################
# Here are some examples of edit conditions.
# 1. IF var1 == 0 THEN var1 = var2 ELSE do nothing
# 2. IF var2 > 0 & var3 > 0 THEN var1 = var2/var3 ELSE var2 = var3
# 3. IF abs(var2-var3)/var4 > 0.2 THEN var1 = 0 & var2 = var3 ELSE var2 = 0 & var3 = var1
# The format of the table can be changed if need be.
edit_conditions <- 
  tibble::tribble(
    ~EditID,    ~IF_Condition,               ~THEN_Function,                ~ELSE_Function,
    "E1",       "var1 == 0",                 "var1 = sum(var2, var3)",      "var1 = var1", 
    "E2",       "var2 > 0 & var3 > 0",       "var1 = var2/var3",            "var2 = var3",
    "E3",       "abs(var2-var3)/var4 > 0.2", "var1 = 0 & var2 = var3",      "var2 = 0 & var3 = var1"
  )
##########################################################################
# Here is some test data.
data <- 
  tibble::tribble(
    ~UnitID, ~var1, ~var2, ~var3, ~var4,
    1L,      0L,    0L,    20L,    9L,
    2L,      1L,    1L,    21L,    8L,
    3L,      0L,    2L,    22L,    7L,
    4L,      1L,    3L,    23L,    6L,
    5L,      0L,    4L,    24L,    5L,
    6L,      1L,    5L,    25L,    4L,
    7L,      0L,    6L,    26L,    6L,
    8L,      1L,    7L,    27L,    7L,
    9L,      0L,    8L,    28L,    8L,
    10L,     1L,    9L,    29L,    9L
  )

Making ifelse statements from the edit_conditions table isn't hard

# conditions
ec <- 
tibble::tribble(
  ~EditID,    ~IF_Condition,               ~THEN_Function,                ~ESE_Function,
  "E1",       "var1 == 0",                 "var1 = sum(var2, var3)",      "var1 = var1", 
  "E2",       "var2 > 0 & var3 > 0",       "var1 = var2/var3",            "var2 = var3",
  "E3",       "abs(var2-var3)/var4 > 0.2", "var1 = 0 & var2 = var3",      "var2 = 0 & var3 = var1"
)

# none of the features of a tibble are
# needed

ec <- as.data.frame(ec[-1])

# Fashion ifelse statement as strings
f <- function(row) {
  paste0("ifelse(", row[1], ",", row[2], ",", row[3], ")")
}

# Apply the function to each row of ec
result <- apply(ec, 1, f)
result
#> [1] "ifelse(var1 == 0,var1 = sum(var2, var3),var1 = var1)"                           
#> [2] "ifelse(var2 > 0 & var3 > 0,var1 = var2/var3,var2 = var3)"                       
#> [3] "ifelse(abs(var2-var3)/var4 > 0.2,var1 = 0 & var2 = var3,var2 = 0 & var3 = var1)"

and applying those to the test data tibble can be done.

But the statements strings can be converted to expression and applied to the test data tibble. Before that, however, test E1:E3 by hand and see what you get.

With your suggestion, I am able to use that to create the statement as strings but am still stuck when it comes to applying them to the test data. I adjusted the THEN-ELSE functions to be in line with the IFELSE function parameters.

Here is what I get:

edit_conditions <- 
  tibble::tribble(
    ~EditID,    ~Variable,    ~IF_Condition,               ~THEN_Function,         ~ELSE_Function,
    "E1",       "var1",       "var1 == 0",                 "sum(var2, var3)",      "var1", 
    "E2",       "var1",       "var2 > 0 & var3 > 0",       "var2/var3",            "var3",
    "E3a",      "var1",       "abs(var2-var3)/var4 > 0.2", "0",                    "var1",
    "E3b",      "var2",       "abs(var2-var3)/var4 > 0.2", "var3",                 "0",
    "E3c",      "var3",       "abs(var2-var3)/var4 > 0.2", "var3",                 "var1"
  )

# Fashion ifelse statement as strings
f <- function(row) {
  paste0("ifelse(", row[3], ",", row[4], ",", row[5], ")")
}

# Apply the function to each row of ec
result <- apply(edit_conditions, 1, f)

# Applying the first edit to var1
data$var1 <- result[1]
data

   UnitID var1                                    var2  var3  var4
    <int> <chr>                                  <int> <int> <int>
 1      1 ifelse(var1 == 0,sum(var2, var3),var1)     0    20     9
 2      2 ifelse(var1 == 0,sum(var2, var3),var1)     1    21     8
 3      3 ifelse(var1 == 0,sum(var2, var3),var1)     2    22     7
 4      4 ifelse(var1 == 0,sum(var2, var3),var1)     3    23     6
 5      5 ifelse(var1 == 0,sum(var2, var3),var1)     4    24     5
 6      6 ifelse(var1 == 0,sum(var2, var3),var1)     5    25     4
 7      7 ifelse(var1 == 0,sum(var2, var3),var1)     6    26     6
 8      8 ifelse(var1 == 0,sum(var2, var3),var1)     7    27     7
 9      9 ifelse(var1 == 0,sum(var2, var3),var1)     8    28     8
10     10 ifelse(var1 == 0,sum(var2, var3),var1)     9    29     9

Here are a couple of the statements applied manually.

# Apply edits manually
data$var1 <- ifelse(data$var1 == 0, data$var2 + data$var3, data$var1)
data$var1 <- ifelse(data$var2 > 0 & data$var3 > 0, data$var2/data$var3, data$var3)

The manual method gives the correct results, but I need a way of automating the process. Please let me know what I am doing wrong.

Thanks for your help @technocrat .

Does this do what you need?

# Define the data frames ec and d
ec <- data.frame(
  UnitID = 1:10, var1 = c(
    "ifelse(var1==0,sum(var2,var3),var1)",
    "ifelse(var1==0,sum(var2,var3),var1)", "ifelse(var1==0,sum(var2,var3),var1)",
    "ifelse(var1==0,sum(var2,var3),var1)", "ifelse(var1==0,sum(var2,var3),var1)",
    "ifelse(var1==0,sum(var2,var3),var1)", "ifelse(var1==0,sum(var2,var3),var1)",
    "ifelse(var1==0,sum(var2,var3),var1)", "ifelse(var1==0,sum(var2,var3),var1)",
    "ifelse(var1==0,sum(var2,var3),var1)"
  ), var2 = 0:9, var3 = 20:29,
  var4 = c(9, 8, 7, 6, 5, 4, 6, 7, 8, 9)
)

d <- data.frame(
  UnitID = 1:10,
  var1 = c(
    0, 1, 0, 1, 0, 1,
    0, 1, 0, 1
  ),
  var2 = 0:9,
  var3 = 20:29,
  var4 = c(
    9, 8, 7,
    6, 5, 4, 6, 7, 8, 9
  )
)

# Create a function to apply the expressions in ec$var1 to the data frame d
apply_ec_var1 <- function(row, expr) {
  eval(parse(text = expr), 
       envir = list2env(as.list(row), 
               parent = globalenv()))
}

# Apply the function to each row of d
d_result <- d
for (i in 1:nrow(d)) {
  d_result[i, "var1"] =
    apply_ec_var1(d[i, ], ec$var1[i])
}

# display the result
d_result
#>    UnitID var1 var2 var3 var4
#> 1       1   20    0   20    9
#> 2       2    1    1   21    8
#> 3       3   24    2   22    7
#> 4       4    1    3   23    6
#> 5       5   28    4   24    5
#> 6       6    1    5   25    4
#> 7       7   32    6   26    6
#> 8       8    1    7   27    7
#> 9       9   36    8   28    8
#> 10     10    1    9   29    9

Created on 2023-09-09 with reprex v2.0.2

Thanks @technocrat

That is perfect!

The missing key for me was the "eval(parse(...".

Thanks so much for your quick responses.

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.