[dplyr] Generate columns from a list using mutate

Hi Posit users.

I have a list of about 47 individual variables I need to generate and I would rather not have to make individual mutate calls for each of them.

I think have a method that can be called as a function to do the operations I need for each one, but I can't find anything in the threads or the documentation describing how to run mutate to generate new columns from a list of variables or criteria, nor do I see any ways to do this as part of a for loop.

A template on where to start, or any other help on how to accomplish this task would be appreciated!

I think I found that using the glue syntax will work, based on this Stack thread I found:

Seems to do the job.

Could you give a small example of what you mean?

Apologies.

This took a minute to generate.

Imagine the following data:

df <- structure(list(ID = c("001", "002", "003", "004", "005", "006", 
"007", "008", "009", "010", "011", "012", "013", "014", "015", 
"016", "017", "018", "019", "020", "021", "022", "023", "024", 
"025", "026", "027", "028", "029", "030", "031", "032", "033", 
"034", "035", "036", "037", "038", "039", "040", "041", "042", 
"043", "044", "045", "046", "047", "048", "049", "050", "051", 
"052", "053", "054", "055", "056", "057", "058", "059", "060", 
"061", "062", "063", "064", "065", "066", "067", "068", "069", 
"070", "071", "072", "073", "074", "075", "076", "077", "078", 
"079", "080", "081", "082", "083", "084", "085", "086", "087", 
"088", "089", "090", "091", "092", "093", "094", "095", "096", 
"097", "098", "099", "100"), String = c("00H1", "00E7", "00C5", 
"00C3", "00I1", "00C", "00E6", "00M5", "00D3", "00B1", "00C1", 
"00G7", "00A5", "00J3", "00G1", "00J", "00F6", "00A5", "00D3", 
"00E1", "00J1", "00J7", "00C5", "00G3", "00H1", "00M", "00E6", 
"00B5", "00H3", "00D1", "00H1", "00G7", "00J5", "00F3", "00H1", 
"00J", "00M6", "00G5", "00L3", "00C1", "00H1", "00E7", "00F5", 
"00A3", "00K1", "00J", "00G6", "00F5", "00L3", "00F1", "00H1", 
"00L7", "00A5", "00G3", "00E1", "00B", "00J6", "00C5", "00I3", 
"00C1", "00B1", "00A7", "00F5", "00K3", "00M1", "00C", "00K6", 
"00H5", "00K3", "00I1", "00H1", "00D7", "00K5", "00D3", "00M1", 
"00I", "00K6", "00L5", "00A3", "00M1", "00I1", "00F7", "00E5", 
"00H3", "00E1", "00I", "00D6", "00J5", "00K3", "00D1", "00A1", 
"00D7", "00E5", "00D3", "00H1", "00J", "00C6", "00M5", "00E3", 
"00I1")), row.names = c(NA, -100L), class = "data.frame")

With 0/1 binarization based upon the presence or absence of a string, with either an exact match or beginning with said string (wildcard in this case == 0 for exact matches and 1 for starts_with):

filtercrit <- structure(list(Var_name = c("Flag_A", "Flag_A", "Flag_A", "Flag_A", 
"Flag_B", "Flag_B", "Flag_B", "Flag_B", "Flag_B", "Flag_C", "Flag_C", 
"Flag_C", "Flag_C"), String = c("00A", "00B", "00C", "00D", "00E", 
"00F", "00G", "00H", "00I", "00J", "00K", "00L", "00M"), wildcard = c(0L, 
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-13L))

In theory, I would make a command like this one to individually call each Flag_X variable and set it to 1 or 0 based on the presence or absence of the partial or exact strings like so:

df2 <- df |> mutate(
  Flag_A = if_else(((
    String %in% (
      filtercrit |> filter(Var_name == "Flag_A", wildcard == 0) |> pull(String)
    )
  ) |
    (
      str_starts(String, paste((
        filtercrit |> filter(Var_name == "Flag_A", wildcard != 0) |> pull(String)
      ), collapse = "|"))
    )), 1, 0, missing = 0),
  Flag_B = if_else(((
    String %in% (
      filtercrit |> filter(Var_name == "Flag_B", wildcard == 0) |> pull(String)
    )
  ) |
    (
      str_starts(String, paste((
        filtercrit |> filter(Var_name == "Flag_B", wildcard != 0) |> pull(String)
      ), collapse = "|"))
    )), 1, 0, missing = 0),
  Flag_C = if_else(((
    String %in% (
      filtercrit |> filter(Var_name == "Flag_C", wildcard == 0) |> pull(String)
    )
  ) |
    (
      str_starts(String, paste((
        filtercrit |> filter(Var_name == "Flag_C", wildcard != 0) |> pull(String)
      ), collapse = "|"))
    )), 1, 0, missing = 0)
)

But in reality I have well over 70 variables I need to do this with so hand-writing every if_else is not ideal.

I tried the solution in the Stack link of using the glue syntax, and this works in base R/tidyr:

df2 <- df

for(n in 1:length(unique(filtercrit$Var_name))) {
  Temp <- filtercrit |> filter(Var_name == unique(filtercrit$Var_name)[n])
  Wildcrit <- paste((Temp |> filter(wildcard == 1) |> pull(String)), collapse = "|")
  Hardcrit <- Temp |> filter(wildcard != 1) |> pull(String)
  
  if (Wildcrit == "") {
    df2 <- df2 |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else((String %in% Hardcrit), 1, 0, missing =
                                                                        0))
  } else
    df2 <- df2 |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else(((
      String %in% Hardcrit
    ) | (
      str_starts(String, Wildcrit)
    )), 1, 0, missing = 0))
}

And even in the arrow workflow I run it seems to work:

df_arrow <- as_arrow_table(df)

for(n in 1:length(unique(filtercrit$Var_name))) {
  Temp <- filtercrit |> filter(Var_name == unique(filtercrit$Var_name)[n])
  Wildcrit <- paste((Temp |> filter(wildcard == 1) |> pull(String)), collapse = "|")
  Hardcrit <- Temp |> filter(wildcard != 1) |> pull(String)
  
  if (Wildcrit == "") {
    df_arrow <- df_arrow |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else((String %in% Hardcrit), 1, 0, missing =
                                                                              0))
  } else
    df_arrow <- df_arrow |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else(((
      String %in% Hardcrit
    ) | (
      str_starts(String, Wildcrit)
    )), 1, 0, missing = 0))
}

However, in my actual workflow, it keeps throwing an error:


subsample_data <- reference_data |> mutate("{unique(Vartable$variable)[n]}" := if_else((Class == "A" &
                                                                                          String %in% HardA) |
                                                                                         (Class == "B" &
                                                                                            String %in% HardB) |
                                                                                         (Class == "C" &
                                                                                            String %in% HardC) |
                                                                                         (Class == "D" &
                                                                                            String %in% HardD),
                                                                                       1,
                                                                                       0,
                                                                                       missing = 0
)) |> group_by(ID1, ID2) |> summarize("{unique(Vartable$variable)[n]}" := max("{unique(Vartable$variable)[n]}")) |> ungroup() |> right_join(subsample_data, by = c("ID1", "ID2"))

> rlang::last_trace()
<error/purrr_error_indexed>
Error in `map()`:
ℹ In index: 5.
ℹ With name: Flag_A.
Caused by error:
! Type error: Array type doesn't match type of values set: string vs struct<variable: string, Class: string, String: string, wildcard: double>
---
Backtrace:
     ▆
  1. ├─dplyr::right_join(...)
  2. ├─dplyr::ungroup(...)
  3. ├─dplyr::summarize(...)
  4. ├─dplyr::group_by(...)
  5. └─arrow:::group_by.arrow_dplyr_query(...)
  6.   ├─dplyr::mutate(.data, !!!final_groups)
  7.   └─arrow:::mutate.arrow_dplyr_query(.data, !!!final_groups)
  8.     └─arrow:::compute_by(...)
  9.       └─arrow:::eval_select_by(by, data, error_call = error_call)
 10.         ├─base::as.data.frame(implicit_schema(data))
 11.         └─arrow:::implicit_schema(data)
 12.           └─purrr::map(.data$selected_columns, ~.$type(old_schm))
 13.             └─purrr:::map_("list", .x, .f, ..., .progress = .progress)
 14.               ├─purrr:::with_indexed_errors(...)
 15.               │ └─base::withCallingHandlers(...)
 16.               ├─purrr:::call_with_cleanup(...)
 17.               └─arrow (local) .f(.x[[i]], ...)
 18.                 └─.$type(old_schm)
 19.                   └─arrow:::compute___expr__type(self, schema)
Run rlang::last_trace(drop = FALSE) to see 4 hidden frames.

This looks like an arrow specific schema disagreement of some kind.

Having gone through their git issues, schema mismatch seems to be the issue:

Basically, %in% is not being processed the same way due to the missmatch.

Nope.

Syntax issue on my part.

That said, the glue syntax is killing me:

<other stuff> |> summarize("{unique(Vartable$variable)[n]}" := max("{unique(Vartable$variable)[n]}")) |> as_arrow_table()

Error: cols must be a list of "Expression"

Would anyone happen to know how to pass this correctly?

This looks like you're trying to apply max() to a string — is that what you intend?

Yes and no.

The full command uses the glue syntax (see: Transforming user-derived variables: Programming with dplyr) to generate some variables from a vector of information from another dataframe called Vartable, where the name of the variable I want generated is the value for variable under Vartable, grade represents vector of strings filtered from the table using pull:

output <- Omnibus

for(n in 1:length(unique(Vartable$variable))){
  Temp <- Vartable |> filter(variable == unique(Vartable$variable)[n])
  Hard9 <- Temp |> filter(class == "09") |> pull (grade)
  Hard10 <- Temp |> filter(class == "10") |> pull (grade)
  
  output <- Omnibus |> mutate("{unique(Vartable$variable)[n]}" := if_else((class == "09" & arrow_is_in(grade, options = list(value_set = Array$create(Hard9), skip_nulls = TRUE))) | (class == "10" & arrow_is_in(grade, options = list(value_set = Array$create(Hard10), skip_nulls = TRUE))),1,0,missing=0)) |> group_by(PatId,DeliveryDt) |> summarize("{unique(Vartable$variable)[n]}" := max("{unique(Vartable$variable)[n]}")) |> ungroup() |> right_join(output, by = c("PatId","DeliveryDt"))  
}

The mutate call to create the variable works just fine with "{unique(Vartable$variable)[n]}" := , but it fails to evaluate the mutated output once it gets to summarize and max whether I use the glue syntax, !!, as.name, eval(), or other similar calls, regardless of whether the string is saved as a separate variable or called directly with the glue syntax.

It's not arrow specific either as pulling it in as a tibble throws the same error.

its fair enough that you are using glue syntax to make the variable names, yet you would need to use additional syntax to have your constructed text from glue be understood as a symbolic name to be used indirectly rather than to use the literal text you pass.

Consider this artificial example

library(tidyverse)
library(glue)

nlist <- c("foo","bar")

for (i in 1) {
  
  print("string not symbol")
  iris |> mutate( "{nlist[i]}_{i}" := i*Petal.Width) |>
    summarise("{nlist[i]}_{i}" := max("{nlist[i]}_{i}") , .by=Species) |> print()
  
  print("string to symbol")
  iris |> mutate( "{nlist[i]}_{i}" := i*Petal.Width) |>
    summarise("{nlist[i]}_{i}" := max(!!sym(glue("{nlist[i]}_{i}"))) , .by=Species) |> print()
  
  print("string up front")
  varname <- glue("{nlist[i]}_{i}")
  iris |> mutate( {{varname}} := i*Petal.Width) |>
    summarise({{varname}} := max(!!sym(varname)) , .by=Species) |> print()
  
}
2 Likes

Wonderful example that makes perfect sense!

Thanks @nirgrahamuk!

I think I'd take a different approach here.

using your filtercrit and df as defined above, get your regex search strings combined together, then one mutate and a pivot

fc <-
  filtercrit %>%
  mutate(string = 
           if_else(
             as.logical(wildcard),
             # wildcard 1 == starts with
             paste0("^", String), 
             # wildcard 0 == complete string
             paste0("^", String, "$") 
             )
         ) %>%
  group_by(Var_name) %>%
  # get individual regex strings per variable
  summarise(string = paste0(string, collapse = "|")) 

# get that as a vector
criteria <- setNames(fc$string, fc$Var_name)
criteria
#>                       Flag_A                       Flag_B 
#>      "^00A$|^00B|^00C|^00D$" "^00E$|^00F$|^00G|^00H|^00I" 
#>                       Flag_C 
#>     "^00J$|^00K$|^00L$|^00M"


# one mutate call
df %>%
  mutate(
    results = map(String, ~set_names(str_detect(.x, criteria), names(criteria)))
  ) %>%
  # reshape
  unnest_wider(results) %>% 
  head()
#> # A tibble: 6 × 5
#>   ID    String Flag_A Flag_B Flag_C
#>   <chr> <chr>  <lgl>  <lgl>  <lgl> 
#> 1 001   00H1   FALSE  TRUE   FALSE 
#> 2 002   00E7   FALSE  FALSE  FALSE 
#> 3 003   00C5   TRUE   FALSE  FALSE 
#> 4 004   00C3   TRUE   FALSE  FALSE 
#> 5 005   00I1   FALSE  TRUE   FALSE 
#> 6 006   00C    TRUE   FALSE  FALSE
1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.