Apologies.
This took a minute to generate.
Imagine the following data:
df <- structure(list(ID = c("001", "002", "003", "004", "005", "006",
"007", "008", "009", "010", "011", "012", "013", "014", "015",
"016", "017", "018", "019", "020", "021", "022", "023", "024",
"025", "026", "027", "028", "029", "030", "031", "032", "033",
"034", "035", "036", "037", "038", "039", "040", "041", "042",
"043", "044", "045", "046", "047", "048", "049", "050", "051",
"052", "053", "054", "055", "056", "057", "058", "059", "060",
"061", "062", "063", "064", "065", "066", "067", "068", "069",
"070", "071", "072", "073", "074", "075", "076", "077", "078",
"079", "080", "081", "082", "083", "084", "085", "086", "087",
"088", "089", "090", "091", "092", "093", "094", "095", "096",
"097", "098", "099", "100"), String = c("00H1", "00E7", "00C5",
"00C3", "00I1", "00C", "00E6", "00M5", "00D3", "00B1", "00C1",
"00G7", "00A5", "00J3", "00G1", "00J", "00F6", "00A5", "00D3",
"00E1", "00J1", "00J7", "00C5", "00G3", "00H1", "00M", "00E6",
"00B5", "00H3", "00D1", "00H1", "00G7", "00J5", "00F3", "00H1",
"00J", "00M6", "00G5", "00L3", "00C1", "00H1", "00E7", "00F5",
"00A3", "00K1", "00J", "00G6", "00F5", "00L3", "00F1", "00H1",
"00L7", "00A5", "00G3", "00E1", "00B", "00J6", "00C5", "00I3",
"00C1", "00B1", "00A7", "00F5", "00K3", "00M1", "00C", "00K6",
"00H5", "00K3", "00I1", "00H1", "00D7", "00K5", "00D3", "00M1",
"00I", "00K6", "00L5", "00A3", "00M1", "00I1", "00F7", "00E5",
"00H3", "00E1", "00I", "00D6", "00J5", "00K3", "00D1", "00A1",
"00D7", "00E5", "00D3", "00H1", "00J", "00C6", "00M5", "00E3",
"00I1")), row.names = c(NA, -100L), class = "data.frame")
With 0/1 binarization based upon the presence or absence of a string, with either an exact match or beginning with said string (wildcard in this case == 0 for exact matches and 1 for starts_with):
filtercrit <- structure(list(Var_name = c("Flag_A", "Flag_A", "Flag_A", "Flag_A",
"Flag_B", "Flag_B", "Flag_B", "Flag_B", "Flag_B", "Flag_C", "Flag_C",
"Flag_C", "Flag_C"), String = c("00A", "00B", "00C", "00D", "00E",
"00F", "00G", "00H", "00I", "00J", "00K", "00L", "00M"), wildcard = c(0L,
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
In theory, I would make a command like this one to individually call each Flag_X variable and set it to 1 or 0 based on the presence or absence of the partial or exact strings like so:
df2 <- df |> mutate(
Flag_A = if_else(((
String %in% (
filtercrit |> filter(Var_name == "Flag_A", wildcard == 0) |> pull(String)
)
) |
(
str_starts(String, paste((
filtercrit |> filter(Var_name == "Flag_A", wildcard != 0) |> pull(String)
), collapse = "|"))
)), 1, 0, missing = 0),
Flag_B = if_else(((
String %in% (
filtercrit |> filter(Var_name == "Flag_B", wildcard == 0) |> pull(String)
)
) |
(
str_starts(String, paste((
filtercrit |> filter(Var_name == "Flag_B", wildcard != 0) |> pull(String)
), collapse = "|"))
)), 1, 0, missing = 0),
Flag_C = if_else(((
String %in% (
filtercrit |> filter(Var_name == "Flag_C", wildcard == 0) |> pull(String)
)
) |
(
str_starts(String, paste((
filtercrit |> filter(Var_name == "Flag_C", wildcard != 0) |> pull(String)
), collapse = "|"))
)), 1, 0, missing = 0)
)
But in reality I have well over 70 variables I need to do this with so hand-writing every if_else
is not ideal.
I tried the solution in the Stack link of using the glue
syntax, and this works in base R
/tidyr
:
df2 <- df
for(n in 1:length(unique(filtercrit$Var_name))) {
Temp <- filtercrit |> filter(Var_name == unique(filtercrit$Var_name)[n])
Wildcrit <- paste((Temp |> filter(wildcard == 1) |> pull(String)), collapse = "|")
Hardcrit <- Temp |> filter(wildcard != 1) |> pull(String)
if (Wildcrit == "") {
df2 <- df2 |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else((String %in% Hardcrit), 1, 0, missing =
0))
} else
df2 <- df2 |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else(((
String %in% Hardcrit
) | (
str_starts(String, Wildcrit)
)), 1, 0, missing = 0))
}
And even in the arrow workflow I run it seems to work:
df_arrow <- as_arrow_table(df)
for(n in 1:length(unique(filtercrit$Var_name))) {
Temp <- filtercrit |> filter(Var_name == unique(filtercrit$Var_name)[n])
Wildcrit <- paste((Temp |> filter(wildcard == 1) |> pull(String)), collapse = "|")
Hardcrit <- Temp |> filter(wildcard != 1) |> pull(String)
if (Wildcrit == "") {
df_arrow <- df_arrow |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else((String %in% Hardcrit), 1, 0, missing =
0))
} else
df_arrow <- df_arrow |> mutate("{unique(filtercrit$Var_name)[n]}" := if_else(((
String %in% Hardcrit
) | (
str_starts(String, Wildcrit)
)), 1, 0, missing = 0))
}
However, in my actual workflow, it keeps throwing an error:
subsample_data <- reference_data |> mutate("{unique(Vartable$variable)[n]}" := if_else((Class == "A" &
String %in% HardA) |
(Class == "B" &
String %in% HardB) |
(Class == "C" &
String %in% HardC) |
(Class == "D" &
String %in% HardD),
1,
0,
missing = 0
)) |> group_by(ID1, ID2) |> summarize("{unique(Vartable$variable)[n]}" := max("{unique(Vartable$variable)[n]}")) |> ungroup() |> right_join(subsample_data, by = c("ID1", "ID2"))
> rlang::last_trace()
<error/purrr_error_indexed>
Error in `map()`:
ℹ In index: 5.
ℹ With name: Flag_A.
Caused by error:
! Type error: Array type doesn't match type of values set: string vs struct<variable: string, Class: string, String: string, wildcard: double>
---
Backtrace:
▆
1. ├─dplyr::right_join(...)
2. ├─dplyr::ungroup(...)
3. ├─dplyr::summarize(...)
4. ├─dplyr::group_by(...)
5. └─arrow:::group_by.arrow_dplyr_query(...)
6. ├─dplyr::mutate(.data, !!!final_groups)
7. └─arrow:::mutate.arrow_dplyr_query(.data, !!!final_groups)
8. └─arrow:::compute_by(...)
9. └─arrow:::eval_select_by(by, data, error_call = error_call)
10. ├─base::as.data.frame(implicit_schema(data))
11. └─arrow:::implicit_schema(data)
12. └─purrr::map(.data$selected_columns, ~.$type(old_schm))
13. └─purrr:::map_("list", .x, .f, ..., .progress = .progress)
14. ├─purrr:::with_indexed_errors(...)
15. │ └─base::withCallingHandlers(...)
16. ├─purrr:::call_with_cleanup(...)
17. └─arrow (local) .f(.x[[i]], ...)
18. └─.$type(old_schm)
19. └─arrow:::compute___expr__type(self, schema)
Run rlang::last_trace(drop = FALSE) to see 4 hidden frames.
This looks like an arrow
specific schema disagreement of some kind.