[Arrow/dplyr] `case_when` fails when `if_else` succeeds

Hi Posit users.

I submitted an issue about this to the arrow devs:

But this is not dissimilar to something I asked in one of my previous questions and I found it interesting that it was behaving one way with if_else and another with case_when and was wondering if anyone had any advice or had seen this before.

I have a large amount of data that needs to be queried in a specific way using a combination of hard (exact matches) and soft (begins with) string matches based on another condition (e.g. value of "class" in another column).

I was recently able to get this working via if_else with many other singular instances where the queries were smaller, but with this larger dataset I need to recruit case_when for a set of larger, conditional mutations based on which "database" the query belongs to.

However, every time I do this, the package seems to throw an error that it cannot be run in arrow and needs to be pulled into R:

FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Foundate = as.Date("1700-01-01"), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(CHard),skip_nulls = TRUE)))) & database == "secondary") ~ EndDate, database == "primary" ~ as.Date(as.integer(BeginDate) + 30), .default = Foundate))

Error: Expression case_when((((arrow_is_in(identity_string, options = list(value_set = Array$create(HardA), ... not supported in Arrow
Call collect() first to pull data into R

I assumed this was some kind of issue with doing a date conversion in arrow, which I realize does not have hard-mapped tidyverse commands for doing mathematical operations on dates (e.g. I need to add 30 days to BeginDate and the like, hence why I used Foundate as an imputed placeholder that would not be possible to remove later.

However, oddly enough, every single one of these other options seem to work:

#Works (adding to date permitted, no case_when)
FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Logdate = if_else(database == "primary", as.Date(as.integer(BeginDate) + 30), EndDate)) |> as_arrow_table()

#Works (no case_when, no math performed on date)
FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Logdate = if_else((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(CHard),skip_nulls = TRUE)))) & database == "secondary"), EndDate, NA)) |> as_arrow_table()

#Works (no case when, adding to date, permits operations on date)
FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Foundate = as.Date("1700-01-01"), Logdate = if_else(database == "primary", as.Date(as.integer(BeginDate) + 30), Foundate)) |> as_arrow_table()


#This Works(HAS case_when, but value for met criteria is an integer, not a date)
> FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Foundate = as.Date("1700-01-01"), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(CHard),skip_nulls = TRUE)))) & database == "secondary") ~ 1, database == "primary" ~ 0, .default = 2)) |> as_arrow_table()


#This Works (no case_when, saves as date with no math)
FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Foundate = as.Date("1700-01-01"), Logdate = if_else((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(CHard),skip_nulls = TRUE)))) & database == "secondary"), EndDate, Foundate)) |> as_arrow_table()

Am I facing some sort of syntax issue or is there a limitation when attempting to set case_when conditions with regards to dates?

Based on a previous thread:

I think it might be another SQL limitation?

Thank you in advance for any suggestions!

It appears this one didn't run either:

FilteredOutput <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(Foundate = as.Date("1700-01-01"), Logflag = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardC),skip_nulls = TRUE)))) & database == "secondary") ~ 1, database == "primary" ~ 2, .default = 0)) |> filter(Logflag > 0) |> mutate(Logdate = if_else(Logflag == 2 ~ (as.numeric(BeginDate)+30), as.numeric(EndDate)))
Error: Expression if_else(Logflag == 2 ~ (as.numeric(BeginDate) + 30), as.numeric(EndDate)) not supported in Arrow
Call collect() first to pull data into R.

In this case, I specifically attempted to cast the date as a numeric value and then convert it later after flagging what needs to be filtered and converted outside of arrow, but there were some subsequent issues when I tried to trim down the size of the dataset. Specifically, every time I tried to either re-cast or pull the date-to-numeric transformed data to do operations on it, it failed to run:

#Works
Filtertab <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, database == "secondary" ~ (BeginDate+30), .default = NA))


#Fails
Filtertab <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, database == "secondary" ~ (BeginDate+30), .default = NA)) |>
mutate(BeginDate = as.date(BeginDate), EndDate = as.date(EndDate), Logdate = as.date(Logdate))
ℹ In index: 2.
ℹ With name: BeginDate.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function cast_double
Run `rlang::last_trace()` to see where the error occurred.

#Fails
Filtertab <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, database == "secondary" ~ (BeginDate+30), .default = NA)) |>
filter((BeginDate <= EndDate) & (EndDate <= Logdate)) |>
group_by(ID,BeginDate) |>
summarize(Logdate = max(Logdate))
ℹ In index: 2.
ℹ With name: BeginDate.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function cast_double
Run `rlang::last_trace()` to see where the error occurred.
	
#Fails
Filtertab <- LargeDataset |> 
select(ID, BeginDate, EndDate, class, identity_string, database) |> 
distinct() |> 
mutate(DDt = as.numeric(BeginDate), BDate = as.numeric(EndDate), Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( identity_string, options = list(value_set = Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set = Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ BDate, database == "secondary" ~ (DDt+30), .default = NA)) |>
filter((DDt <= BDate) & (BDate <= Logdate)) |>
group_by(ID,BeginDate) |>
summarize(Logdate = max(Logdate))
Error in `map()`:
ℹ In index: 7.
ℹ With name: DDt.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function cast_double
Run `rlang::last_trace()` to see where the error occurred.

I think assigning/pulling dates in general may be an issue.

Going to see if I can do a conversion in the original LargeDataset pointer and see if that works.