Okay I gave this some thought and got to this not very great but working(?) solution. There is definitely not dplyr-like function for this and this is the best I got so far. Notice, that you will have to collect()
everything, because this is non-standard filtering.
Basically we define two functions exists_duplicate()
and exists_neighbor()
, that check for a given x
, y
and date
if ther is a duplicate or a neighbor in the next or current date
. We then apply this function rowwise to each row of your dataset. Note that .
in the respective function call passes the whole dataset to the function that is then filtered to the correct date to check for neighbors/duplicates. This also means, that this will be not very efficient so that I doubt you can easily run this on very large datasets.
I am also not completely sure if this solves your problem correctly but might get you started.
Best,
Valentin
# Rows selection by the rule
# function for duplicate detection
exists_duplicate <- function(x,y,date,dates,df,dir = 1){
#dir is either -1 for the previous, or 1 for the next date
if (!dir %in% c(-1,1)){
stop("dir must be -1 or 1")
}
# choose either next or previous date depending on dir
next_date <- dates[which(date == dates) + dir]
# exception for dir == -1 and date == min(dates)
if (length(next_date) == 0){
return(NA)
}
# exception for dir == 1 and date == max(dates)
if (is.na(next_date)){
return(NA)
}
n_dupl <-
df %>%
filter(date == !!next_date,
x == !!x,
y == !!y) %>%
nrow()
n_dupl > 0 # if there are any duplicates, returns TRUE
}
# function for neighbor detection
exists_neighbor <- function(x,y,date,dates,df,dir = 1){
# dir is either 1 for the next date or 0 for the current date
if(!(dir %in% c(1,0))){
stop("dir must be 1 or 0")
}
next_date <- dates[which(date == dates) + dir]
# exception for dir == 1 and date == max(dates)
if (is.na(next_date)){
return(NA)
}
n_neighbor <-
df %>%
filter(date == !!next_date,
abs(x - !! x) <= 0.00015,
abs(y - !! y) <= 0.00015,
!(x == !!x & y == !!y)
) %>%
nrow()
n_neighbor > 0 # if there are any neighbors, returns TRUE
}
data_collected <- dataset %>%
collect()
dates <- dataset %>%
pull(date) %>%
unique() %>%
sort()
data_neighbor <-
data_collected %>%
rowwise() %>%
mutate(has_duplicate = exists_duplicate(x, y, date, !!dates, ., dir = 1)) %>%
mutate(has_neighbor = exists_neighbor(x, y, date, !!dates, ., dir = 1)) %>%
mutate(is_duplicate = exists_duplicate(x, y, date, !!dates, ., dir = -1)) %>%
mutate(is_neighbor = exists_neighbor(x, y, date, !!dates, ., dir = 0))
data_neighbor
data_neighbor %>%
filter(has_duplicate == FALSE | # if there are no duplicates
has_neighbor == TRUE | # if there is a neighbor at date == date + 1
is_neighbor == TRUE # if there is a neighbor at date == date
)