Quickly eliminate rows in a data frame grouped by variable X, based on "change" points in variable Y

Andrea · September 10, 2018, 3:32pm

My problem is a little bit difficult to explain. I have a dataframe my_df, grouped by variable character. I also have some more columns: in particular, I have a last_event column which retains the last event happened, and a column date_time. For each character, I want to find the "chain of events" which happened to her/him. This means looking at the last_event column and store only the rows where last_event changes values, i.e., the times where an event happens, while quickly removing all the other rows. Hopefully the example will make things more clear:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tibble)

# set seed for reproducibility
set.seed(2)

create_input_dataframe <- function(){
  # generate input data frame
  character <- c("Jake", "Elwood", "Sister Mary")
  character <- factor(character, levels = character)
  m <- length(character)
  n <- 10^4
  ntot <- n * m
  stop_date <- now()
  period <- minutes(ntot) 
  start_date <- stop_date - period
  date_time <- seq(start_date, stop_date, length.out = ntot)
  event <- c("There was an earthquake", "A terrible flood", "Locusts")
  event <- factor(event, levels = event)
  x <- runif(ntot)
  y <- rnorm(ntot)
  probabilities <- rev(seq(0.1, 0.5, length.out = m))
  
  my_df <- data.frame(character  = sample(character, ntot, replace = TRUE, 
                                          prob = probabilities), 
                      last_event = sample(event, ntot, replace = TRUE),
                      x = x, 
                      y = y,
                      date_time = date_time)
  my_df$last_event[sample(seq_len(ntot), n/10)] <- NA
  return(my_df)
}
create_first_10_rows_of_output <- function(){
  top <- structure(list(rowname = c("1", "2", "4", "7", "10", "12", "16", 
  "17", "19", "20"), character = structure(c(1L, 1L, 1L, 1L, 1L, 
  1L, 1L, 1L, 1L, 1L), .Label = c("Jake", "Elwood", "Sister Mary"
  ), class = "factor"), last_event = structure(c(1L, 3L, 1L, 2L, 
  1L, 2L, 3L, 2L, NA, 3L), .Label = c("There was an earthquake", 
  "A terrible flood", "Locusts"), class = "factor"), x = c(0.18488225992769, 
  0.943839338840917, 0.833448815625161, 0.23889475944452, 0.976398489437997, 
  0.387549542589113, 0.962644048267975, 0.132372003281489, 0.868861036840826, 
  0.514281762996688), y = c(-0.820686814466583, -0.143422434763756, 
  -0.264525547407185, -1.47082074646914, -0.142277039055211, 0.0521512918828832, 
  0.336463740931448, -0.5464779779466, 1.82319608312785, -0.218062867744612
  ), date_time = structure(c(1534790107.19217, 1534790347.20017, 
  1534790527.20617, 1534790767.21417, 1534791067.22417, 1534791367.23417, 
  1534791787.24817, 1534791847.25017, 1534792087.25818, 1534792147.26018
  ), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(1L, 
  2L, 4L, 7L, 10L, 12L, 16L, 17L, 19L, 20L), class = "data.frame")
  return(top)
}

# create dataframe
my_df <- create_input_dataframe() 
# show the first 20 rows of the test dataframe
head(my_df, 20)
#>      character              last_event          x          y
#> 1         Jake There was an earthquake 0.18488226 -0.8206868
#> 2  Sister Mary        A terrible flood 0.70237404  0.1662410
#> 3  Sister Mary        A terrible flood 0.57332633  0.1747081
#> 4       Elwood                    <NA> 0.16805192  1.0416555
#> 5         Jake                 Locusts 0.94383934 -0.1434224
#> 6       Elwood        A terrible flood 0.94347496  1.2078019
#> 7         Jake There was an earthquake 0.12915898  0.1982122
#> 8         Jake There was an earthquake 0.83344882 -0.2645255
#> 9         Jake        A terrible flood 0.46801852 -0.1216778
#> 10        Jake        A terrible flood 0.54998374  1.9110023
#> 11      Elwood There was an earthquake 0.55267407 -0.4122948
#> 12        Jake        A terrible flood 0.23889476 -1.4708207
#> 13      Elwood There was an earthquake 0.76051331 -1.1537385
#> 14        Jake There was an earthquake 0.18082010  2.2176152
#> 15      Elwood        A terrible flood 0.40528218 -1.2535423
#> 16        Jake There was an earthquake 0.85354845  0.9300886
#> 17        Jake There was an earthquake 0.97639849 -0.1422770
#> 18      Elwood        A terrible flood 0.22582546  1.6098654
#> 19 Sister Mary        A terrible flood 0.44480923 -1.0891327
#> 20      Elwood        A terrible flood 0.07497942  1.4692903
#>              date_time
#> 1  2018-08-20 21:36:24
#> 2  2018-08-20 21:37:24
#> 3  2018-08-20 21:38:24
#> 4  2018-08-20 21:39:24
#> 5  2018-08-20 21:40:24
#> 6  2018-08-20 21:41:24
#> 7  2018-08-20 21:42:24
#> 8  2018-08-20 21:43:24
#> 9  2018-08-20 21:44:24
#> 10 2018-08-20 21:45:24
#> 11 2018-08-20 21:46:24
#> 12 2018-08-20 21:47:24
#> 13 2018-08-20 21:48:24
#> 14 2018-08-20 21:49:24
#> 15 2018-08-20 21:50:24
#> 16 2018-08-20 21:51:24
#> 17 2018-08-20 21:52:24
#> 18 2018-08-20 21:53:24
#> 19 2018-08-20 21:54:24
#> 20 2018-08-20 21:55:24

# show the first 10 rows of the desired output
(top <- create_first_10_rows_of_output())
#>    rowname character              last_event         x           y
#> 1        1      Jake There was an earthquake 0.1848823 -0.82068681
#> 2        2      Jake                 Locusts 0.9438393 -0.14342243
#> 4        4      Jake There was an earthquake 0.8334488 -0.26452555
#> 7        7      Jake        A terrible flood 0.2388948 -1.47082075
#> 10      10      Jake There was an earthquake 0.9763985 -0.14227704
#> 12      12      Jake        A terrible flood 0.3875495  0.05215129
#> 16      16      Jake                 Locusts 0.9626440  0.33646374
#> 17      17      Jake        A terrible flood 0.1323720 -0.54647798
#> 19      19      Jake                    <NA> 0.8688610  1.82319608
#> 20      20      Jake                 Locusts 0.5142818 -0.21806287
#>              date_time
#> 1  2018-08-20 20:35:07
#> 2  2018-08-20 20:39:07
#> 4  2018-08-20 20:42:07
#> 7  2018-08-20 20:46:07
#> 10 2018-08-20 20:51:07
#> 12 2018-08-20 20:56:07
#> 16 2018-08-20 21:03:07
#> 17 2018-08-20 21:04:07
#> 19 2018-08-20 21:08:07
#> 20 2018-08-20 21:09:07

Created on 2018-09-10 by the reprex package (v0.2.0).

You don't need to look at the functions create_input_dataframe and create_first_10_rows_of_output. You can just concentrate on the input dataframe my_df (the first 20 rows are also printed) and on the output dataframe: since building the whole output dataframe would have been too long, I only show the first 10 rows (dataframe top). As you can see, the output has the following characteristics:

it's ordered by character: this is not strictly a requirement, I can always arrange(character)
for each character, two consecutive rows in last_event never have the same value: only the rows corresponding to a change in last_event have been retained, while all the others have been removed

This is the output I need (of course, not just the first 10 lines, but all of it). Thanks!

PS the reason why I didn't build a smaller example is that the solution must be fast - I need to work on larg-ish dataframes ( ~ 10^6 x 100 ), thus a slow solution won't cut it.

cderv · September 10, 2018, 4:19pm

Unfortunately, there is something off in your example as it don't seem that the date and time are coherent - I can't locate the rows you want to keep.

However, based on what you described, here is a proposition

my_df %>%
  # for nice printing
  as_tibble() %>%
  # operate by character
  group_by(character) %>%
  # order the event date by character
  arrange(date_time, .by_group = TRUE) %>%
  # locate change of event
  mutate(change = last_event != lead(last_event, default = TRUE)) %>%
  # keep only change point
  filter(change) %>%
  # do not keep dummy variable
  select(-change)

lead and lag functions can be useful to locate rows where previous or following value are not equal - so where point are changing.

Can you see if it fits what you want to do ? thanks.

aosmith · September 10, 2018, 4:31pm

I also couldn't quite get your example to work.

I've used diff() for this sort of work before, to calculate the difference between the observation and the prior observation. This is very similar to what @cderv did. You'd discard everything where the difference is 0.

The factor is treated as an integer by diff().

I did everything directly in filter(), but you could also do this by adding a new variable in mutate(), filtering, and then removing the extra variable via select().

my_df %>%
     group_by(character) %>%
     arrange(character, date_time) %>%
     filter(c(NA, diff(last_event)) != 0)

Andrea · September 10, 2018, 4:34pm

@cderv thanks! Preparing the example required a lot of manual cut & paste, because I don't have an algorithm to produce the output I want, so it's possible there are a few errors. I'll check again and let you know.

Andrea · September 10, 2018, 4:58pm

@cderv and @aosmith - I modified my example. Please see if it works now!

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tibble)

# set seed for reproducibility
set.seed(2)

create_input_dataframe <- function(){
  # generate input data frame
  character <- c("Jake", "Elwood", "Sister Mary")
  character <- factor(character, levels = character)
  m <- length(character)
  n <- 10^4
  ntot <- n * m
  stop_date <- now()
  period <- minutes(ntot) 
  start_date <- stop_date - period
  date_time <- seq(start_date, stop_date, length.out = ntot)
  event <- c("There was an earthquake", "A terrible flood", "Locusts")
  event <- factor(event, levels = event)
  x <- runif(ntot)
  y <- rnorm(ntot)
  probabilities <- rev(seq(0.1, 0.5, length.out = m))
  
  my_df <- data.frame(character  = sample(character, ntot, replace = TRUE, 
                                          prob = probabilities), 
                      last_event = sample(event, ntot, replace = TRUE),
                      x = x, 
                      y = y,
                      date_time = date_time)
  my_df$last_event[sample(seq_len(ntot), n/10)] <- NA
  return(my_df)
}

# create dataframe
my_df <- create_input_dataframe() 
# show the first 20 rows of the test dataframe
head(my_df, 20)
#>      character              last_event          x          y
#> 1         Jake There was an earthquake 0.18488226 -0.8206868
#> 2  Sister Mary        A terrible flood 0.70237404  0.1662410
#> 3  Sister Mary        A terrible flood 0.57332633  0.1747081
#> 4       Elwood                    <NA> 0.16805192  1.0416555
#> 5         Jake                 Locusts 0.94383934 -0.1434224
#> 6       Elwood        A terrible flood 0.94347496  1.2078019
#> 7         Jake There was an earthquake 0.12915898  0.1982122
#> 8         Jake There was an earthquake 0.83344882 -0.2645255
#> 9         Jake        A terrible flood 0.46801852 -0.1216778
#> 10        Jake        A terrible flood 0.54998374  1.9110023
#> 11      Elwood There was an earthquake 0.55267407 -0.4122948
#> 12        Jake        A terrible flood 0.23889476 -1.4708207
#> 13      Elwood There was an earthquake 0.76051331 -1.1537385
#> 14        Jake There was an earthquake 0.18082010  2.2176152
#> 15      Elwood        A terrible flood 0.40528218 -1.2535423
#> 16        Jake There was an earthquake 0.85354845  0.9300886
#> 17        Jake There was an earthquake 0.97639849 -0.1422770
#> 18      Elwood        A terrible flood 0.22582546  1.6098654
#> 19 Sister Mary        A terrible flood 0.44480923 -1.0891327
#> 20      Elwood        A terrible flood 0.07497942  1.4692903
#>              date_time
#> 1  2018-08-20 22:57:07
#> 2  2018-08-20 22:58:07
#> 3  2018-08-20 22:59:07
#> 4  2018-08-20 23:00:07
#> 5  2018-08-20 23:01:07
#> 6  2018-08-20 23:02:07
#> 7  2018-08-20 23:03:07
#> 8  2018-08-20 23:04:07
#> 9  2018-08-20 23:05:07
#> 10 2018-08-20 23:06:07
#> 11 2018-08-20 23:07:07
#> 12 2018-08-20 23:08:07
#> 13 2018-08-20 23:09:07
#> 14 2018-08-20 23:10:07
#> 15 2018-08-20 23:11:07
#> 16 2018-08-20 23:12:07
#> 17 2018-08-20 23:13:07
#> 18 2018-08-20 23:14:07
#> 19 2018-08-20 23:15:07
#> 20 2018-08-20 23:16:07

# this is the output I would desire (first 10 rows)
top <- my_df %>% 
  arrange(character) %>%
  slice(1:30)

(top <- top[-c(4, 6, 7, 9, 10, 12, 14, 15, 16, 19, 22, 26), ])
#>    character              last_event          x           y
#> 1       Jake There was an earthquake 0.18488226 -0.82068681
#> 2       Jake                 Locusts 0.94383934 -0.14342243
#> 3       Jake There was an earthquake 0.12915898  0.19821219
#> 5       Jake        A terrible flood 0.46801852 -0.12167783
#> 8       Jake There was an earthquake 0.18082010  2.21761525
#> 11      Jake        A terrible flood 0.66189876  0.19113983
#> 13      Jake                 Locusts 0.48877323 -1.88515790
#> 17      Jake        A terrible flood 0.13237200 -0.54647798
#> 18      Jake                    <NA> 0.01041453 -2.23395693
#> 20      Jake                 Locusts 0.51428176 -0.21806287
#> 21      Jake There was an earthquake 0.62719629  0.60132136
#> 23      Jake                 Locusts 0.98172786 -1.41643590
#> 24      Jake There was an earthquake 0.29701074  1.12819700
#> 25      Jake                 Locusts 0.16320087  0.07986385
#> 27      Jake There was an earthquake 0.81039726 -0.73493265
#> 28      Jake        A terrible flood 0.68340342 -0.61868532
#> 29      Jake There was an earthquake 0.92972022  1.09741911
#> 30      Jake                 Locusts 0.27540120  0.37115056
#>              date_time
#> 1  2018-08-20 22:57:07
#> 2  2018-08-20 23:01:07
#> 3  2018-08-20 23:03:07
#> 5  2018-08-20 23:05:07
#> 8  2018-08-20 23:10:07
#> 11 2018-08-20 23:17:07
#> 13 2018-08-20 23:22:07
#> 17 2018-08-20 23:26:07
#> 18 2018-08-20 23:27:07
#> 20 2018-08-20 23:31:07
#> 21 2018-08-20 23:32:07
#> 23 2018-08-20 23:37:07
#> 24 2018-08-20 23:38:07
#> 25 2018-08-20 23:40:07
#> 27 2018-08-20 23:46:07
#> 28 2018-08-20 23:49:07
#> 29 2018-08-20 23:50:07
#> 30 2018-08-20 23:51:07

Created on 2018-09-10 by the reprex package (v0.2.0).

Andrea · September 10, 2018, 9:57pm

@aosmith I love the simplicity of your solution! Note that I added a ungroup() final command, because I think it's good practice to always remove any grouping created in a data frame, when it's not needed anymore. I tested your approach and it nearly works! But it still needs some fixes:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tibble)

# set seed for reproducibility
set.seed(2)

create_input_dataframe <- function(){
  # generate input data frame
  character <- c("Jake", "Elwood", "Sister Mary")
  character <- factor(character, levels = character)
  m <- length(character)
  n <- 10^4
  ntot <- n * m
  stop_date <- now()
  period <- minutes(ntot) 
  start_date <- stop_date - period
  date_time <- seq(start_date, stop_date, length.out = ntot)
  event <- c("There was an earthquake", "A terrible flood", "Locusts")
  event <- factor(event, levels = event)
  x <- runif(ntot)
  y <- rnorm(ntot)
  probabilities <- rev(seq(0.1, 0.5, length.out = m))
  
  my_df <- data.frame(character  = sample(character, ntot, replace = TRUE, 
                                          prob = probabilities), 
                      last_event = sample(event, ntot, replace = TRUE),
                      x = x, 
                      y = y,
                      date_time = date_time)
  my_df$last_event[sample(seq_len(ntot), n/10)] <- NA
  return(my_df)
}

# create dataframe
my_df <- create_input_dataframe() 


top <- my_df %>% 
  arrange(character) %>%
  slice(1:30) %>%
  as_tibble()
# this is the output I would desire (first 20 rows)
(top <- top[-c(4, 6, 7, 9, 10, 12, 14, 15, 16, 19, 22, 26), ])
#> # A tibble: 18 x 5
#>    character last_event                   x       y date_time          
#>    <fct>     <fct>                    <dbl>   <dbl> <dttm>             
#>  1 Jake      There was an earthquake 0.185  -0.821  2018-08-21 03:54:42
#>  2 Jake      Locusts                 0.944  -0.143  2018-08-21 03:58:42
#>  3 Jake      There was an earthquake 0.129   0.198  2018-08-21 04:00:42
#>  4 Jake      A terrible flood        0.468  -0.122  2018-08-21 04:02:42
#>  5 Jake      There was an earthquake 0.181   2.22   2018-08-21 04:07:42
#>  6 Jake      A terrible flood        0.662   0.191  2018-08-21 04:14:42
#>  7 Jake      Locusts                 0.489  -1.89   2018-08-21 04:19:42
#>  8 Jake      A terrible flood        0.132  -0.546  2018-08-21 04:23:42
#>  9 Jake      <NA>                    0.0104 -2.23   2018-08-21 04:24:42
#> 10 Jake      Locusts                 0.514  -0.218  2018-08-21 04:28:42
#> 11 Jake      There was an earthquake 0.627   0.601  2018-08-21 04:29:42
#> 12 Jake      Locusts                 0.982  -1.42   2018-08-21 04:34:42
#> 13 Jake      There was an earthquake 0.297   1.13   2018-08-21 04:35:42
#> 14 Jake      Locusts                 0.163   0.0799 2018-08-21 04:37:42
#> 15 Jake      There was an earthquake 0.810  -0.735  2018-08-21 04:43:42
#> 16 Jake      A terrible flood        0.683  -0.619  2018-08-21 04:46:42
#> 17 Jake      There was an earthquake 0.930   1.10   2018-08-21 04:47:42
#> 18 Jake      Locusts                 0.275   0.371  2018-08-21 04:48:42

# aosmith solution 
out_df <- my_df %>%
  group_by(character) %>%
  arrange(character, date_time) %>%
  filter(c(NA, diff(last_event)) != 0) %>%
  ungroup()

# compare aosmith solution and desired result
out_df[1:20, ]
#> # A tibble: 20 x 5
#>    character last_event                  x       y date_time          
#>    <fct>     <fct>                   <dbl>   <dbl> <dttm>             
#>  1 Jake      Locusts                 0.944 -0.143  2018-08-21 03:58:42
#>  2 Jake      There was an earthquake 0.129  0.198  2018-08-21 04:00:42
#>  3 Jake      A terrible flood        0.468 -0.122  2018-08-21 04:02:42
#>  4 Jake      There was an earthquake 0.181  2.22   2018-08-21 04:07:42
#>  5 Jake      A terrible flood        0.662  0.191  2018-08-21 04:14:42
#>  6 Jake      Locusts                 0.489 -1.89   2018-08-21 04:19:42
#>  7 Jake      A terrible flood        0.132 -0.546  2018-08-21 04:23:42
#>  8 Jake      There was an earthquake 0.627  0.601  2018-08-21 04:29:42
#>  9 Jake      Locusts                 0.982 -1.42   2018-08-21 04:34:42
#> 10 Jake      There was an earthquake 0.297  1.13   2018-08-21 04:35:42
#> 11 Jake      Locusts                 0.163  0.0799 2018-08-21 04:37:42
#> 12 Jake      There was an earthquake 0.810 -0.735  2018-08-21 04:43:42
#> 13 Jake      A terrible flood        0.683 -0.619  2018-08-21 04:46:42
#> 14 Jake      There was an earthquake 0.930  1.10   2018-08-21 04:47:42
#> 15 Jake      Locusts                 0.275  0.371  2018-08-21 04:48:42
#> 16 Jake      A terrible flood        0.786 -1.09   2018-08-21 04:50:42
#> 17 Jake      There was an earthquake 0.989  2.01   2018-08-21 04:51:42
#> 18 Jake      A terrible flood        0.770 -1.46   2018-08-21 04:54:42
#> 19 Jake      There was an earthquake 0.260 -0.960  2018-08-21 04:57:42
#> 20 Jake      Locusts                 0.388 -0.596  2018-08-21 05:00:42
top
#> # A tibble: 18 x 5
#>    character last_event                   x       y date_time          
#>    <fct>     <fct>                    <dbl>   <dbl> <dttm>             
#>  1 Jake      There was an earthquake 0.185  -0.821  2018-08-21 03:54:42
#>  2 Jake      Locusts                 0.944  -0.143  2018-08-21 03:58:42
#>  3 Jake      There was an earthquake 0.129   0.198  2018-08-21 04:00:42
#>  4 Jake      A terrible flood        0.468  -0.122  2018-08-21 04:02:42
#>  5 Jake      There was an earthquake 0.181   2.22   2018-08-21 04:07:42
#>  6 Jake      A terrible flood        0.662   0.191  2018-08-21 04:14:42
#>  7 Jake      Locusts                 0.489  -1.89   2018-08-21 04:19:42
#>  8 Jake      A terrible flood        0.132  -0.546  2018-08-21 04:23:42
#>  9 Jake      <NA>                    0.0104 -2.23   2018-08-21 04:24:42
#> 10 Jake      Locusts                 0.514  -0.218  2018-08-21 04:28:42
#> 11 Jake      There was an earthquake 0.627   0.601  2018-08-21 04:29:42
#> 12 Jake      Locusts                 0.982  -1.42   2018-08-21 04:34:42
#> 13 Jake      There was an earthquake 0.297   1.13   2018-08-21 04:35:42
#> 14 Jake      Locusts                 0.163   0.0799 2018-08-21 04:37:42
#> 15 Jake      There was an earthquake 0.810  -0.735  2018-08-21 04:43:42
#> 16 Jake      A terrible flood        0.683  -0.619  2018-08-21 04:46:42
#> 17 Jake      There was an earthquake 0.930   1.10   2018-08-21 04:47:42
#> 18 Jake      Locusts                 0.275   0.371  2018-08-21 04:48:42

Created on 2018-09-10 by the reprex package (v0.2.0).

As you can see, the two results are very similar. There are only two issues:

the first row is missing: actually, the first row for each group, i.e., for each character. However, this is not a big issue: of course, it would be better if the first row was retained, but given the complexity of the task, I can accept an approximate solution.
NA results are removed! This is not ok: I need them. You can see that rows 9 and 10 of my desired output are missing in your solution.
How can I fix this?

@cderv your solution seems to return a completely different result from that of @aosmith (and from my desired output). I'll try to understand why tomorrow.

Thanks a lot to both of you for the support!

cderv · September 10, 2018, 10:06pm

I think this is because I misunderstood your first example as I used a lead and not a lag. It semés you need to keep in a sequence the first event not the last. If needed I will try to start from your second example. However the approach of @aosmith is exactly the same but use the base diff function. So if it suits you, this is fine.

Dealing with NA is tricky and you may need to deal with them separately.

aosmith · September 10, 2018, 10:29pm

Oh, yes, I probably should have not used NA for the first value of the diff() vector and used something like 1 instead.

In terms of your NA values, I was wondering if they would be a problem. One option is to add the NA as a factor level so they are easier to work with with diff() or lag(). You can do this with addNA() as shown here.

Andrea · September 11, 2018, 9:12am

Forget my last post (now deleted), it was nonsense. Writing a better reply soon.

Andrea · September 11, 2018, 9:19am

Check it out! I was able to fix the NA issue and the missing first row: now my expected output and the actual output are identical

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tibble)
library(forcats)

# set seed for reproducibility
set.seed(2)

create_input_dataframe <- function(){
  # generate input data frame
  character <- c("Jake", "Elwood", "Sister Mary")
  character <- factor(character, levels = character)
  m <- length(character)
  n <- 10^4
  ntot <- n * m
  stop_date <- now()
  period <- minutes(ntot) 
  start_date <- stop_date - period
  date_time <- seq(start_date, stop_date, length.out = ntot)
  event <- c("There was an earthquake", "A terrible flood", "Locusts")
  event <- factor(event, levels = event)
  x <- runif(ntot)
  y <- rnorm(ntot)
  probabilities <- rev(seq(0.1, 0.5, length.out = m))
  
  my_df <- data.frame(character  = sample(character, ntot, replace = TRUE, 
                                          prob = probabilities), 
                      last_event = sample(event, ntot, replace = TRUE),
                      x = x, 
                      y = y,
                      date_time = date_time)
  my_df$last_event[sample(seq_len(ntot), n/10)] <- NA
  return(my_df)
}

# create dataframe
my_df <- create_input_dataframe() 


top <- my_df %>%
  arrange(character) %>%
  slice(1:30) %>%
  as_tibble()
# this is the output I would desire (first 20 rows)
(top <- top[-c(4, 6, 7, 9, 10, 12, 14, 15, 16, 19, 22, 26), ])
#> # A tibble: 18 x 5
#>    character last_event                   x       y date_time          
#>    <fct>     <fct>                    <dbl>   <dbl> <dttm>             
#>  1 Jake      There was an earthquake 0.185  -0.821  2018-08-21 15:17:50
#>  2 Jake      Locusts                 0.944  -0.143  2018-08-21 15:21:50
#>  3 Jake      There was an earthquake 0.129   0.198  2018-08-21 15:23:50
#>  4 Jake      A terrible flood        0.468  -0.122  2018-08-21 15:25:50
#>  5 Jake      There was an earthquake 0.181   2.22   2018-08-21 15:30:50
#>  6 Jake      A terrible flood        0.662   0.191  2018-08-21 15:37:50
#>  7 Jake      Locusts                 0.489  -1.89   2018-08-21 15:42:50
#>  8 Jake      A terrible flood        0.132  -0.546  2018-08-21 15:46:50
#>  9 Jake      <NA>                    0.0104 -2.23   2018-08-21 15:47:50
#> 10 Jake      Locusts                 0.514  -0.218  2018-08-21 15:51:50
#> 11 Jake      There was an earthquake 0.627   0.601  2018-08-21 15:52:50
#> 12 Jake      Locusts                 0.982  -1.42   2018-08-21 15:57:50
#> 13 Jake      There was an earthquake 0.297   1.13   2018-08-21 15:58:50
#> 14 Jake      Locusts                 0.163   0.0799 2018-08-21 16:00:50
#> 15 Jake      There was an earthquake 0.810  -0.735  2018-08-21 16:06:50
#> 16 Jake      A terrible flood        0.683  -0.619  2018-08-21 16:09:50
#> 17 Jake      There was an earthquake 0.930   1.10   2018-08-21 16:10:50
#> 18 Jake      Locusts                 0.275   0.371  2018-08-21 16:11:50

# aosmith solution 
out_df <- my_df %>%
  mutate(last_event = fct_explicit_na(last_event)) %>%
  group_by(character) %>%
  arrange(date_time, .by_group = TRUE) %>%
  filter(c(1, diff(last_event)) != 0) %>%
  ungroup() %>%
  identity()

# compare aosmith solution and desired result
print(out_df[1:18, ])
#> # A tibble: 18 x 5
#>    character last_event                   x       y date_time          
#>    <fct>     <fct>                    <dbl>   <dbl> <dttm>             
#>  1 Jake      There was an earthquake 0.185  -0.821  2018-08-21 15:17:50
#>  2 Jake      Locusts                 0.944  -0.143  2018-08-21 15:21:50
#>  3 Jake      There was an earthquake 0.129   0.198  2018-08-21 15:23:50
#>  4 Jake      A terrible flood        0.468  -0.122  2018-08-21 15:25:50
#>  5 Jake      There was an earthquake 0.181   2.22   2018-08-21 15:30:50
#>  6 Jake      A terrible flood        0.662   0.191  2018-08-21 15:37:50
#>  7 Jake      Locusts                 0.489  -1.89   2018-08-21 15:42:50
#>  8 Jake      A terrible flood        0.132  -0.546  2018-08-21 15:46:50
#>  9 Jake      (Missing)               0.0104 -2.23   2018-08-21 15:47:50
#> 10 Jake      Locusts                 0.514  -0.218  2018-08-21 15:51:50
#> 11 Jake      There was an earthquake 0.627   0.601  2018-08-21 15:52:50
#> 12 Jake      Locusts                 0.982  -1.42   2018-08-21 15:57:50
#> 13 Jake      There was an earthquake 0.297   1.13   2018-08-21 15:58:50
#> 14 Jake      Locusts                 0.163   0.0799 2018-08-21 16:00:50
#> 15 Jake      There was an earthquake 0.810  -0.735  2018-08-21 16:06:50
#> 16 Jake      A terrible flood        0.683  -0.619  2018-08-21 16:09:50
#> 17 Jake      There was an earthquake 0.930   1.10   2018-08-21 16:10:50
#> 18 Jake      Locusts                 0.275   0.371  2018-08-21 16:11:50
print(top)
#> # A tibble: 18 x 5
#>    character last_event                   x       y date_time          
#>    <fct>     <fct>                    <dbl>   <dbl> <dttm>             
#>  1 Jake      There was an earthquake 0.185  -0.821  2018-08-21 15:17:50
#>  2 Jake      Locusts                 0.944  -0.143  2018-08-21 15:21:50
#>  3 Jake      There was an earthquake 0.129   0.198  2018-08-21 15:23:50
#>  4 Jake      A terrible flood        0.468  -0.122  2018-08-21 15:25:50
#>  5 Jake      There was an earthquake 0.181   2.22   2018-08-21 15:30:50
#>  6 Jake      A terrible flood        0.662   0.191  2018-08-21 15:37:50
#>  7 Jake      Locusts                 0.489  -1.89   2018-08-21 15:42:50
#>  8 Jake      A terrible flood        0.132  -0.546  2018-08-21 15:46:50
#>  9 Jake      <NA>                    0.0104 -2.23   2018-08-21 15:47:50
#> 10 Jake      Locusts                 0.514  -0.218  2018-08-21 15:51:50
#> 11 Jake      There was an earthquake 0.627   0.601  2018-08-21 15:52:50
#> 12 Jake      Locusts                 0.982  -1.42   2018-08-21 15:57:50
#> 13 Jake      There was an earthquake 0.297   1.13   2018-08-21 15:58:50
#> 14 Jake      Locusts                 0.163   0.0799 2018-08-21 16:00:50
#> 15 Jake      There was an earthquake 0.810  -0.735  2018-08-21 16:06:50
#> 16 Jake      A terrible flood        0.683  -0.619  2018-08-21 16:09:50
#> 17 Jake      There was an earthquake 0.930   1.10   2018-08-21 16:10:50
#> 18 Jake      Locusts                 0.275   0.371  2018-08-21 16:11:50

Andrea · September 11, 2018, 9:24am

PS I hope the Blues Brother reference in my example was not lost!