Ok, I considerably simplified my Minimal Reproducible Example:
- only 2 engines, the minimum necessary
- simpler/more readable dates: 10 days of operation for both engines, same time stamps for both engines.
- At a sampling rate of 3 samples/day, 10 days of operation would make 30 observations for engine, but we have 10 skipped samples -> 20 observations for engine. The shorter data frame makes it easier to double check the solution by hand
- "days" here indicates a period of time (not calendar dates), rounded down to the closest integer (as indicated by the
floor
function)
- the days counter should be only reset on rows 12 (>1 day break), 17 (>2 days break), 21 (change from engine A to engine B), 32 and 37 (same dates as for engine A, but for engine B)
Here's the calculated_output
and the expected_output
: as you can see, they don't correspond
library(lubridate, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(tibble, warn.conflicts = FALSE)
# tibble printing options
options(tibble.print_min = 100, tibble.width = Inf)
# build input dataframe
create_input_df <- function(){
m <- 2
a_day <- 3
n <- a_day*10
lvl <- paste0("engine_", LETTERS[1:m])
start_date <- as_datetime("2018-09-13 00:00:00")
end_date <- start_date + n * hours(8)
date_time <- seq(start_date, end_date, length.out = n)
# short stops don't restart the running days count
short_stops <- seq(2, n, by = n/5)
# long stops, however, do
center <- floor(n/2)
medium_stop <- seq(center-1, center)
long_stop <- seq(22,26)
# merge stop indices
index <- sort(unique(c(short_stops, medium_stop, long_stop)))
# remove the rows corresponding to the stops
date_time <- date_time[-index]
# build data frame
n <- length(date_time)
ntot <- n * m
engines <- factor(rep(lvl, each = n), levels = lvl)
x <- runif(ntot)
y <- rnorm(ntot)
my_df <- data_frame(engines, date_time = rep(date_time, m), x, y)
return(list(my_df, start_date))
}
foo <- create_input_df()
input_df <- foo[[1]]
start_date <- foo[[2]]
rm(foo)
stop_threshold <- 1
calculated_output <- input_df %>%
group_by(engines) %>%
mutate(prev_dt = lag(date_time, order_by = engines),
dt_diff = ifelse(is.na(date_time - prev_dt), 0, (date_time - prev_dt)/ddays(1))) %>%
group_by(engines, grp = cumsum(dt_diff > stop_threshold)) %>%
mutate(run_days = floor(cumsum(dt_diff))) %>%
ungroup
calculated_output
#> # A tibble: 40 x 8
#> engines date_time x y prev_dt dt_diff
#> <fct> <dttm> <dbl> <dbl> <dttm> <dbl>
#> 1 engine_A 2018-09-13 00:00:00 0.196 -1.30 NA 0
#> 2 engine_A 2018-09-13 16:33:06 0.0333 -0.108 2018-09-13 00:00:00 0.690
#> 3 engine_A 2018-09-14 00:49:39 0.693 -1.29 2018-09-13 16:33:06 0.345
#> 4 engine_A 2018-09-14 09:06:12 0.735 1.56 2018-09-14 00:49:39 0.345
#> 5 engine_A 2018-09-14 17:22:45 0.651 -2.11 2018-09-14 09:06:12 0.345
#> 6 engine_A 2018-09-15 01:39:18 0.837 -0.152 2018-09-14 17:22:45 0.345
#> 7 engine_A 2018-09-15 18:12:24 0.492 -0.189 2018-09-15 01:39:18 0.690
#> 8 engine_A 2018-09-16 02:28:57 0.664 -0.553 2018-09-15 18:12:24 0.345
#> 9 engine_A 2018-09-16 10:45:31 0.180 0.600 2018-09-16 02:28:57 0.345
#> 10 engine_A 2018-09-16 19:02:04 0.0507 -0.697 2018-09-16 10:45:31 0.345
#> 11 engine_A 2018-09-17 03:18:37 0.837 -0.326 2018-09-16 19:02:04 0.345
#> 12 engine_A 2018-09-18 04:08:16 0.0239 0.0353 2018-09-17 03:18:37 1.03
#> 13 engine_A 2018-09-18 12:24:49 0.237 -0.195 2018-09-18 04:08:16 0.345
#> 14 engine_A 2018-09-18 20:41:22 0.941 -0.170 2018-09-18 12:24:49 0.345
#> 15 engine_A 2018-09-19 04:57:55 0.120 -0.338 2018-09-18 20:41:22 0.345
#> 16 engine_A 2018-09-19 21:31:02 0.902 -0.0708 2018-09-19 04:57:55 0.690
#> 17 engine_A 2018-09-21 23:10:20 0.853 -0.174 2018-09-19 21:31:02 2.07
#> 18 engine_A 2018-09-22 07:26:53 0.807 0.0990 2018-09-21 23:10:20 0.345
#> 19 engine_A 2018-09-22 15:43:26 0.799 0.865 2018-09-22 07:26:53 0.345
#> 20 engine_A 2018-09-23 00:00:00 0.237 0.757 2018-09-22 15:43:26 0.345
#> 21 engine_B 2018-09-13 00:00:00 0.424 -0.0950 NA 0
#> 22 engine_B 2018-09-13 16:33:06 0.597 -0.0994 2018-09-13 00:00:00 0.690
#> 23 engine_B 2018-09-14 00:49:39 0.589 0.0720 2018-09-13 16:33:06 0.345
#> 24 engine_B 2018-09-14 09:06:12 0.471 -0.884 2018-09-14 00:49:39 0.345
#> 25 engine_B 2018-09-14 17:22:45 0.755 -0.688 2018-09-14 09:06:12 0.345
#> 26 engine_B 2018-09-15 01:39:18 0.497 -0.0582 2018-09-14 17:22:45 0.345
#> 27 engine_B 2018-09-15 18:12:24 0.809 -0.839 2018-09-15 01:39:18 0.690
#> 28 engine_B 2018-09-16 02:28:57 0.0380 -1.31 2018-09-15 18:12:24 0.345
#> 29 engine_B 2018-09-16 10:45:31 0.193 -0.645 2018-09-16 02:28:57 0.345
#> 30 engine_B 2018-09-16 19:02:04 0.991 0.0701 2018-09-16 10:45:31 0.345
#> 31 engine_B 2018-09-17 03:18:37 0.422 -0.835 2018-09-16 19:02:04 0.345
#> 32 engine_B 2018-09-18 04:08:16 0.222 -1.60 2018-09-17 03:18:37 1.03
#> 33 engine_B 2018-09-18 12:24:49 0.207 0.814 2018-09-18 04:08:16 0.345
#> 34 engine_B 2018-09-18 20:41:22 0.957 -0.458 2018-09-18 12:24:49 0.345
#> 35 engine_B 2018-09-19 04:57:55 0.00400 0.932 2018-09-18 20:41:22 0.345
#> 36 engine_B 2018-09-19 21:31:02 0.482 -0.774 2018-09-19 04:57:55 0.690
#> 37 engine_B 2018-09-21 23:10:20 0.0872 1.16 2018-09-19 21:31:02 2.07
#> 38 engine_B 2018-09-22 07:26:53 0.908 1.68 2018-09-21 23:10:20 0.345
#> 39 engine_B 2018-09-22 15:43:26 0.696 -0.383 2018-09-22 07:26:53 0.345
#> 40 engine_B 2018-09-23 00:00:00 0.451 0.922 2018-09-22 15:43:26 0.345
#> grp run_days
#> <int> <dbl>
#> 1 0 0
#> 2 0 0
#> 3 0 1
#> 4 0 1
#> 5 0 1
#> 6 0 2
#> 7 0 2
#> 8 0 3
#> 9 0 3
#> 10 0 3
#> 11 0 4
#> 12 1 1
#> 13 1 1
#> 14 1 1
#> 15 1 2
#> 16 1 2
#> 17 2 2
#> 18 2 2
#> 19 2 2
#> 20 2 3
#> 21 0 0
#> 22 0 0
#> 23 0 1
#> 24 0 1
#> 25 0 1
#> 26 0 2
#> 27 0 2
#> 28 0 3
#> 29 0 3
#> 30 0 3
#> 31 0 4
#> 32 1 1
#> 33 1 1
#> 34 1 1
#> 35 1 2
#> 36 1 2
#> 37 2 2
#> 38 2 2
#> 39 2 2
#> 40 2 3
# this is the data frame I would like to generate, with the added run_days column
expected_output <- input_df %>%
mutate(run_days = rep(c(0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1), times = 2))
expected_output
#> # A tibble: 40 x 5
#> engines date_time x y run_days
#> <fct> <dttm> <dbl> <dbl> <dbl>
#> 1 engine_A 2018-09-13 00:00:00 0.196 -1.30 0
#> 2 engine_A 2018-09-13 16:33:06 0.0333 -0.108 0
#> 3 engine_A 2018-09-14 00:49:39 0.693 -1.29 1
#> 4 engine_A 2018-09-14 09:06:12 0.735 1.56 1
#> 5 engine_A 2018-09-14 17:22:45 0.651 -2.11 1
#> 6 engine_A 2018-09-15 01:39:18 0.837 -0.152 2
#> 7 engine_A 2018-09-15 18:12:24 0.492 -0.189 2
#> 8 engine_A 2018-09-16 02:28:57 0.664 -0.553 3
#> 9 engine_A 2018-09-16 10:45:31 0.180 0.600 3
#> 10 engine_A 2018-09-16 19:02:04 0.0507 -0.697 3
#> 11 engine_A 2018-09-17 03:18:37 0.837 -0.326 4
#> 12 engine_A 2018-09-18 04:08:16 0.0239 0.0353 0
#> 13 engine_A 2018-09-18 12:24:49 0.237 -0.195 0
#> 14 engine_A 2018-09-18 20:41:22 0.941 -0.170 0
#> 15 engine_A 2018-09-19 04:57:55 0.120 -0.338 1
#> 16 engine_A 2018-09-19 21:31:02 0.902 -0.0708 1
#> 17 engine_A 2018-09-21 23:10:20 0.853 -0.174 0
#> 18 engine_A 2018-09-22 07:26:53 0.807 0.0990 0
#> 19 engine_A 2018-09-22 15:43:26 0.799 0.865 0
#> 20 engine_A 2018-09-23 00:00:00 0.237 0.757 1
#> 21 engine_B 2018-09-13 00:00:00 0.424 -0.0950 0
#> 22 engine_B 2018-09-13 16:33:06 0.597 -0.0994 0
#> 23 engine_B 2018-09-14 00:49:39 0.589 0.0720 1
#> 24 engine_B 2018-09-14 09:06:12 0.471 -0.884 1
#> 25 engine_B 2018-09-14 17:22:45 0.755 -0.688 1
#> 26 engine_B 2018-09-15 01:39:18 0.497 -0.0582 2
#> 27 engine_B 2018-09-15 18:12:24 0.809 -0.839 2
#> 28 engine_B 2018-09-16 02:28:57 0.0380 -1.31 3
#> 29 engine_B 2018-09-16 10:45:31 0.193 -0.645 3
#> 30 engine_B 2018-09-16 19:02:04 0.991 0.0701 3
#> 31 engine_B 2018-09-17 03:18:37 0.422 -0.835 4
#> 32 engine_B 2018-09-18 04:08:16 0.222 -1.60 0
#> 33 engine_B 2018-09-18 12:24:49 0.207 0.814 0
#> 34 engine_B 2018-09-18 20:41:22 0.957 -0.458 0
#> 35 engine_B 2018-09-19 04:57:55 0.00400 0.932 1
#> 36 engine_B 2018-09-19 21:31:02 0.482 -0.774 1
#> 37 engine_B 2018-09-21 23:10:20 0.0872 1.16 0
#> 38 engine_B 2018-09-22 07:26:53 0.908 1.68 0
#> 39 engine_B 2018-09-22 15:43:26 0.696 -0.383 0
#> 40 engine_B 2018-09-23 00:00:00 0.451 0.922 1
The problem with my "solution" is that the counter is not reset to 0 - it's reset to the "restarting" value of dt_diff
, which is larger than stop_threshold
, thus it's always 1 at least.