Hi,
Need help to modify time stamps based on a condition.
I had 2 datasets and while combining the two, some of the timestamps are overlapping. I am interested in adjusting these overlaps. Combining datasets created NAs in ID and that shows this data comes from another dataframe. We do want to retain timestamps coming from this dataframe with NA IDs and adjust other timestamps accordingly.
How do I remove rows with some kind of condition to avoid overlaps in time stamps START_TIME & END_TIME?
# Data with overlaps in timestamps such as in Row 5th & 6th.
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
START_TIME = c("2023-08-14 12:07:40","2023-08-14 12:08:48","2023-08-14 12:09:49",
"2023-08-14 12:11:22","2023-08-14 12:13:17",
"2023-08-14 12:14:24","2023-08-14 12:14:56",
"2023-08-14 12:15:05","2023-08-14 12:17:04",
"2023-08-14 12:17:04","2023-08-14 12:17:05","2023-08-14 12:17:25",
"2023-08-14 12:17:26","2023-08-14 12:17:27",
"2023-08-14 12:25:54","2023-08-14 12:34:48",
"2023-08-14 12:35:24","2023-08-14 12:40:19",
"2023-08-14 12:40:38","2023-08-14 12:46:28",
"2023-08-14 12:46:50","2023-08-14 12:55:44","2023-08-14 12:55:53",
"2023-08-14 12:57:10","2023-08-14 12:59:25",
"2023-08-14 13:01:04","2023-08-14 13:08:45",
"2023-08-14 13:10:50","2023-08-14 13:13:28",
"2023-08-14 13:15:50","2023-08-14 13:22:01"),
END_TIME = c("2023-08-14 12:08:48","2023-08-14 12:09:49","2023-08-14 12:11:22",
"2023-08-14 12:13:17","2023-08-14 12:14:56",
"2023-08-14 13:10:50","2023-08-14 12:15:05",
"2023-08-14 12:17:04","2023-08-14 12:17:04",
"2023-08-14 12:17:05","2023-08-14 12:17:25","2023-08-14 12:17:26",
"2023-08-14 12:17:27","2023-08-14 12:25:54",
"2023-08-14 12:34:48","2023-08-14 12:35:24",
"2023-08-14 12:40:19","2023-08-14 12:40:38",
"2023-08-14 12:46:28","2023-08-14 12:46:50",
"2023-08-14 12:55:44","2023-08-14 12:55:53","2023-08-14 12:57:10",
"2023-08-14 12:59:25","2023-08-14 13:01:04",
"2023-08-14 13:08:45","2023-08-14 13:13:28",
"2023-08-14 13:14:54","2023-08-14 13:22:01",
"2023-08-14 13:18:36","2023-08-14 13:24:10"),
ELAPSED_TIME = c(1.1326,1.0197,
1.5486,1.915,1.65,56.4333,0.15,1.9833,0,0.0138,
0.3362,0.0167,0.0167,8.455,8.893,0.6012,
4.9153,0.3241,5.8319,0.3597,8.9031,0.15,1.2892,
2.2385,1.6556,7.6833,4.7208,4.06662,8.5425,
2.76666,2.1566),
DATE_ID = c("2023-08-14 00:00:00","2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00",
"2023-08-14 00:00:00","2023-08-14 00:00:00"),
ID = c(237214724L,
237214724L,237214724L,237214724L,235565710L,NA,
235565710L,232849697L,232849697L,232849697L,
232849697L,232849697L,232849697L,233163426L,234067478L,
234067478L,234067478L,234067478L,235266765L,
235266765L,239167461L,239167461L,239167461L,
234182936L,238535268L,236468967L,236468967L,NA,
236019152L,NA,236918867L),
Category = c("Cat1","Cat1",
"Cat1","Cat7","Cat1","Cat4","Cat3","Cat3","Cat1",
"Cat2","Cat3","Cat1","Cat1","Cat5","Cat1",
"Cat1","Cat5","Cat3","Cat5","Cat3","Cat1",
"Cat3","Cat5","Cat1","Cat2","Cat1","Cat1","Cat6",
"Cat1","Cat6","Cat3")
) %>%
mutate(START_TIME = as.POSIXct(START_TIME), END_TIME = as.POSIXct(END_TIME))
# I would like to retain time stamps where IDs are NAs and adjust the other rows accordingly.
# I tried to modify as follows:
mod_df <- df %>%
mutate(END_TIME = case_when(
END_TIME > lead(START_TIME) & str_detect(Category, "Cat1|Cat2|Cat3|Cat5|Cat7") ~ lead(START_TIME),
TRUE ~ END_TIME
)) %>%
mutate(START_TIME = case_when(
START_TIME < lag(END_TIME) & str_detect(Category, "Cat1|Cat2|Cat3|Cat5|Cat7") ~ lag(END_TIME),
TRUE ~ START_TIME
))
# Above approach helps achieve right time stamps in 6th row and row 27th.
# However, I would like to now remove these extra rows in between,
# i.e. from 7th to 27th row as these time stamps are already captured by 6th and 28th row. How do I achieve this?
How do I remove rows with some kind of condition such as if END_TIME < lag(END_TIME), remove that row and repeat until END_TIME >= lag(END_TIME). Hoping this could remove the rows with overlapping time stamps START_TIME & END_TIME.
Thanks for your help!
``