# GOAL:
# REMOVE DUPLICATED ROWS BASED ON COLUMNNS/VARIABLES "SN", "PN" AND "TIME"
# FROM Dataframe (DF) "dup"
# (My dataframe has more columns/variables, I am not including all those in this DF)
# CONDITION FOR REMOVAL OF DUPLICATES:
# 1. "PN" AND "SN" ARE DUPLICATED.
# 2. KEEP THE ROW WITH THE LATEST OBSERVATION IN THE COLUMNM/VARIABLE "TIME."
# 3. KEEP ROWS THAT HAVE DUPLICATES FOR "SN" AND NON DUPLICATES FOR "PN."
# THESE ROWS ARE VALID DATA IN MY ORIGINAL DATAFRAME.
# COMPLETED: I HAVE SORTED THE DATASET BY TIME FROM EARLIEST TO LATEST.
# NEEDED HELP: THE UN-DUPLICATION PROCESS AS MENTIONED ABOVE. I HAVE TRIED
# MANY DIFFERENT SCRIPTS AND NONE HAVE WORKED. DESPERATE FOR SOME HELP.
#NOTE: Model of desired dataframe is at the end called "deduped_ordered" *******************
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
#Create dataframe called "dup" with three variables/columns called PN, SN, and TIME
# This is my duplicated and unsorted dataframe.
PN <- c("661_50379",
"661_50379",
"661_50379",
"661_08935",
"661_10857",
"661_13261",
"661_13268",
"661_13268",
"661_15672",
"661_13519"
)
SN <- c("XXXF20J4KXKV",
"XXXF20J4KXKV",
"XXXF20J4KXKV",
"DMXD90ORE72R",
"DMXD90ORE72R",
"3PC000305928ZZZZ",
"3PC000305928ZZZZ",
"H6RHNB2B1059",
"F71DP3V00GRD",
"F2LF908B0D3Y"
)
TIME <- c("23:53:36",
"11:27:31",
"13:56:34",
"13:03:15",
"10:30:59",
"11:19:47",
"11:06:22",
"11:22:17",
"11:22:17",
"21:17:48"
)
#CREATE DATAFRAME NAMED "dup"
dup <- data.frame(PN, SN, TIME)
view(dup)
# Check class (CHAR)
class(dup$TIME)
#> [1] "character"
#Change class of "TIME" variable from "CHAR" to "DURATION" for sorting purposes
#I am not sure if this even needed
dup$TIME.DIFF <- as.duration(hms(dup$TIME))
class(dup$TIME.DIFF)
#> [1] "Duration"
#> attr(,"package")
#> [1] "lubridate"
view(dup)
#Confirm class for variable/column "TIME.DIFF" is now duration
class(dup$TIME.DIFF)
#> [1] "Duration"
#> attr(,"package")
#> [1] "lubridate"
# Create datafame, "dup_ordered" Sorted "SN" according to "TIME.DIFF" by ascending order
attach(dup)
#> The following objects are masked _by_ .GlobalEnv:
#>
#> PN, SN, TIME
dup_ordered <- dup[order(SN,TIME.DIFF),]
view(dup_ordered)
# NEEDED: Remove duplicates if PN and SN are both duplicated, AND keep the unique row with latest TIME.
# MODEL OF DESIRED DATASET
PN <- c("661_50379",
"661_08935",
"661_10857",
"661_13261",
"661_13268",
"661_13268",
"661_15672",
"661_13519"
)
SN <- c("XXXF20J4KXKV",
"DMXD90ORE72R",
"DMXD90ORE72R",
"3PC000305928ZZZZ",
"3PC000305928ZZZZ",
"H6RHNB2B1059",
"F71DP3V00GRD",
"F2LF908B0D3Y"
)
TIME <- c("23:53:36",
"10:30:59",
"13:03:15",
"11:06:22",
"11:19:47",
"11:22:17",
"11:17:48",
"21:17:48"
)
dedup_ordered <- data.frame(PN, SN, TIME)
view(dedup_ordered)
Created on 2022-09-27 with reprex v2.0.2