Unduplicating (remove duplicates) in column conditional on other columns

OrchidBeds · September 27, 2022, 5:41pm

# GOAL: 

# REMOVE DUPLICATED ROWS BASED ON COLUMNNS/VARIABLES "SN", "PN" AND "TIME" 
# FROM Dataframe (DF) "dup"
# (My dataframe has more columns/variables, I am not including all those in this DF)

# CONDITION FOR REMOVAL OF DUPLICATES: 

# 1. "PN" AND "SN" ARE DUPLICATED. 
# 2. KEEP THE ROW WITH THE LATEST OBSERVATION IN THE COLUMNM/VARIABLE "TIME." 
# 3. KEEP ROWS THAT HAVE DUPLICATES FOR "SN" AND NON DUPLICATES FOR "PN." 
#    THESE ROWS ARE VALID DATA IN MY ORIGINAL DATAFRAME.

# COMPLETED: I HAVE SORTED THE DATASET BY TIME FROM EARLIEST TO LATEST. 

# NEEDED HELP: THE UN-DUPLICATION PROCESS AS MENTIONED ABOVE. I HAVE TRIED 
# MANY DIFFERENT SCRIPTS AND NONE HAVE WORKED. DESPERATE FOR SOME HELP. 

#NOTE: Model of desired dataframe is at the end called "deduped_ordered" *******************

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union


#Create dataframe called "dup" with three variables/columns called PN, SN, and TIME
# This is my duplicated and unsorted dataframe. 

PN <- c("661_50379",
        "661_50379",
        "661_50379",
        "661_08935",
        "661_10857",
        "661_13261",
        "661_13268",
        "661_13268",
        "661_15672",
        "661_13519"
)

SN <- c("XXXF20J4KXKV",
        "XXXF20J4KXKV",
        "XXXF20J4KXKV",
        "DMXD90ORE72R",
        "DMXD90ORE72R",
        "3PC000305928ZZZZ",
        "3PC000305928ZZZZ",
        "H6RHNB2B1059",
        "F71DP3V00GRD",
        "F2LF908B0D3Y"
        
)

TIME <- c("23:53:36",
          "11:27:31",
          "13:56:34",
          "13:03:15",
          "10:30:59",
          "11:19:47",
          "11:06:22",
          "11:22:17", 
          "11:22:17",
          "21:17:48" 
)


#CREATE DATAFRAME NAMED "dup" 
dup <- data.frame(PN, SN, TIME)
view(dup)

# Check class (CHAR)
class(dup$TIME)
#> [1] "character"

#Change class of "TIME" variable from "CHAR" to "DURATION" for sorting purposes 
#I am not sure if this even needed 

dup$TIME.DIFF <- as.duration(hms(dup$TIME))
class(dup$TIME.DIFF)
#> [1] "Duration"
#> attr(,"package")
#> [1] "lubridate"
view(dup)

#Confirm class for variable/column "TIME.DIFF" is now duration 
class(dup$TIME.DIFF)
#> [1] "Duration"
#> attr(,"package")
#> [1] "lubridate"


# Create datafame, "dup_ordered" Sorted "SN" according to "TIME.DIFF" by ascending order 
attach(dup)
#> The following objects are masked _by_ .GlobalEnv:
#> 
#>     PN, SN, TIME
dup_ordered <- dup[order(SN,TIME.DIFF),]
view(dup_ordered)




# NEEDED: Remove duplicates if PN and SN are both duplicated, AND keep the unique row with latest TIME. 

# MODEL OF DESIRED DATASET 

PN <- c("661_50379",
        "661_08935",
        "661_10857",
        "661_13261",
        "661_13268",
        "661_13268",
        "661_15672",
        "661_13519"
        
)

SN <- c("XXXF20J4KXKV",
        "DMXD90ORE72R",
        "DMXD90ORE72R",
        "3PC000305928ZZZZ",
        "3PC000305928ZZZZ",
        "H6RHNB2B1059",
        "F71DP3V00GRD",
        "F2LF908B0D3Y"
)



TIME <- c("23:53:36",
          "10:30:59",
          "13:03:15",
          "11:06:22",
          "11:19:47",
          "11:22:17",
          "11:17:48",
          "21:17:48"
)


dedup_ordered <- data.frame(PN, SN, TIME)
view(dedup_ordered)

^{Created on 2022-09-27 with reprex v2.0.2}

OrchidBeds · September 27, 2022, 5:42pm

@andresrcs andresrcs Help please? You helped me about 4 years ago with a project I had at another job. TY!

FactOREO · September 27, 2022, 7:00pm

Hello,

since I think it is a) faster and b) more convenient, I decided to rely on the fastverse (in this case collapse and data.table) for my solution. Regardless, below is a version with dplyr and lubridate which produces the same output. Thereafter is a speed comparison, which shows why I would use collapse and data.table all the time instead of dplyr.

library(fastverse)

dup |>
  # transform to IDate
  fmutate(TIME = as.ITime(TIME)) |>
  # order by all columns, TIME decreasing
  roworder(PN,SN,-TIME) |>
  # add a counter and keep only if the counter is equal to one
  fmutate(
    counter = 1L
  ) |>
  ftransform(
    counter = fcumsum(counter, list(PN,SN))
  ) |>
  fsubset(counter == 1L) |>
  fselect(-counter)
#>          PN               SN     TIME
#> 1 661_08935     DMXD90ORE72R 13:03:15
#> 2 661_10857     DMXD90ORE72R 10:30:59
#> 3 661_13261 3PC000305928ZZZZ 11:19:47
#> 4 661_13268 3PC000305928ZZZZ 11:06:22
#> 5 661_13268     H6RHNB2B1059 11:22:17
#> 6 661_13519     F2LF908B0D3Y 21:17:48
#> 7 661_15672     F71DP3V00GRD 11:22:17
#> 8 661_50379     XXXF20J4KXKV 23:53:36

# the same, but in dplyr and lubridate
dup |>
  dplyr::mutate(TIME = lubridate::hms(TIME)) |>
  dplyr::arrange(PN,SN,-TIME) |>
  dplyr::group_by(PN,SN) |>
  dplyr::mutate(counter = 1:dplyr::n()) |>
  dplyr::filter(counter == 1) |>
  dplyr::select(-counter)
#> # A tibble: 8 × 3
#> # Groups:   PN, SN [8]
#>   PN        SN               TIME       
#>   <chr>     <chr>            <Period>   
#> 1 661_08935 DMXD90ORE72R     13H 3M 15S 
#> 2 661_10857 DMXD90ORE72R     10H 30M 59S
#> 3 661_13261 3PC000305928ZZZZ 11H 19M 47S
#> 4 661_13268 3PC000305928ZZZZ 11H 6M 22S 
#> 5 661_13268 H6RHNB2B1059     11H 22M 17S
#> 6 661_13519 F2LF908B0D3Y     21H 17M 48S
#> 7 661_15672 F71DP3V00GRD     11H 22M 17S
#> 8 661_50379 XXXF20J4KXKV     23H 53M 36S

# speed comparison
microbenchmark::microbenchmark(
  dup |>
    # transform to IDate
    fmutate(TIME = as.ITime(TIME)) |>
    # order by all columns, TIME decreasing
    roworder(PN,SN,-TIME) |>
    # add a counter and keep only if the counter is equal to one
    fmutate(
      counter = 1L
    ) |>
    ftransform(
      counter = fcumsum(counter, list(PN,SN))
    ) |>
    fsubset(counter == 1L) |>
    fselect(-counter),
  dup |>
    dplyr::mutate(TIME = lubridate::hms(TIME)) |>
    dplyr::arrange(PN,SN,-TIME) |>
    dplyr::group_by(PN,SN) |>
    dplyr::mutate(counter = 1:dplyr::n()) |>
    dplyr::filter(counter == 1) |>
    dplyr::select(-counter),
  times = 50
)
#> Unit: microseconds
#>                                                                                                                                                                                                            expr
#>                  fselect(fsubset(ftransform(fmutate(roworder(fmutate(dup, TIME = as.ITime(TIME)),      PN, SN, -TIME), counter = 1L), counter = fcumsum(counter,      list(PN, SN))), counter == 1L), -counter)
#>  dplyr::select(dplyr::filter(dplyr::mutate(dplyr::group_by(dplyr::arrange(dplyr::mutate(dup,      TIME = lubridate::hms(TIME)), PN, SN, -TIME), PN, SN), counter = 1:dplyr::n()),      counter == 1), -counter)
#>        min        lq      mean    median        uq        max neval
#>    453.301   629.301  1005.667   848.601  1110.101   3273.801    50
#>  22522.702 33590.201 46353.481 43666.051 55882.901 102340.100    50

^{Created on 2022-09-27 with reprex v2.0.2}

As a sidenote: The sample data is not even a data.table object, so the speed gain if you transform it beforehand would be even greater.

Kind regards

OrchidBeds · September 27, 2022, 8:10pm

Thank you! I will try this out on my data first thing in the morning when I return to work and then provide an update. I don't understand the data.table distinction right now but I will do some research into this to find out. I am a beginner so please bear with me.

OrchidBeds · September 28, 2022, 7:35pm

Update: I used your more efficient code using fastverse on my original large dataset with success. 83 dupes removed of 21,005 rows. I also converted my datasets to data table. TY!

system · October 5, 2022, 7:36pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.