Identifying Fuzzy Duplicates from a column

andresrcs · July 13, 2019, 3:23pm

This would be one way to do it

library(tidystringdist)
library(tidyverse)

# Sample data
df <- data.frame(stringsAsFactors = FALSE,
  Name = as.factor(c(" CANON PVT. LTD ", " Antila,Thomas ", " Greg ",
                     " St.Luke's Hospital ", " Z_SANDSTONE COOLING LTD ",
                     " St.Luke's Hospital ", " CANON PVT. LTD. ",
                     " SANDSTONE COOLING LTD ", " Greg ", " ANTILA,THOMAS ")),
  City = as.factor(c(" Georgia ", " Georgia ", " Georgia ", " Georgia ",
                     " Georgia ", " Georgia ", " Georgia ", " Georgia ",
                     " Georgia ", " Georgia "))
)


match <- df %>% 
  tidy_comb_all(Name) %>% 
  tidy_stringdist() %>% 
  filter(soundex == 0) %>% # Set a threshold
  gather(x, match, starts_with("V")) %>% 
  .$match
  

df %>% 
  filter(Name %in% match) %>% 
  arrange(Name)
#>                        Name      City
#> 1            Antila,Thomas   Georgia 
#> 2            ANTILA,THOMAS   Georgia 
#> 3           CANON PVT. LTD   Georgia 
#> 4          CANON PVT. LTD.   Georgia 
#> 5    SANDSTONE COOLING LTD   Georgia 
#> 6  Z_SANDSTONE COOLING LTD   Georgia

^{Created on 2019-07-13 by the reprex package (v0.2.1)}