This would be one way to do it
library(tidystringdist)
library(tidyverse)
# Sample data
df <- data.frame(stringsAsFactors = FALSE,
Name = as.factor(c(" CANON PVT. LTD ", " Antila,Thomas ", " Greg ",
" St.Luke's Hospital ", " Z_SANDSTONE COOLING LTD ",
" St.Luke's Hospital ", " CANON PVT. LTD. ",
" SANDSTONE COOLING LTD ", " Greg ", " ANTILA,THOMAS ")),
City = as.factor(c(" Georgia ", " Georgia ", " Georgia ", " Georgia ",
" Georgia ", " Georgia ", " Georgia ", " Georgia ",
" Georgia ", " Georgia "))
)
match <- df %>%
tidy_comb_all(Name) %>%
tidy_stringdist() %>%
filter(soundex == 0) %>% # Set a threshold
gather(x, match, starts_with("V")) %>%
.$match
df %>%
filter(Name %in% match) %>%
arrange(Name)
#> Name City
#> 1 Antila,Thomas Georgia
#> 2 ANTILA,THOMAS Georgia
#> 3 CANON PVT. LTD Georgia
#> 4 CANON PVT. LTD. Georgia
#> 5 SANDSTONE COOLING LTD Georgia
#> 6 Z_SANDSTONE COOLING LTD Georgia
Created on 2019-07-13 by the reprex package (v0.2.1)