Hello
I have the following columns in my data table: surname, first name, postcode, Street ....
I found the duplicates in my data table with:
duplicate_indexes <- which(duplicated(my_data[c('surname', 'first_name')]),)
duplicates <- my_data[duplicate_indexes,]
Currently under duplicates I have only records which have exactly the same Information in each column of my data table (surname, first name, postcode, street are exactly the same) .
Now I would like also to find the records from my data table, which have similarities, e.g. surname or first name are the same and postcode is also the same. how can I do that?
(mydata <- data.frame(surname=c(rep(LETTERS[1:5],2),"E"),
firstname=c(rep(letters[1:4],2),"d","a","x"),
postcode=c(rep(1:3,3),1,1)))
library(dplyr)
# dupliate over named variables only , i.e. surname & firstname
group_by(
mydata,
surname,
firstname
) %>%
mutate(
n = n()
) %>%
filter(n > 1)
#duplicated over postcode
group_by(
mydata,
postcode
) %>%
mutate(
n = n()
) %>%
filter(n > 1)
#surname or first name are the same and postcode is also the same
(surname_and_postcode_dup <-
group_by(
mydata,
surname,postcode) %>%
mutate(
n = n()
) %>%
filter(n > 1) )
(firstname_and_postcode_dup <-
group_by(
mydata,
firstname,postcode) %>%
mutate(
n = n()
) %>%
filter(n > 1))
(sur_or_first_and_post <-
bind_rows(surname_and_postcode_dup,
firstname_and_postcode_dup))