Below is my attempt which works. But this can't seem to scale to my project data sets that have records of 4500 and 3000 respectively.
I have two questions:
- How do I make the code efficient
- How do I return a dataset instead of a list.
library(dplyr)
df1 <- data.frame(id=1:8, ch= c("a","b","c","f","r","i","k","o"), p=9:16)
df2 <- data.frame(id=c(1,2,9,3,8,6,9,10), ch= c("a","b","c","c","r","i","k","o"), p=17:24)
data <- vector("list")
for(i in 1:nrow(df1)){
for(k in 1:nrow(df2)){
#Merge eligibility form with exit for but only when the date of interview match
if((df1$id[i]== df2$id[k] & df1$ch[i]== df2$ch[k]) &
!is.na(df1$id[i]== df2$id[k] & df1$ch[i]== df2$ch[k])){
data[[i]] <-inner_join(df1[i,],df2[k,], by = "id") }
}
}
data
#> [[1]]
#> id ch.x p.x ch.y p.y
#> 1 1 a 9 a 17
#>
#> [[2]]
#> id ch.x p.x ch.y p.y
#> 1 2 b 10 b 18
#>
#> [[3]]
#> id ch.x p.x ch.y p.y
#> 1 3 c 11 c 20
#>
#> [[4]]
#> NULL
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> id ch.x p.x ch.y p.y
#> 1 6 i 14 i 22
Created on 2018-10-03 by the reprex package (v0.2.1)