I have 2 dataframes. auto and manual. I need to create matching instances between 2 data frames when auto_size and manual_size falls within +/- 25 range within each and when the loactions are matched, while the ID is matching. Location matching: right to right and middle, middle to right,middle and left left to left and middle
- the actual datsets are more bigger than this containing thousands of records.
auto <- data.frame(ID = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2", "3", "4", "4", "5", "6",
"6", "6", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7"),
auto_repeat_tag = c(8, 1, 3, 7, 5, 4, 6, 2, 3, 1, 2, 4, 5, 6, 8, 7, 1, 1, 2,
1, 3, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8,9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,25, 26),
auto_size = c(11, 10, 8, 8, 7, 6, 4, 4, 4, 4, 4, 20,17, 8, 6, 4, 3, 13, 11, 8,
8, 5, 2, 11, 10, 10, 9, 9, 9, 8, 8,7, 7, 7, 7, 7, 6, 6, 6, 6, 5,
5, 5, 5, 5, 5, 5, 3, 3),
auto_location = c("left", "middle", "left", "left", "left", "left", "left",
"left", "left","middle", "left", "left", "left", "left","left", "left", "left","left", "left", "left", "left",
"middle", "left", "middle", "middle", "left", "middle", "left", "left", "middle", "left", "middle",
"middle", "middle", "left", "left", "left", "left", "left", "left","left", "left", "middle", "left", "left",
"left", "left", "left","left"))
manual <- data.frame(ID = c("1", "1", "1", "1", "1", "2", "3", "3", "3", "4", "5", "6", "7", "7"),
manual_repeat_tag = c(2, 1, 5, 4, 3, 1, 1, 3, 2, 1, 1, 1, 1, 2),
manual_size = c(6, 6, 4, 4,4, 3, 3, 25, 17, 3, 10, 3, 3, 2),
manual_location = c("left","middle", "left", "left", "left", "middle", "left", "left", "left",
"left", "left", "left", "left", "left"))
Upto now first I merged 2 files so that I can create all the possible instances between 2 dataframes. Then made separate columns for +/- 25% columns for each size.
auto_manual<-merge(auto,manual,by="ID")
auto_manual<- auto_manual%>%
mutate(auto_plus50size=auto_size+(auto_size/4),
auto_minus50size=auto_size-(auto_size/4),
manual_plus50size=manual_size+(manual_size/4),
manual_minus50size=manual_size-(manual_size/4))
I have 2 dataframes. auto and manual. I need to create matching instances between 2 data frames when auto_size and manual_size falls within +/- 25 range within each and when the loactions are matched, while the ID is matching. Location matching: right to right and middle, middle to right,middle and left left to left and middle
- the actual datsets are more bigger than this containing thousands of records.
auto <- data.frame(ID = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2", "3", "4", "4", "5", "6",
"6", "6", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7"),
auto_repeat_tag = c(8, 1, 3, 7, 5, 4, 6, 2, 3, 1, 2, 4, 5, 6, 8, 7, 1, 1, 2,
1, 3, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8,9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,25, 26),
auto_size = c(11, 10, 8, 8, 7, 6, 4, 4, 4, 4, 4, 20,17, 8, 6, 4, 3, 13, 11, 8,
8, 5, 2, 11, 10, 10, 9, 9, 9, 8, 8,7, 7, 7, 7, 7, 6, 6, 6, 6, 5,
5, 5, 5, 5, 5, 5, 3, 3),
auto_location = c("left", "middle", "left", "left", "left", "left", "left",
"left", "left","middle", "left", "left", "left", "left","left", "left", "left","left", "left", "left", "left", "middle", "left", "middle", "middle",
"left", "middle", "left", "left", "middle", "left", "middle",
"middle", "middle", "left", "left", "left", "left", "left", "left","left", "left", "middle", "left", "left", "left", "left", "left","left"))
manual <- data.frame(ID = c("1", "1", "1", "1", "1", "2", "3", "3", "3", "4", "5", "6", "7", "7"),
manual_repeat_tag = c(2, 1, 5, 4, 3, 1, 1, 3, 2, 1, 1, 1, 1, 2),
manual_size = c(6, 6, 4, 4,4, 3, 3, 25, 17, 3, 10, 3, 3, 2),
manual_location = c("left","middle", "left", "left", "left", "middle", "left", "left", "left",
"left", "left", "left", "left", "left"))
Upto now first I merged 2 files so that I can create all the possible instances between 2 dataframes. Then created seperate columns for +/- 25% columns for each size.
auto_manual<-merge(auto,manual,by="ID")
auto_manual<- auto_manual%>%
mutate(auto_plus50size=auto_size+(auto_size/4),
auto_minus50size=auto_size-(auto_size/4),
manual_plus50size=manual_size+(manual_size/4),
manual_minus50size=manual_size-(manual_size/4))
Then I created a match size and match location column depending on the criteria.
## Matching auto and manual SIZE (matching auto range with manual size range and vice versa)
auto_manual%<>%
mutate(match_size= case_when(manual_minus50size>=auto_minus50size&
manual_minus50size<=auto_plus50size~paste0("auto",auto_size,"man",manual_size),
manual_plus50size>=auto_minus50size&
manual_plus50size<=auto_plus50size~paste0("auto",auto_size,"man",manual_size),
auto_minus50size>=manual_minus50size&
auto_minus50size<=manual_plus50size~paste0("auto",auto_size,"man",manual_size),
auto_plus50size>=manual_minus50size&
auto_plus50size<=manual_plus50size~paste0("auto",auto_size,"man",manual_size),
TRUE~"not_matching")) %>%
relocate(match_size,.after = ID)
## Matching auto and manual location
#left can be matched to left and middle
#middle to right,left,middle
#right to right and middle
auto_manual%<>%
mutate(match_location=case_when(auto_location =="right"&
(manual_location %in% c("right","middle"))~"matched",
auto_location=="middle" &
( manual_location %in% c("right","middle","left"))~"matched",
auto_location =="left" &
( manual_location %in% c("middle","left"))~"matched",
manual_location=="right" &
(auto_location %in% c("right","middle"))~"matched",
manual_location=="middle" &
(auto_location %in% c("right","middle","left"))~"matched",
manual_location=="left" &
(auto_location %in% c("middle","left"))~"matched",
TRUE~"not_matching")) %>%
relocate(match_location,.after = match_size)
Finally I extracted all the instances both location and sizes are matching
#### Both size and location matched
matched<-auto_manual %>%
filter(match_size!="not_matching" & match_location!="not_matching") %>%
select(ID,match_size,match_location,auto_repeat_tag,manual_repeat_tag,auto_size,manual_size)
matched <- data.frame(ID = c("1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "2", "2", "2", "2", "3", "5", "6", "6", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7"),
match_size = c("auto10man6", "auto10man6", "auto8man6", "auto8man6", "auto8man6", "auto8man6","auto7man6", "auto7man6", "auto6man6", "auto6man6", "auto6man4",
"auto6man4", "auto6man4", "auto4man6", "auto4man6", "auto4man4",
"auto4man4", "auto4man4", "auto4man6", "auto4man6", "auto4man4",
"auto4man4", "auto4man4", "auto4man3", "auto4man3", "auto4man3",
"auto4man3", "auto3man3", "auto8man10", "auto5man3", "auto2man3",
"auto5man3", "auto5man3", "auto5man3", "auto5man3", "auto5man3",
"auto5man3", "auto5man3", "auto3man3", "auto3man2", "auto3man3","auto3man2"),
match_location = c("matched", "matched", "matched", "matched", "matched", "matched", "matched","matched", "matched", "matched", "matched", "matched", "matched", "matched",
"matched", "matched", "matched", "matched", "matched", "matched", "matched",
"matched", "matched", "matched", "matched", "matched", "matched",
"matched", "matched", "matched", "matched", "matched", "matched",
"matched", "matched", "matched", "matched", "matched", "matched","matched", "matched", "matched"),
auto_repeat_tag = c(1, 1, 3, 3, 7, 7, 5, 5, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 3,1, 2, 7, 1, 1, 1, 2, 18, 19, 20, 21, 22, 23, 24, 25, 25, 26,26),
manual_repeat_tag = c(2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 2, 1, 2),
auto_size = c(10, 10, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 8, 5, 2, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3),
manual_size = c(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 6, 6, 4, 4, 4, 6, 6, 4, 4, 4, 3, 3, 3, 3, 3, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2))
As all the above in the matched dataframe are matching instances I want to extract best matches according to the criterion that one repeat instance within one ID should only matched once and the highest size from auto or manual should match first and then in descending order. Can someone help me writing a piece of function or way to do this using dplyr.
Thank you in advance