complex merge between two datasets

first dataset.

merge_field is the field without any tidyr::fill operations done. the name of column represents which fill operation is done.

name country merge_field merge_field_down merge_field_downup merge_field_up
Anna USA       CA               CA                CA            CA
Anna USA       NA               CA                CA            NA
Anna BRA       NA               CA                CA            NA
Anna BRA       NA               CA                CA            NA
Ryan USA       NA               NA                PT            PT
Ryan USA       PT               PT                PT            PT
Ryan USA       NA               PT                PT            NA
Liam MEX       NA               NA                OX            OX 
Liam MEX       OX               OX                OX            OX 
Liam MEX       NA               OX                OX            NA
Liam MEX       NA               OX                OX            NA
Liam MEX       NA               OX                OX            NA

second dataset

name country merge_field merge_field_right
Anna USA       CA         merge_field_down   
Ryan USA       PT         merge_field_up
Liam MEX       OX         merge_field_downup

the second dataset tells me what tidyr:fill grouping we want to keep for the name and country combination in the first dataset. we group by name in this dataset.

for example for "Anne USA CA" we need to use the grouping in merge_field_down column.. For "Ryan USA PT" we need to use the grouping in merge_field_up.

desired output data set

name country merge_field_right_value merge_field_right
Anna USA       CA                     merge_field_down
Anna USA       CA                     merge_field_down
Anna BRA       CA                     merge_field_down
Anna BRA       CA                     merge_field_down
Ryan USA       PT                     merge_field_up    
Ryan USA       PT                     merge_field_up
Ryan USA       NA                     merge_field_up
Liam MEX       OX                     merge_field_downup
Liam MEX       OX                     merge_field_downup
Liam MEX       OX                     merge_field_downup
Liam MEX       OX                     merge_field_downup
Liam MEX       OX                     merge_field_downup
Liam MEX       OX                     merge_field_downup

is there any easy way to do this. I know this is quite complicated. thanks.

DPUTS OF DATAFRAME:

dataframe1

structure(list(name = c("Anna", "Anna", "Anna", "Anna", "Ryan",
"Ryan", "Ryan", "Liam", "Liam", "Liam", "Liam", "Liam"), country = c("USA",
"USA", "BRA", "BRA", "USA", "USA", "USA", "MEX", "MEX", "MEX",
"MEX", "MEX"), merge_field = c("CA", NA, NA, NA, NA, "PT", NA,
NA, "OX", NA, NA, NA), merge_field_down = c("CA", "CA", "CA",
"CA", NA, "PT", "PT", NA, "OX", "OX", "OX", "OX"), merge_field_downup = c("CA",
"CA", "CA", "CA", "PT", "PT", "PT", "OX", "OX", "OX", "OX", "OX"
), merge_field_up = c("CA", NA, NA, NA, "PT", "PT", NA, "OX",
"OX", NA, NA, NA)), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"))

dataframe2

structure(list(name = c("Anna", "Anna", "Anna", "Anna", "Ryan",
"Ryan", "Ryan", "Liam", "Liam", "Liam", "Liam", "Liam"), country = c("USA",
"USA", "BRA", "BRA", "USA", "USA", "USA", "MEX", "MEX", "MEX",
"MEX", "MEX"), merge_field = c("CA", NA, NA, NA, NA, "PT", NA,
NA, "OX", NA, NA, NA), merge_field_down = c("CA", "CA", "CA",
"CA", NA, "PT", "PT", NA, "OX", "OX", "OX", "OX"), merge_field_downup = c("CA",
"CA", "CA", "CA", "PT", "PT", "PT", "OX", "OX", "OX", "OX", "OX"
), merge_field_up = c("CA", NA, NA, NA, "PT", "PT", NA, "OX",
"OX", NA, NA, NA)), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"))

Hello,

first of all: You didn't correctly specify your second dataset (it is the same as the first one in your comment).

library(data.table)
library(collapse)
#> collapse 1.8.6, see ?`collapse-package` or ?`collapse-documentation`
#> 
#> Attache Paket: 'collapse'
#> Das folgende Objekt ist maskiert 'package:stats':
#> 
#>     D
# note the class attribute "data.table"
dataframe1 <- structure(
  list(name = c(rep('Anna',4), rep('Ryan',3),rep('Liam',5)),
       country = c("USA","USA","BRA","BRA","USA","USA","USA",
                   "MEX","MEX","MEX","MEX", "MEX"),
       merge_field = c("CA",NA,NA,NA,NA,"PT",NA,NA,"OX",NA,NA,NA),
       merge_field_down = c("CA","CA","CA","CA",NA,"PT","PT",NA,"OX","OX","OX","OX"),
       merge_field_downup = c("CA","CA","CA","CA","PT","PT","PT","OX","OX","OX","OX","OX"),
       merge_field_up = c("CA",NA,NA,NA,"PT","PT",NA,"OX","OX",NA,NA,NA)),
  row.names = c(NA, -12L),class = c("data.table","data.frame"))
dataframe2 <- structure(
  list(name = c('Anna','Ryan','Liam'),
       country = c('USA','USA','MEX'),
       merge_field = c('CA','PT','OX'),
       merge_field_right = c('merge_field_down','merge_field_up','merge_field_downup')
  ),
  row.names = c(NA, -3L),class = c("data.table","data.frame"))

result <- dataframe1 |>
  # long format
  melt.data.table(id.vars = c('name','country')) |>
  # merge with data.table syntax (you could use dplyr *_joins as well)
  (\(x) x[dataframe2, on = 'name'])() |>
  # keep, if condition is met
  fsubset(variable == merge_field_right) |>
  # keep relevant columns
  fselect(name, country, 'merge_field_right_value' = value,merge_field_right)

result
#>     name country merge_field_right_value  merge_field_right
#>  1: Anna     USA                      CA   merge_field_down
#>  2: Anna     USA                      CA   merge_field_down
#>  3: Anna     BRA                      CA   merge_field_down
#>  4: Anna     BRA                      CA   merge_field_down
#>  5: Ryan     USA                      PT     merge_field_up
#>  6: Ryan     USA                      PT     merge_field_up
#>  7: Ryan     USA                    <NA>     merge_field_up
#>  8: Liam     MEX                      OX merge_field_downup
#>  9: Liam     MEX                      OX merge_field_downup
#> 10: Liam     MEX                      OX merge_field_downup
#> 11: Liam     MEX                      OX merge_field_downup
#> 12: Liam     MEX                      OX merge_field_downup

Created on 2022-08-17 by the reprex package (v2.0.1)

I hope this does what you want (at least with the given data it works).

Kind regards

PS: If you wish, I can change the functions to use tidyverse only. But collapse is way more performant and uses pretty similar syntax, so I thought I can give it a shot for this question. :slight_smile:

1 Like

this is so helpful thanks. is it possible to switch to tidy verse instead of collapse?thank u!

Hey,

for sure:

library('tidyverse')

dataframe1 <- structure(
  list(name = c(rep('Anna',4), rep('Ryan',3),rep('Liam',5)),
       country = c("USA","USA","BRA","BRA","USA","USA","USA",
                   "MEX","MEX","MEX","MEX", "MEX"),
       merge_field = c("CA",NA,NA,NA,NA,"PT",NA,NA,"OX",NA,NA,NA),
       merge_field_down = c("CA","CA","CA","CA",NA,"PT","PT",NA,"OX","OX","OX","OX"),
       merge_field_downup = c("CA","CA","CA","CA","PT","PT","PT","OX","OX","OX","OX","OX"),
       merge_field_up = c("CA",NA,NA,NA,"PT","PT",NA,"OX","OX",NA,NA,NA)),
  row.names = c(NA, -12L),class = c("tbl_df","tbl","data.frame"))
dataframe2 <- structure(
  list(name = c('Anna','Ryan','Liam'),
       country = c('USA','USA','MEX'),
       merge_field = c('CA','PT','OX'),
       merge_field_right = c('merge_field_down','merge_field_up','merge_field_downup')
  ),
  row.names = c(NA, -3L),class = c("tbl_df","tbl","data.frame"))

result <- dataframe1 %>%
  # long format
  pivot_longer(cols = 3:6, values_to = 'value', names_to = 'variable') %>%
  # merge
  left_join(dataframe2, by = 'name') %>%
  # keep, if condition is met
  filter(variable == merge_field_right) %>%
  # keep relevant columns
  select(name,'country' = country.x,'merge_field_right_value' = value,merge_field_right)

result
#> # A tibble: 12 × 4
#>    name  country merge_field_right_value merge_field_right 
#>    <chr> <chr>   <chr>                   <chr>             
#>  1 Anna  USA     CA                      merge_field_down  
#>  2 Anna  USA     CA                      merge_field_down  
#>  3 Anna  BRA     CA                      merge_field_down  
#>  4 Anna  BRA     CA                      merge_field_down  
#>  5 Ryan  USA     PT                      merge_field_up    
#>  6 Ryan  USA     PT                      merge_field_up    
#>  7 Ryan  USA     <NA>                    merge_field_up    
#>  8 Liam  MEX     OX                      merge_field_downup
#>  9 Liam  MEX     OX                      merge_field_downup
#> 10 Liam  MEX     OX                      merge_field_downup
#> 11 Liam  MEX     OX                      merge_field_downup
#> 12 Liam  MEX     OX                      merge_field_downup

Created on 2022-08-17 by the reprex package (v2.0.1)

You're welcome. :slight_smile:

Kind regards

1 Like

thank u OREO <3. I appreciate it.

Would you mind accepting the answer, to mark a solution was found? Thanks. :slight_smile:

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.