I have 2 datasets (df1 and df2) with the following shared columns: "tic", "cusip", "sedol", "isin", "datadate_year". All except "datadate_year" are unique identifier columns. "datadate_year" is a year value (YYYY).
I need to join the two datasets using the available identifiers. Note that not every record will have an identifier within a given column. The merging must happen within the same year. For example: I would want to try to join df1$tic on df2$tic only in 2020. If there is no match on “tic”, I would want the code to iterate across the different identifier options until (hopefully) one is found—always staying within the same year.
BONUS: If there is a match, I would like a new column created called "matching_variable" that lists which identifier was successful.
Here are reproducible examples for df1, df2, and what I'd like the results to look like (merged_dfs).
df1 <- data.frame(
company_name = c("Acme Corp", "Beta Industries", "Delta Ltd", "Epsilon PLC", "Eta Enterprises",
"Gamma Inc", "Iota Holdings", "Kappa Co", "Lambda Systems", "Mu Industries",
"Nu Innovations", "Omicron Ventures", "Phi Holdings", "Pi Enterprises", "Rho Ltd",
"Sigma Solutions", "Tau PLC", "Theta Technologies", "Upsilon Inc", "Xi Corp", "Zeta Solutions"),
tic = c("ACME", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
cusip = c("123456789", "234567890", "456789012", "567890123", "789012345", "345678901", "901234567",
"12345678", "123456780", NA, "345678902", "567890234", "123456789", "678901345", "789012456",
"890123456", "901234567", "890123456", "12345678", NA, NA),
sedol = c("123456", "1234567", NA, "4567890", "6789012", "2345678", "8901234", "9012345", "123457",
"1234568", "2345679", "4567891", "123458", "5678902", "6789013", "7890124", "8901235",
"7890123", "9012346", NA, "5678901"),
isin = c("US1234567890", "US2345678901", NA, "US5678901234", "US7890123456", "US3456789012",
"US9012345678", "US0123456789", "US1234567800", "US2345678910", "US3456789021",
"US5678902343", "US1234567899", "US6789013454", "US7890124565", "US8901234566",
"US9012345677", "US8901234567", "US0123456788", "US4567890132", NA),
datadate_year = c(2021, 2022, 2021, 2022, 2021, 2023, 2023, 2021, 2022, 2023, 2021, 2023, 2023,
2021, 2022, 2023, 2021, 2022, 2022, 2022, 2023)
)
df2 <- data.frame(
company_name = c("Acme Corp", "Beta Industries", "Delta Ltd", "Epsilon PLC", "Eta Enterprises",
"Gamma Inc", "Iota Holdings", "Kappa Co", "Lambda Systems", "Mu Industries",
"Nu Innovations", "Omicron Ventures", "Phi Holdings", "Pi Enterprises", "Sigma Solutions",
"Tau PLC", "Theta Technologies", "Upsilon Inc", "Xi Corp", "Zeta Solutions"),
tic = c("ACME", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
cusip = c("123456789", "234567890", NA, "567890123", "789012345", "345678901", "901234567",
"12345678", NA, "234567891", "345678902", NA, "123456789", "678901345", "890123456",
"901234567", NA, NA, NA, "678901234"),
sedol = c("123456", "1234567", "3456789", "4567890", "6789012", NA, NA, "9012345", NA, "1234568",
"2345679", "4567891", "123458", "5678902", "7890124", "8901235", "7890123", "9012346", NA, NA),
isin = c("US1234567890", NA, "US4567890123", "US5678901234", "US7890123456", "US3456789012",
"US9012345678", "US0123456789", "US1234567800", "US2345678910", NA, NA, NA,
"US6789013454", "US8901234566", "US9012345677", NA, "US0123456788", "US4567890132", "US6789012345"),
datadate_year = c(2021, 2022, 2021, 2022, 2021, 2023, 2023, 2021, 2022, 2023, 2021, 2023, 2023,
2021, 2023, 2021, 2022, 2022, 2022, 2023)
)
merged_dfs <- data.frame(
company_name.x = c("Acme Corp", "Beta Industries", "Epsilon PLC", "Eta Enterprises", "Gamma Inc", "Iota Holdings", "Kappa Co", "Lambda Systems", "Mu Industries", "Nu Innovations", "Omicron Ventures", "Phi Holdings", "Pi Enterprises", "Sigma Solutions", "Tau PLC", "Theta Technologies", "Upsilon Inc", "Xi Corp"),
tic.x = c("ACME", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "XI"),
cusip.x = c("123456789", "234567890", "567890123", "789012345", "345678901", "901234567", "12345678", "123456780", NA, "345678902", "567890234", "123456789", "678901345", "890123456", "901234567", "890123456", "12345678", NA),
sedol.x = c("123456", "1234567", "4567890", "6789012", "2345678", "8901234", "9012345", "123457", "1234568", "2345679", "4567891", "123458", "5678902", "7890124", "8901235", "7890123", "9012346", NA),
isin.x = c("US1234567890", "US2345678901", "US5678901234", "US7890123456", "US3456789012", "US9012345678", "US0123456789", "US1234567800", "US2345678910", "US3456789021", "US5678902343", "US1234567899", "US6789013454", "US8901234566", "US9012345677", "US8901234567", "US0123456788", "US4567890132"),
datadate_year.x = c(2021, 2022, 2022, 2021, 2023, 2023, 2021, 2022, 2023, 2021, 2023, 2023, 2021, 2023, 2021, 2022, 2022, 2022),
company_name.y = c("Acme Corp", "Beta Industries", "Epsilon PLC", "Eta Enterprises", "Gamma Inc", "Iota Holdings", "Kappa Co", "Lambda Systems", "Mu Industries", "Nu Innovations", "Omicron Ventures", "Phi Holdings", "Pi Enterprises", "Sigma Solutions", "Tau PLC", "Theta Technologies", "Upsilon Inc", "Xi Corp"),
tic.y = c("ACME", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "XI"),
cusip.y = c("123456789", "234567890", "567890123", "789012345", "345678901", "901234567", "12345678", NA, "234567891", "345678902", NA, "123456789", "678901345", "890123456", "901234567", NA, NA, NA),
sedol.y = c("123456", "1234567", "4567890", "6789012", NA, NA, "9012345", NA, "1234568", "2345679", "4567891", "123458", "5678902", "7890124", "8901235", "7890123", "9012346", NA),
isin.y = c("US1234567890", NA, "US5678901234", "US7890123456", "US3456789012", "US9012345678", "US0123456789", "US1234567800", "US2345678910", NA, NA, NA, "US6789013454", "US8901234566", "US9012345677", NA, "US0123456788", "US4567890132"),
datadate_year.y = c(2021, 2022, 2022, 2021, 2023, 2023, 2021, 2022, 2023, 2021, 2023, 2023, 2021, 2023, 2021, 2022, 2022, 2022),
matching_variable = c("tic", "cusip", "cusip", "cusip", "cusip", "cusip", "cusip", "isin", "sedol", "cusip", "sedol", "cusip", "cusip", "cusip", "cusip", "sedol", "sedol", "tic")
)