Problem combining two datasets in R

anotherhoward · October 14, 2024, 7:23pm

I am having a problem merging two datasets. Each is extracting data from the same file.

Here is the output from the first part of the code. It is for one season.

Here is the code:

library("Lahman")
WS_Losers <- Teams %>%
  filter(yearID >= 1901 & lgID %in% c("AL","NL")) |>
  filter(LgWin == "Y" & WSWin == "N") %>%
  mutate(next_year = yearID + 1) %>%
  select(yearID, teamID, LgWin, WSWin, W, L, Rank, next_year)

Here is the output from the second part of the code. The data I am interested in is for the same team but one season later.

Here is my code:

  Won_Lost_Next <- Teams %>%
  filter(yearID >= 1901 & lgID %in% c("AL", "NL")) %>%
  mutate(W_Lpct = round(W / (W + L), 3)) %>%
  select(yearID, teamID, W, L, W_Lpct, Rank, LgWin, WSWin, next_year)

Here is the rest of the code to merge the two datasets:

merged_df <- inner_join(WS_Losers, Won_Lost_Next, by = c("next_year" = "yearID", "teamID"))
write.csv(merged_df, "merged_file.csv", row.names = FALSE)

Here are the output of each dataset.

dput(head(WS_Losers, 10))
structure(list(yearID = c(1903L, 1905L, 1906L, 1907L, 1908L,
1909L, 1910L, 1911L, 1912L, 1913L), teamID = structure(c(106L,
101L, 35L, 52L, 52L, 52L, 35L, 89L, 89L, 89L), levels = c("ALT",
"ANA", "ARI", "ATL", "BAL", "BFN", "BFP", "BL1", "BL2", "BL3",
"BL4", "BLA", "BLF", "BLN", "BLU", "BOS", "BR1", "BR2", "BR3",
"BR4", "BRF", "BRO", "BRP", "BS1", "BS2", "BSN", "BSP", "BSU",
"BUF", "CAL", "CH1", "CH2", "CHA", "CHF", "CHN", "CHP", "CHU",
"CIN", "CL1", "CL2", "CL3", "CL4", "CL5", "CL6", "CLE", "CLP",
"CN1", "CN2", "CN3", "CNU", "COL", "DET", "DTN", "ELI", "FLO",
"FW1", "HAR", "HOU", "HR1", "IN1", "IN2", "IN3", "IND", "KC1",
"KC2", "KCA", "KCF", "KCN", "KCU", "KEO", "LAA", "LAN", "LS1",
"LS2", "LS3", "MIA", "MID", "MIL", "MIN", "ML1", "ML2", "ML3",
"ML4", "MLA", "MLU", "MON", "NEW", "NH1", "NY1", "NY2", "NY3",
"NY4", "NYA", "NYN", "NYP", "OAK", "PH1", "PH2", "PH3", "PH4",
"PHA", "PHI", "PHN", "PHP", "PHU", "PIT", "PRO", "PT1", "PTF",
"PTP", "RC1", "RC2", "RIC", "SDN", "SE1", "SEA", "SFN", "SL1",
"SL2", "SL3", "SL4", "SL5", "SLA", "SLF", "SLN", "SLU", "SPU",
"SR1", "SR2", "TBA", "TEX", "TL1", "TL2", "TOR", "TRN", "TRO",
"WAS", "WIL", "WOR", "WS1", "WS2", "WS3", "WS4", "WS5", "WS6",
"WS7", "WS8", "WS9", "WSU"), class = "factor"), LgWin = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), WSWin = c("N",
"N", "N", "N", "N", "N", "N", "N", "N", "N"), W = c(91L, 92L,
116L, 92L, 90L, 98L, 104L, 99L, 103L, 101L), L = c(49L, 56L,
36L, 58L, 63L, 54L, 50L, 54L, 48L, 51L), Rank = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), next_year = c(1904, 1906, 1907,
1908, 1909, 1910, 1911, 1912, 1913, 1914)), row.names = c(NA,
10L), class = "data.frame")

dput(head(Won_Lost_Next, 40))
structure(list(yearID = c(1901L, 1901L, 1901L, 1901L, 1901L,
1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L,
1901L, 1901L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L), teamID = structure(c(12L,
16L, 33L, 45L, 52L, 84L, 101L, 140L, 22L, 26L, 35L, 38L, 89L,
102L, 106L, 125L, 12L, 16L, 33L, 45L, 52L, 101L, 123L, 140L,
22L, 26L, 35L, 38L, 89L, 102L, 106L, 125L, 16L, 33L, 45L, 52L,
93L, 101L, 123L, 140L), levels = c("ALT", "ANA", "ARI", "ATL",
"BAL", "BFN", "BFP", "BL1", "BL2", "BL3", "BL4", "BLA", "BLF",
"BLN", "BLU", "BOS", "BR1", "BR2", "BR3", "BR4", "BRF", "BRO",
"BRP", "BS1", "BS2", "BSN", "BSP", "BSU", "BUF", "CAL", "CH1",
"CH2", "CHA", "CHF", "CHN", "CHP", "CHU", "CIN", "CL1", "CL2",
"CL3", "CL4", "CL5", "CL6", "CLE", "CLP", "CN1", "CN2", "CN3",
"CNU", "COL", "DET", "DTN", "ELI", "FLO", "FW1", "HAR", "HOU",
"HR1", "IN1", "IN2", "IN3", "IND", "KC1", "KC2", "KCA", "KCF",
"KCN", "KCU", "KEO", "LAA", "LAN", "LS1", "LS2", "LS3", "MIA",
"MID", "MIL", "MIN", "ML1", "ML2", "ML3", "ML4", "MLA", "MLU",
"MON", "NEW", "NH1", "NY1", "NY2", "NY3", "NY4", "NYA", "NYN",
"NYP", "OAK", "PH1", "PH2", "PH3", "PH4", "PHA", "PHI", "PHN",
"PHP", "PHU", "PIT", "PRO", "PT1", "PTF", "PTP", "RC1", "RC2",
"RIC", "SDN", "SE1", "SEA", "SFN", "SL1", "SL2", "SL3", "SL4",
"SL5", "SLA", "SLF", "SLN", "SLU", "SPU", "SR1", "SR2", "TBA",
"TEX", "TL1", "TL2", "TOR", "TRN", "TRO", "WAS", "WIL", "WOR",
"WS1", "WS2", "WS3", "WS4", "WS5", "WS6", "WS7", "WS8", "WS9",
"WSU"), class = "factor"), W = c(68L, 79L, 83L, 54L, 74L, 48L,
74L, 61L, 79L, 69L, 53L, 52L, 52L, 83L, 90L, 76L, 50L, 77L, 74L,
69L, 52L, 83L, 78L, 61L, 75L, 73L, 68L, 70L, 48L, 56L, 103L,
56L, 91L, 60L, 77L, 65L, 72L, 75L, 65L, 43L), L = c(65L, 57L,
53L, 82L, 61L, 89L, 62L, 72L, 57L, 69L, 86L, 87L, 85L, 57L, 49L,
64L, 88L, 60L, 60L, 67L, 83L, 53L, 58L, 75L, 63L, 64L, 69L, 70L,
88L, 81L, 36L, 78L, 47L, 77L, 63L, 71L, 62L, 60L, 74L, 94L),
W_Lpct = c(0.511, 0.581, 0.61, 0.397, 0.548, 0.35, 0.544,
0.459, 0.581, 0.5, 0.381, 0.374, 0.38, 0.593, 0.647, 0.543,
0.362, 0.562, 0.552, 0.507, 0.385, 0.61, 0.574, 0.449, 0.543,
0.533, 0.496, 0.5, 0.353, 0.409, 0.741, 0.418, 0.659, 0.438,
0.55, 0.478, 0.537, 0.556, 0.468, 0.314), Rank = c(5L, 2L,
1L, 7L, 3L, 8L, 4L, 6L, 3L, 5L, 6L, 8L, 7L, 2L, 1L, 4L, 8L,
3L, 4L, 5L, 7L, 1L, 2L, 6L, 2L, 3L, 5L, 4L, 8L, 7L, 1L, 6L,
1L, 7L, 3L, 5L, 4L, 2L, 6L, 8L), LgWin = c("N", "N", "Y",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y",
"N", "N", "N", "N", "N", "N", "Y", "N", "N", "N", "N", "N",
"N", "N", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "N",
"N"), WSWin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "Y", "N", "N", "N", "N", "N", "N",
"N"), next_year = c(1902, 1902, 1902, 1902, 1902, 1902, 1902,
1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1903,
1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903,
1903, 1903, 1903, 1903, 1903, 1904, 1904, 1904, 1904, 1904,
1904, 1904, 1904)), row.names = c(NA, 40L), class = "data.frame")

What I want to do is to merge what is in the second image with what is in the first image with the corresponding rows aligning. One problem is that in the second image, the next_year column should be one year higher than the yearID,as it is in the first image. Further, the data that is added should be the data for the same team but one year later, so for the 1901 Boston team, the data should be that for the same team but in the 1902 season. I am comparing how a team that played in the World Series in one season performed during the following season.

I would appreciate help in getting this code to produce the desired result.
Howard

FJCC · October 14, 2024, 9:12pm

Your code for Won_Lost_Next does not include the definition of next_year.

Please post the output

dput(head(WS_Losers, 10))
dput(head(Won_Lost_Next, 40))

That will allow others to recreate a subset of your data.

anotherhoward · October 14, 2024, 9:39pm

next_year refers to the next baseball season. It is found by adding one to the yearID variable.

FJCC · October 14, 2024, 10:32pm

I don't see any problem with your code.
When I asked for 40 rows of Won_Lost_Next, I failed to notice that data set starts in 1901, so 40 rows only retrieves data up to 1903. Since WS_Losers starts in 1903, there were no matches between its next_year column and the yearID of Won_Lost_Next. I worked around that by subtracting three years from next_year of WS_Losers. Having done that, I get a successful merge. Do you not get any rows in the merged data with the full data sets?

WS_Losers <- structure(list(yearID = c(1903L, 1905L, 1906L, 1907L, 1908L,
                          1909L, 1910L, 1911L, 1912L, 1913L), teamID = structure(c(106L,
                                                                                   101L, 35L, 52L, 52L, 52L, 35L, 89L, 89L, 89L), levels = c("ALT",
                                                                                                                                             "ANA", "ARI", "ATL", "BAL", "BFN", "BFP", "BL1", "BL2", "BL3",
                                                                                                                                             "BL4", "BLA", "BLF", "BLN", "BLU", "BOS", "BR1", "BR2", "BR3",
                                                                                                                                             "BR4", "BRF", "BRO", "BRP", "BS1", "BS2", "BSN", "BSP", "BSU",
                                                                                                                                             "BUF", "CAL", "CH1", "CH2", "CHA", "CHF", "CHN", "CHP", "CHU",
                                                                                                                                             "CIN", "CL1", "CL2", "CL3", "CL4", "CL5", "CL6", "CLE", "CLP",
                                                                                                                                             "CN1", "CN2", "CN3", "CNU", "COL", "DET", "DTN", "ELI", "FLO",
                                                                                                                                             "FW1", "HAR", "HOU", "HR1", "IN1", "IN2", "IN3", "IND", "KC1",
                                                                                                                                             "KC2", "KCA", "KCF", "KCN", "KCU", "KEO", "LAA", "LAN", "LS1",
                                                                                                                                             "LS2", "LS3", "MIA", "MID", "MIL", "MIN", "ML1", "ML2", "ML3",
                                                                                                                                             "ML4", "MLA", "MLU", "MON", "NEW", "NH1", "NY1", "NY2", "NY3",
                                                                                                                                             "NY4", "NYA", "NYN", "NYP", "OAK", "PH1", "PH2", "PH3", "PH4",
                                                                                                                                             "PHA", "PHI", "PHN", "PHP", "PHU", "PIT", "PRO", "PT1", "PTF",
                                                                                                                                             "PTP", "RC1", "RC2", "RIC", "SDN", "SE1", "SEA", "SFN", "SL1",
                                                                                                                                             "SL2", "SL3", "SL4", "SL5", "SLA", "SLF", "SLN", "SLU", "SPU",
                                                                                                                                             "SR1", "SR2", "TBA", "TEX", "TL1", "TL2", "TOR", "TRN", "TRO",
                                                                                                                                             "WAS", "WIL", "WOR", "WS1", "WS2", "WS3", "WS4", "WS5", "WS6",
                                                                                                                                             "WS7", "WS8", "WS9", "WSU"), class = "factor"), LgWin = c("Y",
                                                                                                                                                                                                       "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), WSWin = c("N",
                                                                                                                                                                                                                                                               "N", "N", "N", "N", "N", "N", "N", "N", "N"), W = c(91L, 92L,
                                                                                                                                                                                                                                                                                                                   116L, 92L, 90L, 98L, 104L, 99L, 103L, 101L), L = c(49L, 56L,
                                                                                                                                                                                                                                                                                                                                                                      36L, 58L, 63L, 54L, 50L, 54L, 48L, 51L), Rank = c(1L, 1L, 1L,
                                                                                                                                                                                                                                                                                                                                                                                                                        1L, 1L, 1L, 1L, 1L, 1L, 1L), next_year = c(1904, 1906, 1907,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                   1908, 1909, 1910, 1911, 1912, 1913, 1914)), row.names = c(NA,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             10L), class = "data.frame")

Won_Lost_Next <- structure(list(yearID = c(1901L, 1901L, 1901L, 1901L, 1901L,
                          1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L,
                          1901L, 1901L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
                          1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
                          1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L), teamID = structure(c(12L,
                                                                                                        16L, 33L, 45L, 52L, 84L, 101L, 140L, 22L, 26L, 35L, 38L, 89L,
                                                                                                        102L, 106L, 125L, 12L, 16L, 33L, 45L, 52L, 101L, 123L, 140L,
                                                                                                        22L, 26L, 35L, 38L, 89L, 102L, 106L, 125L, 16L, 33L, 45L, 52L,
                                                                                                        93L, 101L, 123L, 140L), levels = c("ALT", "ANA", "ARI", "ATL",
                                                                                                                                           "BAL", "BFN", "BFP", "BL1", "BL2", "BL3", "BL4", "BLA", "BLF",
                                                                                                                                           "BLN", "BLU", "BOS", "BR1", "BR2", "BR3", "BR4", "BRF", "BRO",
                                                                                                                                           "BRP", "BS1", "BS2", "BSN", "BSP", "BSU", "BUF", "CAL", "CH1",
                                                                                                                                           "CH2", "CHA", "CHF", "CHN", "CHP", "CHU", "CIN", "CL1", "CL2",
                                                                                                                                           "CL3", "CL4", "CL5", "CL6", "CLE", "CLP", "CN1", "CN2", "CN3",
                                                                                                                                           "CNU", "COL", "DET", "DTN", "ELI", "FLO", "FW1", "HAR", "HOU",
                                                                                                                                           "HR1", "IN1", "IN2", "IN3", "IND", "KC1", "KC2", "KCA", "KCF",
                                                                                                                                           "KCN", "KCU", "KEO", "LAA", "LAN", "LS1", "LS2", "LS3", "MIA",
                                                                                                                                           "MID", "MIL", "MIN", "ML1", "ML2", "ML3", "ML4", "MLA", "MLU",
                                                                                                                                           "MON", "NEW", "NH1", "NY1", "NY2", "NY3", "NY4", "NYA", "NYN",
                                                                                                                                           "NYP", "OAK", "PH1", "PH2", "PH3", "PH4", "PHA", "PHI", "PHN",
                                                                                                                                           "PHP", "PHU", "PIT", "PRO", "PT1", "PTF", "PTP", "RC1", "RC2",
                                                                                                                                           "RIC", "SDN", "SE1", "SEA", "SFN", "SL1", "SL2", "SL3", "SL4",
                                                                                                                                           "SL5", "SLA", "SLF", "SLN", "SLU", "SPU", "SR1", "SR2", "TBA",
                                                                                                                                           "TEX", "TL1", "TL2", "TOR", "TRN", "TRO", "WAS", "WIL", "WOR",
                                                                                                                                           "WS1", "WS2", "WS3", "WS4", "WS5", "WS6", "WS7", "WS8", "WS9",
                                                                                                                                           "WSU"), class = "factor"), W = c(68L, 79L, 83L, 54L, 74L, 48L,
                                                                                                                                                                            74L, 61L, 79L, 69L, 53L, 52L, 52L, 83L, 90L, 76L, 50L, 77L, 74L,
                                                                                                                                                                            69L, 52L, 83L, 78L, 61L, 75L, 73L, 68L, 70L, 48L, 56L, 103L,
                                                                                                                                                                            56L, 91L, 60L, 77L, 65L, 72L, 75L, 65L, 43L), L = c(65L, 57L,
                                                                                                                                                                                                                                53L, 82L, 61L, 89L, 62L, 72L, 57L, 69L, 86L, 87L, 85L, 57L, 49L,
                                                                                                                                                                                                                                64L, 88L, 60L, 60L, 67L, 83L, 53L, 58L, 75L, 63L, 64L, 69L, 70L,
                                                                                                                                                                                                                                88L, 81L, 36L, 78L, 47L, 77L, 63L, 71L, 62L, 60L, 74L, 94L),
               W_Lpct = c(0.511, 0.581, 0.61, 0.397, 0.548, 0.35, 0.544,
                          0.459, 0.581, 0.5, 0.381, 0.374, 0.38, 0.593, 0.647, 0.543,
                          0.362, 0.562, 0.552, 0.507, 0.385, 0.61, 0.574, 0.449, 0.543,
                          0.533, 0.496, 0.5, 0.353, 0.409, 0.741, 0.418, 0.659, 0.438,
                          0.55, 0.478, 0.537, 0.556, 0.468, 0.314), Rank = c(5L, 2L,
                                                                             1L, 7L, 3L, 8L, 4L, 6L, 3L, 5L, 6L, 8L, 7L, 2L, 1L, 4L, 8L,
                                                                             3L, 4L, 5L, 7L, 1L, 2L, 6L, 2L, 3L, 5L, 4L, 8L, 7L, 1L, 6L,
                                                                             1L, 7L, 3L, 5L, 4L, 2L, 6L, 8L), LgWin = c("N", "N", "Y",
                                                                                                                        "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y",
                                                                                                                        "N", "N", "N", "N", "N", "N", "Y", "N", "N", "N", "N", "N",
                                                                                                                        "N", "N", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "N",
                                                                                                                        "N"), WSWin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                                                                                                                                        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                                                                                                                                        NA, NA, NA, NA, NA, NA, "Y", "N", "N", "N", "N", "N", "N",
                                                                                                                                        "N"), next_year = c(1902, 1902, 1902, 1902, 1902, 1902, 1902,
                                                                                                                                                            1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1903,
                                                                                                                                                            1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903,
                                                                                                                                                            1903, 1903, 1903, 1903, 1903, 1904, 1904, 1904, 1904, 1904,
                                                                                                                                                            1904, 1904, 1904)), row.names = c(NA, 40L), class = "data.frame")

library(tidyverse)
WS_Losers <- WS_Losers |> mutate(next_year = next_year - 3)
merged_df <- inner_join(WS_Losers, Won_Lost_Next, by = c("next_year" = "yearID", "teamID"))
nrow(merged_df)
#> [1] 2

^{Created on 2024-10-14 with reprex v2.1.1}

anotherhoward · October 15, 2024, 2:40pm

I get all the rows in the merged data with the full data sets. But I have a problem. Why is the last column next_year.y appearing in the merged_df output? It is not necessary.

FJCC · October 15, 2024, 11:34pm

You have a next_year column in the Won_Lost_Next data frame. When you join that to WS_Losers, which also has a next_year column, R has to distinguish the to columns, so it appends a .y to the one from Won_Lost_Next. You can drop the column with the select() function from dplyr or you can just avoid calculating it in the first place.

anotherhoward · October 16, 2024, 11:11am

I added the next_year column in the Won_Lost_Next data frame so that I could add the columns in that data frame to those in WS_Losers. If I didn’t put a next_year column in the Won_Lost_Next data frame, is there another way I could have added its columns to WS_Losers without appending the next_year.y column?

FJCC · October 16, 2024, 12:44pm

If you are doing a join, there is no need to have identical columns in the two data frames.

anotherhoward · October 16, 2024, 7:03pm

If there is no need to have identical columns in the two data frames, how would you change the code?

FJCC · October 17, 2024, 1:12am

Well, I don't know your overall goal, so I can't say. What specific problem are you facing with the merged data frame as it is? Is the next_year.y column causing a problem? Are there other changes needed?
You can drop the next_year.y column with

library(dplyr)
merged_df <- merged_df |> select(-next_year.y)

anotherhoward · October 18, 2024, 4:50pm

My questions have been answered.

R-Guru · October 19, 2024, 4:59pm

You may want to consider creating a new variable using the lag() function to store the yearID value for record 1 in record 2, ex. yearID_next = lag(yearID) and then you can merge the two data frames by yearID variables. This method helps to assure matching key values between the two data frames. Below is an example. Hope this helps.

mtcars2 <- mutate(subset(mtcars, cyl== 6, select=c(mpg, cyl, disp)), prev_mpg = lag( mpg , 1 ))
mtcars2

merged_df <- inner_join(mtcars, mtcars2, by = c("mpg" = "prev_mpg"))
merged_df

anotherhoward · October 19, 2024, 6:33pm

Could you please insert your changes into the complete code. Thanks

system · January 17, 2025, 6:34pm

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.