Problem combining two datasets in R

I am having a problem merging two datasets. Each is extracting data from the same file.

Here is the output from the first part of the code. It is for one season.

Here is the code:

library("Lahman")
WS_Losers <- Teams %>%
  filter(yearID >= 1901 & lgID %in% c("AL","NL")) |>
  filter(LgWin == "Y" & WSWin == "N") %>%
  mutate(next_year = yearID + 1) %>%
  select(yearID, teamID, LgWin, WSWin, W, L, Rank, next_year)

Here is the output from the second part of the code. The data I am interested in is for the same team but one season later.

Here is my code:

  Won_Lost_Next <- Teams %>%
  filter(yearID >= 1901 & lgID %in% c("AL", "NL")) %>%
  mutate(W_Lpct = round(W / (W + L), 3)) %>%
  select(yearID, teamID, W, L, W_Lpct, Rank, LgWin, WSWin, next_year)

Here is the rest of the code to merge the two datasets:

merged_df <- inner_join(WS_Losers, Won_Lost_Next, by = c("next_year" = "yearID", "teamID"))
write.csv(merged_df, "merged_file.csv", row.names = FALSE)

Here are the output of each dataset.

dput(head(WS_Losers, 10))
structure(list(yearID = c(1903L, 1905L, 1906L, 1907L, 1908L,
1909L, 1910L, 1911L, 1912L, 1913L), teamID = structure(c(106L,
101L, 35L, 52L, 52L, 52L, 35L, 89L, 89L, 89L), levels = c("ALT",
"ANA", "ARI", "ATL", "BAL", "BFN", "BFP", "BL1", "BL2", "BL3",
"BL4", "BLA", "BLF", "BLN", "BLU", "BOS", "BR1", "BR2", "BR3",
"BR4", "BRF", "BRO", "BRP", "BS1", "BS2", "BSN", "BSP", "BSU",
"BUF", "CAL", "CH1", "CH2", "CHA", "CHF", "CHN", "CHP", "CHU",
"CIN", "CL1", "CL2", "CL3", "CL4", "CL5", "CL6", "CLE", "CLP",
"CN1", "CN2", "CN3", "CNU", "COL", "DET", "DTN", "ELI", "FLO",
"FW1", "HAR", "HOU", "HR1", "IN1", "IN2", "IN3", "IND", "KC1",
"KC2", "KCA", "KCF", "KCN", "KCU", "KEO", "LAA", "LAN", "LS1",
"LS2", "LS3", "MIA", "MID", "MIL", "MIN", "ML1", "ML2", "ML3",
"ML4", "MLA", "MLU", "MON", "NEW", "NH1", "NY1", "NY2", "NY3",
"NY4", "NYA", "NYN", "NYP", "OAK", "PH1", "PH2", "PH3", "PH4",
"PHA", "PHI", "PHN", "PHP", "PHU", "PIT", "PRO", "PT1", "PTF",
"PTP", "RC1", "RC2", "RIC", "SDN", "SE1", "SEA", "SFN", "SL1",
"SL2", "SL3", "SL4", "SL5", "SLA", "SLF", "SLN", "SLU", "SPU",
"SR1", "SR2", "TBA", "TEX", "TL1", "TL2", "TOR", "TRN", "TRO",
"WAS", "WIL", "WOR", "WS1", "WS2", "WS3", "WS4", "WS5", "WS6",
"WS7", "WS8", "WS9", "WSU"), class = "factor"), LgWin = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), WSWin = c("N",
"N", "N", "N", "N", "N", "N", "N", "N", "N"), W = c(91L, 92L,
116L, 92L, 90L, 98L, 104L, 99L, 103L, 101L), L = c(49L, 56L,
36L, 58L, 63L, 54L, 50L, 54L, 48L, 51L), Rank = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), next_year = c(1904, 1906, 1907,
1908, 1909, 1910, 1911, 1912, 1913, 1914)), row.names = c(NA,
10L), class = "data.frame")

dput(head(Won_Lost_Next, 40))
structure(list(yearID = c(1901L, 1901L, 1901L, 1901L, 1901L,
1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L,
1901L, 1901L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L), teamID = structure(c(12L,
16L, 33L, 45L, 52L, 84L, 101L, 140L, 22L, 26L, 35L, 38L, 89L,
102L, 106L, 125L, 12L, 16L, 33L, 45L, 52L, 101L, 123L, 140L,
22L, 26L, 35L, 38L, 89L, 102L, 106L, 125L, 16L, 33L, 45L, 52L,
93L, 101L, 123L, 140L), levels = c("ALT", "ANA", "ARI", "ATL",
"BAL", "BFN", "BFP", "BL1", "BL2", "BL3", "BL4", "BLA", "BLF",
"BLN", "BLU", "BOS", "BR1", "BR2", "BR3", "BR4", "BRF", "BRO",
"BRP", "BS1", "BS2", "BSN", "BSP", "BSU", "BUF", "CAL", "CH1",
"CH2", "CHA", "CHF", "CHN", "CHP", "CHU", "CIN", "CL1", "CL2",
"CL3", "CL4", "CL5", "CL6", "CLE", "CLP", "CN1", "CN2", "CN3",
"CNU", "COL", "DET", "DTN", "ELI", "FLO", "FW1", "HAR", "HOU",
"HR1", "IN1", "IN2", "IN3", "IND", "KC1", "KC2", "KCA", "KCF",
"KCN", "KCU", "KEO", "LAA", "LAN", "LS1", "LS2", "LS3", "MIA",
"MID", "MIL", "MIN", "ML1", "ML2", "ML3", "ML4", "MLA", "MLU",
"MON", "NEW", "NH1", "NY1", "NY2", "NY3", "NY4", "NYA", "NYN",
"NYP", "OAK", "PH1", "PH2", "PH3", "PH4", "PHA", "PHI", "PHN",
"PHP", "PHU", "PIT", "PRO", "PT1", "PTF", "PTP", "RC1", "RC2",
"RIC", "SDN", "SE1", "SEA", "SFN", "SL1", "SL2", "SL3", "SL4",
"SL5", "SLA", "SLF", "SLN", "SLU", "SPU", "SR1", "SR2", "TBA",
"TEX", "TL1", "TL2", "TOR", "TRN", "TRO", "WAS", "WIL", "WOR",
"WS1", "WS2", "WS3", "WS4", "WS5", "WS6", "WS7", "WS8", "WS9",
"WSU"), class = "factor"), W = c(68L, 79L, 83L, 54L, 74L, 48L,
74L, 61L, 79L, 69L, 53L, 52L, 52L, 83L, 90L, 76L, 50L, 77L, 74L,
69L, 52L, 83L, 78L, 61L, 75L, 73L, 68L, 70L, 48L, 56L, 103L,
56L, 91L, 60L, 77L, 65L, 72L, 75L, 65L, 43L), L = c(65L, 57L,
53L, 82L, 61L, 89L, 62L, 72L, 57L, 69L, 86L, 87L, 85L, 57L, 49L,
64L, 88L, 60L, 60L, 67L, 83L, 53L, 58L, 75L, 63L, 64L, 69L, 70L,
88L, 81L, 36L, 78L, 47L, 77L, 63L, 71L, 62L, 60L, 74L, 94L),
W_Lpct = c(0.511, 0.581, 0.61, 0.397, 0.548, 0.35, 0.544,
0.459, 0.581, 0.5, 0.381, 0.374, 0.38, 0.593, 0.647, 0.543,
0.362, 0.562, 0.552, 0.507, 0.385, 0.61, 0.574, 0.449, 0.543,
0.533, 0.496, 0.5, 0.353, 0.409, 0.741, 0.418, 0.659, 0.438,
0.55, 0.478, 0.537, 0.556, 0.468, 0.314), Rank = c(5L, 2L,
1L, 7L, 3L, 8L, 4L, 6L, 3L, 5L, 6L, 8L, 7L, 2L, 1L, 4L, 8L,
3L, 4L, 5L, 7L, 1L, 2L, 6L, 2L, 3L, 5L, 4L, 8L, 7L, 1L, 6L,
1L, 7L, 3L, 5L, 4L, 2L, 6L, 8L), LgWin = c("N", "N", "Y",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y",
"N", "N", "N", "N", "N", "N", "Y", "N", "N", "N", "N", "N",
"N", "N", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "N",
"N"), WSWin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "Y", "N", "N", "N", "N", "N", "N",
"N"), next_year = c(1902, 1902, 1902, 1902, 1902, 1902, 1902,
1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1903,
1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903,
1903, 1903, 1903, 1903, 1903, 1904, 1904, 1904, 1904, 1904,
1904, 1904, 1904)), row.names = c(NA, 40L), class = "data.frame")

What I want to do is to merge what is in the second image with what is in the first image with the corresponding rows aligning. One problem is that in the second image, the next_year column should be one year higher than the yearID,as it is in the first image. Further, the data that is added should be the data for the same team but one year later, so for the 1901 Boston team, the data should be that for the same team but in the 1902 season. I am comparing how a team that played in the World Series in one season performed during the following season.

I would appreciate help in getting this code to produce the desired result.
Howard

Your code for Won_Lost_Next does not include the definition of next_year.

Please post the output

dput(head(WS_Losers, 10))
dput(head(Won_Lost_Next, 40))

That will allow others to recreate a subset of your data.

next_year refers to the next baseball season. It is found by adding one to the yearID variable.

I don't see any problem with your code.
When I asked for 40 rows of Won_Lost_Next, I failed to notice that data set starts in 1901, so 40 rows only retrieves data up to 1903. Since WS_Losers starts in 1903, there were no matches between its next_year column and the yearID of Won_Lost_Next. I worked around that by subtracting three years from next_year of WS_Losers. Having done that, I get a successful merge. Do you not get any rows in the merged data with the full data sets?

WS_Losers <- structure(list(yearID = c(1903L, 1905L, 1906L, 1907L, 1908L,
                          1909L, 1910L, 1911L, 1912L, 1913L), teamID = structure(c(106L,
                                                                                   101L, 35L, 52L, 52L, 52L, 35L, 89L, 89L, 89L), levels = c("ALT",
                                                                                                                                             "ANA", "ARI", "ATL", "BAL", "BFN", "BFP", "BL1", "BL2", "BL3",
                                                                                                                                             "BL4", "BLA", "BLF", "BLN", "BLU", "BOS", "BR1", "BR2", "BR3",
                                                                                                                                             "BR4", "BRF", "BRO", "BRP", "BS1", "BS2", "BSN", "BSP", "BSU",
                                                                                                                                             "BUF", "CAL", "CH1", "CH2", "CHA", "CHF", "CHN", "CHP", "CHU",
                                                                                                                                             "CIN", "CL1", "CL2", "CL3", "CL4", "CL5", "CL6", "CLE", "CLP",
                                                                                                                                             "CN1", "CN2", "CN3", "CNU", "COL", "DET", "DTN", "ELI", "FLO",
                                                                                                                                             "FW1", "HAR", "HOU", "HR1", "IN1", "IN2", "IN3", "IND", "KC1",
                                                                                                                                             "KC2", "KCA", "KCF", "KCN", "KCU", "KEO", "LAA", "LAN", "LS1",
                                                                                                                                             "LS2", "LS3", "MIA", "MID", "MIL", "MIN", "ML1", "ML2", "ML3",
                                                                                                                                             "ML4", "MLA", "MLU", "MON", "NEW", "NH1", "NY1", "NY2", "NY3",
                                                                                                                                             "NY4", "NYA", "NYN", "NYP", "OAK", "PH1", "PH2", "PH3", "PH4",
                                                                                                                                             "PHA", "PHI", "PHN", "PHP", "PHU", "PIT", "PRO", "PT1", "PTF",
                                                                                                                                             "PTP", "RC1", "RC2", "RIC", "SDN", "SE1", "SEA", "SFN", "SL1",
                                                                                                                                             "SL2", "SL3", "SL4", "SL5", "SLA", "SLF", "SLN", "SLU", "SPU",
                                                                                                                                             "SR1", "SR2", "TBA", "TEX", "TL1", "TL2", "TOR", "TRN", "TRO",
                                                                                                                                             "WAS", "WIL", "WOR", "WS1", "WS2", "WS3", "WS4", "WS5", "WS6",
                                                                                                                                             "WS7", "WS8", "WS9", "WSU"), class = "factor"), LgWin = c("Y",
                                                                                                                                                                                                       "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), WSWin = c("N",
                                                                                                                                                                                                                                                               "N", "N", "N", "N", "N", "N", "N", "N", "N"), W = c(91L, 92L,
                                                                                                                                                                                                                                                                                                                   116L, 92L, 90L, 98L, 104L, 99L, 103L, 101L), L = c(49L, 56L,
                                                                                                                                                                                                                                                                                                                                                                      36L, 58L, 63L, 54L, 50L, 54L, 48L, 51L), Rank = c(1L, 1L, 1L,
                                                                                                                                                                                                                                                                                                                                                                                                                        1L, 1L, 1L, 1L, 1L, 1L, 1L), next_year = c(1904, 1906, 1907,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                   1908, 1909, 1910, 1911, 1912, 1913, 1914)), row.names = c(NA,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             10L), class = "data.frame")

Won_Lost_Next <- structure(list(yearID = c(1901L, 1901L, 1901L, 1901L, 1901L,
                          1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L, 1901L,
                          1901L, 1901L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
                          1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L, 1902L,
                          1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L, 1903L), teamID = structure(c(12L,
                                                                                                        16L, 33L, 45L, 52L, 84L, 101L, 140L, 22L, 26L, 35L, 38L, 89L,
                                                                                                        102L, 106L, 125L, 12L, 16L, 33L, 45L, 52L, 101L, 123L, 140L,
                                                                                                        22L, 26L, 35L, 38L, 89L, 102L, 106L, 125L, 16L, 33L, 45L, 52L,
                                                                                                        93L, 101L, 123L, 140L), levels = c("ALT", "ANA", "ARI", "ATL",
                                                                                                                                           "BAL", "BFN", "BFP", "BL1", "BL2", "BL3", "BL4", "BLA", "BLF",
                                                                                                                                           "BLN", "BLU", "BOS", "BR1", "BR2", "BR3", "BR4", "BRF", "BRO",
                                                                                                                                           "BRP", "BS1", "BS2", "BSN", "BSP", "BSU", "BUF", "CAL", "CH1",
                                                                                                                                           "CH2", "CHA", "CHF", "CHN", "CHP", "CHU", "CIN", "CL1", "CL2",
                                                                                                                                           "CL3", "CL4", "CL5", "CL6", "CLE", "CLP", "CN1", "CN2", "CN3",
                                                                                                                                           "CNU", "COL", "DET", "DTN", "ELI", "FLO", "FW1", "HAR", "HOU",
                                                                                                                                           "HR1", "IN1", "IN2", "IN3", "IND", "KC1", "KC2", "KCA", "KCF",
                                                                                                                                           "KCN", "KCU", "KEO", "LAA", "LAN", "LS1", "LS2", "LS3", "MIA",
                                                                                                                                           "MID", "MIL", "MIN", "ML1", "ML2", "ML3", "ML4", "MLA", "MLU",
                                                                                                                                           "MON", "NEW", "NH1", "NY1", "NY2", "NY3", "NY4", "NYA", "NYN",
                                                                                                                                           "NYP", "OAK", "PH1", "PH2", "PH3", "PH4", "PHA", "PHI", "PHN",
                                                                                                                                           "PHP", "PHU", "PIT", "PRO", "PT1", "PTF", "PTP", "RC1", "RC2",
                                                                                                                                           "RIC", "SDN", "SE1", "SEA", "SFN", "SL1", "SL2", "SL3", "SL4",
                                                                                                                                           "SL5", "SLA", "SLF", "SLN", "SLU", "SPU", "SR1", "SR2", "TBA",
                                                                                                                                           "TEX", "TL1", "TL2", "TOR", "TRN", "TRO", "WAS", "WIL", "WOR",
                                                                                                                                           "WS1", "WS2", "WS3", "WS4", "WS5", "WS6", "WS7", "WS8", "WS9",
                                                                                                                                           "WSU"), class = "factor"), W = c(68L, 79L, 83L, 54L, 74L, 48L,
                                                                                                                                                                            74L, 61L, 79L, 69L, 53L, 52L, 52L, 83L, 90L, 76L, 50L, 77L, 74L,
                                                                                                                                                                            69L, 52L, 83L, 78L, 61L, 75L, 73L, 68L, 70L, 48L, 56L, 103L,
                                                                                                                                                                            56L, 91L, 60L, 77L, 65L, 72L, 75L, 65L, 43L), L = c(65L, 57L,
                                                                                                                                                                                                                                53L, 82L, 61L, 89L, 62L, 72L, 57L, 69L, 86L, 87L, 85L, 57L, 49L,
                                                                                                                                                                                                                                64L, 88L, 60L, 60L, 67L, 83L, 53L, 58L, 75L, 63L, 64L, 69L, 70L,
                                                                                                                                                                                                                                88L, 81L, 36L, 78L, 47L, 77L, 63L, 71L, 62L, 60L, 74L, 94L),
               W_Lpct = c(0.511, 0.581, 0.61, 0.397, 0.548, 0.35, 0.544,
                          0.459, 0.581, 0.5, 0.381, 0.374, 0.38, 0.593, 0.647, 0.543,
                          0.362, 0.562, 0.552, 0.507, 0.385, 0.61, 0.574, 0.449, 0.543,
                          0.533, 0.496, 0.5, 0.353, 0.409, 0.741, 0.418, 0.659, 0.438,
                          0.55, 0.478, 0.537, 0.556, 0.468, 0.314), Rank = c(5L, 2L,
                                                                             1L, 7L, 3L, 8L, 4L, 6L, 3L, 5L, 6L, 8L, 7L, 2L, 1L, 4L, 8L,
                                                                             3L, 4L, 5L, 7L, 1L, 2L, 6L, 2L, 3L, 5L, 4L, 8L, 7L, 1L, 6L,
                                                                             1L, 7L, 3L, 5L, 4L, 2L, 6L, 8L), LgWin = c("N", "N", "Y",
                                                                                                                        "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y",
                                                                                                                        "N", "N", "N", "N", "N", "N", "Y", "N", "N", "N", "N", "N",
                                                                                                                        "N", "N", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "N",
                                                                                                                        "N"), WSWin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                                                                                                                                        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                                                                                                                                        NA, NA, NA, NA, NA, NA, "Y", "N", "N", "N", "N", "N", "N",
                                                                                                                                        "N"), next_year = c(1902, 1902, 1902, 1902, 1902, 1902, 1902,
                                                                                                                                                            1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1902, 1903,
                                                                                                                                                            1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903,
                                                                                                                                                            1903, 1903, 1903, 1903, 1903, 1904, 1904, 1904, 1904, 1904,
                                                                                                                                                            1904, 1904, 1904)), row.names = c(NA, 40L), class = "data.frame")

library(tidyverse)
WS_Losers <- WS_Losers |> mutate(next_year = next_year - 3)
merged_df <- inner_join(WS_Losers, Won_Lost_Next, by = c("next_year" = "yearID", "teamID"))
nrow(merged_df)
#> [1] 2

Created on 2024-10-14 with reprex v2.1.1

I get all the rows in the merged data with the full data sets. But I have a problem. Why is the last column next_year.y appearing in the merged_df output? It is not necessary.

You have a next_year column in the Won_Lost_Next data frame. When you join that to WS_Losers, which also has a next_year column, R has to distinguish the to columns, so it appends a .y to the one from Won_Lost_Next. You can drop the column with the select() function from dplyr or you can just avoid calculating it in the first place.

I added the next_year column in the Won_Lost_Next data frame so that I could add the columns in that data frame to those in WS_Losers. If I didn’t put a next_year column in the Won_Lost_Next data frame, is there another way I could have added its columns to WS_Losers without appending the next_year.y column?

If you are doing a join, there is no need to have identical columns in the two data frames.

If there is no need to have identical columns in the two data frames, how would you change the code?

Well, I don't know your overall goal, so I can't say. What specific problem are you facing with the merged data frame as it is? Is the next_year.y column causing a problem? Are there other changes needed?
You can drop the next_year.y column with

library(dplyr)
merged_df <- merged_df |> select(-next_year.y)

My questions have been answered.

You may want to consider creating a new variable using the lag() function to store the yearID value for record 1 in record 2, ex. yearID_next = lag(yearID) and then you can merge the two data frames by yearID variables. This method helps to assure matching key values between the two data frames. Below is an example. Hope this helps.

mtcars2 <- mutate(subset(mtcars, cyl== 6, select=c(mpg, cyl, disp)), prev_mpg = lag( mpg , 1 ))
mtcars2

merged_df <- inner_join(mtcars, mtcars2, by = c("mpg" = "prev_mpg"))
merged_df

Could you please insert your changes into the complete code. Thanks

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.