Data Wrangling in R

Good Afternoon All, I have two dataframe named df1 (432 observation and 6 variable) and df2 (432 observation and 16 variable) , I have two columns in the two dataframe called Plot. I want to retrieve all the rows from df2 based on the plot order in df1 . I tried using inner_join but I am not getting the 432 observation back instead I get 2592 observation. I have attached the sampled dataset below, here I only present the first 20 observation in the two dataframe. Thank you very much

library(tidyverse)


df1 <- structure(
  list(
    Plot = c(1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1,
             16, 2, 17, 2, 17, 2, 17, 2, 17),
    Varieties = c(
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581",
      "TME419",
      "TMS581"
    ),
    Position = c(
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Flat",
      "Flat",
      "Flat",
      "Flat",
      "Flat",
      "Flat",
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Ridge",
      "Flat",
      "Flat"
    ),
    Fertilizer = c(
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "0 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5",
      "37.5 kg P2O5"
    ),
    Innoculation = c(
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No",
      "No"
    ),
    Replication = c(1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
  ),
  row.names = c(NA,-20L),
  class = c("tbl_df", "tbl", "data.frame")
)


# Second dataframe

df2 <- structure(
  list(
    Time = structure(
      c(
        1689752853,
        1689752865,
        1689752879,
        1689752890,
        1689752899,
        1689752909,
        1689752947,
        1689752960,
        1689752999,
        1689753014,
        1689753027,
        1689753046,
        1689753083,
        1689753094,
        1689753106,
        1689753116,
        1689753129,
        1689753142,
        1689753213,
        1689753227
      ),
      tzone = "UTC",
      class = c("POSIXct",
                "POSIXt")
    ),
    `VWC%` = c(
      9.2,
      16.7,
      9.5,
      17.4,
      13.2,
      17,
      10.5,
      12.4,
      12.1,
      15.4,
      10.7,
      12.7,
      9.2,
      18.3,
      10.6,
      19.6,
      12.2,
      19.4,
      15.8,
      18
    ),
    Period = c(
      2508,
      2778,
      2520,
      2803,
      2653,
      2792,
      2557,
      2625,
      2613,
      2731,
      2563,
      2636,
      2508,
      2838,
      2561,
      2884,
      2619,
      2876,
      2748,
      2826
    ),
    EC = c(
      0,
      0,
      0,
      0.02,
      0.02,
      0,
      0,
      0.02,
      0,
      0,
      0,
      0,
      0,
      0.03,
      0,
      0.02,
      0,
      0.03,
      0.03,
      0.04
    ),
    Temp_Soil = c(
      26.7,
      26.6,
      26.5,
      26.4,
      26.5,
      26.4,
      26.5,
      26.5,
      26.6,
      26.6,
      26.6,
      26.6,
      26.7,
      26.8,
      26.8,
      26.8,
      27,
      27,
      27.3,
      27.4
    ),
    `Temp_Soil(F)` = c(
      80,
      79.9,
      79.7,
      79.6,
      79.7,
      79.6,
      79.7,
      79.7,
      79.9,
      79.8,
      79.8,
      79.9,
      80.1,
      80.2,
      80.3,
      80.3,
      80.5,
      80.6,
      81.1,
      81.3
    ),
    Temp_IR = c(
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-"
    ),
    `Temp_IR(F)` = c(
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-",
      "--.-"
    ),
    Latitude = c(
      7.487863,
      7.487998,
      7.487807,
      7.487947,
      7.487862,
      7.487838,
      7.487883,
      7.48792,
      7.48798,
      7.487973,
      7.487955,
      7.48785,
      7.488163,
      7.487747,
      7.487967,
      7.488022,
      7.488123,
      7.488077,
      7.487815,
      7.487868
    ),
    Longitude = c(
      3.883433,
      3.88335,
      3.883468,
      3.88375,
      3.883648,
      3.883797,
      3.883428,
      3.883348,
      3.88348,
      3.883722,
      3.883125,
      3.883592,
      3.88315,
      3.883782,
      3.883427,
      3.883475,
      3.883328,
      3.883518,
      3.883197,
      3.88359
    ),
    Satellites = c(6, 5, 6, 5, 4, 4,
                   5, 5, 6, 5, 5, 4, 4, 4, 4, 5, 4, 4, 5, 5),
    Fix = c(1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
    `Rod Length` = c(
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L",
      "L"
    ),
    `Soil Type` = c(
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S",
      "S"
    ),
    `VWC Mode` = c(
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V",
      "V"
    ),
    Plot = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
             3, 3, 3, 3, 4, 4)
  ),
  row.names = c(NA,-20L),
  class = c("tbl_df",
            "tbl", "data.frame")
)

It seems the data have six occurrences of each value of plot. You get 2592 rows after the inner join because each row of df1 is joined to all of the rows in df2 that have the same plot value. 2592 = 432 * 6. If you want each row of df1 to match with one row of df2, there must be a way to determine which rows are matched. Is there another column that can be used to match the rows of the data frames?

1 Like

Thank you for your reply @FJCC , there is no other rows that can be used for the matching, in that case what do you suggest I should do in this case. I am expecting to get 432 rows as the final result. If I do this manually it will be very tedious. Please is there a way around this problem ?

what would you do manually that wouldnt result in 2592 rows ?
if you have a principle you can apply manually, then you could instruct the computer to do it programmatically. If this knowledge of what to do is lacking, it wont be possible to do it manually any more so than it could be done by machine ...

I was thinking if I can manually extract the rows out and form a new dataframe. Its seems R is duplicating the plot

how would you know which rows from df2 to not bring into df1 ?

the first row of df1, has Plot value 1
there are 6 such rows with Plot value of 1 in df2, they are the first 6 rows in fact.
if only one of these should be retrieved, which one ? by what rule would one be better than another ? if there is no such rule, its simply not possible, or doing it would be arbitrary and may bias your analysis (whatever that may be)

Thank you very much for your suggestion, I will use inner-join and leave the results that way

@FJCC @nirgrahamuk this is what I was hoping to achieve when I join the two dataframes with inner_join. I have to do this manually. Thank you for your time. Here I only show the output for the first 20 observation. I now have 432 observation in which I was expecting!!!

library(tidyverse)

df_join <- structure(
list(
Time = structure(
c(
1689752853,
1689752865,
1689752879,
1689752890,
1689752899,
1689752909,
1689752947,
1689752960,
1689752999,
1689753014,
1689753027,
1689753046,
1689753083,
1689753094,
1689753106,
1689753116,
1689753129,
1689753142,
1689753213,
1689753227
),
tzone = "UTC",
class = c("POSIXct",
"POSIXt")
),
Position = c(
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat",
"Ridge",
"Flat"
),
Plot = c(1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4),
Varieties = c(
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419",
"TME419"
),
Fertilizer = c(
"0 kg P2O5",
"0 kg P2O5",
"0 kg P2O5",
"0 kg P2O5",
"0 kg P2O5",
"0 kg P2O5",
"37.5 kg P2O5",
"37.5 kg P2O5",
"37.5 kg P2O5",
"37.5 kg P2O5",
"37.5 kg P2O5",
"37.5 kg P2O5",
"75 kg P2O5",
"75 kg P2O5",
"75 kg P2O5",
"75 kg P2O5",
"75 kg P2O5",
"75 kg P2O5",
"0 kg P2O5",
"0 kg P2O5"
),
Innoculation = c(
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"No",
"Single",
"Single"
),
Replication = c(1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
VWC% = c(
9.2,
16.7,
9.5,
17.4,
13.2,
17,
10.5,
12.4,
12.1,
15.4,
10.7,
12.7,
9.2,
18.3,
10.6,
19.6,
12.2,
19.4,
15.8,
18
),
Period = c(
2508,
2778,
2520,
2803,
2653,
2792,
2557,
2625,
2613,
2731,
2563,
2636,
2508,
2838,
2561,
2884,
2619,
2876,
2748,
2826
),
EC = c(
0,
0,
0,
0.02,
0.02,
0,
0,
0.02,
0,
0,
0,
0,
0,
0.03,
0,
0.02,
0,
0.03,
0.03,
0.04
),
Temp_Soil = c(
26.7,
26.6,
26.5,
26.4,
26.5,
26.4,
26.5,
26.5,
26.6,
26.6,
26.6,
26.6,
26.7,
26.8,
26.8,
26.8,
27,
27,
27.3,
27.4
),
Temp_Soil(F) = c(
80,
79.9,
79.7,
79.6,
79.7,
79.6,
79.7,
79.7,
79.9,
79.8,
79.8,
79.9,
80.1,
80.2,
80.3,
80.3,
80.5,
80.6,
81.1,
81.3
)
),
row.names = c(NA,-20L),
class = c("tbl_df", "tbl",
"data.frame")
)

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.