Just spotted the primary issue: the dates are treated as strings rather than dates, which is why 7/19/2023 precedes 8/27/2021! The fill()
function was also missing the argument that says which columns to fill.
Here is a corrected solution that uses the lubridate
package to treat dates correctly, and I'll replace the long column names by removing the lower-case letters so it's easier to compare rows, and only use the numeric portion of the unique IDs:
click here, and run hidden code first
library(tidyverse)
original_table <-
structure(list(EvaluationID = c("CY-TR-10058_2023-07-21", "CY-TR-20319_2023-07-22",
"CY-TR-78315_2023-07-21", "CY-TR-28952_2023-07-20", "CY-TR-15126_2023-07-18",
"CY-TR-32532_2023-07-21", "CY-TR-38511_2023-07-19", "CY-TR-22299_2023-07-21",
"CY-TR-1467_2023-07-21", "CY-TR-34500_2023-07-20", "CY-TR-12968_2023-07-19",
"CY-TR-69008_2023-07-18", "CY-TR-17347_2023-07-20", "CY-TR-3831_2023-07-19",
"CY-TR-34930_2023-07-20", "CY-TR-22299_2022-09-02", "CY-TR-3831_2021-08-27",
"CY-TR-28952_2021-08-28", "CY-TR-20319_2021-08-28", "CY-TR-1467_2021-08-29"
), PointID = c("CY-TR-10058", "CY-TR-20319", "CY-TR-78315", "CY-TR-28952",
"CY-TR-15126", "CY-TR-32532", "CY-TR-38511", "CY-TR-22299", "CY-TR-1467",
"CY-TR-34500", "CY-TR-12968", "CY-TR-69008", "CY-TR-17347", "CY-TR-3831",
"CY-TR-34930", "CY-TR-22299", "CY-TR-3831", "CY-TR-28952", "CY-TR-20319",
"CY-TR-1467"), StreamName = c("Gold Creek", "Gold Creek", NA,
"Gold Creek", "Gold Creek", "Gold Creek", "18 pup", "Gold Creek",
"Gold Creek", "Gold Creek", NA, NA, "Gold Creek", "Gold Creek",
"Gold Creek", "Gold Creek", "Gold Creek", "Gold Creek", "Gold Creek",
"Gold Creek"), FieldEvalDate = c("7/21/2023", "8/22/2023", "7/21/2023",
"7/20/2023", "7/18/2023", "7/21/2023", "7/19/2023", "7/21/2023",
"7/21/2023", "7/20/2023", "7/19/2023", "7/18/2023", "7/20/2023",
"7/19/2023", "7/20/2023", "9/2/2022", "8/27/2021", "8/28/2021",
"8/28/2021", "8/29/2021"), PointSelectionType = c("Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted"), PctOverheadCover = c(31.8, 2.4, 66.4, 12.6, 9.7,
3.6, 39.8, 3.7, 1.7, 2.5, 66.7, 55.9, 0.1, 0, 1.5, 2.1, 1.3,
10.6, 0, 2.3), PctBankOverheadCover = c(81.8, 14.3, 77, 63.9,
54.5, 32.9, 75.9, 25.4, 26.5, 23, 77.5, 67.9, 2.1, 12.8, 25,
32.6, 19, 41.4, 4.1, 25.4), VegComplexity = c(1, NA, 1.53, NA,
1.14, 0.68, 1.34, NA, NA, 0.91, 1.31, 1.32, 0.24, NA, 1.19, 0.48,
0.96, 0.75, 0.45, 0.68), VegComplexityWoody = c(0.76, NA, 0.71,
NA, 0.96, 0.45, 0.81, NA, NA, 0.68, 0.67, 0.66, 0.14, NA, 0.69,
0.36, 0.44, 0.32, 0.27, 0.3), VegComplexityUnderstoryGround = c(0.59,
NA, 0.43, NA, 0.86, 0.43, 0.79, NA, NA, 0.58, 0.67, 0.63, 0.14,
NA, 0.68, 0.34, 0.43, 0.3, 0.27, 0.29), SpecificConductance = c(723,
NA, 1522, NA, 663, 861, 299.1, NA, NA, 829, 402, 547, 670, NA,
755, 664.6, 670.1, 805.2, 774.9, 832.4), pH = c(8.29, NA, 8.32,
NA, 7.72, 8.16, 7.6, NA, NA, 7.99, 8.3, 8.25, 8.22, NA, 7.94,
8.24, 8.05, 8.11, 8.28, 8.37), InstantTemp = c(9, NA, 8.7, NA,
6.33, 7.46, 5.8, NA, NA, 11.8, 2.5, 6.6, 7.8, NA, 5.86, 6.9,
2.4, 0.4, 3.5, 3.9), TurbidityAvg = c(6.54, NA, 3.46, NA, 37.2,
18.53, 22.6, NA, NA, 19.82, 22.9, 579, 43.47, NA, 9.65, 28.64,
5.75, 15.06, 25.94, 13.77), PoolCount = c(3, NA, 15, NA, 2, 11,
12, NA, NA, 8, 33, 9, 1, NA, 2, 1, 1, 2, 1, 0), PctPools = c(5.57,
NA, 11.6, NA, 5.87, 11.62, 26.67, NA, NA, 18.24, 26.25, 9.37,
1.73, NA, 5.21, 3.42, 3, 2.67, 2.68, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))
Streamline and arrange original table:
original_table |>
# remove all lower-case letters from column names
rename_with(\(x) str_remove_all(x, '[a-z]')) |>
# remove first column (obtained by pasting ID and date)
select(!EID) |>
# remove column PST since only value is "Targeted"
select(!PST) |>
# extract numeric unique ID
mutate(PID = str_extract(PID, '[0-9]+')) |>
# change datatype of date column from chr to Date
mutate(FED = parse_date(FED, format = '%m/%d/%Y')) |>
# arrange rows in reverse-chrnological order
arrange(PID, desc(FED)) |>
print() -> arranged_streamlined_table
#> # A tibble: 20 Ć 14
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Goldā¦ 2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Goldā¦ 2023-07-21 1.7 26.5 NA NA NA NA NA NA NA
#> 4 1467 Goldā¦ 2021-08-29 2.3 25.4 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 5 15126 Goldā¦ 2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 6 17347 Goldā¦ 2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 7 20319 Goldā¦ 2023-08-22 2.4 14.3 NA NA NA NA NA NA NA
#> 8 20319 Goldā¦ 2021-08-28 0 4.1 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 9 22299 Goldā¦ 2023-07-21 3.7 25.4 NA NA NA NA NA NA NA
#> 10 22299 Goldā¦ 2022-09-02 2.1 32.6 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 11 28952 Goldā¦ 2023-07-20 12.6 63.9 NA NA NA NA NA NA NA
#> 12 28952 Goldā¦ 2021-08-28 10.6 41.4 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 13 32532 Goldā¦ 2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 14 34500 Goldā¦ 2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 15 34930 Goldā¦ 2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 16 3831 Goldā¦ 2023-07-19 0 12.8 NA NA NA NA NA NA NA
#> 17 3831 Goldā¦ 2021-08-27 1.3 19 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 18 38511 18 pā¦ 2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 19 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 20 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Break table into subtables, one per unique ID, and fill in NAs will most recent non-NA value:
arranged_streamlined_table |>
# group table by unique ID
group_by(PID) |>
# fill "up", i.e., change NA to immediately preceding non-NA value
fill(everything(), .direction = "up") |>
print() -> filled_grouped_table
#> # A tibble: 20 Ć 14
#> # Groups: PID [15]
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Goldā¦ 2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Goldā¦ 2023-07-21 1.7 26.5 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 4 1467 Goldā¦ 2021-08-29 2.3 25.4 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 5 15126 Goldā¦ 2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 6 17347 Goldā¦ 2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 7 20319 Goldā¦ 2023-08-22 2.4 14.3 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 8 20319 Goldā¦ 2021-08-28 0 4.1 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 9 22299 Goldā¦ 2023-07-21 3.7 25.4 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 10 22299 Goldā¦ 2022-09-02 2.1 32.6 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 11 28952 Goldā¦ 2023-07-20 12.6 63.9 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 12 28952 Goldā¦ 2021-08-28 10.6 41.4 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 13 32532 Goldā¦ 2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 14 34500 Goldā¦ 2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 15 34930 Goldā¦ 2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 16 3831 Goldā¦ 2023-07-19 0 12.8 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 17 3831 Goldā¦ 2021-08-27 1.3 19 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 18 38511 18 pā¦ 2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 19 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 20 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Extract most recent updated row for each unique ID:
filled_grouped_table |>
# extract the first row of each subtable (one per unique ID)
slice(1) |>
# undo grouping by unique ID
ungroup() |>
print() -> final_table
#> # A tibble: 15 Ć 14
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Goldā¦ 2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Goldā¦ 2023-07-21 1.7 26.5 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 4 15126 Goldā¦ 2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 5 17347 Goldā¦ 2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 6 20319 Goldā¦ 2023-08-22 2.4 14.3 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 7 22299 Goldā¦ 2023-07-21 3.7 25.4 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 8 28952 Goldā¦ 2023-07-20 12.6 63.9 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 9 32532 Goldā¦ 2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 10 34500 Goldā¦ 2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 11 34930 Goldā¦ 2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 12 3831 Goldā¦ 2023-07-19 0 12.8 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 13 38511 18 pā¦ 2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 14 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 15 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Created on 2024-02-06 with reprex v2.0.2