Just spotted the primary issue: the dates are treated as strings rather than dates, which is why 7/19/2023 precedes 8/27/2021! The fill()
function was also missing the argument that says which columns to fill.
Here is a corrected solution that uses the lubridate
package to treat dates correctly, and I'll replace the long column names by removing the lower-case letters so it's easier to compare rows, and only use the numeric portion of the unique IDs:
click here, and run hidden code first
library(tidyverse)
original_table <-
structure(list(EvaluationID = c("CY-TR-10058_2023-07-21", "CY-TR-20319_2023-07-22",
"CY-TR-78315_2023-07-21", "CY-TR-28952_2023-07-20", "CY-TR-15126_2023-07-18",
"CY-TR-32532_2023-07-21", "CY-TR-38511_2023-07-19", "CY-TR-22299_2023-07-21",
"CY-TR-1467_2023-07-21", "CY-TR-34500_2023-07-20", "CY-TR-12968_2023-07-19",
"CY-TR-69008_2023-07-18", "CY-TR-17347_2023-07-20", "CY-TR-3831_2023-07-19",
"CY-TR-34930_2023-07-20", "CY-TR-22299_2022-09-02", "CY-TR-3831_2021-08-27",
"CY-TR-28952_2021-08-28", "CY-TR-20319_2021-08-28", "CY-TR-1467_2021-08-29"
), PointID = c("CY-TR-10058", "CY-TR-20319", "CY-TR-78315", "CY-TR-28952",
"CY-TR-15126", "CY-TR-32532", "CY-TR-38511", "CY-TR-22299", "CY-TR-1467",
"CY-TR-34500", "CY-TR-12968", "CY-TR-69008", "CY-TR-17347", "CY-TR-3831",
"CY-TR-34930", "CY-TR-22299", "CY-TR-3831", "CY-TR-28952", "CY-TR-20319",
"CY-TR-1467"), StreamName = c("Gold Creek", "Gold Creek", NA,
"Gold Creek", "Gold Creek", "Gold Creek", "18 pup", "Gold Creek",
"Gold Creek", "Gold Creek", NA, NA, "Gold Creek", "Gold Creek",
"Gold Creek", "Gold Creek", "Gold Creek", "Gold Creek", "Gold Creek",
"Gold Creek"), FieldEvalDate = c("7/21/2023", "8/22/2023", "7/21/2023",
"7/20/2023", "7/18/2023", "7/21/2023", "7/19/2023", "7/21/2023",
"7/21/2023", "7/20/2023", "7/19/2023", "7/18/2023", "7/20/2023",
"7/19/2023", "7/20/2023", "9/2/2022", "8/27/2021", "8/28/2021",
"8/28/2021", "8/29/2021"), PointSelectionType = c("Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted", "Targeted", "Targeted", "Targeted", "Targeted", "Targeted",
"Targeted"), PctOverheadCover = c(31.8, 2.4, 66.4, 12.6, 9.7,
3.6, 39.8, 3.7, 1.7, 2.5, 66.7, 55.9, 0.1, 0, 1.5, 2.1, 1.3,
10.6, 0, 2.3), PctBankOverheadCover = c(81.8, 14.3, 77, 63.9,
54.5, 32.9, 75.9, 25.4, 26.5, 23, 77.5, 67.9, 2.1, 12.8, 25,
32.6, 19, 41.4, 4.1, 25.4), VegComplexity = c(1, NA, 1.53, NA,
1.14, 0.68, 1.34, NA, NA, 0.91, 1.31, 1.32, 0.24, NA, 1.19, 0.48,
0.96, 0.75, 0.45, 0.68), VegComplexityWoody = c(0.76, NA, 0.71,
NA, 0.96, 0.45, 0.81, NA, NA, 0.68, 0.67, 0.66, 0.14, NA, 0.69,
0.36, 0.44, 0.32, 0.27, 0.3), VegComplexityUnderstoryGround = c(0.59,
NA, 0.43, NA, 0.86, 0.43, 0.79, NA, NA, 0.58, 0.67, 0.63, 0.14,
NA, 0.68, 0.34, 0.43, 0.3, 0.27, 0.29), SpecificConductance = c(723,
NA, 1522, NA, 663, 861, 299.1, NA, NA, 829, 402, 547, 670, NA,
755, 664.6, 670.1, 805.2, 774.9, 832.4), pH = c(8.29, NA, 8.32,
NA, 7.72, 8.16, 7.6, NA, NA, 7.99, 8.3, 8.25, 8.22, NA, 7.94,
8.24, 8.05, 8.11, 8.28, 8.37), InstantTemp = c(9, NA, 8.7, NA,
6.33, 7.46, 5.8, NA, NA, 11.8, 2.5, 6.6, 7.8, NA, 5.86, 6.9,
2.4, 0.4, 3.5, 3.9), TurbidityAvg = c(6.54, NA, 3.46, NA, 37.2,
18.53, 22.6, NA, NA, 19.82, 22.9, 579, 43.47, NA, 9.65, 28.64,
5.75, 15.06, 25.94, 13.77), PoolCount = c(3, NA, 15, NA, 2, 11,
12, NA, NA, 8, 33, 9, 1, NA, 2, 1, 1, 2, 1, 0), PctPools = c(5.57,
NA, 11.6, NA, 5.87, 11.62, 26.67, NA, NA, 18.24, 26.25, 9.37,
1.73, NA, 5.21, 3.42, 3, 2.67, 2.68, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))
Streamline and arrange original table:
original_table |>
# remove all lower-case letters from column names
rename_with(\(x) str_remove_all(x, '[a-z]')) |>
# remove first column (obtained by pasting ID and date)
select(!EID) |>
# remove column PST since only value is "Targeted"
select(!PST) |>
# extract numeric unique ID
mutate(PID = str_extract(PID, '[0-9]+')) |>
# change datatype of date column from chr to Date
mutate(FED = parse_date(FED, format = '%m/%d/%Y')) |>
# arrange rows in reverse-chrnological order
arrange(PID, desc(FED)) |>
print() -> arranged_streamlined_table
#> # A tibble: 20 Ć 14
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Gold⦠2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Gold⦠2023-07-21 1.7 26.5 NA NA NA NA NA NA NA
#> 4 1467 Gold⦠2021-08-29 2.3 25.4 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 5 15126 Gold⦠2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 6 17347 Gold⦠2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 7 20319 Gold⦠2023-08-22 2.4 14.3 NA NA NA NA NA NA NA
#> 8 20319 Gold⦠2021-08-28 0 4.1 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 9 22299 Gold⦠2023-07-21 3.7 25.4 NA NA NA NA NA NA NA
#> 10 22299 Gold⦠2022-09-02 2.1 32.6 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 11 28952 Gold⦠2023-07-20 12.6 63.9 NA NA NA NA NA NA NA
#> 12 28952 Gold⦠2021-08-28 10.6 41.4 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 13 32532 Gold⦠2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 14 34500 Gold⦠2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 15 34930 Gold⦠2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 16 3831 Gold⦠2023-07-19 0 12.8 NA NA NA NA NA NA NA
#> 17 3831 Gold⦠2021-08-27 1.3 19 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 18 38511 18 p⦠2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 19 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 20 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Break table into subtables, one per unique ID, and fill in NAs will most recent non-NA value:
arranged_streamlined_table |>
# group table by unique ID
group_by(PID) |>
# fill "up", i.e., change NA to immediately preceding non-NA value
fill(everything(), .direction = "up") |>
print() -> filled_grouped_table
#> # A tibble: 20 Ć 14
#> # Groups: PID [15]
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Gold⦠2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Gold⦠2023-07-21 1.7 26.5 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 4 1467 Gold⦠2021-08-29 2.3 25.4 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 5 15126 Gold⦠2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 6 17347 Gold⦠2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 7 20319 Gold⦠2023-08-22 2.4 14.3 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 8 20319 Gold⦠2021-08-28 0 4.1 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 9 22299 Gold⦠2023-07-21 3.7 25.4 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 10 22299 Gold⦠2022-09-02 2.1 32.6 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 11 28952 Gold⦠2023-07-20 12.6 63.9 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 12 28952 Gold⦠2021-08-28 10.6 41.4 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 13 32532 Gold⦠2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 14 34500 Gold⦠2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 15 34930 Gold⦠2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 16 3831 Gold⦠2023-07-19 0 12.8 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 17 3831 Gold⦠2021-08-27 1.3 19 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 18 38511 18 p⦠2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 19 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 20 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Extract most recent updated row for each unique ID:
filled_grouped_table |>
# extract the first row of each subtable (one per unique ID)
slice(1) |>
# undo grouping by unique ID
ungroup() |>
print() -> final_table
#> # A tibble: 15 Ć 14
#> PID SN FED POC PBOC VC VCW VCUG SC H IT TA
#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10058 Gold⦠2023-07-21 31.8 81.8 1 0.76 0.59 723 8.29 9 6.54
#> 2 12968 <NA> 2023-07-19 66.7 77.5 1.31 0.67 0.67 402 8.3 2.5 22.9
#> 3 1467 Gold⦠2023-07-21 1.7 26.5 0.68 0.3 0.29 832. 8.37 3.9 13.8
#> 4 15126 Gold⦠2023-07-18 9.7 54.5 1.14 0.96 0.86 663 7.72 6.33 37.2
#> 5 17347 Gold⦠2023-07-20 0.1 2.1 0.24 0.14 0.14 670 8.22 7.8 43.5
#> 6 20319 Gold⦠2023-08-22 2.4 14.3 0.45 0.27 0.27 775. 8.28 3.5 25.9
#> 7 22299 Gold⦠2023-07-21 3.7 25.4 0.48 0.36 0.34 665. 8.24 6.9 28.6
#> 8 28952 Gold⦠2023-07-20 12.6 63.9 0.75 0.32 0.3 805. 8.11 0.4 15.1
#> 9 32532 Gold⦠2023-07-21 3.6 32.9 0.68 0.45 0.43 861 8.16 7.46 18.5
#> 10 34500 Gold⦠2023-07-20 2.5 23 0.91 0.68 0.58 829 7.99 11.8 19.8
#> 11 34930 Gold⦠2023-07-20 1.5 25 1.19 0.69 0.68 755 7.94 5.86 9.65
#> 12 3831 Gold⦠2023-07-19 0 12.8 0.96 0.44 0.43 670. 8.05 2.4 5.75
#> 13 38511 18 p⦠2023-07-19 39.8 75.9 1.34 0.81 0.79 299. 7.6 5.8 22.6
#> 14 69008 <NA> 2023-07-18 55.9 67.9 1.32 0.66 0.63 547 8.25 6.6 579
#> 15 78315 <NA> 2023-07-21 66.4 77 1.53 0.71 0.43 1522 8.32 8.7 3.46
#> # ā¹ 2 more variables: PC <dbl>, PP <dbl>
Created on 2024-02-06 with reprex v2.0.2