Applying summary across multiple column

dromano · July 24, 2024, 12:00pm

Are you interested in per district or per school values of the chronic rate? I ask because your second table includes SchoolName and SchoolCode columns even though there is a single rate for StudentswithDisabilities, which suggests students from the elementary and middle schools are being aggregated to calculate the rate.

For example, here is code that calculates (incorrect) per school values of AbsentRate2 (the only column that allows interesting calculations from the sample table):

original data, saved as `demo`

structure(list(SSID = c(100123123123, 100123123124, 100123123125, 100123123126, 
100123123127, 100123123128), StudentName = c("Last, First Middle", 
"Last, First Middle", "Last, First Middle", "Last, First Middle", 
"Last, First Middle", "Last, First Middle"), SchoolName = c("School Elementary", 
"School Elementary", "School Elementary", "School Middle", 
"School Middle", "School Elementary"), SchoolCode = c(6021111, 
6021111, 6021111, 6021112, 6021112, 6021111), Grade = c("01", 
"02", "KN", "06", "06", "01"), DaysExpectedA = c(180, 180, 180, 
180, 180, 180), ADAGeneratingIndependentStudyDays = c(0, 0, 0, 
0, 0, 0), DaysAttendedB = c(180, 171, 178, 177, 177, 174), DaysAbsentduetoOutOfSchoolSuspensionsC = c(0, 
0, 0, 0, 0, 0), InSchoolSuspensionDaysD = c(0, 0, 0, 0, 0, 0), 
    DaysExcusedAbsencesE = c(0, 9, 0, 3, 3, 6), DaysUnexcusedAbsencesF = c(0, 
    0, 2, 0, 0, 0), NonADAGeneratingIndependentStudyDaysG = c(0, 
    0, 0, 0, 0, 0), DaysAbsentCEFG = c(0, 9, 2, 3, 3, 6), AbsenceRate2 = c(0, 
    5, 1.11111111111111, 1.66666666666667, 1.66666666666667, 
    3.33333333333333), chronic = c(FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE), dupes = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
    FALSE), Homeless = c("N", "N", "N", "N", "N", "N"), StudentswithDisabilities = c("Yes", 
    "N", "N", "N", "Yes", "N"), EnglishLearner = c("N", "N", 
    "N", "N", "N", "N"), SocioEconomicallyDisadvantaged = c("Yes", 
    "N", "N", "N", "N", "N"), Hispanic = c("Yes", NA, NA, NA, 
    NA, NA), White = c(NA, "Yes", "Yes", "Yes", NA, "Yes"), Multiple = c(NA, 
    NA, NA, NA, "Yes", NA), Filipino = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_
    ), Asian = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), Missing = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), `Am Indian/Alskn Nat` = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), All = c("Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes")), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) -> demo

library(tidyverse)
demo |> 
  pivot_longer(
    # pick out demographics columns based on values; 
    # create new columns with their names and values
    where(\(x) all(x %in% c('Yes', 'N', NA))) 
    ) |> 
  # group by combination of demographic column name, value, and school
  group_by(name, value, SchoolName, SchoolCode) |> 
  # calculate average value of `AbsenceRate2` and total student per group
  summarise(mean = mean(AbsenceRate2), total = n()) |> 
  # retain only values corresponding to actual demographic populations
  filter(value == 'Yes') |> 
  # move calculated columns to front
  relocate(mean, total) |> 
  # undo grouping used to calculate values
  ungroup()
#> # A tibble: 9 × 6
#>    mean total name                           value SchoolName        SchoolCode
#>   <dbl> <int> <chr>                          <chr> <chr>                  <dbl>
#> 1  2.36     4 All                            Yes   School Elementary    6021111
#> 2  1.67     2 All                            Yes   School Middle        6021112
#> 3  0        1 Hispanic                       Yes   School Elementary    6021111
#> 4  1.67     1 Multiple                       Yes   School Middle        6021112
#> 5  0        1 SocioEconomicallyDisadvantaged Yes   School Elementary    6021111
#> 6  0        1 StudentswithDisabilities       Yes   School Elementary    6021111
#> 7  1.67     1 StudentswithDisabilities       Yes   School Middle        6021112
#> 8  3.15     3 White                          Yes   School Elementary    6021111
#> 9  1.67     1 White                          Yes   School Middle        6021112

^{Created on 2024-07-24 with reprex v2.0.2}