Automate variable selection

sgva · July 30, 2024, 10:40pm

I am trying to automate a line of code that calls variables. It is in my "summarise" line of code below


library(tidyverse)


# I have three data frames that I want to use later

mro <- c('19100', '19740')
aff <- c('50','80')
yrs <- c(21,22)

# create example data for this post below
yr_pma <- rep(seq(from=2019, to=2021), 5)

set.seed(123)
p50_y22_19100 <- sample(0:12, 15, replace=TRUE) 
p80_y22_19100 <- sample(0:12, 15, replace=TRUE) 
p50_y21_19100 <- sample(0:12, 15, replace=TRUE) 
p80_y21_19100 <- sample(0:12, 15, replace=TRUE) 

p50_y21_19740 <- sample(0:12, 15, replace=TRUE) 
p80_y21_19740 <- sample(0:12, 15, replace=TRUE) 
p50_y22_19740 <- sample(0:12, 15, replace=TRUE) 
p80_y22_19740 <- sample(0:12, 15, replace=TRUE) 

comb_met <- data.frame(yr_pma, p50_y21_19100, p80_y21_19100, p50_y21_19740, p80_y21_19740, 
                               p50_y22_19100, p80_y22_19100, p50_y22_19740, p80_y22_19740)
comb_met

# create example data for this post above


# This statement is what I want, but I manually type all of the variable names

aff_result <- comb_met %>%
  group_by(yr_pma) %>%
  summarise(across(c(p50_y21_19100, p80_y21_19100, p50_y21_19740, p80_y21_19740,
                     p50_y22_19100, p80_y22_19100, p50_y22_19740, p80_y22_19740), sum, na.rm=TRUE))

aff_result

# ?? How can I create the "summarise()" line of code in an automated way from "mro", "aff", and "yrs" so I
# don't have to manually type them


> aff_result
# A tibble: 3 × 9
  yr_pma p50_y21_19100 p80_y21_19100 p50_y21_19740
   <int>         <int>         <int>         <int>
1   2019            33            29            35
2   2020            40            36            25
3   2021            41            37            19
# ℹ 5 more variables: p80_y21_19740 <int>,
#   p50_y22_19100 <int>, p80_y22_19100 <int>,
#   p50_y22_19740 <int>, p80_y22_19740 <int>
>

startz · July 30, 2024, 10:52pm

Something like

summarise(across(p50_y21_19100: p80_y22_19740, sum, na.rm=TRUE))

sgva · July 30, 2024, 11:46pm

I don't think I was clear. I'm looking to automatically produce a the summarise line of code. I do have this (non-automated) line of code already in the post.

summarise(across(c(p50_y21_19100, p80_y21_19100, p50_y21_19740, p80_y21_19740,
                     p50_y22_19100, p80_y22_19100, p50_y22_19740, p80_y22_19740), sum, na.rm=TRUE))

If you test the code in the post, you will see it produces this result

> aff_result
# A tibble: 3 × 9
  yr_pma p50_y21_19100 p80_y21_19100 p50_y21_19740
   <int>         <int>         <int>         <int>
1   2019            33            29            35
2   2020            40            36            25
3   2021            41            37            19
# ℹ 5 more variables: p80_y21_19740 <int>,
#   p50_y22_19100 <int>, p80_y22_19100 <int>,
#   p50_y22_19740 <int>, p80_y22_19740 <int>
>

The solution I'm shooting for would produce the same result (with dummy data I created in the post) and automate the creation of the line of code, so I can avoid the brute force method of typing the 50/80, 21/22, and '19100' / '19740' (and my actual data has many more allowable values) I'd like to pull them from the variables mro, aff, and yrs, that I defined in the post as this:

I've considered "matches" but haven't solved it. I'm not creating any new variables, just looking to summarise existing ones.

mro <- c('19100', '19740')
aff <- c('50','80')
yrs <- c(21,22)

Thanks
Steve

startz · July 31, 2024, 12:03am

Ah, I thought you wanted all the variables in the data frame. I take it you want variable names put together from the three vectors you named? (That's harder!)

dromano · July 31, 2024, 3:41am

Not sure if this helps, but here's an approach that uses list-columns to create your original data frame, comb_met (just for the purpose of illustrating some tools that help reduce repetition), and then shows how you can combine tidyselect functions with across():

library(tidyverse)
set.seed(123) 
# create table of column-name parts
expand_grid(
  beginning = str_c('p', c(50, 80), sep = ""),
  middle = str_c('y', 21:22, sep = ""),
  end = c(19100, 19740)
)  -> name_parts

name_parts
#> # A tibble: 8 × 3
#>   beginning middle   end
#>   <chr>     <chr>  <dbl>
#> 1 p50       y21    19100
#> 2 p50       y21    19740
#> 3 p50       y22    19100
#> 4 p50       y22    19740
#> 5 p80       y21    19100
#> 6 p80       y21    19740
#> 7 p80       y22    19100
#> 8 p80       y22    19740

name_parts |> 
  # join column-name parts
  unite(name, everything()) -> whole_names

whole_names
#> # A tibble: 8 × 1
#>   name         
#>   <chr>        
#> 1 p50_y21_19100
#> 2 p50_y21_19740
#> 3 p50_y22_19100
#> 4 p50_y22_19740
#> 5 p80_y21_19100
#> 6 p80_y21_19740
#> 7 p80_y22_19100
#> 8 p80_y22_19740

whole_names |> 
  # add random data vectors as list-column elements
  rowwise() |> 
  mutate(value = sample(0:12, 15, replace=TRUE) |> list()) |> 
  # add row corresponding to `yr_pma` column 
  add_row(name = "yr_pma", value  = rep(seq(from=2019, to=2021), 5) |> list()) |> 
  # pivot to move `name` values to column names
  pivot_wider() |> 
  # move `yr_pma` column to front
  relocate(yr_pma) -> comb_met_list_columns

comb_met_list_columns
#> # A tibble: 1 × 9
#>   yr_pma p50_y21_19100 p50_y21_19740 p50_y22_19100 p50_y22_19740 p80_y21_19100
#>   <list> <list>        <list>        <list>        <list>        <list>       
#> 1 <int>  <int [15]>    <int [15]>    <int [15]>    <int [15]>    <int [15]>   
#> # ℹ 3 more variables: p80_y21_19740 <list>, p80_y22_19100 <list>,
#> #   p80_y22_19740 <list>

comb_met_list_columns |> 
  # expand list elements to obtain original df from post (random seed aside)
  unnest(everything()) -> comb_met

comb_met
#> # A tibble: 15 × 9
#>    yr_pma p50_y21_19100 p50_y21_19740 p50_y22_19100 p50_y22_19740 p80_y21_19100
#>     <int>         <int>         <int>         <int>         <int>         <int>
#>  1   2019             2             8             6            10             4
#>  2   2020             2            11             4             5             8
#>  3   2021             9             8            11             8             9
#>  4   2019             1             8             9             1            11
#>  5   2020             5            12            12            12             3
#>  6   2021            10             2             6             4             5
#>  7   2019             4             7             8             7            10
#>  8   2020             3             9             8            11             7
#>  9   2021             5             6             9            12             5
#> 10   2019             8             9             6             1             5
#> 11   2020             9             8            10             0             6
#> 12   2021            10             2            11             8             0
#> 13   2019             4             3             4            10             5
#> 14   2020             2             0             6             8             1
#> 15   2021            10            10             4             5             0
#> # ℹ 3 more variables: p80_y21_19740 <int>, p80_y22_19100 <int>,
#> #   p80_y22_19740 <int>

comb_met |> 
  # group by `yrpma` years
  group_by(yr_pma) |> 
  # apply `sum()` to each column except grouping column
  summarise(across(everything(), \(col) col |> sum(na.rm = T)))
#> # A tibble: 3 × 9
#>   yr_pma p50_y21_19100 p50_y21_19740 p50_y22_19100 p50_y22_19740 p80_y21_19100
#>    <int>         <int>         <int>         <int>         <int>         <int>
#> 1   2019            19            35            33            29            35
#> 2   2020            21            40            40            36            25
#> 3   2021            44            28            41            37            19
#> # ℹ 3 more variables: p80_y21_19740 <int>, p80_y22_19100 <int>,
#> #   p80_y22_19740 <int>

^{Created on 2024-07-30 with reprex v2.0.2}

nirgrahamuk · July 31, 2024, 9:09am

# programmatically set the list of variables 
(to_do <- expand_grid(aff,yrs,mro) |> 
    arrange(yrs,mro) |> 
    mutate(to_do = glue::glue("p{aff}_y{yrs}_{mro}")) |> pull(to_do))

# same as aff_result but we use to_do instead of manually typed entries
aff_result2 <- comb_met |> 
  group_by(yr_pma) |> 
  summarise(across(to_do,sum,na.rm=TRUE))

identical(aff_result,aff_result2)
# TRUE

mduvekot · July 31, 2024, 1:17pm

tidyselect is pretty flexible:

# select all that start with "p"
summarise(across(starts_with("p"), sum, na.rm=TRUE))
# select all except the first column
summarise(across(-1, sum, na.rm=TRUE))
# select all that don't match "yr_pma"
summarise(across(!matches("yr_pma"), sum, na.rm=TRUE))

system · October 29, 2024, 1:17pm

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.