Weighted tables in dplyr?

Slavek · March 22, 2024, 10:13pm

Hi,
I have this simple df

source <- data.frame(
           serial = c(100001,100002,100008,100010,
                      100012,100013,100015,100017,100018,100019,100021,
                      100024,100026,100029,100033,100035,100036,100037,
                      100038),
             educ = c(3, 2, 3, 3, 3, 1, 2, 3, 2, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3),
            grads = c(1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1),
           ethnic = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
              wts = c(0.80453,1.61199,1.07317,
                      0.37436,0.50043,1.40756,1.86962,0.84807,0.94143,0.32848,
                      1.58918,0.30688,3.03342,0.973,0.42845,0.35284,1.07317,
                      0.94018,1.04452),
  q5_1_Positivity = c(1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0),
  q6_1_Positivity = c(1, 1, 0, 0, 1, 0, 1, 1, 1, NA, 1, 0, 1, 1, 1, NA, 1, 1, 0),
  q6_2_Positivity = c(1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0),
  q7_1_Positivity = c(1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0),
  q7_2_Positivity = c(1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0)
)

I would like to run following tables:

library(dplyr)
Rec.Pos <- source %>%
  filter(!is.na(q5_1_Positivity)) %>%
  mutate(q5_1=as.character(q5_1_Positivity)) %>%
  group_by(q5_1_Positivity) %>%
  summarise(cnt = n()) %>%
  mutate(freq = round(cnt / sum(cnt), 3))
Rec.Pos

Posivity.educ <- source %>%
  mutate(educ=as.character(educ)) %>%
  group_by(educ) %>%
  summarise_at(.vars = vars(ends_with(match = "Positivity")),.funs = list(Proportion = ~mean(.,na.rm=TRUE),Count = ~sum(!is.na(.))))
Posivity.educ

but with the weight (wts) applied. Can we do that in R?

Also, two additional questions:

Is it possible to amend the second table to group not only by one variable (educ) but by educ, grads and ethic on a side (so 3 tables in one, one on top of another)
Is it possible to see only one column with a count instead of 5 if they have identical values?

dromano · March 23, 2024, 9:07pm

Could you say a little more? It's not clear to me what you mean. And would you mind clarifying your two additional questions, too, maybe with toy before and after tables that illustrate what you mean?

Slavek · March 24, 2024, 12:04am

Well, you can use weighted data in SPSS, DisplayR or any other statistical software to adjust sample proportions to proportions in a population. Can we do that in R?

The secibd part of my question is improving my output so making the second table similar to tables you can do in other software si fir example having mire than oe variable on the left or displaying a smakker table with a total count instead if counts of each variable .

Slavek · March 24, 2024, 12:05am

dromano · March 24, 2024, 12:13pm

Does the stackoverflow link you posted answer your question, or would you still like help?

dromano · March 24, 2024, 3:43pm

As to your two example tables, here's how I would achieve what it seems you'd like to achieve:

# Reconstruction of Rec.Pos using count()
source |> 
  count(educ, q5_1_Positivity) 
#>   educ q5_1_Positivity n
#> 1    1               0 1
#> 2    1               1 2
#> 3    2               1 3
#> 4    3               0 6
#> 5    3               1 7

source %>%
  count(educ, q5_1_Positivity) |> 
  group_by(educ) |> 
  summarise(Count = sum(n), Proportion = sum(q5_1_Positivity * n) / Count) |> 
  mutate(across(where(is.numeric), ~ round(., 3)))
#> # A tibble: 3 × 3
#>    educ Count Proportion
#>   <dbl> <dbl>      <dbl>
#> 1     1     3      0.667
#> 2     2     3      1    
#> 3     3    13      0.538

# Repeating, but with weights
source |> 
  count(educ, q5_1_Positivity, wt = wts)
#>   educ q5_1_Positivity       n
#> 1    1               0 1.40756
#> 2    1               1 4.00642
#> 3    2               1 4.42304
#> 4    3               0 3.48025
#> 5    3               1 6.18401

source |> 
  count(educ, q5_1_Positivity, wt = wts) |> 
  group_by(educ) |> 
  summarise(Count = sum(n), Proportion = sum(q5_1_Positivity * n) / Count) |> 
  mutate(across(where(is.numeric), ~ round(., 3)))
#> # A tibble: 3 × 3
#>    educ Count Proportion
#>   <dbl> <dbl>      <dbl>
#> 1     1  5.41       0.74
#> 2     2  4.42       1   
#> 3     3  9.66       0.64

# Reconstruction of Posivity.educ
source |> 
  select(educ, q5_1_Positivity, q6_1_Positivity, wts) |> 
  mutate(ones = 1) |> 
  group_by(educ) |> 
  summarise(
    across(
      contains("Pos"), 
      list(
        Count = ~ sum((!is.na(.)) * ones),
        Proportion = ~ sum(. * ones, na.rm = T) / sum((!is.na(.)) * ones)
      )
    )
  ) |> 
  mutate(across(where(is.numeric), ~ round(., 3)))
#> # A tibble: 3 × 5
#>    educ q5_1_Positivity_Count q5_1_Positivity_Proportion q6_1_Positivity_Count
#>   <dbl>                 <dbl>                      <dbl>                 <dbl>
#> 1     1                     3                      0.667                     3
#> 2     2                     3                      1                         3
#> 3     3                    13                      0.538                    11
#> # ℹ 1 more variable: q6_1_Positivity_Proportion <dbl>

# Repeating, but with weights
source |> 
  select(educ, q5_1_Positivity, q6_1_Positivity, wts) |>
  group_by(educ) |> 
  summarise(
    across(
      contains("Pos"), 
      list(
        Count = ~ sum((!is.na(.)) * wts),
        Proportion = ~ sum(. * wts, na.rm = T) / sum((!is.na(.)) * wts)
      )
    )
  ) |> 
  mutate(across(where(is.numeric), ~ round(., 3)))
#> # A tibble: 3 × 5
#>    educ q5_1_Positivity_Count q5_1_Positivity_Proportion q6_1_Positivity_Count
#>   <dbl>                 <dbl>                      <dbl>                 <dbl>
#> 1     1                  5.41                       0.74                  5.41
#> 2     2                  4.42                       1                     4.42
#> 3     3                  9.66                       0.64                  8.98
#> # ℹ 1 more variable: q6_1_Positivity_Proportion <dbl>

^{Created on 2024-03-24 with reprex v2.0.2}

Slavek · March 25, 2024, 1:11pm

Thank you, is this way around the only way of applying weights?
Is it possible to add them to my codes? I am asking as:

proportions from the first table are missing
I don't know how to apply weights to my second table with means

nirgrahamuk · March 25, 2024, 2:14pm


  library(dplyr)
Rec.Pos <- source %>%
  filter(!is.na(q5_1_Positivity)) %>%
  mutate(q5_1=as.character(q5_1_Positivity)) %>%
  group_by(q5_1_Positivity) %>%
  summarise(cnt = sum(wts)) %>% # n() -> sum(wts)
  mutate(freq = round(cnt / sum(cnt), 3))
Rec.Pos

Posivity.educ <- source %>%
  mutate(educ=as.character(educ)) %>%
  group_by(educ) %>%
  summarise_at(.vars = vars(ends_with(match = "Positivity")),
                # mean -> weighted.mean 
               .funs = list(Proportion = ~weighted.mean(x=.,w=wts,na.rm=TRUE),
                            # sum(x) -> sum(wts*x)
                            Count = ~sum(wts*!is.na(.))))
Posivity.educ

I added comments at the 3 locations edited to account for weights

dromano · March 25, 2024, 3:16pm

I'm not familiar with the use of weighting as you described, so my code applies weights from scratch, but in retrospect comment-free, which is not very helpful. @nirgrahamuk's solution preserves your code by using the weighted.mean() function, which I wasn't aware of, and is likely closer to what you were hoping for.

I'm not sure what you mean, since I reproduced your first table (although, I used the column name Proportion instead of freq).

Since I think it might be useful for folks who find this topic, I might include a quick treatment weights — as I understand them — in a separate reply below.

dromano · March 25, 2024, 7:21pm

Here is the quick treatment of weights I mentioned earlier, to make clear the relationships with frequencies, means, and weighted means.

library(tidyverse)

# Create vector of weights
set.seed(1)
weights <- runif(10)
# weight should sum to 1
weights <- weights / sum(weights)

# Create toy table of binary and integer values, along with weights
tibble(
  binary = sample(0:1, 10, replace = T),
  integer = sample(0:9, 10, replace = T),
  wt = weights
) -> survey

survey
#> # A tibble: 10 × 3
#>    binary integer     wt
#>     <int>   <int>  <dbl>
#>  1      0       8 0.0481
#>  2      0       4 0.0675
#>  3      0       4 0.104 
#>  4      0       8 0.165 
#>  5      0       8 0.0366
#>  6      1       4 0.163 
#>  7      1       4 0.171 
#>  8      1       1 0.120 
#>  9      1       9 0.114 
#> 10      0       8 0.0112

# Calculate frequency/proportions by hand
survey |> 
  summarise(
    across(
      binary:integer, 
      list(freq = \(x) sum(x) / n())
    )
  )
#> # A tibble: 1 × 2
#>   binary_freq integer_freq
#>         <dbl>        <dbl>
#> 1         0.4          5.8

# Calculate frequency/proportions with `mean()`
survey |> 
  summarise(across(binary:integer, list(mean = mean)))
#> # A tibble: 1 × 2
#>   binary_mean integer_mean
#>         <dbl>        <dbl>
#> 1         0.4          5.8

# Calculate weighted frequency/proportions by hand
survey |> 
  summarise(
    across(
      binary:integer, 
      list(scaled_mean = \(x) sum(x * wt))
    )
  )
#> # A tibble: 1 × 2
#>   binary_scaled_mean integer_scaled_mean
#>                <dbl>               <dbl>
#> 1              0.568                5.25

# Calculate weighted frequency/proportions with `weighted.mean()`
survey |> 
  summarise(
    across(
      binary:integer, 
      list(wt_mean = \(x) weighted.mean(x, w = wt))
    )
  )
#> # A tibble: 1 × 2
#>   binary_wt_mean integer_wt_mean
#>            <dbl>           <dbl>
#> 1          0.568            5.25

^{Created on 2024-03-25 with reprex v2.0.2}

system · April 17, 2024, 10:51am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.