Summarise a column

 hstate     Comhpsu  hpsu  hhno  gg08 gg114
   <dbl+lbl>    <dbl> <dbl> <dbl> <dbl> <chr>
 1 10 [Bihar]     151     1    70     4 "S"  
 2 10 [Bihar]     151     1    83     9 "S"  
 3 10 [Bihar]     151     1   221     3 "S"  
 4 10 [Bihar]     151     1   344     4 "FS" 
 5 10 [Bihar]     152     2    43     5 "S"  
 6 10 [Bihar]     152     2    53     3 "C"  
 7 10 [Bihar]     152     2    55     7 "Y"  
 8 10 [Bihar]     152     2   136     3 "HN" 
 9 10 [Bihar]     152     2   386     4 "S"  
10 10 [Bihar]     152     2   404     3 "N"  
11 10 [Bihar]     152     2   494     4 "N"  
12 10 [Bihar]     153     3     8     4 "LS" 
13 10 [Bihar]     153     3     9     3 "N"  
14 10 [Bihar]     153     3    12     4 "T"  
15 10 [Bihar]     153     3    41     3 "S"  
16 10 [Bihar]     153     3    95     6 ""   
17 10 [Bihar]     153     3   153     3 "S"  
18 10 [Bihar]     153     3   202     2 "RS" 
19 10 [Bihar]     153     3   219     3 "AB" 
20 10 [Bihar]     153     3   402     3 "S"  

gg114 contains keys for reasons of dropping out of school. Now I want to make a summary of the reasons of dropping out, specifically I want to show how much each reason contribute in dropping out; but there are cases where there are more than one reason of dropping out, so I also am not sure what would be correct mathematical approach of doing the summary(whether a sort of linear approach where you get a percentage of a specific reason, or any other like alloting weights where there is more than one reason. Note each letter is a distinct reason, a string of letters like 'LS' would mean reason L and reason S. Also if you can suggest ways to show this graphically, it would be much appreciated. Thank you.

Could I ask you to supply your data in Dput() format? It makes things easier.

To supply some sample data use the dput() function. In the case of a large dataset something like dput(head(mydata, 100)) should supply the data we need. Just do dput(mydata) where mydata is your data. Copy the output and paste it here between
```

```

Here's what I would do:

library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)

df <- data.frame(
  hstate = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), 
  Comhpsu = c(151, 151, 151, 151, 151,152, 152, 152, 152, 152, 152, 152, 152, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153), 
  hpsu = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), 
  hhno = c(70, 83, 221, 344, 344, 43, 53, 55, 136, 136, 386, 404, 494, 8, 8, 9, 12, 41, 153, 202, 202, 219, 219, 402), 
  gg08 = c(4, 9, 3, 4, 4, 5, 3, 7, 3, 3, 4, 3, 4, 4, 4, 3, 4, 3, 3, 2, 2, 3, 3, 3), 
  gg114 = c("S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S")
)

df |> 
  dplyr::mutate(gg114 = strsplit(gg114, "")) |> 
  tidyr::unnest(gg114) |> 
  dplyr::summarize(.by = gg114, n = dplyr::n()) |> 
  dplyr::arrange(desc(n)) |>
  dplyr::mutate(gg114 = forcats::fct_inorder(gg114)) |>
  ggplot(aes(x = gg114, y = n)) +
  geom_col()

Forgive me for late response; here is what you asked for. Let me know if you need anything else.

structure(list(hstate = structure(c(10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), labels = c(`Uttar Pradesh` = 9, 
Bihar = 10), label = "state number", class = c("haven_labelled", 
"vctrs_vctr", "double")), hpsu = structure(c(1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
4, 5), label = "psu number", format.stata = "%8.0g"), hhno = structure(c(1, 
6, 19, 36, 61, 83, 90, 115, 246, 268, 415, 103, 151, 154, 280, 
287, 359, 424, 449, 494, 496, 503, 514, 6, 8, 18, 24, 37, 53, 
54, 94, 149, 203, 222, 227, 7, 23, 75, 77, 225, 228, 244, 276, 
297, 307, 313, 331, 342, 443, 122), label = "household number", format.stata = "%8.0g"), 
    gg08 = structure(c(4, 3, 4, 6, 3, 12, 4, 5, 3, 3, 3, 4, 3, 
    7, 4, 3, 7, 4, 4, 6, 8, 5, 3, 7, 8, 3, 4, 3, 2, 3, 2, 3, 
    4, 3, 3, 4, 3, 4, 3, 10, 2, 6, 2, 4, 6, 3, 3, 3, 4, 3), label = "line number of respondent", format.stata = "%8.0g"), 
    gg114 = structure(c("", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", ""), label = "reasons for discontinuing school/never attended school", format.stata = "%23s")), row.names = c(NA, 
-50L), class = c("tbl_df", "tbl", "data.frame"))

Fabulous!!!!
Thank you @mduvekot for looking into my problem. Could you briefly please explain what each line of code does. Also how to add a legend for each of those codes/keys.

library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)

# your data
df <- structure(list(hstate = structure(c(10, 10, 10, 10, 10, 10, 10, 
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), labels = c(`Uttar Pradesh` = 9, 
    Bihar = 10), label = "state number", class = c("haven_labelled", 
      "vctrs_vctr", "double")), hpsu = structure(c(1, 1, 1, 1, 1, 1, 
        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 
        3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
        4, 5), label = "psu number", format.stata = "%8.0g"), hhno = structure(c(1, 
          6, 19, 36, 61, 83, 90, 115, 246, 268, 415, 103, 151, 154, 280, 
          287, 359, 424, 449, 494, 496, 503, 514, 6, 8, 18, 24, 37, 53, 
          54, 94, 149, 203, 222, 227, 7, 23, 75, 77, 225, 228, 244, 276, 
          297, 307, 313, 331, 342, 443, 122), label = "household number", format.stata = "%8.0g"), 
  gg08 = structure(c(4, 3, 4, 6, 3, 12, 4, 5, 3, 3, 3, 4, 3, 
    7, 4, 3, 7, 4, 4, 6, 8, 5, 3, 7, 8, 3, 4, 3, 2, 3, 2, 3, 
    4, 3, 3, 4, 3, 4, 3, 10, 2, 6, 2, 4, 6, 3, 3, 3, 4, 3), label = "line number of respondent", format.stata = "%8.0g"), 
  gg114 = structure(c("", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", ""), label = "reasons for discontinuing school/never attended school", format.stata = "%23s")), row.names = c(NA, 
      -50L), class = c("tbl_df", "tbl", "data.frame"))

# since gg114 is a vector of empty strings, we'll give it some fake values
df$gg114 <- c("S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S", "S","S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S", "S")

df |> 
  # split the string into individual characters
  dplyr::mutate(gg114 = strsplit(gg114, "")) |>
  # unnest the list column, creating a row for each character 
  tidyr::unnest(gg114) |> 
  # count the frequency of each character
  dplyr::summarize(.by = gg114, n = dplyr::n()) |> 
  # sort by count, in descending order
  dplyr::arrange(desc(n)) |>
  # convert gg114 to a factor, and maintain the order
  # ggplot sorts alphabetically, so we need to convert to a factor
  dplyr::mutate(gg114 = forcats::fct_inorder(gg114)) |>
  # plot, assigning a fill color to each gg114 value forces creation of a legend
  # (unnecessary, and bad parctice, but you asked for it)
  ggplot(aes(x = gg114, y = n,  fill = gg114)) +
  geom_col()

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.