# Summarise a column

`````` hstate     Comhpsu  hpsu  hhno  gg08 gg114
<dbl+lbl>    <dbl> <dbl> <dbl> <dbl> <chr>
1 10 [Bihar]     151     1    70     4 "S"
2 10 [Bihar]     151     1    83     9 "S"
3 10 [Bihar]     151     1   221     3 "S"
4 10 [Bihar]     151     1   344     4 "FS"
5 10 [Bihar]     152     2    43     5 "S"
6 10 [Bihar]     152     2    53     3 "C"
7 10 [Bihar]     152     2    55     7 "Y"
8 10 [Bihar]     152     2   136     3 "HN"
9 10 [Bihar]     152     2   386     4 "S"
10 10 [Bihar]     152     2   404     3 "N"
11 10 [Bihar]     152     2   494     4 "N"
12 10 [Bihar]     153     3     8     4 "LS"
13 10 [Bihar]     153     3     9     3 "N"
14 10 [Bihar]     153     3    12     4 "T"
15 10 [Bihar]     153     3    41     3 "S"
16 10 [Bihar]     153     3    95     6 ""
17 10 [Bihar]     153     3   153     3 "S"
18 10 [Bihar]     153     3   202     2 "RS"
19 10 [Bihar]     153     3   219     3 "AB"
20 10 [Bihar]     153     3   402     3 "S"
``````

gg114 contains keys for reasons of dropping out of school. Now I want to make a summary of the reasons of dropping out, specifically I want to show how much each reason contribute in dropping out; but there are cases where there are more than one reason of dropping out, so I also am not sure what would be correct mathematical approach of doing the summary(whether a sort of linear approach where you get a percentage of a specific reason, or any other like alloting weights where there is more than one reason. Note each letter is a distinct reason, a string of letters like 'LS' would mean reason L and reason S. Also if you can suggest ways to show this graphically, it would be much appreciated. Thank you.

Could I ask you to supply your data in Dput() format? It makes things easier.

To supply some sample data use the dput() function. In the case of a large dataset something like dput(head(mydata, 100)) should supply the data we need. Just do dput(mydata) where mydata is your data. Copy the output and paste it here between
```

```

Here's what I would do:

``````library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)

df <- data.frame(
hstate = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
Comhpsu = c(151, 151, 151, 151, 151,152, 152, 152, 152, 152, 152, 152, 152, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153),
hpsu = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3),
hhno = c(70, 83, 221, 344, 344, 43, 53, 55, 136, 136, 386, 404, 494, 8, 8, 9, 12, 41, 153, 202, 202, 219, 219, 402),
gg08 = c(4, 9, 3, 4, 4, 5, 3, 7, 3, 3, 4, 3, 4, 4, 4, 3, 4, 3, 3, 2, 2, 3, 3, 3),
gg114 = c("S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S")
)

df |>
dplyr::mutate(gg114 = strsplit(gg114, "")) |>
tidyr::unnest(gg114) |>
dplyr::summarize(.by = gg114, n = dplyr::n()) |>
dplyr::arrange(desc(n)) |>
dplyr::mutate(gg114 = forcats::fct_inorder(gg114)) |>
ggplot(aes(x = gg114, y = n)) +
geom_col()
``````

Forgive me for late response; here is what you asked for. Let me know if you need anything else.

``````structure(list(hstate = structure(c(10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), labels = c(`Uttar Pradesh` = 9,
Bihar = 10), label = "state number", class = c("haven_labelled",
"vctrs_vctr", "double")), hpsu = structure(c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 5), label = "psu number", format.stata = "%8.0g"), hhno = structure(c(1,
6, 19, 36, 61, 83, 90, 115, 246, 268, 415, 103, 151, 154, 280,
287, 359, 424, 449, 494, 496, 503, 514, 6, 8, 18, 24, 37, 53,
54, 94, 149, 203, 222, 227, 7, 23, 75, 77, 225, 228, 244, 276,
297, 307, 313, 331, 342, 443, 122), label = "household number", format.stata = "%8.0g"),
gg08 = structure(c(4, 3, 4, 6, 3, 12, 4, 5, 3, 3, 3, 4, 3,
7, 4, 3, 7, 4, 4, 6, 8, 5, 3, 7, 8, 3, 4, 3, 2, 3, 2, 3,
4, 3, 3, 4, 3, 4, 3, 10, 2, 6, 2, 4, 6, 3, 3, 3, 4, 3), label = "line number of respondent", format.stata = "%8.0g"),
gg114 = structure(c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", ""), label = "reasons for discontinuing school/never attended school", format.stata = "%23s")), row.names = c(NA,
-50L), class = c("tbl_df", "tbl", "data.frame"))
``````

Fabulous!!!!
Thank you @mduvekot for looking into my problem. Could you briefly please explain what each line of code does. Also how to add a legend for each of those codes/keys.

``````library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)

df <- structure(list(hstate = structure(c(10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), labels = c(`Uttar Pradesh` = 9,
Bihar = 10), label = "state number", class = c("haven_labelled",
"vctrs_vctr", "double")), hpsu = structure(c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 5), label = "psu number", format.stata = "%8.0g"), hhno = structure(c(1,
6, 19, 36, 61, 83, 90, 115, 246, 268, 415, 103, 151, 154, 280,
287, 359, 424, 449, 494, 496, 503, 514, 6, 8, 18, 24, 37, 53,
54, 94, 149, 203, 222, 227, 7, 23, 75, 77, 225, 228, 244, 276,
297, 307, 313, 331, 342, 443, 122), label = "household number", format.stata = "%8.0g"),
gg08 = structure(c(4, 3, 4, 6, 3, 12, 4, 5, 3, 3, 3, 4, 3,
7, 4, 3, 7, 4, 4, 6, 8, 5, 3, 7, 8, 3, 4, 3, 2, 3, 2, 3,
4, 3, 3, 4, 3, 4, 3, 10, 2, 6, 2, 4, 6, 3, 3, 3, 4, 3), label = "line number of respondent", format.stata = "%8.0g"),
gg114 = structure(c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", ""), label = "reasons for discontinuing school/never attended school", format.stata = "%23s")), row.names = c(NA,
-50L), class = c("tbl_df", "tbl", "data.frame"))

# since gg114 is a vector of empty strings, we'll give it some fake values
df\$gg114 <- c("S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S", "S","S", "S", "S", "F", "S", "S", "C", "Y", "H", "N",  "S", "N", "N", "L", "S", "N", "T", "S", "S", "R", "S", "A", "B", "S", "S")

df |>
# split the string into individual characters
dplyr::mutate(gg114 = strsplit(gg114, "")) |>
# unnest the list column, creating a row for each character
tidyr::unnest(gg114) |>
# count the frequency of each character
dplyr::summarize(.by = gg114, n = dplyr::n()) |>
# sort by count, in descending order
dplyr::arrange(desc(n)) |>
# convert gg114 to a factor, and maintain the order
# ggplot sorts alphabetically, so we need to convert to a factor
dplyr::mutate(gg114 = forcats::fct_inorder(gg114)) |>
# plot, assigning a fill color to each gg114 value forces creation of a legend
# (unnecessary, and bad parctice, but you asked for it)
ggplot(aes(x = gg114, y = n,  fill = gg114)) +
geom_col()
``````