Hello R community users,
I'm working on calculating the count and percentage of values of three columns grouped by Groupsize. However, I'm encountering difficulties in assigning the correct denominators for percentage calculation. Below is the data and code I'm using. I would greatly appreciate any assistance in resolving this issue. Thank you in advance for your help.
# Load libraries
library(dplyr)
library(kableExtra)
# Define the data
data <- data.frame(
Q2I_Num1 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, NA, 1L, 1L),
Q2I_Num2 = c(2L, 0L, NA, 2L, 0L, 2L, 2L, 0L, 2L, NA, 2L),
Q2I_Num3 = c(3L, 3L, 3L, NA, 3L, 0L, 3L, 0L, 3L, NA, 3L),
Q2I_Other = c("Yes, I like it", "Yes", "No", "No, no worry", "No.", " ", "Yes, happy.", " ", "No, not at all.", " ", "No, not at all."),
SPSS = c("No", "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", " ", "Yes"),
foodStorage = c(1L, 3L, 6L, 10L, 20L, 12L, 100L, NA, 32L, 67L, NA),
Unit = c("sq.ft", "sq.m", "sq.m", "cubic ft", "cubic m", "cubic ft", "", "sq.ft", "", "cubic m", "sq.ft"),
Q4Ea1 = c(1L, 3L, 6L, 10L, 20L, 12L, 100L, NA, 32L, 67L, NA),
Q4Ea2 = c(1L, 2L, 2L, 3L, 4L, 3L, NA, 3L, NA, 4L, 1L),
Q4Ea3 = c(1L, 3L, 6L, 10L, 20L, 12L, 100L, NA, 32L, 67L, NA),
Q4Ea4 = c(1L, 2L, 2L, 3L, 4L, 3L, NA, 3L, NA, 4L, 1L),
Groupsize = c("Low", "Med", "High", "Low", "Med", "High", "", "Med", NA, "Low", "Med")
)
# Select columns for analysis
var_labels <- c(
"1 Number of clients served",
"2 Number of previous clients",
"3 Number of expected clients"
)
columns_to_analyze <- c("Q2I_Num1", "Q2I_Num2", "Q2I_Num3")
# Create a function to calculate frequencies and percentages within each Groupsize group
calculate_freq_perc <- function(data, col, label) {
data %>%
filter(!is.na(Groupsize) & Groupsize != "") %>%
group_by(Groupsize) %>%
summarise(
Frequency = sum((.data[[col]] %in% 1:3) & !is.na(.data[[col]])),
Total_Selected = sum(!is.na(.data[[col]])),
Percentage = round((Frequency / Total_Selected) * 100, digits = 1)
) %>%
mutate(Count_Percent = paste0(Frequency, " (", Percentage, "%)"),
Variable_Label = label,
Groupsize = as.character(Groupsize)) %>%
select(Variable_Label, Groupsize, Count_Percent) %>%
pivot_wider(names_from = Groupsize, values_from = Count_Percent)
}
# Apply the function to each column
result_list <- lapply(columns_to_analyze, function(col) {
calculate_freq_perc(data, col, var_labels[columns_to_analyze == col])
})
# Combine the results into a data frame
result_table <- bind_rows(result_list)
# However, my output table should look like similar to this. # A tibble: 3 × 4
Variable_Label High Low Med
<chr> <chr> <chr> <chr>
1 1 Number of clients served 2 (29%) 3 (42%) 2 (29%)
2 2 Number of previous clients 1 (250%) 2 (50%) 1 (25%)
3 3 Number of expected clients 1 (20%) 1 (20%) 3 (60%)