geom_col() both stacked and dodged by different variables have wrong bar totals

I want to use geom_col() with position_dodge() to create a plot that shows both:

  1. Stacked bars for Var1
  2. Dodged bars for Var2

Unfortunately, the resulting dodged bars do not match the total sum of frequencies by Var2.
For example, the following is a simulated dataset with some counts divided by country (Var1), protocol (Var2) and year (X-axis), and I want to have different dodged bars by protocol and stacked blocks by country.

In the plot (reprex below) is clear that the bar totals do not reach the actual totals (horizontal lines with dots)
Also the boxes in the bars are less than they should be (L:2 and M:4 instead of 4 and 5).

What am I missing?

Minimal Reproducible Example

library(ggplot2)
library(dplyr)

# Create sample data
df <- bind_rows(
        data.frame(
            year = rep(2016, 5),
            protocol = rep("M", 5),
            country = c("A", "B", "C", "D", "E"),
            freq = c(100, 50, 30, 40, 11) # sum is 231
        ),
        data.frame(
            year = rep(2016, 4),
            protocol = rep("L", 4),
            country = c("A", "B", "C", "D"),
            freq = c(23, 60, 200, 100) # sum is 383
        )
    )
    
    # Create summary data
    df_sum <- df |>
        summarise(
            label = paste(country, collapse = "\n"),
            freq = sum(freq),
            .by = c(year, protocol)
        )
    
    # Plot showing the issue
    ggplot() +
        geom_col(
            data = df,
            aes(x = factor(year), y = freq,
                fill = protocol),
            position = position_dodge(0.7),
            color = "black",
            width = 0.4
        ) +
        geom_point(
            data = df_sum,
            aes(x = factor(year), y = freq,
                group = protocol),
            position = position_dodge(0.7)
        ) +
        geom_hline(yintercept = c(231, 383))

Environment

R version 4.4.2
ggplot2 3.5.1

Hi,

The issue is that you need to set position to 'stack' to have the columns stacked on top of each other... which prevents you from using position_dodge. You could set the y aesthethics to the interaction of year and protocol, add the stat argument, and remove the position argument... But this may not be what you are trying to achieve aesthetically.

ggplot() +
  geom_col(
    data = df,
    aes(x = interaction(factor(year), protocol), y = freq,
        fill = protocol),
    stat = 'identity',
    color = "black",
    width = 0.4
  ) +
  geom_point(
    data = df_sum,
    aes(x = factor(year), y = freq,
        group = protocol),
    position = position_dodge(0.7)
  ) +
  geom_hline(yintercept = c(231, 383))

Another possibility for you to keep your code, but modify your data. That is you would have to calculate all cumulative frequencies in the right order so it displays correctly in your graphs.

ggplot() +
  geom_col(
    data = df |>
      arrange( year, protocol, freq ) |>
      group_by( year, protocol) |>
      mutate( cumul_freq = cumsum(freq) ) |>
      arrange( year, protocol, desc(cumul_freq) ),
    aes(x = factor(year), y = cumul_freq,
        fill = protocol),
    position = position_dodge(width = 0.7),
    color = "black",
    width = 0.4
  ) +
  geom_point(
    data = df_sum,
    aes(x = factor(year), y = freq,
        group = protocol),
    position = position_dodge(0.7)
  ) +
  geom_hline(yintercept = c(231, 383))

Thank you!

By exploring around I discovered that ggplot simply does not allow this and one needs to use various workarounds if one wants it.

So (with great help of various AI) I developed an ad hoc geom:

GeomStackDodgeCol <- ggproto(
    "GeomStackDodgeCol", GeomRect,
    required_aes = c("x", "y", "fill", "group"),
    default_aes = aes(
        colour = "black",
        linewidth = 0.5,
        linetype = 1,
        alpha = NA
    ),
    
    setup_data = function(data, params) {
        # Reset stacking for each x value and fill group
        data <- data |>
            group_by(x, fill) |>
            mutate(
                ymin = c(0, head(cumsum(y), -1)),
                ymax = cumsum(y)
            ) |>
            ungroup()
        
        # Compute dodging offsets with width and padding
        fill_groups <- unique(data$fill)
        n_groups <- length(fill_groups)
        width <- params$width %||% 0.9     # width of the bars
        padding <- params$padding %||% 0.1  # padding between bars
        
        # Calculate total width needed for the group
        total_width <- n_groups * width + (n_groups - 1) * padding * width
        
        # Calculate positions with proper spacing
        positions <- seq(-total_width/2, total_width/2, length.out = n_groups)
        
        # Create rectangle coordinates
        data$xmin <- data$x + positions[match(data$fill, fill_groups)] - width/2
        data$xmax <- data$x + positions[match(data$fill, fill_groups)] + width/2
        
        data
    },
    
    draw_panel = function(data, panel_params, coord, width = 0.9, ...) {
        coords <- coord$transform(data, panel_params)
        
        grid::rectGrob(
            x = (coords$xmin + coords$xmax)/2,
            y = (coords$ymin + coords$ymax)/2,
            width = coords$xmax - coords$xmin,
            height = coords$ymax - coords$ymin,
            default.units = "native",
            just = c("center", "center"),
            gp = grid::gpar(
                col = coords$colour,
                fill = alpha(coords$fill, coords$alpha),
                lwd = coords$linewidth * .pt,
                lty = coords$linetype
            )
        )
    },
    
    parameters = function(complete = FALSE) {
        c("na.rm", "width", "padding")
    }
)

geom_stackdodge <- function(mapping = NULL, data = NULL,
                            position = "identity", 
                            width = 0.9,
                            padding = 0.1,
                            na.rm = FALSE,
                            show.legend = NA,
                            inherit.aes = TRUE, ...) {
    layer(
        geom = GeomStackDodgeCol,
        mapping = mapping,
        data = data,
        stat = "identity",
        position = position,
        show.legend = show.legend,
        inherit.aes = inherit.aes,
        params = list(
            na.rm = na.rm,
            width = width,
            padding = padding
        )
    )
}

of course testing is mandated.

Here's some testing code:

local({
    df <- bind_rows(
        data.frame(
            year = rep(2016, 5),
            protocol = rep("M", 5),
            country = c("A", "B", "C", "D", "E"),
            freq = c(100, 50, 30, 40, 11) # sum is 231
        ),
        data.frame(
            year = rep(2016, 4),
            protocol = rep("L", 4),
            country = c("A", "B", "C", "D"),
            freq = c(23, 60, 200, 100) # sum is 383
        )
    )
    
    df <- bind_rows(
        df,
        df |> mutate(year = 2017, freq = sample(freq)),
    )
    
    # Create summary data
    df_sum <- df |>
        summarise(
            label = paste(country, collapse = "\n"),
            freq = sum(freq),
            .by = c(year, protocol)
        )
    ggplot() +
        geom_stackdodge(
            data = df,
            aes(x = factor(year), y = freq, group = country,
                fill = protocol),
            width = 0.1, padding = 0.5
        ) +
        geom_hline(yintercept = c(sum(c(100, 50, 30, 40, 11), sum(c(23, 60, 200, 100) )) # To show that the bars sum up to the expected values
})