I'm trying to develop a boxplot based on survey data (~ n=1000). The problem is, R keeps rendering the box inaccurately, that is, it seems to be snapping the middle line and most of the IQ lines to the nearest whole integer. In the example below you can see how it's veering quite a ways off as I've added mean points using stat_summary() on the same data. I've never run into this behaviour before, so wondering if there's something going wrong with the underpinning data? Ordinarily, I'd share a tidy and small slice of data, but in this case, I'm going a bit wider just in case there's something underlying that is obvious to someone else looking at this. Apologies it is a bit messy.
Here's the plot as it is rendering for me:
The below code should be reproducible:
library(tidyverse)
library(haven)
values <-
structure(list(Q58_bin = structure(c(3L, 1L, 1L, 3L, 3L, 1L,
1L, 2L, 1L, 3L, 2L, 2L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 2L, 1L,
2L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 3L, 3L, 2L, 3L, 3L, 2L, 1L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L,
1L, 1L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 3L, 2L,
3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 3L, 1L, 3L, 1L,
3L, 2L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 3L, 3L,
1L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 3L, 1L, 3L,
1L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 1L, 1L, 1L,
1L, 3L, 2L, 1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, 1L,
3L, 3L, 2L, 2L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L,
3L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 2L, 3L, 3L, 3L, 3L,
3L, 1L, 3L, 1L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
2L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 3L, 3L, 2L, 1L, 3L, 2L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 2L
), levels = c("high (n=154)", "low (n=46)", "medium (n=142)"), class = "factor"),
Q4 = structure(c(3, 4, 4, 5, 5, 3, 5, 1, 1, 3, 5, 1, 1, 5,
5, 5, 4, 5, 3, 5, 4, 4, 5, 2, 5, 4, 4, 1, 5, 4, 4, 4, 3,
4, 3, 4, 5, 5, 4, 4, 2, 3, 3, 4, 3, 5, 5, 4, 4, 3, 5, 1,
5, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 4, 2, 4, 2, 3, 2, 3, 3,
4, 3, 2, 4, 4, 3, 3, 2, 5, 4, 4, 5, 2, 5, 3, 2, 2, 5, 3,
3, 2, 2, 4, 4, 3, 2, 4, 4, 4, 4, 5, 3, 4, 4, 1, 2, 5, 4,
3, 1, 5, 1, 4, 5, 3, 4, 2, 3, 4, 4, 4, 4, 5, 3, 2, 4, 2,
4, 5, 3, 3, 3, 4, 5, 5, 3, 5, 4, 4, 1, 1, 3, 3, 3, 1, 3,
1, 3, 5, 4, 3, 2, 1, 5, 1, 3, 2, 2, 3, 3, 1, 3, 2, 5, 4,
3, 2, 3, 4, 4, 5, 2, 4, 5, 2, 3, 4, 5, 3, 5, 4, 4, 2, 4,
4, 4, 1, 5, 2, 2, 4, 3, 5, 4, 3, 5, 5, 4, 5, 5, 2, 4, 2,
5, 5, 3, 3, 5, 3, 3, 3, 3, 4, 5, 1, 4, 4, 3, 4, 5, 2, 5,
5, 2, 2, 3, 4, 3, 3, 5, 4, 3, 4, 2, 4, 5, 3, 2, 3, 2, 5,
4, 2, 3, 1, 5, 4, 3, 5, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2,
4, 4, 4, 2, 4, 2, 3, 4, 4, 4, 3, 3, 4, 5, 4, 5, 2, 5, 5,
2, 3, 5, 3, 4, 4, 3, 2, 5, 3, 3, 2, 5, 3, 4, 3, 5, 3, 3,
5, 3, 1, 4, 4, 2, 5, 3, 3, 2, 4, 5, 3, 4, 4, 5, 1, 5, 5,
2, 5, 4, 5, 3, 4, 4, 3, 3, 3, 1, 3, 5, 5, 4, 2, 3, 3, 4,
3, 3, 3, 4, 2), label = "How much have you thought about climate change before today?", format.spss = "F40.0", display_width = 5L, labels = c("Not at all" = 1,
"A little" = 2, Some = 3, "A lot" = 4, "A great deal" = 5,
"Don't know" = 99), class = c("haven_labelled", "vctrs_vctr",
"double")), n = c(142L, 154L, 154L, 142L, 142L, 154L, 154L,
46L, 154L, 142L, 46L, 46L, 154L, 46L, 154L, 142L, 46L, 142L,
154L, 154L, 46L, 154L, 46L, 142L, 154L, 142L, 154L, 154L,
154L, 142L, 142L, 142L, 154L, 154L, 154L, 154L, 154L, 142L,
154L, 142L, 142L, 46L, 142L, 142L, 46L, 154L, 142L, 46L,
46L, 142L, 142L, 142L, 142L, 142L, 46L, 46L, 154L, 154L,
46L, 154L, 46L, 142L, 154L, 142L, 142L, 154L, 142L, 142L,
142L, 154L, 154L, 154L, 46L, 142L, 142L, 142L, 142L, 154L,
154L, 154L, 142L, 142L, 142L, 154L, 142L, 46L, 142L, 142L,
154L, 154L, 142L, 142L, 142L, 142L, 142L, 46L, 154L, 154L,
142L, 154L, 142L, 154L, 142L, 46L, 46L, 154L, 142L, 142L,
46L, 154L, 46L, 154L, 46L, 154L, 154L, 46L, 142L, 142L, 154L,
154L, 154L, 154L, 142L, 46L, 154L, 154L, 142L, 46L, 154L,
154L, 142L, 142L, 154L, 142L, 154L, 154L, 142L, 142L, 142L,
142L, 46L, 46L, 154L, 154L, 154L, 154L, 154L, 142L, 142L,
154L, 154L, 154L, 154L, 154L, 154L, 154L, 142L, 142L, 46L,
142L, 142L, 154L, 46L, 154L, 154L, 154L, 154L, 142L, 46L,
154L, 154L, 142L, 46L, 142L, 154L, 142L, 154L, 46L, 154L,
154L, 154L, 154L, 142L, 142L, 46L, 46L, 154L, 142L, 142L,
154L, 154L, 154L, 142L, 154L, 142L, 142L, 154L, 142L, 142L,
46L, 142L, 142L, 154L, 154L, 142L, 142L, 154L, 142L, 154L,
46L, 142L, 142L, 142L, 142L, 142L, 154L, 142L, 154L, 142L,
142L, 142L, 46L, 154L, 142L, 154L, 154L, 142L, 154L, 154L,
154L, 154L, 154L, 142L, 154L, 142L, 154L, 142L, 142L, 154L,
154L, 154L, 154L, 142L, 142L, 142L, 142L, 142L, 46L, 154L,
142L, 154L, 154L, 142L, 154L, 142L, 154L, 142L, 142L, 142L,
142L, 142L, 154L, 46L, 142L, 142L, 46L, 142L, 142L, 154L,
142L, 142L, 142L, 46L, 154L, 142L, 142L, 154L, 154L, 154L,
154L, 154L, 154L, 142L, 142L, 154L, 154L, 142L, 154L, 154L,
154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L,
154L, 142L, 142L, 142L, 154L, 142L, 154L, 154L, 142L, 142L,
142L, 142L, 142L, 142L, 142L, 142L, 46L, 154L, 154L, 154L,
154L, 154L, 154L, 154L, 154L, 142L, 154L, 154L, 142L, 142L,
46L, 154L, 142L, 46L, 142L, 142L, 154L, 142L, 154L, 154L,
154L, 154L, 46L)), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -342L), groups = structure(list(
Q58_bin = structure(1:3, levels = c("high (n=154)", "low (n=46)",
"medium (n=142)"), class = "factor"), .rows = structure(list(
c(2L, 3L, 6L, 7L, 9L, 13L, 15L, 19L, 20L, 22L, 25L, 27L,
28L, 29L, 33L, 34L, 35L, 36L, 37L, 39L, 46L, 57L, 58L,
60L, 63L, 66L, 70L, 71L, 72L, 78L, 79L, 80L, 84L, 89L,
90L, 97L, 98L, 100L, 102L, 106L, 110L, 112L, 114L, 115L,
119L, 120L, 121L, 122L, 125L, 126L, 129L, 130L, 133L,
135L, 136L, 143L, 144L, 145L, 146L, 147L, 150L, 151L,
152L, 153L, 154L, 155L, 156L, 162L, 164L, 165L, 166L,
167L, 170L, 171L, 175L, 177L, 179L, 180L, 181L, 182L,
187L, 190L, 191L, 192L, 194L, 197L, 203L, 204L, 207L,
209L, 216L, 218L, 223L, 225L, 226L, 228L, 229L, 230L,
231L, 232L, 234L, 236L, 239L, 240L, 241L, 242L, 249L,
251L, 252L, 254L, 256L, 262L, 269L, 274L, 277L, 278L,
279L, 280L, 281L, 282L, 285L, 286L, 288L, 289L, 290L,
291L, 292L, 293L, 294L, 295L, 296L, 297L, 298L, 299L,
300L, 304L, 306L, 307L, 317L, 318L, 319L, 320L, 321L,
322L, 323L, 324L, 326L, 327L, 331L, 336L, 338L, 339L,
340L, 341L), c(8L, 11L, 12L, 14L, 17L, 21L, 23L, 42L,
45L, 48L, 49L, 55L, 56L, 59L, 61L, 73L, 86L, 96L, 104L,
105L, 109L, 111L, 113L, 116L, 124L, 128L, 141L, 142L,
159L, 163L, 169L, 173L, 178L, 185L, 186L, 200L, 210L,
222L, 248L, 263L, 266L, 273L, 316L, 330L, 333L, 342L),
c(1L, 4L, 5L, 10L, 16L, 18L, 24L, 26L, 30L, 31L, 32L,
38L, 40L, 41L, 43L, 44L, 47L, 50L, 51L, 52L, 53L, 54L,
62L, 64L, 65L, 67L, 68L, 69L, 74L, 75L, 76L, 77L, 81L,
82L, 83L, 85L, 87L, 88L, 91L, 92L, 93L, 94L, 95L, 99L,
101L, 103L, 107L, 108L, 117L, 118L, 123L, 127L, 131L,
132L, 134L, 137L, 138L, 139L, 140L, 148L, 149L, 157L,
158L, 160L, 161L, 168L, 172L, 174L, 176L, 183L, 184L,
188L, 189L, 193L, 195L, 196L, 198L, 199L, 201L, 202L,
205L, 206L, 208L, 211L, 212L, 213L, 214L, 215L, 217L,
219L, 220L, 221L, 224L, 227L, 233L, 235L, 237L, 238L,
243L, 244L, 245L, 246L, 247L, 250L, 253L, 255L, 257L,
258L, 259L, 260L, 261L, 264L, 265L, 267L, 268L, 270L,
271L, 272L, 275L, 276L, 283L, 284L, 287L, 301L, 302L,
303L, 305L, 308L, 309L, 310L, 311L, 312L, 313L, 314L,
315L, 325L, 328L, 329L, 332L, 334L, 335L, 337L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE))
summary_stats <- values %>%
group_by(Q58_bin) %>%
summarise(mean = mean(Q4, na.rm = TRUE),
IQR = IQR(Q4, na.rm = TRUE))
ggplot(values, aes(x = Q58_bin, y = Q4, group = Q58_bin)) +
geom_boxplot(color = "blue", outlier.shape = TRUE) +
geom_jitter(color = "purple", width = 0.2, alpha = 0.2) + # Add scatterplot points
stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "blue") + # Add mean points
geom_text(data = summary_stats, aes(label = paste0("Mean: ", round(mean, 2)), y = mean + 0.2), color = "black", size = 3, alpha=0.7) + # Add mean text
geom_text(data = summary_stats, aes(label = paste0("IQR: ", round(IQR, 2)), y = mean - 0.2), color = "black", size = 3, alpha=0.7) + # Add IQR text
labs(title = "",
x = "",
y = str_wrap("How much have you thought about climate change before today?", width = 25)) + # Wrap y-axis label
theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_y_continuous(limits = c(0, 5), breaks = seq(0, 5, by = 0.5)) # Adjust y-axis breaks for precision