Binning and geom_dotplot, unexpected bins and filling/stacking behaviour

I am clearly misunderstanding something about how geom_dotplot works, or how the binwidth parameter should be understood, so I'd really welcome a clarification.

In brief: if I use ggplot2's geom_dotplot() (or similar functions from other packages, here ggdistfor reference), and set binwidth to 2, I'd expect to get some dots in every bin, unless there's a "hole" in my data, with no cases in a given bin of width 2.

If you look at the graphs below, you will see that there is a visible "hole"/empty column just before value 50. However, in my data there is no bin of width 2 that is expected to be empty, as there are values for both 48 and 50.

I must be obviously misunderstanding something very basic about how binning (or one of the functions involved) works, but I couldn't figure out through the documentation what is the issue. I'd really welcome:

  • any pointer to an explanation of how binning works in these cases, and why a bin with binwidth 2 should have value 0 if cutting data with width 2 does not not lead to any bin with value zero
  • a way to use geom_dotplot that reflects bins as they result from cut

Thanks!

library("dplyr", warn.conflicts = FALSE)
library("ggplot2")
library("ggdist")


df <- data.frame(ab =c(rep("a", 308),
                       rep("b", 91)), 
                 value = 
                   c(56L, 14L, 11L, 12L, 37L, 17L, 61L, 19L, 79L, 81L, 67L, 13L, 
                     15L, 28L, 62L, 54L, 45L, 29L, 15L, 54L, 27L, 17L, 13L, 62L, 44L, 
                     88L, 31L, 96L, 42L, 21L, 28L, 52L, 94L, 52L, 34L, 66L, 32L, 99L, 
                     54L, 37L, 57L, 20L, 15L, 39L, 65L, 27L, 52L, 58L, 18L, 33L, 52L, 
                     69L, 64L, 24L, 42L, 23L, 54L, 38L, 71L, 69L, 43L, 56L, 47L, 16L, 
                     62L, 55L, 48L, 41L, 12L, 76L, 18L, 45L, 51L, 38L, 29L, 63L, 58L, 
                     15L, 18L, 12L, 42L, 11L, 74L, 98L, 99L, 66L, 39L, 21L, 34L, 42L, 
                     55L, 38L, 25L, 46L, 23L, 28L, 78L, 53L, 19L, 21L, 41L, 24L, 12L, 
                     52L, 68L, 43L, 50L, 39L, 84L, 32L, 23L, 24L, 32L, 21L, 55L, 62L, 
                     23L, 47L, 34L, 9L, 48L, 41L, 44L, 27L, 15L, 74L, 20L, 26L, 68L, 
                     44L, 34L, 47L, 43L, 41L, 67L, 64L, 97L, 26L, 51L, 31L, 66L, 43L, 
                     61L, 96L, 17L, 37L, 71L, 46L, 53L, 26L, 18L, 31L, 35L, 17L, 14L, 
                     79L, 48L, 35L, 66L, 63L, 52L, 40L, 33L, 20L, 28L, 39L, 44L, 53L, 
                     16L, 15L, 38L, 50L, 14L, 47L, 41L, 57L, 61L, 38L, 30L, 84L, 60L, 
                     44L, 75L, 22L, 67L, 31L, 29L, 90L, 54L, 23L, 13L, 51L, 46L, 50L, 
                     26L, 68L, 51L, 40L, 43L, 85L, 83L, 86L, 30L, 34L, 38L, 96L, 42L, 
                     36L, 74L, 77L, 44L, 62L, 39L, 38L, 43L, 25L, 36L, 31L, 71L, 53L, 
                     86L, 65L, 33L, 97L, 39L, 30L, 25L, 60L, 80L, 40L, 81L, 85L, 57L, 
                     30L, 26L, 73L, 12L, 51L, 57L, 25L, 36L, 13L, 48L, 79L, 31L, 58L, 
                     15L, 30L, 27L, 57L, 52L, 34L, 13L, 14L, 16L, 33L, 69L, 76L, 42L, 
                     88L, 41L, 53L, 42L, 60L, 70L, 72L, 52L, 38L, 11L, 93L, 54L, 39L, 
                     36L, 76L, 12L, 91L, 36L, 30L, 37L, 53L, 26L, 43L, 32L, 70L, 55L, 
                     94L, 69L, 63L, 67L, 39L, 18L, 27L, 43L, 53L, 78L, 44L, 38L, 30L, 
                     7L, 29L, 26L, 28L, 51L, 10L, 52L, 34L, 90L, 58L, 60L, 84L, 24L, 
                     44L, 68L, 28L, 14L, 55L, 30L, 26L, 12L, 21L, 16L, 22L, 22L, 11L, 
                     16L, 14L, 69L, 33L, 25L, 6L, 55L, 34L, 30L, 59L, 15L, 13L, 16L, 
                     22L, 38L, 20L, 20L, 8L, 18L, 34L, 16L, 33L, 25L, 21L, 21L, 25L, 
                     30L, 21L, 5L, 37L, 33L, 63L, 81L, 76L, 76L, 37L, 30L, 48L, 33L, 
                     11L, 94L, 45L, 52L, 21L, 42L, 50L, 19L, 40L, 58L, 55L, 33L, 24L, 
                     23L, 12L, 45L, 46L, 20L, 68L, 77L, 21L, 48L, 75L, 47L, 61L, 51L, 
                     56L, 34L, 71L, 90L, 45L, 67L, 3L, 76L, 46L, 81L))

df |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2, binpositions = "all") 

NuL4rAY


df |> 
  dplyr::filter(ab == "a") |> 
  ggplot(aes(x = value)) +
  geom_dotplot(binwidth = 2, binpositions = "all") 

zF2Xwmk


df %>%
  ggplot(aes(y = ab, x = value, fill = ab)) +
  stat_slab() +
  stat_dotsinterval(aes(group = NA), side = "bottom", slab_linewidth = NA, binwidth = 2, position = position_dodge()) +
  scale_x_continuous(n.breaks = 10) +
  scale_fill_brewer(palette = "Set2")

L3R3Ffb



df |> 
  dplyr::filter(ab == "a") |> 
  dplyr::group_by(value) |> 
  dplyr::count() |> 
  dplyr::filter(value>45)
#> # A tibble: 47 × 2
#> # Groups:   value [47]
#>    value     n
#>    <int> <int>
#>  1    46     3
#>  2    47     4
#>  3    48     4
#>  4    50     3
#>  5    51     6
#>  6    52     9
#>  7    53     7
#>  8    54     6
#>  9    55     4
#> 10    56     2
#> # ℹ 37 more rows


df |> 
  dplyr::filter(ab == "a") |> 
  dplyr::pull(value) |> 
  cut_width(width = 2, boundary = 0) |> 
  table()
#> 
#>    [6,8]   (8,10]  (10,12]  (12,14]  (14,16]  (16,18]  (18,20]  (20,22] 
#>        1        2        9        9       10        9        5        5 
#>  (22,24]  (24,26]  (26,28]  (28,30]  (30,32]  (32,34]  (34,36]  (36,38] 
#>        8       11       10       11       10       11        7       13 
#>  (38,40]  (40,42]  (42,44]  (44,46]  (46,48]  (48,50]  (50,52]  (52,54] 
#>       11       13       15        5        8        3       15       13 
#>  (54,56]  (56,58]  (58,60]  (60,62]  (62,64]  (64,66]  (66,68]  (68,70] 
#>        6        9        3        8        5        6        7        6 
#>  (70,72]  (72,74]  (74,76]  (76,78]  (78,80]  (80,82]  (82,84]  (84,86] 
#>        4        4        4        3        4        2        3        4 
#>  (86,88]  (88,90]  (90,92]  (92,94]  (94,96]  (96,98] (98,100] 
#>        2        2        1        3        3        3        2

Created on 2023-08-01 with reprex v2.0.2

Try adding method = "histodot"

library("ggplot2")
#library("ggdist")


d <- data.frame(ab =c(rep("a", 308),
                       rep("b", 91)), 
                 value = 
                   c(56, 14, 11, 12, 37, 17, 61, 19, 79, 81, 67, 13, 
                     15, 28, 62, 54, 45, 29, 15, 54, 27, 17, 13, 62, 44, 
                     88, 31, 96, 42, 21, 28, 52, 94, 52, 34, 66, 32, 99, 
                     54, 37, 57, 20, 15, 39, 65, 27, 52, 58, 18, 33, 52, 
                     69, 64, 24, 42, 23, 54, 38, 71, 69, 43, 56, 47, 16, 
                     62, 55, 48, 41, 12, 76, 18, 45, 51, 38, 29, 63, 58, 
                     15, 18, 12, 42, 11, 74, 98, 99, 66, 39, 21, 34, 42, 
                     55, 38, 25, 46, 23, 28, 78, 53, 19, 21, 41, 24, 12, 
                     52, 68, 43, 50, 39, 84, 32, 23, 24, 32, 21, 55, 62, 
                     23, 47, 34, 9, 48, 41, 44, 27, 15, 74, 20, 26, 68, 
                     44, 34, 47, 43, 41, 67, 64, 97, 26, 51, 31, 66, 43, 
                     61, 96, 17, 37, 71, 46, 53, 26, 18, 31, 35, 17, 14, 
                     79, 48, 35, 66, 63, 52, 40, 33, 20, 28, 39, 44, 53, 
                     16, 15, 38, 50, 14, 47, 41, 57, 61, 38, 30, 84, 60, 
                     44, 75, 22, 67, 31, 29, 90, 54, 23, 13, 51, 46, 50, 
                     26, 68, 51, 40, 43, 85, 83, 86, 30, 34, 38, 96, 42, 
                     36, 74, 77, 44, 62, 39, 38, 43, 25, 36, 31, 71, 53, 
                     86, 65, 33, 97, 39, 30, 25, 60, 80, 40, 81, 85, 57, 
                     30, 26, 73, 12, 51, 57, 25, 36, 13, 48, 79, 31, 58, 
                     15, 30, 27, 57, 52, 34, 13, 14, 16, 33, 69, 76, 42, 
                     88, 41, 53, 42, 60, 70, 72, 52, 38, 11, 93, 54, 39, 
                     36, 76, 12, 91, 36, 30, 37, 53, 26, 43, 32, 70, 55, 
                     94, 69, 63, 67, 39, 18, 27, 43, 53, 78, 44, 38, 30, 
                     7, 29, 26, 28, 51, 10, 52, 34, 90, 58, 60, 84, 24, 
                     44, 68, 28, 14, 55, 30, 26, 12, 21, 16, 22, 22, 11, 
                     16, 14, 69, 33, 25, 6, 55, 34, 30, 59, 15, 13, 16, 
                     22, 38, 20, 20, 8, 18, 34, 16, 33, 25, 21, 21, 25, 
                     30, 21, 5, 37, 33, 63, 81, 76, 76, 37, 30, 48, 33, 
                     11, 94, 45, 52, 21, 42, 50, 19, 40, 58, 55, 33, 24, 
                     23, 12, 45, 46, 20, 68, 77, 21, 48, 75, 47, 61, 51, 
                     56, 34, 71, 90, 45, 67, 3, 76, 46, 81))

d |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2, method = 'histodot', binpositions = "all") +
  theme_minimal()

Created on 2023-08-02 with reprex v2.0.2

1 Like

Thanks to Technocrat for providing solution - further reading on the matter can be found here :

1 Like

Thanks @technocrat .

I've tried this, but the number of dots still do not match.

d |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2, method = 'histodot', binpositions = "all") +
  theme_minimal()

If I try to look at the table, and look at the "column" immediately before 50, I should expect 4 "a" dots, and 2 "b" dots. However, I see two and two.

If I table the results, or I count them manually, there is no way I can bin them and get 2 "a" and 2 "b".

library("dplyr", warn.conflicts = FALSE)
library("ggplot2")

d |> 
  dplyr::filter(ab == "a") |> 
  dplyr::pull(value) |> 
  cut_width(width = 2, boundary = 1) |> 
  table()
#> 
#>   [7,9]  (9,11] (11,13] (13,15] (15,17] (17,19] (19,21] (21,23] (23,25] (25,27] 
#>       2       4      11      11       7       7       7       6       7      12 
#> (27,29] (29,31] (31,33] (33,35] (35,37] (37,39] (39,41] (41,43] (43,45] (45,47] 
#>       9      13       8       9       9      17       9      15       9       7 
#> (47,49] (49,51] (51,53] (53,55] (55,57] (57,59] (59,61] (61,63] (63,65] (65,67] 
#>       4       9      16      10       7       4       6       8       4       8 
#> (67,69] (69,71] (71,73] (73,75] (75,77] (77,79] (79,81] (81,83] (83,85] (85,87] 
#>       7       5       2       4       4       5       3       1       4       2 
#> (87,89] (89,91] (91,93] (93,95] (95,97] (97,99] 
#>       2       3       1       2       5       3

d |> 
  dplyr::filter(ab == "b") |> 
  dplyr::pull(value) |> 
  cut_width(width = 2, boundary = 1) |> 
  table()
#> 
#>   [3,5]   (5,7]   (7,9]  (9,11] (11,13] (13,15] (15,17] (17,19] (19,21] (21,23] 
#>       2       1       1       2       3       3       4       2       9       4 
#> (23,25] (25,27] (27,29] (29,31] (31,33] (33,35] (35,37] (37,39] (39,41] (41,43] 
#>       5       1       1       4       5       3       2       1       1       1 
#> (43,45] (45,47] (47,49] (49,51] (51,53] (53,55] (55,57] (57,59] (59,61] (61,63] 
#>       4       3       2       2       1       3       1       2       2       1 
#> (63,65] (65,67] (67,69] (69,71] (71,73] (73,75] (75,77] (77,79] (79,81] (81,83] 
#>       0       1       3       1       0       1       4       0       2       0 
#> (83,85] (85,87] (87,89] (89,91] (91,93] (93,95] 
#>       1       0       0       1       0       1

Indeed, if I plot them separately, I do get 4 "a" and 2 "b", as expected.


d |> 
  dplyr::filter(ab == "a") |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2, method = 'histodot', binpositions = "all") +
  theme_minimal()

d |> 
  dplyr::filter(ab == "b") |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2, method = 'histodot', binpositions = "all") +
  theme_minimal()

What am I missing here? Is ggplot just drawing them dots on top of each other? But if so, this is surely a bug? What am I missing here?

@technocrat

Yes, here is a minimal example.

library("ggplot2")

d <- data.frame(ab = c("a", "a", "a",
                       "b", "b", "b"),
                value = c(1, 2, 5, 
                          1, 2, 5)
)


d |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2,
               method = 'histodot',
               binpositions = "all",
               dotsize = 0.5) 

There is no "a"!

I am misunderstanding something or is this a bug?

I see this has been addressed as an issue - Geom_dotplot dots overlap when 'fill' in 'aes' · Issue #3620 · tidyverse/ggplot2 · GitHub

Adding stackgroups = TRUE seems to fix this.

Still puzzled by the default behaviour, but this seems to be discussed in the issue.

I struggle to see how this is proper default behaviour and not a bug...

library("ggplot2")

d <- data.frame(ab = c("a", "a", "a", "a",
                       "b", "b", "b", "b"),
                value = c(1, 2, 3, 5, 
                          1, 2, 4, 5)
)


d |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2,
               dotsize = 0.3) 

No 3 in the data in first MwH. Just noticed, however, that y-axis isn’t by integer (number of observations) but by proportion, which implies that the number of dots doesn’t necessarily correspond to the number of values. It’s late here (PDT). I’ll take a look later today.

1 Like

I think the Wilkinson dotplot with its density representation is the primary usecase that this geom was intended to support; I'm just guessing that the secondary usecase of a conventional histogram type chart where the bar is seperated out into dots; is the victim of the Wilkinson dotplot bias vis how the function is set up with its defaults etc.

You are looking for this sort of thing I think ?

library("ggplot2")

d <- data.frame(ab = c("a", "a", "a", "a",
                       "b", "b", "b", "b"),
                value = c(1, 2, 3, 5, 
                          1, 2, 4, 5)
)


d |> 
  ggplot(aes(x = value, fill = ab)) +
  geom_dotplot(binwidth = 2,
               stackgroups = TRUE,
               method="histodot",
               dotsize = .3,
               origin = 0) 

#explicit cuts 
d$cut <- cut(d$value,breaks = c(0,2,4,6))
d |> 
  ggplot(aes(x = cut, fill = ab)) +
  geom_dotplot(stackgroups = TRUE,
               method="histodot",
               dotsize = .3,binwidth = 1) 

first :
image
second :
image

1 Like

Thanks! Yes, but even if it is by proportion, I still suppose it still shouldn't hide completely one of bin types by default without even throwing a warning. Either way, I'm obviously trying to apply this function not quite in the way it was intended... I guess I'll have to read the paper to get more context about its expected use cases, and find something else for the kind of things I had in mind.

But still, if you look at ?geom_dotplot, the documentation says in its very first sentence: "dots are stacked, with each dot representing one observation". So no matter what the y-axis does, I'd still expect this to be true by default.

I'll check it out more thoroughly, but this does seem closer to what I had in mind. Either way, as you suggest, this is all meant to be used in a very different way... I will check out the paper. But I still feel the documentation could do a better job in explaining the intended purpose of the function, and especially the kind of things that this is clearly not meant for...

The documentation of ?geom_dotplot starts with:

In a dot plot, the width of a dot corresponds to the bin width (or maximum width, depending on the binning algorithm), and dots are stacked, with each dot representing one observation.

Which seems to describe the kind of use cases I had in mind... Thanks again!

Overall, there is really no consistently good solution. Depending on the use case, even reverting to basic geom_point and some manual tweaking may not be too bad.

library("ggplot2")
library("dplyr", warn.conflicts = FALSE)

d <- data.frame(ab = c("a", "a", "a", "a",
                       "b", "b", "b", "b"),
                value = c(1, 2, 3, 5, 
                          1, 2, 4, 5)
)


d |> 
  mutate(cut_value =  ggplot2::cut_width(value, width = 2, boundary = 0)) |> 
  group_by(cut_value) |> 
  mutate(y_position = row_number()) |> 
  ungroup() |> 
  ggplot(aes(x = cut_value, y =  y_position-0.5, fill = ab)) +
  geom_point(size = 9, shape = 21) +
  scale_y_continuous(limits = c(0, 10))


df <- data.frame(ab =c(rep("a", 308),
                       rep("b", 91)), 
                 value = 
                   c(56L, 14L, 11L, 12L, 37L, 17L, 61L, 19L, 79L, 81L, 67L, 13L, 
                     15L, 28L, 62L, 54L, 45L, 29L, 15L, 54L, 27L, 17L, 13L, 62L, 44L, 
                     88L, 31L, 96L, 42L, 21L, 28L, 52L, 94L, 52L, 34L, 66L, 32L, 99L, 
                     54L, 37L, 57L, 20L, 15L, 39L, 65L, 27L, 52L, 58L, 18L, 33L, 52L, 
                     69L, 64L, 24L, 42L, 23L, 54L, 38L, 71L, 69L, 43L, 56L, 47L, 16L, 
                     62L, 55L, 48L, 41L, 12L, 76L, 18L, 45L, 51L, 38L, 29L, 63L, 58L, 
                     15L, 18L, 12L, 42L, 11L, 74L, 98L, 99L, 66L, 39L, 21L, 34L, 42L, 
                     55L, 38L, 25L, 46L, 23L, 28L, 78L, 53L, 19L, 21L, 41L, 24L, 12L, 
                     52L, 68L, 43L, 50L, 39L, 84L, 32L, 23L, 24L, 32L, 21L, 55L, 62L, 
                     23L, 47L, 34L, 9L, 48L, 41L, 44L, 27L, 15L, 74L, 20L, 26L, 68L, 
                     44L, 34L, 47L, 43L, 41L, 67L, 64L, 97L, 26L, 51L, 31L, 66L, 43L, 
                     61L, 96L, 17L, 37L, 71L, 46L, 53L, 26L, 18L, 31L, 35L, 17L, 14L, 
                     79L, 48L, 35L, 66L, 63L, 52L, 40L, 33L, 20L, 28L, 39L, 44L, 53L, 
                     16L, 15L, 38L, 50L, 14L, 47L, 41L, 57L, 61L, 38L, 30L, 84L, 60L, 
                     44L, 75L, 22L, 67L, 31L, 29L, 90L, 54L, 23L, 13L, 51L, 46L, 50L, 
                     26L, 68L, 51L, 40L, 43L, 85L, 83L, 86L, 30L, 34L, 38L, 96L, 42L, 
                     36L, 74L, 77L, 44L, 62L, 39L, 38L, 43L, 25L, 36L, 31L, 71L, 53L, 
                     86L, 65L, 33L, 97L, 39L, 30L, 25L, 60L, 80L, 40L, 81L, 85L, 57L, 
                     30L, 26L, 73L, 12L, 51L, 57L, 25L, 36L, 13L, 48L, 79L, 31L, 58L, 
                     15L, 30L, 27L, 57L, 52L, 34L, 13L, 14L, 16L, 33L, 69L, 76L, 42L, 
                     88L, 41L, 53L, 42L, 60L, 70L, 72L, 52L, 38L, 11L, 93L, 54L, 39L, 
                     36L, 76L, 12L, 91L, 36L, 30L, 37L, 53L, 26L, 43L, 32L, 70L, 55L, 
                     94L, 69L, 63L, 67L, 39L, 18L, 27L, 43L, 53L, 78L, 44L, 38L, 30L, 
                     7L, 29L, 26L, 28L, 51L, 10L, 52L, 34L, 90L, 58L, 60L, 84L, 24L, 
                     44L, 68L, 28L, 14L, 55L, 30L, 26L, 12L, 21L, 16L, 22L, 22L, 11L, 
                     16L, 14L, 69L, 33L, 25L, 6L, 55L, 34L, 30L, 59L, 15L, 13L, 16L, 
                     22L, 38L, 20L, 20L, 8L, 18L, 34L, 16L, 33L, 25L, 21L, 21L, 25L, 
                     30L, 21L, 5L, 37L, 33L, 63L, 81L, 76L, 76L, 37L, 30L, 48L, 33L, 
                     11L, 94L, 45L, 52L, 21L, 42L, 50L, 19L, 40L, 58L, 55L, 33L, 24L, 
                     23L, 12L, 45L, 46L, 20L, 68L, 77L, 21L, 48L, 75L, 47L, 61L, 51L, 
                     56L, 34L, 71L, 90L, 45L, 67L, 3L, 76L, 46L, 81L))

df |> 
  mutate(cut_value =  ggplot2::cut_width(value, width = 2, boundary = 0)) |> 
  group_by(cut_value) |> 
  mutate(y_position = row_number()) |> 
  ungroup() |> 
  ggplot(aes(x = cut_value, y =  y_position-0.5, fill = ab)) +
  geom_point(size = 4, shape = 21) +
  scale_y_continuous(name = "number", limits = c(0, 20)) +
  scale_x_discrete() +
  theme(axis.text.x = element_text(angle = 90))  

Created on 2023-08-06 with reprex v2.0.2

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.