How to use tidyverse to group columns

blackish952 · June 24, 2018, 1:36pm

I have done some group_by in tidyverse from my data set. So I have a few different tasks that need your assistance.

Here is the dput() output:

Summary

> dput(final_df[1:50,])
structure(list(ACCTG_YEAR_KEY = c(2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L), CUSTOMER_NUMBER = c(220200080L, 240548132L, 301360595L, 
328452940L, 341200277L, 230428210L, 240202617L, 344714103L, 345200417L, 
371200194L, 373315788L, 374419508L, 374677593L, 402200310L, 220722498L, 
230287664L, 230553268L, 230652420L, 240202617L, 240202621L, 240206733L, 
240825287L, 209198530L, 304143540L, 328200348L, 344200453L, 344431705L, 
344714103L, 345200446L, 304639741L, 344200453L, 344608751L, 370776672L, 
373200163L, 373200603L, 374216239L, 341702398L, 373201131L, 373201151L, 
373201195L, 373201195L, 374419508L, 371522452L, 375724769L, 408200760L, 
438126238L, 438496803L, 448200151L, 374745489L, 410200452L), 
    Sales = c(96.4, 74.96, 89, 94.85, 81.94, 75, 87.72, 94.85, 
    94.85, 79.27, 87.99, 71.31, 77.78, 94.85, 94.85, 80.62, 85, 
    85.37, 80.65, 80.62, 94.85, 85.36, 89.95, 75.88, 85.36, 70.28, 
    94.85, 94.85, 75, 85, 70.28, 75, 94.85, 81.94, 70.99, 91.58, 
    85.36, 72.3, 87.5, 69.99, 71.99, 71.31, 94.85, 86.76, 73.27, 
    91.58, 82.95, 86.76, 71.31, 86.76), QtySold = c(1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), MFGCOST = c(36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 
    36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 
    36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 
    36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 
    36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 
    36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05), 
    MarginDollars = c(48.98, 27.54, 41.58, 47.43, 34.52, 27.58, 
    40.3, 47.43, 47.43, 31.85, 40.57, 23.89, 30.36, 47.43, 47.43, 
    33.2, 37.58, 37.95, 33.23, 33.2, 47.43, 37.94, 42.53, 28.46, 
    37.94, 22.86, 47.43, 47.43, 27.58, 37.58, 22.86, 27.58, 47.43, 
    34.52, 23.57, 44.16, 37.94, 24.88, 40.08, 22.57, 24.57, 23.89, 
    47.43, 39.34, 25.85, 44.16, 35.53, 39.34, 23.89, 39.34), 
    PRODUCT_SUB_LINE_DESCR = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("PRIVATE LABEL", "SUNDRY"), class = "factor"), 
    Document_Key = c(109728177L, 105927185L, 109524825L, 107340834L, 
    109326548L, 105217321L, 109036278L, 106887077L, 106829163L, 
    108487569L, 105511403L, 109328000L, 106756016L, 107475976L, 
    106799857L, 105507436L, 107338996L, 107550135L, 106423749L, 
    106942047L, 105543940L, 107866146L, 108541653L, 107119996L, 
    107037597L, 106467534L, 107096390L, 108129873L, 107154419L, 
    109656355L, 109710707L, 105752057L, 107609854L, 109083555L, 
    106787295L, 109700398L, 107096083L, 109658183L, 105420733L, 
    105855786L, 108145877L, 108946083L, 105164588L, 108929791L, 
    105374455L, 109952072L, 108067902L, 110043209L, 106803795L, 
    109345189L), Gross_MarginDollars = c(58.422, 37.4108, 51.17, 
    56.903, 44.2512, 37.45, 49.9156, 56.903, 56.903, 41.6346, 
    50.1802, 33.8338, 40.1744, 56.903, 56.903, 42.9576, 47.25, 
    47.6126, 42.987, 42.9576, 56.903, 47.6028, 52.101, 38.3124, 
    47.6028, 32.8244, 56.903, 56.903, 37.45, 47.25, 32.8244, 
    37.45, 56.903, 44.2512, 33.5202, 53.6984, 47.6028, 34.804, 
    49.7, 32.5402, 34.5002, 33.8338, 56.903, 48.9748, 35.7546, 
    53.6984, 45.241, 48.9748, 33.8338, 48.9748), Gross_MarginDollars_Percentage = c(60.603734439834, 
    49.9076840981857, 57.4943820224719, 59.9926199261993, 54.0043934586283, 
    49.9333333333333, 56.9033287733698, 59.9926199261993, 59.9926199261993, 
    52.5225179765359, 57.0294351630867, 47.4460804936194, 51.6513242478786, 
    59.9926199261993, 59.9926199261993, 53.2840486231704, 55.5882352941177, 
    55.7720510718051, 53.3006819590825, 53.2840486231704, 59.9926199261993, 
    55.7671040299906, 57.9221789883269, 50.4907749077491, 55.7671040299906, 
    46.7051792828685, 59.9926199261993, 59.9926199261993, 49.9333333333333, 
    55.5882352941177, 46.7051792828685, 49.9333333333333, 59.9926199261993, 
    54.0043934586283, 47.2181997464432, 58.6355099366674, 55.7671040299906, 
    48.1383125864454, 56.8, 46.4926418059723, 47.9236005000695, 
    47.4460804936194, 59.9926199261993, 56.4485938220378, 48.7984168145216, 
    58.6355099366674, 54.5400843881857, 56.4485938220378, 47.4460804936194, 
    56.4485938220378)), row.names = c(NA, -50L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = c("Document_Key", "CUSTOMER_NUMBER"
), drop = TRUE, indices = list(42L, 5L, 44L, 38L, 15L, 10L, 20L, 
    31L, 39L, 1L, 18L, 25L, 12L, 34L, 14L, 48L, 8L, 7L, 19L, 
    24L, 36L, 26L, 23L, 28L, 16L, 3L, 13L, 17L, 32L, 21L, 46L, 
    27L, 40L, 9L, 22L, 43L, 41L, 6L, 33L, 4L, 11L, 49L, 2L, 29L, 
    37L, 35L, 30L, 0L, 45L, 47L), group_sizes = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
    Document_Key = c(105164588L, 105217321L, 105374455L, 105420733L, 
    105507436L, 105511403L, 105543940L, 105752057L, 105855786L, 
    105927185L, 106423749L, 106467534L, 106756016L, 106787295L, 
    106799857L, 106803795L, 106829163L, 106887077L, 106942047L, 
    107037597L, 107096083L, 107096390L, 107119996L, 107154419L, 
    107338996L, 107340834L, 107475976L, 107550135L, 107609854L, 
    107866146L, 108067902L, 108129873L, 108145877L, 108487569L, 
    108541653L, 108929791L, 108946083L, 109036278L, 109083555L, 
    109326548L, 109328000L, 109345189L, 109524825L, 109656355L, 
    109658183L, 109700398L, 109710707L, 109728177L, 109952072L, 
    110043209L), CUSTOMER_NUMBER = c(371522452L, 230428210L, 
    408200760L, 373201151L, 230287664L, 373315788L, 240206733L, 
    344608751L, 373201195L, 240548132L, 240202617L, 344200453L, 
    374677593L, 373200603L, 220722498L, 374745489L, 345200417L, 
    344714103L, 240202621L, 328200348L, 341702398L, 344431705L, 
    304143540L, 345200446L, 230553268L, 328452940L, 402200310L, 
    230652420L, 370776672L, 240825287L, 438496803L, 344714103L, 
    373201195L, 371200194L, 209198530L, 375724769L, 374419508L, 
    240202617L, 373200163L, 341200277L, 374419508L, 410200452L, 
    301360595L, 304639741L, 373201131L, 374216239L, 344200453L, 
    220200080L, 438126238L, 448200151L)), row.names = c(NA, -50L
), class = "data.frame", vars = c("Document_Key", "CUSTOMER_NUMBER"
), drop = TRUE))

Task 1 When you put my sample data set in Rstudio and when you do View(dataframe), you will notice there are two columns that I need you to pay attention to: "Customer_Number" and "Document Key". They are basically customer account and the invoice associated with that account. So it makes sense that one customer number can have multiples invoices/document_keys since they can order as many items they want and for all the items that are shipped in one time, they will be put in one invoice. Just like you shop at Target. I do not want to waste your time reading more about this task description.

So what I want to do is to group or create a new data frame that shows me the number of invoices belong to one Unique customer number. For instance,
Customer A has invoices A1, A2
Customer B has invoices B1, B3
The other columns that are attached to these entities will be changed as well but I hope there is a way for me to accomplish this.

My code (which does not seem to work T_T):

final_df = one_time_buyer_df %>% 
           filter ((one_time_buyer_df$QtySold == '1') & 
                    (one_time_buyer_df$PRODUCT_SUB_LINE_DESCR == "PRIVATE LABEL") & (one_time_buyer_df$Sales > 0)) %>%
           group_by(Document_Key, CUSTOMER_NUMBER) %>%
           mutate(Gross_MarginDollars = Sales - MFGCOST - 0.02*Sales,
                  Gross_MarginDollars_Percentage = Gross_MarginDollars / Sales * 100)

I will follow with other tasks soon. Let's focus on this one first.
Thank you!

cderv · June 24, 2018, 2:59pm

What does not work for you here ?

How would you like them to change ?

This is how I would do that. You then need to summarise the other attributes I believe by a sum of Gross_MarginDollars by customer ?

tab of customers

tab <- tibble::tribble(
  ~ACCTG_YEAR_KEY, ~CUSTOMER_NUMBER, ~Sales, ~QtySold, ~MFGCOST, ~MarginDollars, ~PRODUCT_SUB_LINE_DESCR, ~Document_Key, ~Gross_MarginDollars, ~Gross_MarginDollars_Percentage,
            2018L,       220200080L,   96.4,       1L,    36.05,          48.98,         "PRIVATE LABEL",    109728177L,               58.422,                 60.603734439834,
            2018L,       240548132L,  74.96,       1L,    36.05,          27.54,         "PRIVATE LABEL",    105927185L,              37.4108,                49.9076840981857,
            2018L,       301360595L,     89,       1L,    36.05,          41.58,         "PRIVATE LABEL",    109524825L,                51.17,                57.4943820224719,
            2018L,       328452940L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    107340834L,               56.903,                59.9926199261993,
            2018L,       341200277L,  81.94,       1L,    36.05,          34.52,         "PRIVATE LABEL",    109326548L,              44.2512,                54.0043934586283,
            2018L,       230428210L,     75,       1L,    36.05,          27.58,         "PRIVATE LABEL",    105217321L,                37.45,                49.9333333333333,
            2018L,       240202617L,  87.72,       1L,    36.05,           40.3,         "PRIVATE LABEL",    109036278L,              49.9156,                56.9033287733698,
            2018L,       344714103L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    106887077L,               56.903,                59.9926199261993,
            2018L,       345200417L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    106829163L,               56.903,                59.9926199261993,
            2018L,       371200194L,  79.27,       1L,    36.05,          31.85,         "PRIVATE LABEL",    108487569L,              41.6346,                52.5225179765359,
            2018L,       373315788L,  87.99,       1L,    36.05,          40.57,         "PRIVATE LABEL",    105511403L,              50.1802,                57.0294351630867,
            2018L,       374419508L,  71.31,       1L,    36.05,          23.89,         "PRIVATE LABEL",    109328000L,              33.8338,                47.4460804936194,
            2018L,       374677593L,  77.78,       1L,    36.05,          30.36,         "PRIVATE LABEL",    106756016L,              40.1744,                51.6513242478786,
            2018L,       402200310L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    107475976L,               56.903,                59.9926199261993,
            2018L,       220722498L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    106799857L,               56.903,                59.9926199261993,
            2018L,       230287664L,  80.62,       1L,    36.05,           33.2,         "PRIVATE LABEL",    105507436L,              42.9576,                53.2840486231704,
            2018L,       230553268L,     85,       1L,    36.05,          37.58,         "PRIVATE LABEL",    107338996L,                47.25,                55.5882352941177,
            2018L,       230652420L,  85.37,       1L,    36.05,          37.95,         "PRIVATE LABEL",    107550135L,              47.6126,                55.7720510718051,
            2018L,       240202617L,  80.65,       1L,    36.05,          33.23,         "PRIVATE LABEL",    106423749L,               42.987,                53.3006819590825,
            2018L,       240202621L,  80.62,       1L,    36.05,           33.2,         "PRIVATE LABEL",    106942047L,              42.9576,                53.2840486231704,
            2018L,       240206733L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    105543940L,               56.903,                59.9926199261993,
            2018L,       240825287L,  85.36,       1L,    36.05,          37.94,         "PRIVATE LABEL",    107866146L,              47.6028,                55.7671040299906,
            2018L,       209198530L,  89.95,       1L,    36.05,          42.53,         "PRIVATE LABEL",    108541653L,               52.101,                57.9221789883269,
            2018L,       304143540L,  75.88,       1L,    36.05,          28.46,         "PRIVATE LABEL",    107119996L,              38.3124,                50.4907749077491,
            2018L,       328200348L,  85.36,       1L,    36.05,          37.94,         "PRIVATE LABEL",    107037597L,              47.6028,                55.7671040299906,
            2018L,       344200453L,  70.28,       1L,    36.05,          22.86,         "PRIVATE LABEL",    106467534L,              32.8244,                46.7051792828685,
            2018L,       344431705L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    107096390L,               56.903,                59.9926199261993,
            2018L,       344714103L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    108129873L,               56.903,                59.9926199261993,
            2018L,       345200446L,     75,       1L,    36.05,          27.58,         "PRIVATE LABEL",    107154419L,                37.45,                49.9333333333333,
            2018L,       304639741L,     85,       1L,    36.05,          37.58,         "PRIVATE LABEL",    109656355L,                47.25,                55.5882352941177,
            2018L,       344200453L,  70.28,       1L,    36.05,          22.86,         "PRIVATE LABEL",    109710707L,              32.8244,                46.7051792828685,
            2018L,       344608751L,     75,       1L,    36.05,          27.58,         "PRIVATE LABEL",    105752057L,                37.45,                49.9333333333333,
            2018L,       370776672L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    107609854L,               56.903,                59.9926199261993,
            2018L,       373200163L,  81.94,       1L,    36.05,          34.52,         "PRIVATE LABEL",    109083555L,              44.2512,                54.0043934586283,
            2018L,       373200603L,  70.99,       1L,    36.05,          23.57,         "PRIVATE LABEL",    106787295L,              33.5202,                47.2181997464432,
            2018L,       374216239L,  91.58,       1L,    36.05,          44.16,         "PRIVATE LABEL",    109700398L,              53.6984,                58.6355099366674,
            2018L,       341702398L,  85.36,       1L,    36.05,          37.94,         "PRIVATE LABEL",    107096083L,              47.6028,                55.7671040299906,
            2018L,       373201131L,   72.3,       1L,    36.05,          24.88,         "PRIVATE LABEL",    109658183L,               34.804,                48.1383125864454,
            2018L,       373201151L,   87.5,       1L,    36.05,          40.08,         "PRIVATE LABEL",    105420733L,                 49.7,                            56.8,
            2018L,       373201195L,  69.99,       1L,    36.05,          22.57,         "PRIVATE LABEL",    105855786L,              32.5402,                46.4926418059723,
            2018L,       373201195L,  71.99,       1L,    36.05,          24.57,         "PRIVATE LABEL",    108145877L,              34.5002,                47.9236005000695,
            2018L,       374419508L,  71.31,       1L,    36.05,          23.89,         "PRIVATE LABEL",    108946083L,              33.8338,                47.4460804936194,
            2018L,       371522452L,  94.85,       1L,    36.05,          47.43,         "PRIVATE LABEL",    105164588L,               56.903,                59.9926199261993,
            2018L,       375724769L,  86.76,       1L,    36.05,          39.34,         "PRIVATE LABEL",    108929791L,              48.9748,                56.4485938220378,
            2018L,       408200760L,  73.27,       1L,    36.05,          25.85,         "PRIVATE LABEL",    105374455L,              35.7546,                48.7984168145216,
            2018L,       438126238L,  91.58,       1L,    36.05,          44.16,         "PRIVATE LABEL",    109952072L,              53.6984,                58.6355099366674,
            2018L,       438496803L,  82.95,       1L,    36.05,          35.53,         "PRIVATE LABEL",    108067902L,               45.241,                54.5400843881857,
            2018L,       448200151L,  86.76,       1L,    36.05,          39.34,         "PRIVATE LABEL",    110043209L,              48.9748,                56.4485938220378,
            2018L,       374745489L,  71.31,       1L,    36.05,          23.89,         "PRIVATE LABEL",    106803795L,              33.8338,                47.4460804936194,
            2018L,       410200452L,  86.76,       1L,    36.05,          39.34,         "PRIVATE LABEL",    109345189L,              48.9748,                56.4485938220378
  )


library(tidyverse)
#> Warning: le package 'tibble' a été compilé avec la version R 3.4.4
#> Warning: le package 'tidyr' a été compilé avec la version R 3.4.4
#> Warning: le package 'purrr' a été compilé avec la version R 3.4.4
#> Warning: le package 'dplyr' a été compilé avec la version R 3.4.4
#> Warning: le package 'stringr' a été compilé avec la version R 3.4.4
tab %>%
  filter(QtySold == 1L,
         PRODUCT_SUB_LINE_DESCR == "PRIVATE LABEL",
         Sales > 0) %>%
  group_by(CUSTOMER_NUMBER) %>%
  summarise(nb_invoice = n_distinct(Document_Key))
#> Warning: le package 'bindrcpp' a été compilé avec la version R 3.4.4
#> # A tibble: 45 x 2
#>    CUSTOMER_NUMBER nb_invoice
#>              <int>      <int>
#>  1       209198530          1
#>  2       220200080          1
#>  3       220722498          1
#>  4       230287664          1
#>  5       230428210          1
#>  6       230553268          1
#>  7       230652420          1
#>  8       240202617          2
#>  9       240202621          1
#> 10       240206733          1
#> # ... with 35 more rows

Created on 2018-06-24 by the reprex package (v0.2.0).

blackish952 · June 25, 2018, 12:58pm

Christophe:
Thank you for the response.
What I meant earlier was unclear. Let me rephrase my question.
What I want to do is to group or create a new data frame that shows me the number of invoices belong to one Unique customer number. For instance,
Customer A has invoices A1 has items: Ai1 - $5, Ai2 - 10 so this invoice total is 15 and each invoice has some MFCOST attached to it.
Customer B has invoices B1 has items Bi4 - 2 , Bi6 - 7 so this invoice total is 9$ and each invoice has some MFCOST attached to it.

Your code above was great. It shows the number of invoices that each unique customer has, but I want to have some more columns that show total MFGCOST, Total Sales, and Total Margins.

Can I do this?

  summarise(nb_invoice = n_distinct(Document_Key),
            total_sales = sum((Sales)))

Also,
the output from your code does not seem right to me.
Here is why.

Even if we just look at 2018, there are 6 rows for customer number 0 but your code output gave me 3 distinct invoice number, which is great.
But then I cannot calculate the total invoice per each distinct invoice.

Can we achieve this goal?

mara · June 25, 2018, 4:51pm

You can do multiple groupings to calculate summaries and totals. See, for example, the Grouping section in ModernDive (link below)
http://moderndive.com/5-wrangling.html#groupby

Also, please try to use the reprex package rather than code + screenshots! It's much easier to go back and forth that what.
For pointers specific to the community site, check out the reprex FAQ.