I have done some group_by in tidyverse from my data set. So I have a few different tasks that need your assistance.
Here is the dput() output:
Summary
> dput(final_df[1:50,])
structure(list(ACCTG_YEAR_KEY = c(2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L), CUSTOMER_NUMBER = c(220200080L, 240548132L, 301360595L,
328452940L, 341200277L, 230428210L, 240202617L, 344714103L, 345200417L,
371200194L, 373315788L, 374419508L, 374677593L, 402200310L, 220722498L,
230287664L, 230553268L, 230652420L, 240202617L, 240202621L, 240206733L,
240825287L, 209198530L, 304143540L, 328200348L, 344200453L, 344431705L,
344714103L, 345200446L, 304639741L, 344200453L, 344608751L, 370776672L,
373200163L, 373200603L, 374216239L, 341702398L, 373201131L, 373201151L,
373201195L, 373201195L, 374419508L, 371522452L, 375724769L, 408200760L,
438126238L, 438496803L, 448200151L, 374745489L, 410200452L),
Sales = c(96.4, 74.96, 89, 94.85, 81.94, 75, 87.72, 94.85,
94.85, 79.27, 87.99, 71.31, 77.78, 94.85, 94.85, 80.62, 85,
85.37, 80.65, 80.62, 94.85, 85.36, 89.95, 75.88, 85.36, 70.28,
94.85, 94.85, 75, 85, 70.28, 75, 94.85, 81.94, 70.99, 91.58,
85.36, 72.3, 87.5, 69.99, 71.99, 71.31, 94.85, 86.76, 73.27,
91.58, 82.95, 86.76, 71.31, 86.76), QtySold = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), MFGCOST = c(36.05, 36.05, 36.05, 36.05, 36.05, 36.05,
36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05,
36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05,
36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05,
36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05,
36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05, 36.05),
MarginDollars = c(48.98, 27.54, 41.58, 47.43, 34.52, 27.58,
40.3, 47.43, 47.43, 31.85, 40.57, 23.89, 30.36, 47.43, 47.43,
33.2, 37.58, 37.95, 33.23, 33.2, 47.43, 37.94, 42.53, 28.46,
37.94, 22.86, 47.43, 47.43, 27.58, 37.58, 22.86, 27.58, 47.43,
34.52, 23.57, 44.16, 37.94, 24.88, 40.08, 22.57, 24.57, 23.89,
47.43, 39.34, 25.85, 44.16, 35.53, 39.34, 23.89, 39.34),
PRODUCT_SUB_LINE_DESCR = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("PRIVATE LABEL", "SUNDRY"), class = "factor"),
Document_Key = c(109728177L, 105927185L, 109524825L, 107340834L,
109326548L, 105217321L, 109036278L, 106887077L, 106829163L,
108487569L, 105511403L, 109328000L, 106756016L, 107475976L,
106799857L, 105507436L, 107338996L, 107550135L, 106423749L,
106942047L, 105543940L, 107866146L, 108541653L, 107119996L,
107037597L, 106467534L, 107096390L, 108129873L, 107154419L,
109656355L, 109710707L, 105752057L, 107609854L, 109083555L,
106787295L, 109700398L, 107096083L, 109658183L, 105420733L,
105855786L, 108145877L, 108946083L, 105164588L, 108929791L,
105374455L, 109952072L, 108067902L, 110043209L, 106803795L,
109345189L), Gross_MarginDollars = c(58.422, 37.4108, 51.17,
56.903, 44.2512, 37.45, 49.9156, 56.903, 56.903, 41.6346,
50.1802, 33.8338, 40.1744, 56.903, 56.903, 42.9576, 47.25,
47.6126, 42.987, 42.9576, 56.903, 47.6028, 52.101, 38.3124,
47.6028, 32.8244, 56.903, 56.903, 37.45, 47.25, 32.8244,
37.45, 56.903, 44.2512, 33.5202, 53.6984, 47.6028, 34.804,
49.7, 32.5402, 34.5002, 33.8338, 56.903, 48.9748, 35.7546,
53.6984, 45.241, 48.9748, 33.8338, 48.9748), Gross_MarginDollars_Percentage = c(60.603734439834,
49.9076840981857, 57.4943820224719, 59.9926199261993, 54.0043934586283,
49.9333333333333, 56.9033287733698, 59.9926199261993, 59.9926199261993,
52.5225179765359, 57.0294351630867, 47.4460804936194, 51.6513242478786,
59.9926199261993, 59.9926199261993, 53.2840486231704, 55.5882352941177,
55.7720510718051, 53.3006819590825, 53.2840486231704, 59.9926199261993,
55.7671040299906, 57.9221789883269, 50.4907749077491, 55.7671040299906,
46.7051792828685, 59.9926199261993, 59.9926199261993, 49.9333333333333,
55.5882352941177, 46.7051792828685, 49.9333333333333, 59.9926199261993,
54.0043934586283, 47.2181997464432, 58.6355099366674, 55.7671040299906,
48.1383125864454, 56.8, 46.4926418059723, 47.9236005000695,
47.4460804936194, 59.9926199261993, 56.4485938220378, 48.7984168145216,
58.6355099366674, 54.5400843881857, 56.4485938220378, 47.4460804936194,
56.4485938220378)), row.names = c(NA, -50L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = c("Document_Key", "CUSTOMER_NUMBER"
), drop = TRUE, indices = list(42L, 5L, 44L, 38L, 15L, 10L, 20L,
31L, 39L, 1L, 18L, 25L, 12L, 34L, 14L, 48L, 8L, 7L, 19L,
24L, 36L, 26L, 23L, 28L, 16L, 3L, 13L, 17L, 32L, 21L, 46L,
27L, 40L, 9L, 22L, 43L, 41L, 6L, 33L, 4L, 11L, 49L, 2L, 29L,
37L, 35L, 30L, 0L, 45L, 47L), group_sizes = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
Document_Key = c(105164588L, 105217321L, 105374455L, 105420733L,
105507436L, 105511403L, 105543940L, 105752057L, 105855786L,
105927185L, 106423749L, 106467534L, 106756016L, 106787295L,
106799857L, 106803795L, 106829163L, 106887077L, 106942047L,
107037597L, 107096083L, 107096390L, 107119996L, 107154419L,
107338996L, 107340834L, 107475976L, 107550135L, 107609854L,
107866146L, 108067902L, 108129873L, 108145877L, 108487569L,
108541653L, 108929791L, 108946083L, 109036278L, 109083555L,
109326548L, 109328000L, 109345189L, 109524825L, 109656355L,
109658183L, 109700398L, 109710707L, 109728177L, 109952072L,
110043209L), CUSTOMER_NUMBER = c(371522452L, 230428210L,
408200760L, 373201151L, 230287664L, 373315788L, 240206733L,
344608751L, 373201195L, 240548132L, 240202617L, 344200453L,
374677593L, 373200603L, 220722498L, 374745489L, 345200417L,
344714103L, 240202621L, 328200348L, 341702398L, 344431705L,
304143540L, 345200446L, 230553268L, 328452940L, 402200310L,
230652420L, 370776672L, 240825287L, 438496803L, 344714103L,
373201195L, 371200194L, 209198530L, 375724769L, 374419508L,
240202617L, 373200163L, 341200277L, 374419508L, 410200452L,
301360595L, 304639741L, 373201131L, 374216239L, 344200453L,
220200080L, 438126238L, 448200151L)), row.names = c(NA, -50L
), class = "data.frame", vars = c("Document_Key", "CUSTOMER_NUMBER"
), drop = TRUE))
Task 1 When you put my sample data set in Rstudio and when you do View(dataframe), you will notice there are two columns that I need you to pay attention to: "Customer_Number" and "Document Key". They are basically customer account and the invoice associated with that account. So it makes sense that one customer number can have multiples invoices/document_keys since they can order as many items they want and for all the items that are shipped in one time, they will be put in one invoice. Just like you shop at Target. I do not want to waste your time reading more about this task description.
So what I want to do is to group or create a new data frame that shows me the number of invoices belong to one Unique customer number. For instance,
Customer A has invoices A1, A2
Customer B has invoices B1, B3
The other columns that are attached to these entities will be changed as well but I hope there is a way for me to accomplish this.
My code (which does not seem to work T_T):
final_df = one_time_buyer_df %>%
filter ((one_time_buyer_df$QtySold == '1') &
(one_time_buyer_df$PRODUCT_SUB_LINE_DESCR == "PRIVATE LABEL") & (one_time_buyer_df$Sales > 0)) %>%
group_by(Document_Key, CUSTOMER_NUMBER) %>%
mutate(Gross_MarginDollars = Sales - MFGCOST - 0.02*Sales,
Gross_MarginDollars_Percentage = Gross_MarginDollars / Sales * 100)
I will follow with other tasks soon. Let's focus on this one first.
Thank you!