Split values from one category into two others by proportion, round down, remainder to one

juandmaz · June 19, 2025, 9:00pm

I have this table, and I want to reassign the case counts when the cause is C55. I want to redistribute it mathematically according to the proportion between C53 and C54 (that is, if both have 1, assign 50% of C55 to each). Always round down, and if there is any remaining whole number, assign it to C53. This should all be done separately for each age group.

# A tibble: 26 × 4
    GENRE CAUSA GRUPEDAD CUENTA
   <dbl> <chr> <chr>     <dbl>
 1     2 C55   55 a 59       1
 2     2 C54   70 a 74       1
 3     2 C54   80 y mas      1
 4     2 C53   45 a 49       5
 5     2 C54   60 a 64       1
 6     2 C53   50 a 54       1
 7     2 C53   80 y mas      2
 8     2 C54   55 a 59       1
 9     2 C53   65 a 69       3
10     2 C55   75 a 79       3
# ℹ 16 more rows

Here is the entire data set

dput(df)

structure(list(GENRE = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
                        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), CAUSA = c("C55", "C54", 
                                                                          "C54", "C53", "C54", "C53", "C53", "C54", "C53", "C55", "C53", 
                                                                          "C55", "C53", "C53", "C53", "C55", "C55", "C53", "C53", "C54", 
                                                                          "C55", "C55", "C55", "C53", "C55", "C55"), GRUPEDAD = c("55 a 59", 
                                                                                                                                  "70 a 74", "80 y mas", "45 a 49", "60 a 64", "50 a 54", "80 y mas", 
                                                                                                                                  "55 a 59", "65 a 69", "75 a 79", "55 a 59", "35 a 39", "60 a 64", 
                                                                                                                                  "40 a 44", "30 a 34", "70 a 74", "50 a 54", "70 a 74", "75 a 79", 
                                                                                                                                  "75 a 79", "40 a 44", "60 a 64", "45 a 49", "35 a 39", "65 a 69", 
                                                                                                                                  "80 y mas"), CUENTA = c(1, 1, 1, 5, 1, 1, 2, 1, 3, 3, 3, 2, 3, 
                                                                                                                                                          3, 2, 3, 1, 4, 1, 1, 2, 1, 3, 3, 2, 5)), row.names = c(NA, -26L
                                                                                                                                                          ), class = c("tbl_df", "tbl", "data.frame"))

FJCC · June 20, 2025, 12:55am

Please post the entire data set in an easily copied format since it only has 26 rows. If your data frame is named DF, run

dput(DF)

and post the output.

juandmaz · June 26, 2025, 6:56pm

Hi, sorry. I just edited the post with the entire data set

FJCC · June 27, 2025, 2:42am

Does this work for you?

library(tidyverse)
DF <- structure(list(GENRE = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
               CAUSA = c("C55", "C54", 
                         "C54", "C53", "C54", "C53", "C53", "C54", "C53", "C55", "C53", 
                         "C55", "C53", "C53", "C53", "C55", "C55", "C53", "C53", "C54", 
                         "C55", "C55", "C55", "C53", "C55", "C55"), 
               GRUPEDAD = c("55 a 59", 
                            "70 a 74", "80 y mas", "45 a 49", "60 a 64", "50 a 54", "80 y mas", 
                            "55 a 59", "65 a 69", "75 a 79", "55 a 59", "35 a 39", "60 a 64", 
                            "40 a 44", "30 a 34", "70 a 74", "50 a 54", "70 a 74", "75 a 79", 
                            "75 a 79", "40 a 44", "60 a 64", "45 a 49", "35 a 39", "65 a 69", 
                            "80 y mas"), 
               CUENTA = c(1, 1, 1, 5, 1, 1, 2, 1, 3, 3, 3, 2, 3, 
                          3, 2, 3, 1, 4, 1, 1, 2, 1, 3, 3, 2, 5)), 
          row.names = c(NA, -26L), class = c("tbl_df", "tbl", "data.frame"))
DF <- DF |> group_by(GRUPEDAD) |> 
  mutate(Total = sum(CUENTA)) |> 
  pivot_wider(names_from = CAUSA, values_from = CUENTA, values_fill = 0) |> 
  mutate(new_C54 = C54 + floor(C55*C54/(C53+C54)), new_C53 = Total-new_C54)
DF
#> # A tibble: 11 × 8
#> # Groups:   GRUPEDAD [11]
#>    GENRE GRUPEDAD Total   C55   C54   C53 new_C54 new_C53
#>    <dbl> <chr>    <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
#>  1     2 55 a 59      5     1     1     3       1       4
#>  2     2 70 a 74      8     3     1     4       1       7
#>  3     2 80 y mas     8     5     1     2       2       6
#>  4     2 45 a 49      8     3     0     5       0       8
#>  5     2 60 a 64      5     1     1     3       1       4
#>  6     2 50 a 54      2     1     0     1       0       2
#>  7     2 65 a 69      5     2     0     3       0       5
#>  8     2 75 a 79      5     3     1     1       2       3
#>  9     2 35 a 39      5     2     0     3       0       5
#> 10     2 40 a 44      5     2     0     3       0       5
#> 11     2 30 a 34      2     0     0     2       0       2

^{Created on 2025-06-26 with reprex v2.1.1}

juandmaz · July 3, 2025, 6:17pm

Yeah but I have a problem and is that I have an extensive df.
How will you do with this df?

structure(list(anio = c("2010", "2010", "2010", "2010", "2010", 
"2010", "2010", "2010", "2010", "2010", "2010", "2010", "2010", 
"2010"), PROVRES = c("Jujuy", "Jujuy", "Jujuy", "Jujuy", "Jujuy", 
"Jujuy", "Jujuy", "Jujuy", "Jujuy", "Jujuy", "Jujuy", "Jujuy", 
"Jujuy", "Jujuy"), GENRE = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2), CAUSA = c("C53", "C53", "C53", "C54", "C53", "C53", 
"C53", "C53", "C54", "C54", "C53", "C53", "C54", "C53"), MAT = c(NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_), GRUPEDAD = c("20 a 24", 
"30 a 34", "35 a 39", "40 a 44", "45 a 49", "50 a 54", "55 a 59", 
"60 a 64", "60 a 64", "70 a 74", "70 a 74", "75 a 79", "80 y mas", 
"80 y mas"), CUENTA = c(1, 4, 1, 1, 3, 3, 2, 2, 1, 1, 2, 3, 1, 
2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-14L))

FJCC · July 4, 2025, 12:17am

Your latest data set does not have any CAUSA values of C55, so I'm not sure what the goal is.