Sum cells with similar names.

Good morning, everyone.

I am currently working with the following data frame. Here is the "dput" format:

structure(list(`Current Patient County` = c("ALACHUA", "ALACHUA", 
"ALACHUA", "ALACHUA", "ALACHUA", "ALACHUA", "ALACHUA", "ALLEGHENY", 
"BAKER", "BAKER", "BAKER", "BAKER", "BAKER", "BAKER", "BAKER", 
"BALTIMORE CITY", "BAY", "BAY", "BAY", "BAY", "BAY", "BAY", "BAY", 
"BELKNAP", "BRADFORD", "BRADFORD", "BRADFORD", "BRADFORD", "BRADFORD", 
"BRADFORD", "BRADFORD", "BREVARD", "BREVARD", "BREVARD", "BREVARD", 
"BREVARD", "BREVARD", "BREVARD", "BROWARD", "BROWARD", "BROWARD", 
"BROWARD", "BROWARD", "BROWARD", "BROWARD", "CALHOUN", "CALHOUN", 
"CALHOUN", "CALHOUN", "CALHOUN", "CALHOUN", "CALHOUN", "CAMDEN", 
"CHARLOTTE", "CHARLOTTE", "CHARLOTTE", "CHARLOTTE", "CHARLOTTE", 
"CHARLOTTE", "CHARLOTTE", "CITRUS", "CITRUS", "CITRUS", "CITRUS", 
"CITRUS", "CITRUS", "CITRUS", "CLAY", "CLAY", "CLAY", "CLAY", 
"CLAY", "CLAY", "CLAY", "COLLIER", "COLLIER", "COLLIER", "COLLIER", 
"COLLIER", "COLLIER", "COLLIER", "COLUMBIA", "COLUMBIA", "COLUMBIA", 
"COLUMBIA", "COLUMBIA", "COLUMBIA", "COLUMBIA", "DAVIDSON", "DE SOTO", 
"DE SOTO", "DE SOTO", "DE SOTO", "DE SOTO", "DE SOTO", "DESOTO", 
"DESOTO", "DESOTO", "DESOTO", "DESOTO", "DESOTO", "DESOTO", "DIXIE", 
"DIXIE", "DIXIE", "DIXIE", "DIXIE", "DIXIE", "DIXIE", "DUVAL", 
"DUVAL", "DUVAL", "DUVAL", "DUVAL", "DUVAL", "DUVAL", "EAST BATON ROUGE", 
"ESCAMBIA", "ESCAMBIA", "ESCAMBIA", "ESCAMBIA", "ESCAMBIA", "ESCAMBIA", 
"ESCAMBIA", "ETOWAH", "FLAGLER", "FLAGLER", "FLAGLER", "FLAGLER", 
"FLAGLER", "FLAGLER", "FLAGLER", "FRANKLIN", "FRANKLIN", "FRANKLIN", 
"FRANKLIN", "FRANKLIN", "FRANKLIN", "FRANKLIN", "GADSDEN", "GADSDEN", 
"GADSDEN", "GADSDEN", "GADSDEN", "GADSDEN", "GADSDEN", "GARLAND", 
"GILCHRIST", "GILCHRIST", "GILCHRIST", "GILCHRIST", "GILCHRIST", 
"GILCHRIST", "GILCHRIST", "GLADES", "GLADES", "GLADES", "GLADES", 
"GLADES", "GLADES", "GLADES", "GULF", "GULF", "GULF", "GULF", 
"GULF", "GULF", "GULF", "HAMILTON", "HAMILTON", "HAMILTON", "HAMILTON", 
"HAMILTON", "HAMILTON", "HAMILTON", "HAMPDEN", "HARDEE", "HARDEE", 
"HARDEE", "HARDEE", "HARDEE", "HARDEE", "HARDEE", "HARFORD", 
"HENDRY", "HENDRY", "HENDRY", "HENDRY", "HENDRY", "HENDRY", "HENDRY", 
"HERNANDO", "HERNANDO", "HERNANDO", "HERNANDO", "HERNANDO", "HERNANDO", 
"HERNANDO", "HICKMAN", "HIGHLANDS", "HIGHLANDS", "HIGHLANDS", 
"HIGHLANDS", "HIGHLANDS", "HIGHLANDS", "HIGHLANDS", "HILLSBOROUGH", 
"HILLSBOROUGH", "HILLSBOROUGH", "HILLSBOROUGH", "HILLSBOROUGH", 
"HILLSBOROUGH", "HILLSBOROUGH", "HINDS", "HOLMES", "HOLMES", 
"HOLMES", "HOLMES", "HOLMES", "HOLMES", "HOLMES", "HOUSTON", 
"INDIAN RIVER", "INDIAN RIVER", "INDIAN RIVER", "INDIAN RIVER", 
"INDIAN RIVER", "INDIAN RIVER", "INDIAN RIVER", "JACKSON", "JACKSON", 
"JACKSON", "JACKSON", "JACKSON", "JACKSON", "JACKSON", "JEFFERSON", 
"JEFFERSON", "JEFFERSON", "JEFFERSON", "JEFFERSON", "JEFFERSON", 
"JEFFERSON", "LAFAYETTE", "LAFAYETTE", "LAFAYETTE", "LAFAYETTE", 
"LAFAYETTE", "LAFAYETTE", "LAFAYETTE", "LAKE", "LAKE", "LAKE", 
"LAKE", "LAKE", "LAKE", "LAKE", "LEE", "LEE", "LEE", "LEE", "LEE", 
"LEE", "LEE", "LEON", "LEON", "LEON", "LEON", "LEON", "LEON", 
"LEON", "LEVY", "LEVY", "LEVY", "LEVY", "LEVY", "LEVY", "LEVY", 
"LIBERTY", "LIBERTY", "LIBERTY", "LIBERTY", "LIBERTY", "LIBERTY", 
"LIBERTY", "MADISON", "MADISON", "MADISON", "MADISON", "MADISON", 
"MADISON", "MADISON", "MANATEE", "MANATEE", "MANATEE", "MANATEE", 
"MANATEE", "MANATEE", "MANATEE", "MARION", "MARION", "MARION", 
"MARION", "MARION", "MARION", "MARION", "MARTIN", "MARTIN", "MARTIN", 
"MARTIN", "MARTIN", "MARTIN", "MARTIN", "MECKLENBURG", "MERRIMACK", 
"MIAMI-DADE", "MIAMI-DADE", "MIAMI-DADE", "MIAMI-DADE", "MIAMI-DADE", 
"MIAMI-DADE", "MIAMI-DADE", "MONROE", "MONROE", "MONROE", "MONROE", 
"MONROE", "MONROE", "MONROE", "MONTGOMERY", "NASSAU", "NASSAU", 
"NASSAU", "NASSAU", "NASSAU", "NASSAU", "NASSAU", "NEW YORK", 
"NEW YORK", "NORTHAMPTON", "OCEAN", "OKALOOSA", "OKALOOSA", "OKALOOSA", 
"OKALOOSA", "OKALOOSA", "OKALOOSA", "OKALOOSA", "OKEECHOBEE", 
"OKEECHOBEE", "OKEECHOBEE", "OKEECHOBEE", "OKEECHOBEE", "OKEECHOBEE", 
"OKEECHOBEE", "ORANGE", "ORANGE", "ORANGE", "ORANGE", "ORANGE", 
"ORANGE", "ORANGE", "OSCEOLA", "OSCEOLA", "OSCEOLA", "OSCEOLA", 
"OSCEOLA", "OSCEOLA", "OSCEOLA", "PALM BEACH", "PALM BEACH", 
"PALM BEACH", "PALM BEACH", "PALM BEACH", "PALM BEACH", "PALM BEACH", 
"PASCO", "PASCO", "PASCO", "PASCO", "PASCO", "PASCO", "PASCO", 
"PHILADELPHIA", "PIERCE", "PINELLAS", "PINELLAS", "PINELLAS", 
"PINELLAS", "PINELLAS", "PINELLAS", "PINELLAS", "POLK", "POLK", 
"POLK", "POLK", "POLK", "POLK", "POLK", "PUTNAM", "PUTNAM", "PUTNAM", 
"PUTNAM", "PUTNAM", "PUTNAM", "PUTNAM", "SACRAMENTO", "SAGADAHOC", 
"SANTA ROSA", "SANTA ROSA", "SANTA ROSA", "SANTA ROSA", "SANTA ROSA", 
"SANTA ROSA", "SANTA ROSA", "SARASOTA", "SARASOTA", "SARASOTA", 
"SARASOTA", "SARASOTA", "SARASOTA", "SARASOTA", "SCHENECTADY", 
"SCHENECTADY", "SEMINOLE", "SEMINOLE", "SEMINOLE", "SEMINOLE", 
"SEMINOLE", "SEMINOLE", "SEMINOLE", "SHELBY", "ST. JOHNS", "ST. JOHNS", 
"ST. JOHNS", "ST. JOHNS", "ST. JOHNS", "ST. JOHNS", "ST. JOHNS", 
"ST. LUCIE", "ST. LUCIE", "ST. LUCIE", "ST. LUCIE", "ST. LUCIE", 
"ST. LUCIE", "ST. LUCIE", "SUMTER", "SUMTER", "SUMTER", "SUMTER", 
"SUMTER", "SUMTER", "SUMTER", "SUWANNEE", "SUWANNEE", "SUWANNEE", 
"SUWANNEE", "SUWANNEE", "SUWANNEE", "SUWANNEE", "TAYLOR", "TAYLOR", 
"TAYLOR", "TAYLOR", "TAYLOR", "TAYLOR", "TAYLOR", "UNION", "UNION", 
"UNION", "UNION", "UNION", "UNION", "UNION", "Unspecified", "Unspecified", 
"Unspecified", "Unspecified", "Unspecified", "Unspecified", "Unspecified", 
"VOLUSIA", "VOLUSIA", "VOLUSIA", "VOLUSIA", "VOLUSIA", "VOLUSIA", 
"VOLUSIA", "WAKULLA", "WAKULLA", "WAKULLA", "WAKULLA", "WAKULLA", 
"WAKULLA", "WAKULLA", "WALTON", "WALTON", "WALTON", "WALTON", 
"WALTON", "WALTON", "WALTON", "WASHINGTON", "WASHINGTON", "WASHINGTON", 
"WASHINGTON", "WASHINGTON", "WASHINGTON", "WASHINGTON", "WESTCHESTER", 
"YAVAPAI", "YAVAPAI", "YAVAPAI"), `Age Group` = c("18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "65-74", "18-24", 
"25-34", "35-44", "45-54", "55-64", "65-74", "75 +", "35-44", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75 +", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "45-54", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "35-44", "18-24", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "18-24", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "55-64", "18-24", "25-34", "35-44", 
"45-54", "55-64", "65-74", "75 +", "18-24", "25-34", "35-44", 
"45-54", "55-64", "65-74", "75 +", "18-24", "25-34", "35-44", 
"45-54", "55-64", "65-74", "75 +", "18-24", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "45-54", "18-24", 
"25-34", "35-44", "45-54", "55-64", "65-74", "75 +", "65-74", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75 +", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75 +", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", 
"75 +", "55-64", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "35-44", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "25-34", "65-74", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "25-34", "18-24", 
"25-34", "35-44", "45-54", "55-64", "65-74", "75 +", "25-34", 
"55-64", "55-64", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "25-34", "25-34", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "18-24", "25-34", 
"35-44", "45-54", "55-64", "65-74", "75 +", "35-44", "65-74", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75 +", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75 +", 
"65-74", "75 +", "18-24", "25-34", "35-44", "45-54", "55-64", 
"65-74", "75 +", "65-74", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "18-24", "25-34", "35-44", "45-54", 
"55-64", "65-74", "75 +", "45-54", "45-54", "55-64", "65-74"), 
    Rx = c(214, 724, 1407, 1510, 2468, 2765, 1948, 1, 30, 126, 
    308, 318, 539, 499, 313, 1, 147, 746, 1490, 1807, 3252, 3002, 
    1960, 3, 22, 119, 333, 367, 572, 507, 398, 438, 2131, 4576, 
    5789, 10812, 10821, 8090, 1147, 4049, 6903, 8708, 14657, 
    13818, 10236, 12, 39, 79, 181, 260, 264, 225, 1, 107, 449, 
    907, 1412, 3291, 3877, 3091, 84, 430, 902, 1613, 3427, 3869, 
    3052, 199, 790, 1799, 2459, 3519, 3096, 2252, 206, 685, 1268, 
    1776, 3088, 3802, 4190, 57, 289, 735, 884, 1430, 1442, 1088, 
    1, 1, 2, 8, 8, 14, 5, 24, 101, 170, 303, 439, 493, 305, 12, 
    81, 164, 189, 361, 407, 247, 826, 3731, 7279, 9185, 14445, 
    12995, 8020, 1, 381, 1333, 2366, 3152, 5489, 5456, 4047, 
    1, 76, 310, 800, 1004, 1817, 1970, 1522, 8, 47, 74, 116, 
    244, 238, 145, 48, 112, 232, 362, 579, 618, 410, 1, 7, 88, 
    171, 188, 374, 369, 295, 2, 11, 38, 36, 81, 107, 41, 16, 
    56, 107, 204, 340, 331, 249, 3, 35, 98, 124, 213, 191, 160, 
    1, 20, 66, 92, 178, 254, 214, 151, 1, 33, 156, 250, 295, 
    414, 361, 251, 144, 582, 1275, 2029, 3562, 3575, 2813, 1, 
    65, 242, 433, 646, 1440, 1674, 1464, 1243, 3923, 7708, 9837, 
    14396, 13876, 9624, 2, 12, 55, 164, 185, 300, 268, 248, 1, 
    79, 370, 911, 1195, 2261, 2696, 2366, 35, 144, 388, 512, 
    856, 928, 700, 2, 30, 53, 129, 214, 207, 121, 3, 17, 43, 
    51, 109, 116, 76, 236, 1081, 2023, 2716, 4751, 5193, 5183, 
    483, 1816, 3729, 5217, 9091, 10249, 8647, 271, 833, 1413, 
    1890, 2622, 2697, 1852, 36, 107, 298, 389, 866, 871, 486, 
    10, 37, 61, 90, 126, 116, 52, 16, 55, 86, 158, 291, 297, 
    225, 259, 1175, 2381, 3164, 5269, 5887, 4513, 231, 1044, 
    2123, 3035, 5524, 6433, 5478, 107, 388, 820, 1032, 2201, 
    2434, 2287, 1, 1, 1346, 3952, 5828, 7395, 11682, 12440, 12413, 
    36, 171, 423, 530, 1064, 1076, 580, 1, 89, 363, 813, 1030, 
    1780, 1727, 1199, 1, 1, 1, 1, 216, 698, 1367, 1606, 2742, 
    2503, 2030, 22, 152, 310, 439, 810, 770, 502, 1040, 3394, 
    5707, 7250, 10005, 9239, 6526, 280, 918, 1791, 2302, 2977, 
    2646, 1829, 939, 4318, 7862, 9024, 14618, 15493, 14434, 485, 
    1695, 3727, 5529, 9656, 9274, 7085, 1, 1, 667, 2598, 5709, 
    8054, 14968, 15663, 11331, 568, 2175, 3765, 5382, 8219, 8672, 
    6938, 84, 420, 805, 1020, 1828, 1795, 1120, 1, 1, 213, 737, 
    1615, 2014, 3133, 2779, 1961, 215, 1006, 2207, 2973, 5702, 
    7041, 6807, 1, 2, 328, 1112, 2118, 2502, 3644, 3900, 3175, 
    1, 231, 579, 1583, 1974, 2932, 3292, 2506, 251, 1105, 2307, 
    2905, 4885, 4781, 3335, 32, 161, 362, 544, 1350, 2660, 3135, 
    24, 157, 344, 495, 880, 815, 656, 13, 97, 202, 311, 450, 
    455, 282, 19, 61, 166, 166, 245, 205, 125, 11, 43, 48, 48, 
    54, 34, 26, 364, 1957, 4070, 4823, 8967, 9200, 7137, 37, 
    96, 284, 368, 586, 516, 285, 75, 289, 599, 742, 1346, 1177, 
    847, 25, 98, 176, 323, 493, 432, 309, 1, 1, 3, 1)), row.names = c(NA, 
-513L), class = c("tbl_df", "tbl", "data.frame"))

And here is a picture to add clarity to the issue:

desoto

What I want to do is to add the number of 'RX' for 'DE SOTO' to the number of 'RX for 'DESOTO' within a given age row. Then, I want to get rid of all rows that say 'DE SOTO'. Is there an easy way to do this? My guess is there are multiple solutions, so I am open to suggestions.

Thanks!

Hi @aiyer1217 ,

I assumed from the title of your question that we shouldn't limit the code to DE SOTO only but any possible similar names.

My second assumption is that similarity consists in the simple case of having spaces or no spaces in the name (so the code doesn't cover other types of typos).

I called your dataframe df in my solution:

library(tidyverse)
#1st step, create a county_names reference dataframe with names with spaces and names without spaces
#this will be used in the left_join in the second step to revert to county names including spaces to make it more readable
county_names <- df %>% 
  rename("county_space" = "Current Patient County") %>% 
  select(county_space) %>% 
  arrange() %>% 
  distinct() %>% 
  mutate(county_no_space = str_remove_all(county_space, " ")) %>% 
  group_by(county_no_space) %>%
  filter(row_number() == 1) %>% 
  ungroup()

Then you can do the actual transformation

#2nd step: 
df %>% 
#remove spaces from the county names
  mutate(`Current Patient County` = str_remove_all(`Current Patient County`, " ")) %>% 
#group by county and age group to perform the summarize function
  group_by(`Current Patient County`, `Age Group`) %>% 
#compute the sum for each similar names and age groups
  summarise(Rx = sum(Rx)) %>% 
#use the reference dataframe from step 1 to add the county names with spaces (remember we just removed the spaces above)
  right_join(county_names, by = c("Current Patient County" = "county_no_space")) %>% 
  ungroup() %>% 
#keep the columns you need and rename the county variable to match your expected result
  select(county_space, `Age Group`, Rx) %>% 
  rename("Current Patient County" = "county_space")

The only thing that differs from your requirements is that we keep the name DE SOTO instead of DESOTO.
Hope it helps.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.