Help with better way of applying calculation on columns starting with same name per group in a list

Hi All,

Looking to get results per group in a better way where some of the columns have same starting names, end with sequential numbers & depend on previous column. The number of such columns can vary per group.

In below example, columns starting with Code & ending in sequential manner depend on previous Code number. Currently, working solution is manual per group in a list. Looking for a better way where it applies to either each list of dataframe OR can be applied in original long data without splitting into groups

Thanks for your help!!

# Sample Data
library(tidyverse)
df <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  ACCTDT = c(202309L,202309L,
             202309L,202309L,202310L,202310L,202310L,202310L,
             202311L,202311L,202311L,202311L,202309L,
             202309L,202309L,202310L,202310L,202310L,202311L,
             202311L,202311L),
  `ACT/FC` = c("ACT","ACT","ACT",
               "ACT","FC","FC","FC","FC","FC","FC","FC",
               "FC","ACT","ACT","ACT","FC","FC","FC","FC",
               "FC","FC"),
  Country = c("CA","CA","CA",
              "CA","CA","CA","CA","CA","CA","CA","CA","CA",
              "CA","CA","CA","CA","CA","CA","CA","CA",
              "CA"),
  Type = c("A","A","A","A",
           "A","A","A","A","A","A","A","A","B","B",
           "B","B","B","B","B","B","B"),
  Code = c("Code_1","Code_2","Code_3",
           "Code_4","Code_1","Code_2","Code_3","Code_4","Code_1","Code_2",
           "Code_3","Code_4","Code_1","Code_2","Code_3","Code_1","Code_2",
           "Code_3","Code_1","Code_2","Code_3"),
  Sales = c(100L,90L,80L,
            120L,110L,NA,NA,NA,200L,NA,NA,NA,89L,95L,100L,
            125L,NA,NA,225L,NA,NA),
  Rates = c(0.1, 0.12,0.13,
            0.14, 0.15, 0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,
            0.26,0.27,0.28,0.29,0.3,0.31)
)

# Need values for all Codes based on previous values of Codes & Rates
# Code_2 uses previous values from Code_1, Code_3 from Code_2, Code_4 from Code_3 and Code_New from last category Code_4
# Tried doing it in long grouped data structure with no success. Thus, making data wide instead and splitting data into groups as below

# Creating List by Type
df_list <- df %>%
  group_by(Type) %>%
  group_split()

df_list <- map(df_list, function(df){
  df <- df %>%
    pivot_wider(names_from = Code, values_from = Sales) %>%
    mutate("Code_New" = NA)
  
  return(df)
})

# Working solution - Manual and to be repeated per list of dataframes
# Getting values for Columns starting with Code and each of these column depend on previous column or category of Code if data was still in long format
for(i in which(df_list[[1]]$`ACT/FC` == "FC")){
  df_list[[1]]$Code_2[i] = round(df_list[[1]]$Code_1[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_3[i] = round(df_list[[1]]$Code_2[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_4[i] = round(df_list[[1]]$Code_3[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_New[i] = round(df_list[[1]]$Code_4[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  
}

# Looking to get the results such that calculations can be applied to all list of dataframes.

you can reproduce the result of your loop, with less manual construction as showin in my example; but naturally this will reduce the readability of the code as its another layer of abstraction.

library(tidyverse)
df <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  ACCTDT = c(202309L,202309L,
             202309L,202309L,202310L,202310L,202310L,202310L,
             202311L,202311L,202311L,202311L,202309L,
             202309L,202309L,202310L,202310L,202310L,202311L,
             202311L,202311L),
  `ACT/FC` = c("ACT","ACT","ACT",
               "ACT","FC","FC","FC","FC","FC","FC","FC",
               "FC","ACT","ACT","ACT","FC","FC","FC","FC",
               "FC","FC"),
  Country = c("CA","CA","CA",
              "CA","CA","CA","CA","CA","CA","CA","CA","CA",
              "CA","CA","CA","CA","CA","CA","CA","CA",
              "CA"),
  Type = c("A","A","A","A",
           "A","A","A","A","A","A","A","A","B","B",
           "B","B","B","B","B","B","B"),
  Code = c("Code_1","Code_2","Code_3",
           "Code_4","Code_1","Code_2","Code_3","Code_4","Code_1","Code_2",
           "Code_3","Code_4","Code_1","Code_2","Code_3","Code_1","Code_2",
           "Code_3","Code_1","Code_2","Code_3"),
  Sales = c(100L,90L,80L,
            120L,110L,NA,NA,NA,200L,NA,NA,NA,89L,95L,100L,
            125L,NA,NA,225L,NA,NA),
  Rates = c(0.1, 0.12,0.13,
            0.14, 0.15, 0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,
            0.26,0.27,0.28,0.29,0.3,0.31)
)

# Need values for all Codes based on previous values of Codes & Rates
# Code_2 uses previous values from Code_1, Code_3 from Code_2, Code_4 from Code_3 and Code_New from last category Code_4
# Tried doing it in long grouped data structure with no success. Thus, making data wide instead and splitting data into groups as below

# Creating List by Type
df_list <- df %>%
  group_by(Type) %>%
  group_split()

df_list_common <- df_list <- map(df_list, function(df){
  df <- df %>%
    pivot_wider(names_from = Code, values_from = Sales) %>%
    mutate("Code_New" = NA)
  
  return(df)
})

# Working solution - Manual and to be repeated per list of dataframes
# Getting values for Columns starting with Code and each of these column depend on previous column or category of Code if data was still in long format
for(i in which(df_list[[1]]$`ACT/FC` == "FC")){
  df_list[[1]]$Code_2[i] = round(df_list[[1]]$Code_1[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_3[i] = round(df_list[[1]]$Code_2[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_4[i] = round(df_list[[1]]$Code_3[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  df_list[[1]]$Code_New[i] = round(df_list[[1]]$Code_4[i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
  
}
df_list_target <- df_list

#reset the input
df_list <- df_list_common
(cds <- c("Code_1","Code_2","Code_3","Code_4","Code_New"))

for(i in which(df_list[[1]]$`ACT/FC` == "FC")){
  for(j in seq_along(cds[-1])){
  df_list[[1]][[cds[j+1]]][i] = round(df_list[[1]][[cds[j]]][i-1] * (1 - df_list[[1]]$Rates[i-1]), 0)
}
}

df_list_2 <- df_list

identical(df_list_target,df_list_2)
# TRUE

Thanks @nirgrahamuk !
I am trying to use your recommended steps and am getting some errors. Where am I going wrong here?

# Creating List by Type
df_list <- df %>%
  group_by(Type) %>%
  group_split()

# Wider Data to deal with Code Categories of Interest
df_list <- map(df_list, function(df){
  df <- df %>%
    pivot_wider(names_from = Code, values_from = Sales) %>%
    mutate("Code_New" = NA)
  
  return(df)
})

# Getting Column Names
cds <- c("Code_1","Code_2","Code_3","Code_4","Code_New")

# Function to apply calculation on list of dataframes (Groups by Type)
add_codes <- function(df){
  for(i in which(df$`ACT/FC` == "FC")){
    for(j in seq_along(cds[-1])){
      df[[cds[j+1]]][i] = round(df[[cds[j]]][i-1] * (1 - df$Rates[i-1]), 0)
  }
  }
}

df_target <- map(df_list, ~add_codes(.))

Also, does cds[-1] means we are sequencing along in the reverse order of cds?
Can you please explain how j element is representing the desired calculation.

If I apply this on my list of datframes, I am getting the following error:

Error in `map()`:
ℹ In index: 2.
Caused by error in `[[<-`:
! Assigned data `*vtmp*` must be compatible with existing data.
✖ Existing data has 9 rows.
✖ Assigned data has 4 rows.
ℹ Only vectors of size 1 are recycled.
Caused by error in `vectbl_recycle_rhs_rows()`:
! Can't recycle input of size 4 to size 9.

your code is insufficiently general in that not every data.frame in your list has all of the Codes


# Function to apply calculation on list of dataframes (Groups by Type)
add_codes <- function(df,cds){
  lcds <- intersect(cds,names(df))
  for(i in which(df$`ACT/FC` == "FC")){
    for(j in seq_along(lcds[-1])){
      df[[lcds[j+1]]][i] = round(df[[lcds[j]]][i-1] * (1 - df$Rates[i-1]), 0)
    }
  }
  df
}

result <- map(df_list_common, ~add_codes(.,cds=cds))

cds[-1] is cds` without the first entry

Thank you so much @nirgrahamuk! and thanks for clarifying cds[-1]
Would it be possible to get same result without widening the data on Codes column? ie directly after splitting into groups.

df_list <- df %>%
  group_by(Type) %>%
  group_split()

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.