What is the best way to do calculation (like subtract, multiply, divide) between two data frames having identical column names?

I need to take the ratio between columns starting with letter "A", subtract columns starting with letter "B" and add columns starting with letter "C". What would be the best way to do colwise calculation between two data frames like these without doing it manually?

Thank you!


set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                         Product = rep("A", N),
                         A1 = runif(N), A2 = runif(N),
                         B1 = runif(N), B2 = runif(N), B3 = runif(N),
                         C1 = runif(N), C2 = runif(N))
dataframe1
#>         Date Product        A1         A2        B1        B2         B3
#> 1 2019-01-01       A 0.2655087 0.94467527 0.6870228 0.3800352 0.26722067
#> 2 2019-01-02       A 0.3721239 0.66079779 0.3841037 0.7774452 0.38611409
#> 3 2019-01-03       A 0.5728534 0.62911404 0.7698414 0.9347052 0.01339033
#> 4 2019-01-04       A 0.9082078 0.06178627 0.4976992 0.2121425 0.38238796
#> 5 2019-01-05       A 0.2016819 0.20597457 0.7176185 0.6516738 0.86969085
#> 6 2019-01-06       A 0.8983897 0.17655675 0.9919061 0.1255551 0.34034900
#>          C1        C2
#> 1 0.4820801 0.7942399
#> 2 0.5995658 0.1079436
#> 3 0.4935413 0.7237109
#> 4 0.1862176 0.4112744
#> 5 0.8273733 0.8209463
#> 6 0.6684667 0.6470602


dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                         Product = rep("B", N),
                         A1 = runif(N), A2 = runif(N),
                         B1 = runif(N), B2 = runif(N), B3 = runif(N),
                         C1 = runif(N), C2 = runif(N))
dataframe2
#>         Date Product        A1        A2         B1        B2         B3
#> 1 2019-01-01       B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02       B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03       B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04       B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05       B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06       B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#>          C1        C2
#> 1 0.3466835 0.7773207
#> 2 0.3337749 0.9606180
#> 3 0.4763512 0.4346595
#> 4 0.8921983 0.7125147
#> 5 0.8643395 0.3999944
#> 6 0.3899895 0.3253522

Doing calculation manually

dataframe3 <- dataframe2
dataframe3$A <- dataframe1$A1 / dataframe2$A1
dataframe3$B <- dataframe1$B1 - dataframe2$B1
dataframe3$C <- dataframe1$C1 + dataframe2$C1
dataframe3
#>         Date Product        A1        A2         B1        B2         B3
#> 1 2019-01-01       B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02       B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03       B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04       B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05       B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06       B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#>          C1        C2         A           B         C
#> 1 0.3466835 0.7773207 0.3391206  0.61634380 0.8287636
#> 2 0.3337749 0.9606180 0.6728743  0.28463756 0.9333408
#> 3 0.4763512 0.4346595 1.0814276  0.45356971 0.9698926
#> 4 0.8921983 0.7125147 1.1505677 -0.02093502 1.0784159
#> 5 0.8643395 0.3999944 8.6443008  0.05561343 1.6917128
#> 6 0.3899895 0.3253522 1.8825086  0.58507591 1.0584563
1 Like

Hello,

I am not loving this solution really, but it works (it does assume that the column names of the type LETTERdigit match in both df's - otherwise you would need to put in some checks):

set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("A", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("B", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))

for (col in colnames(dataframe1)) {
    
    category <- stringr::str_extract(col,"[ABC]")
    
    if (!is.na(category)) {
        if (category == "A") {
            new_column = data.frame(dataframe1[, col]/dataframe2[, col])
        } else if (category == "B") {
            new_column = data.frame(dataframe1[, col] - dataframe2[, col])
        } else if (category == "C") {
            new_column = data.frame(dataframe1[, col] + dataframe2[, col])
        }
        names(new_column) <- col
        dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
    }
}
dataframe3
#>         Date        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123

Created on 2019-09-24 by the reprex package (v0.3.0)

A bit better perhaps...

make_operation <- function(x, y, category) {
    if (category == "A") {
        return(x / y)
    } else if (category == "B") {
        return(x - y)
    } else if (category == "C") {
        return(x + y)
    }
}

set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("A", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("B", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))

for (col in colnames(dataframe1)) {
    
    category <- stringr::str_extract(col,"[ABC]")
    
    if (!is.na(category)) {
        new_column = data.frame(make_operation(dataframe1[, col], dataframe2[, col], category))
        names(new_column) <- col
        dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
    }
}
dataframe3
#>         Date        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123

Created on 2019-09-24 by the reprex package (v0.3.0)

1 Like

I go by the rule that column names should not contain information beyond being a consistent symbol with an understandable meaning in plain language. In this case, they contain information in their letter which tells us which operation to use. This kind of information would make more sense as part of the table's values.

So we'll convert the datasets into long form by converting the column names into a new column, group them by their first letters, and apply the right operation to each group.

library(dplyr)
library(tidyr)

column_ops <- list(
  A = `/`,
  B = `-`,
  C = `+`
)

long_1 <- dataframe1 %>%
  select(-Product) %>%
  gather(key = "column", value = "value_1", -Date)
long_2 <- dataframe2 %>%
  select(-Product) %>%
  gather(key = "column", value = "value_2", -Date)

long_1[1:10, ]
#          Date column    value_1
# 1  2019-01-01     A1 0.26550866
# 2  2019-01-02     A1 0.37212390
# 3  2019-01-03     A1 0.57285336
# 4  2019-01-04     A1 0.90820779
# 5  2019-01-05     A1 0.20168193
# 6  2019-01-06     A1 0.89838968
# 7  2019-01-01     A2 0.94467527
# 8  2019-01-02     A2 0.66079779
# 9  2019-01-03     A2 0.62911404
# 10 2019-01-04     A2 0.06178627

dataframe3 <- long_1 %>%
  inner_join(long_2, by = c("Date", "column")) %>%
  mutate(column_letter = substr(column, 1, 1)) %>%
  group_by(column_letter) %>%
  mutate(value = column_ops[[column_letter[1]]](value_1, value_2)) %>%
  ungroup() %>%
  select(Date, column, value) %>%
  spread(key = "column", value = "value")

dataframe3
# # A tibble: 6 x 8
#   Date          A1     A2      B1        B2      B3    C1    C2
#   <date>     <dbl>  <dbl>   <dbl>     <dbl>   <dbl> <dbl> <dbl>
# 1 2019-01-01 0.339 1.29    0.616  -0.533    -0.211  0.829 1.57 
# 2 2019-01-02 0.673 0.954   0.285   0.484    -0.380  0.933 1.07 
# 3 2019-01-03 1.08  1.32    0.454   0.476    -0.0709 0.970 1.16 
# 4 2019-01-04 1.15  0.0717 -0.0209 -0.120    -0.493  1.08  1.12 
# 5 2019-01-05 8.64  0.470   0.0556  0.000803  0.531  1.69  1.22 
# 6 2019-01-06 1.88  0.721   0.585  -0.132    -0.499  1.06  0.972

If you're not familiar with the backtick notation (`/`), it lets you indicate that everything between the ticks is an object name. In this case, it lets us store the function for division in a list without R trying to do the division and raising an error. Also in case you're not familiar with the idea, even the basic operators are function objects. Try this:

print(`/`)
# function (e1, e2)  .Primitive("/")
`/`(3, 2)
# [1] 1.5
1 Like

Not that it's any better, but still wanted to share it because I was interested in the problem and ws trying for sometime to incorporate something like identity for non-numeric columns and then use Reduce, but failed.

Here's what I did:

set.seed(seed = 1)

N <- 6

df_1 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
                                              format = "%Y-%m-%d"),
                               length.out = N,
                               by = 'day'),
                   Product = rep(x = "A",
                                 each = N),
                   A1 = runif(n = N),
                   A2 = runif(n = N),
                   B1 = runif(n = N),
                   B2 = runif(n = N),
                   B3 = runif(n = N),
                   C1 = runif(n = N),
                   C2 = runif(n = N))

df_2 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
                                              format = "%Y-%m-%d"),
                               length.out = N,
                               by = 'day'),
                   Product = rep(x = "B",
                                 each = N),
                   A1 = runif(n = N),
                   A2 = runif(n = N),
                   B1 = runif(n = N),
                   B2 = runif(n = N),
                   B3 = runif(n = N),
                   C1 = runif(n = N),
                   C2 = runif(n = N))

numeric_operation <- function(numeric_column_name)
{
    if (startsWith(x = numeric_column_name,
                   prefix = "A"))
    {
        return(`/`)
    } else if (startsWith(x = numeric_column_name,
                          prefix = "B"))
    {
        return(`-`)
    } else if (startsWith(x = numeric_column_name,
                          prefix = "C"))
    {
        return(`+`)
    } else
    {
        stop("Unknown operation - STOP!!!")
    }
}

operation <- function(df_1_column_name)
{
    df_1_column <- df_1[[df_1_column_name]]
    df_2_column <- df_2[[df_1_column_name]]
    
    if (is.null(x = df_2_column))
        stop("Column name mismatch - STOP!!!")
    
    if (!is.numeric(x = df_2_column))
    {
        if (!identical(x = df_1_column,
                       y = df_2_column))
        {
            warning("Non-numeric non-identical column - SKIPPED!!!")
            return(NULL)
        }
        
        return(df_2_column)
    }
    
    return(numeric_operation(df_1_column_name)(df_1_column, df_2_column))
}

as.data.frame(x = Filter(f = Negate(f = is.null),
                         x = sapply(X = names(x = df_1),
                                    FUN = operation,
                                    simplify = FALSE,
                                    USE.NAMES = TRUE)))
#> Warning in FUN(X[[i]], ...): Non-numeric non-identical column - SKIPPED!!!
#>        Dates        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123
1 Like

Thanks everyone for the help!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.