What is the best way to do calculation (like subtract, multiply, divide) between two data frames having identical column names?

RuReady · September 24, 2019, 2:27am

I need to take the ratio between columns starting with letter "A", subtract columns starting with letter "B" and add columns starting with letter "C". What would be the best way to do colwise calculation between two data frames like these without doing it manually?

Thank you!


set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                         Product = rep("A", N),
                         A1 = runif(N), A2 = runif(N),
                         B1 = runif(N), B2 = runif(N), B3 = runif(N),
                         C1 = runif(N), C2 = runif(N))
dataframe1
#>         Date Product        A1         A2        B1        B2         B3
#> 1 2019-01-01       A 0.2655087 0.94467527 0.6870228 0.3800352 0.26722067
#> 2 2019-01-02       A 0.3721239 0.66079779 0.3841037 0.7774452 0.38611409
#> 3 2019-01-03       A 0.5728534 0.62911404 0.7698414 0.9347052 0.01339033
#> 4 2019-01-04       A 0.9082078 0.06178627 0.4976992 0.2121425 0.38238796
#> 5 2019-01-05       A 0.2016819 0.20597457 0.7176185 0.6516738 0.86969085
#> 6 2019-01-06       A 0.8983897 0.17655675 0.9919061 0.1255551 0.34034900
#>          C1        C2
#> 1 0.4820801 0.7942399
#> 2 0.5995658 0.1079436
#> 3 0.4935413 0.7237109
#> 4 0.1862176 0.4112744
#> 5 0.8273733 0.8209463
#> 6 0.6684667 0.6470602


dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                         Product = rep("B", N),
                         A1 = runif(N), A2 = runif(N),
                         B1 = runif(N), B2 = runif(N), B3 = runif(N),
                         C1 = runif(N), C2 = runif(N))
dataframe2
#>         Date Product        A1        A2         B1        B2         B3
#> 1 2019-01-01       B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02       B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03       B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04       B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05       B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06       B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#>          C1        C2
#> 1 0.3466835 0.7773207
#> 2 0.3337749 0.9606180
#> 3 0.4763512 0.4346595
#> 4 0.8921983 0.7125147
#> 5 0.8643395 0.3999944
#> 6 0.3899895 0.3253522

Doing calculation manually

dataframe3 <- dataframe2
dataframe3$A <- dataframe1$A1 / dataframe2$A1
dataframe3$B <- dataframe1$B1 - dataframe2$B1
dataframe3$C <- dataframe1$C1 + dataframe2$C1
dataframe3
#>         Date Product        A1        A2         B1        B2         B3
#> 1 2019-01-01       B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02       B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03       B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04       B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05       B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06       B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#>          C1        C2         A           B         C
#> 1 0.3466835 0.7773207 0.3391206  0.61634380 0.8287636
#> 2 0.3337749 0.9606180 0.6728743  0.28463756 0.9333408
#> 3 0.4763512 0.4346595 1.0814276  0.45356971 0.9698926
#> 4 0.8921983 0.7125147 1.1505677 -0.02093502 1.0784159
#> 5 0.8643395 0.3999944 8.6443008  0.05561343 1.6917128
#> 6 0.3899895 0.3253522 1.8825086  0.58507591 1.0584563

valeri · September 24, 2019, 7:39am

Hello,

I am not loving this solution really, but it works (it does assume that the column names of the type LETTERdigit match in both df's - otherwise you would need to put in some checks):

set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("A", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("B", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))

for (col in colnames(dataframe1)) {
    
    category <- stringr::str_extract(col,"[ABC]")
    
    if (!is.na(category)) {
        if (category == "A") {
            new_column = data.frame(dataframe1[, col]/dataframe2[, col])
        } else if (category == "B") {
            new_column = data.frame(dataframe1[, col] - dataframe2[, col])
        } else if (category == "C") {
            new_column = data.frame(dataframe1[, col] + dataframe2[, col])
        }
        names(new_column) <- col
        dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
    }
}
dataframe3
#>         Date        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123

Created on 2019-09-24 by the reprex package (v0.3.0)

valeri · September 24, 2019, 8:07am

A bit better perhaps...

make_operation <- function(x, y, category) {
    if (category == "A") {
        return(x / y)
    } else if (category == "B") {
        return(x - y)
    } else if (category == "C") {
        return(x + y)
    }
}

set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("A", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
                                                 Product = rep("B", N),
                                                 A1 = runif(N), A2 = runif(N),
                                                 B1 = runif(N), B2 = runif(N), B3 = runif(N),
                                                 C1 = runif(N), C2 = runif(N))

dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))

for (col in colnames(dataframe1)) {
    
    category <- stringr::str_extract(col,"[ABC]")
    
    if (!is.na(category)) {
        new_column = data.frame(make_operation(dataframe1[, col], dataframe2[, col], category))
        names(new_column) <- col
        dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
    }
}
dataframe3
#>         Date        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123

Created on 2019-09-24 by the reprex package (v0.3.0)

nwerth · September 24, 2019, 1:41pm

I go by the rule that column names should not contain information beyond being a consistent symbol with an understandable meaning in plain language. In this case, they contain information in their letter which tells us which operation to use. This kind of information would make more sense as part of the table's values.

So we'll convert the datasets into long form by converting the column names into a new column, group them by their first letters, and apply the right operation to each group.

library(dplyr)
library(tidyr)

column_ops <- list(
  A = `/`,
  B = `-`,
  C = `+`
)

long_1 <- dataframe1 %>%
  select(-Product) %>%
  gather(key = "column", value = "value_1", -Date)
long_2 <- dataframe2 %>%
  select(-Product) %>%
  gather(key = "column", value = "value_2", -Date)

long_1[1:10, ]
#          Date column    value_1
# 1  2019-01-01     A1 0.26550866
# 2  2019-01-02     A1 0.37212390
# 3  2019-01-03     A1 0.57285336
# 4  2019-01-04     A1 0.90820779
# 5  2019-01-05     A1 0.20168193
# 6  2019-01-06     A1 0.89838968
# 7  2019-01-01     A2 0.94467527
# 8  2019-01-02     A2 0.66079779
# 9  2019-01-03     A2 0.62911404
# 10 2019-01-04     A2 0.06178627

dataframe3 <- long_1 %>%
  inner_join(long_2, by = c("Date", "column")) %>%
  mutate(column_letter = substr(column, 1, 1)) %>%
  group_by(column_letter) %>%
  mutate(value = column_ops[[column_letter[1]]](value_1, value_2)) %>%
  ungroup() %>%
  select(Date, column, value) %>%
  spread(key = "column", value = "value")

dataframe3
# # A tibble: 6 x 8
#   Date          A1     A2      B1        B2      B3    C1    C2
#   <date>     <dbl>  <dbl>   <dbl>     <dbl>   <dbl> <dbl> <dbl>
# 1 2019-01-01 0.339 1.29    0.616  -0.533    -0.211  0.829 1.57 
# 2 2019-01-02 0.673 0.954   0.285   0.484    -0.380  0.933 1.07 
# 3 2019-01-03 1.08  1.32    0.454   0.476    -0.0709 0.970 1.16 
# 4 2019-01-04 1.15  0.0717 -0.0209 -0.120    -0.493  1.08  1.12 
# 5 2019-01-05 8.64  0.470   0.0556  0.000803  0.531  1.69  1.22 
# 6 2019-01-06 1.88  0.721   0.585  -0.132    -0.499  1.06  0.972

If you're not familiar with the backtick notation (`/`), it lets you indicate that everything between the ticks is an object name. In this case, it lets us store the function for division in a list without R trying to do the division and raising an error. Also in case you're not familiar with the idea, even the basic operators are function objects. Try this:

print(`/`)
# function (e1, e2)  .Primitive("/")
`/`(3, 2)
# [1] 1.5

Yarnabrina · September 24, 2019, 4:38pm

Not that it's any better, but still wanted to share it because I was interested in the problem and ws trying for sometime to incorporate something like identity for non-numeric columns and then use Reduce, but failed.

Here's what I did:

set.seed(seed = 1)

N <- 6

df_1 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
                                              format = "%Y-%m-%d"),
                               length.out = N,
                               by = 'day'),
                   Product = rep(x = "A",
                                 each = N),
                   A1 = runif(n = N),
                   A2 = runif(n = N),
                   B1 = runif(n = N),
                   B2 = runif(n = N),
                   B3 = runif(n = N),
                   C1 = runif(n = N),
                   C2 = runif(n = N))

df_2 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
                                              format = "%Y-%m-%d"),
                               length.out = N,
                               by = 'day'),
                   Product = rep(x = "B",
                                 each = N),
                   A1 = runif(n = N),
                   A2 = runif(n = N),
                   B1 = runif(n = N),
                   B2 = runif(n = N),
                   B3 = runif(n = N),
                   C1 = runif(n = N),
                   C2 = runif(n = N))

numeric_operation <- function(numeric_column_name)
{
    if (startsWith(x = numeric_column_name,
                   prefix = "A"))
    {
        return(`/`)
    } else if (startsWith(x = numeric_column_name,
                          prefix = "B"))
    {
        return(`-`)
    } else if (startsWith(x = numeric_column_name,
                          prefix = "C"))
    {
        return(`+`)
    } else
    {
        stop("Unknown operation - STOP!!!")
    }
}

operation <- function(df_1_column_name)
{
    df_1_column <- df_1[[df_1_column_name]]
    df_2_column <- df_2[[df_1_column_name]]
    
    if (is.null(x = df_2_column))
        stop("Column name mismatch - STOP!!!")
    
    if (!is.numeric(x = df_2_column))
    {
        if (!identical(x = df_1_column,
                       y = df_2_column))
        {
            warning("Non-numeric non-identical column - SKIPPED!!!")
            return(NULL)
        }
        
        return(df_2_column)
    }
    
    return(numeric_operation(df_1_column_name)(df_1_column, df_2_column))
}

as.data.frame(x = Filter(f = Negate(f = is.null),
                         x = sapply(X = names(x = df_1),
                                    FUN = operation,
                                    simplify = FALSE,
                                    USE.NAMES = TRUE)))
#> Warning in FUN(X[[i]], ...): Non-numeric non-identical column - SKIPPED!!!
#>        Dates        A1        A2          B1           B2          B3
#> 1 2019-01-01 0.3391206 1.2899871  0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017  0.28463756  0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863  0.45356971  0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573  0.05561343  0.000803299  0.53061791
#> 6 2019-01-06 1.8825086 0.7212366  0.58507591 -0.132461685 -0.49909135
#>          C1        C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123

RuReady · September 24, 2019, 11:25pm

Thanks everyone for the help!

system · October 1, 2019, 11:25pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.