RuReady
September 24, 2019, 2:27am
1
I need to take the ratio between columns starting with letter "A", subtract columns starting with letter "B" and add columns starting with letter "C". What would be the best way to do colwise calculation between two data frames like these without doing it manually?
Thank you!
set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("A", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe1
#> Date Product A1 A2 B1 B2 B3
#> 1 2019-01-01 A 0.2655087 0.94467527 0.6870228 0.3800352 0.26722067
#> 2 2019-01-02 A 0.3721239 0.66079779 0.3841037 0.7774452 0.38611409
#> 3 2019-01-03 A 0.5728534 0.62911404 0.7698414 0.9347052 0.01339033
#> 4 2019-01-04 A 0.9082078 0.06178627 0.4976992 0.2121425 0.38238796
#> 5 2019-01-05 A 0.2016819 0.20597457 0.7176185 0.6516738 0.86969085
#> 6 2019-01-06 A 0.8983897 0.17655675 0.9919061 0.1255551 0.34034900
#> C1 C2
#> 1 0.4820801 0.7942399
#> 2 0.5995658 0.1079436
#> 3 0.4935413 0.7237109
#> 4 0.1862176 0.4112744
#> 5 0.8273733 0.8209463
#> 6 0.6684667 0.6470602
dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("B", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe2
#> Date Product A1 A2 B1 B2 B3
#> 1 2019-01-01 B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02 B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03 B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04 B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05 B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06 B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#> C1 C2
#> 1 0.3466835 0.7773207
#> 2 0.3337749 0.9606180
#> 3 0.4763512 0.4346595
#> 4 0.8921983 0.7125147
#> 5 0.8643395 0.3999944
#> 6 0.3899895 0.3253522
Doing calculation manually
dataframe3 <- dataframe2
dataframe3$A <- dataframe1$A1 / dataframe2$A1
dataframe3$B <- dataframe1$B1 - dataframe2$B1
dataframe3$C <- dataframe1$C1 + dataframe2$C1
dataframe3
#> Date Product A1 A2 B1 B2 B3
#> 1 2019-01-01 B 0.7829328 0.7323137 0.07067905 0.9128759 0.47854525
#> 2 2019-01-02 B 0.5530363 0.6927316 0.09946616 0.2936034 0.76631067
#> 3 2019-01-03 B 0.5297196 0.4776196 0.31627171 0.4590657 0.08424691
#> 4 2019-01-04 B 0.7893562 0.8612095 0.51863426 0.3323947 0.87532133
#> 5 2019-01-05 B 0.0233312 0.4380971 0.66200508 0.6508705 0.33907294
#> 6 2019-01-06 B 0.4772301 0.2447973 0.40683019 0.2580168 0.83944035
#> C1 C2 A B C
#> 1 0.3466835 0.7773207 0.3391206 0.61634380 0.8287636
#> 2 0.3337749 0.9606180 0.6728743 0.28463756 0.9333408
#> 3 0.4763512 0.4346595 1.0814276 0.45356971 0.9698926
#> 4 0.8921983 0.7125147 1.1505677 -0.02093502 1.0784159
#> 5 0.8643395 0.3999944 8.6443008 0.05561343 1.6917128
#> 6 0.3899895 0.3253522 1.8825086 0.58507591 1.0584563
1 Like
valeri
September 24, 2019, 7:39am
2
Hello,
I am not loving this solution really, but it works (it does assume that the column names of the type LETTERdigit
match in both df's - otherwise you would need to put in some checks):
set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("A", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("B", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))
for (col in colnames(dataframe1)) {
category <- stringr::str_extract(col,"[ABC]")
if (!is.na(category)) {
if (category == "A") {
new_column = data.frame(dataframe1[, col]/dataframe2[, col])
} else if (category == "B") {
new_column = data.frame(dataframe1[, col] - dataframe2[, col])
} else if (category == "C") {
new_column = data.frame(dataframe1[, col] + dataframe2[, col])
}
names(new_column) <- col
dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
}
}
dataframe3
#> Date A1 A2 B1 B2 B3
#> 1 2019-01-01 0.3391206 1.2899871 0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017 0.28463756 0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863 0.45356971 0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573 0.05561343 0.000803299 0.53061791
#> 6 2019-01-06 1.8825086 0.7212366 0.58507591 -0.132461685 -0.49909135
#> C1 C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123
Created on 2019-09-24 by the reprex package (v0.3.0)
valeri
September 24, 2019, 8:07am
3
A bit better perhaps...
make_operation <- function(x, y, category) {
if (category == "A") {
return(x / y)
} else if (category == "B") {
return(x - y)
} else if (category == "C") {
return(x + y)
}
}
set.seed(1)
N = 6
dataframe1 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("A", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe2 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'),
Product = rep("B", N),
A1 = runif(N), A2 = runif(N),
B1 = runif(N), B2 = runif(N), B3 = runif(N),
C1 = runif(N), C2 = runif(N))
dataframe3 <- data.frame(Date = seq(as.Date("2019-01-01"), length.out = N, by = 'day'))
for (col in colnames(dataframe1)) {
category <- stringr::str_extract(col,"[ABC]")
if (!is.na(category)) {
new_column = data.frame(make_operation(dataframe1[, col], dataframe2[, col], category))
names(new_column) <- col
dataframe3 <- dplyr::bind_cols(dataframe3, new_column)
}
}
dataframe3
#> Date A1 A2 B1 B2 B3
#> 1 2019-01-01 0.3391206 1.2899871 0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017 0.28463756 0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863 0.45356971 0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573 0.05561343 0.000803299 0.53061791
#> 6 2019-01-06 1.8825086 0.7212366 0.58507591 -0.132461685 -0.49909135
#> C1 C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123
Created on 2019-09-24 by the reprex package (v0.3.0)
1 Like
nwerth
September 24, 2019, 1:41pm
4
I go by the rule that column names should not contain information beyond being a consistent symbol with an understandable meaning in plain language. In this case, they contain information in their letter which tells us which operation to use. This kind of information would make more sense as part of the table's values.
So we'll convert the datasets into long form by converting the column names into a new column, group them by their first letters, and apply the right operation to each group.
library(dplyr)
library(tidyr)
column_ops <- list(
A = `/`,
B = `-`,
C = `+`
)
long_1 <- dataframe1 %>%
select(-Product) %>%
gather(key = "column", value = "value_1", -Date)
long_2 <- dataframe2 %>%
select(-Product) %>%
gather(key = "column", value = "value_2", -Date)
long_1[1:10, ]
# Date column value_1
# 1 2019-01-01 A1 0.26550866
# 2 2019-01-02 A1 0.37212390
# 3 2019-01-03 A1 0.57285336
# 4 2019-01-04 A1 0.90820779
# 5 2019-01-05 A1 0.20168193
# 6 2019-01-06 A1 0.89838968
# 7 2019-01-01 A2 0.94467527
# 8 2019-01-02 A2 0.66079779
# 9 2019-01-03 A2 0.62911404
# 10 2019-01-04 A2 0.06178627
dataframe3 <- long_1 %>%
inner_join(long_2, by = c("Date", "column")) %>%
mutate(column_letter = substr(column, 1, 1)) %>%
group_by(column_letter) %>%
mutate(value = column_ops[[column_letter[1]]](value_1, value_2)) %>%
ungroup() %>%
select(Date, column, value) %>%
spread(key = "column", value = "value")
dataframe3
# # A tibble: 6 x 8
# Date A1 A2 B1 B2 B3 C1 C2
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2019-01-01 0.339 1.29 0.616 -0.533 -0.211 0.829 1.57
# 2 2019-01-02 0.673 0.954 0.285 0.484 -0.380 0.933 1.07
# 3 2019-01-03 1.08 1.32 0.454 0.476 -0.0709 0.970 1.16
# 4 2019-01-04 1.15 0.0717 -0.0209 -0.120 -0.493 1.08 1.12
# 5 2019-01-05 8.64 0.470 0.0556 0.000803 0.531 1.69 1.22
# 6 2019-01-06 1.88 0.721 0.585 -0.132 -0.499 1.06 0.972
If you're not familiar with the backtick notation (`/`
), it lets you indicate that everything between the ticks is an object name. In this case, it lets us store the function for division in a list without R trying to do the division and raising an error. Also in case you're not familiar with the idea, even the basic operators are function objects. Try this:
print(`/`)
# function (e1, e2) .Primitive("/")
`/`(3, 2)
# [1] 1.5
1 Like
Not that it's any better, but still wanted to share it because I was interested in the problem and ws trying for sometime to incorporate something like identity
for non-numeric columns and then use Reduce
, but failed.
Here's what I did:
set.seed(seed = 1)
N <- 6
df_1 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
format = "%Y-%m-%d"),
length.out = N,
by = 'day'),
Product = rep(x = "A",
each = N),
A1 = runif(n = N),
A2 = runif(n = N),
B1 = runif(n = N),
B2 = runif(n = N),
B3 = runif(n = N),
C1 = runif(n = N),
C2 = runif(n = N))
df_2 <- data.frame(Dates = seq(from = as.Date(x = "2019-01-01",
format = "%Y-%m-%d"),
length.out = N,
by = 'day'),
Product = rep(x = "B",
each = N),
A1 = runif(n = N),
A2 = runif(n = N),
B1 = runif(n = N),
B2 = runif(n = N),
B3 = runif(n = N),
C1 = runif(n = N),
C2 = runif(n = N))
numeric_operation <- function(numeric_column_name)
{
if (startsWith(x = numeric_column_name,
prefix = "A"))
{
return(`/`)
} else if (startsWith(x = numeric_column_name,
prefix = "B"))
{
return(`-`)
} else if (startsWith(x = numeric_column_name,
prefix = "C"))
{
return(`+`)
} else
{
stop("Unknown operation - STOP!!!")
}
}
operation <- function(df_1_column_name)
{
df_1_column <- df_1[[df_1_column_name]]
df_2_column <- df_2[[df_1_column_name]]
if (is.null(x = df_2_column))
stop("Column name mismatch - STOP!!!")
if (!is.numeric(x = df_2_column))
{
if (!identical(x = df_1_column,
y = df_2_column))
{
warning("Non-numeric non-identical column - SKIPPED!!!")
return(NULL)
}
return(df_2_column)
}
return(numeric_operation(df_1_column_name)(df_1_column, df_2_column))
}
as.data.frame(x = Filter(f = Negate(f = is.null),
x = sapply(X = names(x = df_1),
FUN = operation,
simplify = FALSE,
USE.NAMES = TRUE)))
#> Warning in FUN(X[[i]], ...): Non-numeric non-identical column - SKIPPED!!!
#> Dates A1 A2 B1 B2 B3
#> 1 2019-01-01 0.3391206 1.2899871 0.61634380 -0.532840745 -0.21132458
#> 2 2019-01-02 0.6728743 0.9539017 0.28463756 0.483841849 -0.38019658
#> 3 2019-01-03 1.0814276 1.3171863 0.45356971 0.475639505 -0.07085658
#> 4 2019-01-04 1.1505677 0.0717436 -0.02093502 -0.120252153 -0.49293337
#> 5 2019-01-05 8.6443008 0.4701573 0.05561343 0.000803299 0.53061791
#> 6 2019-01-06 1.8825086 0.7212366 0.58507591 -0.132461685 -0.49909135
#> C1 C2
#> 1 0.8287636 1.5715606
#> 2 0.9333408 1.0685616
#> 3 0.9698926 1.1583704
#> 4 1.0784159 1.1237891
#> 5 1.6917128 1.2209407
#> 6 1.0584563 0.9724123
1 Like
RuReady
September 24, 2019, 11:25pm
6
Thanks everyone for the help!
system
Closed
October 1, 2019, 11:25pm
7
This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.