Making a Loop more Efficient

michaelbgarcia · July 7, 2021, 4:48pm

So you don't really need a loop here. I think you can get everything you want with normal vectorized operations. Pair that with the speed of data.table and you are golden. I see you wanted to use data.table, but the constant switching between classes renders it useless, except for that last dcast part. Here is a mostly pure data.table solution. Note, I am no data.table expert so am positive this could be optimized even further.

I used a starting point of 1000 iterations, which completes in 2 seconds.



library(data.table)
library(tictoc)

tic() # start timer

# Change this for # of iterations
num_iteration = 1000

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)

# create train dt
train_data = data.table(a1,b1,c1, dum = 1)

# create template table with iterations
i = data.table(iteration= 1:num_iteration, dum = 1)

# full cartesian join to get all combinations of iterations and train data
df = train_data[i, on = "dum", allow.cartesian = TRUE]

# set key
setkey(df, key = "iteration")

df[, random_1 := runif(1, 80, 120), by = "iteration"]
df[, random_2 := runif(1, random_1, 120), by = "iteration"]
df[, random_3 := runif(1, 85, 120), by = "iteration"]
df[, random_4 := runif(1, random_3, 120), by = "iteration"]

df[, split_1 :=  runif(1, 0, 1), by = "iteration"]
df[, split_2 :=  runif(1, 0, 1), by = "iteration"]
df[, split_3 :=  runif(1, 0, 1), by = "iteration"]

df[, cat := ifelse(a1 <= random_1 & b1 <= random_3, "a", 
                                         ifelse(a1 <= random_2 & b1 <= random_4, "b",
                                                "c")
                            )]

df[cat == "a", quant := quantile(c1, prob = split_1)]
df[cat == "a", diff := quant > c1]

df[cat == "b", quant := quantile(c1, prob = split_2)]
df[cat == "b", diff := quant > c1]
  
df[cat == "c", quant := quantile(c1, prob = split_3)]
df[cat == "c", diff := quant > c1]

df_agg = df[,.(mean =mean(diff, na.rm = TRUE)), by = .(iteration, cat)]

df_agg[, total := mean(mean, na.rm = TRUE), by = .(iteration)]

df_cast = dcast(df_agg, iteration + total ~ cat, value.var = 'mean')

df_unique = unique(df, by = c("iteration"))[,c("iteration", "random_1", "random_2", "random_3", "split_1", "split_2", "split_3")]

final_results = df_cast[df_unique, on="iteration", nomatch=NULL] 

print(final_results)
#>       iteration     total          a         b          c  random_1
#>    1:         1 0.3080857 0.00000000 0.8219178 0.10233918  95.67371
#>    2:         2 0.4748137 0.73825503 0.6111111 0.07507508 106.53849
#>    3:         3 0.1973123 0.01744186 0.4356061 0.13888889  90.28929
#>    4:         4 0.5142243 0.55625000 0.4576271 0.52879581  92.39194
#>    5:         5 0.3877325 0.07665904 0.4615385 0.62500000 113.61826
#>   ---                                                              
#>  996:       996 0.4428956 0.35434783 0.8036072 0.17073171  99.15899
#>  997:       997 0.5613058 0.66666667 0.2962963 0.72095436 118.62009
#>  998:       998 0.4462901 0.26744186 0.3333333 0.73809524 116.54816
#>  999:       999 0.5215304 0.41428571 0.8586387 0.29166667  89.12317
#> 1000:      1000 0.4498091 0.90384615 0.2115385 0.23404255  92.29038
#>       random_2  random_3    split_1   split_2    split_3
#>    1: 104.1073  86.14198 0.22345413 0.8230599 0.09045242
#>    2: 107.1270  98.86750 0.72568845 0.7255776 0.07787107
#>    3: 118.9194 118.56232 0.01756418 0.4398307 0.19976866
#>    4: 114.9078 103.28158 0.54330321 0.4567068 0.53717210
#>    5: 117.9242 108.58274 0.08132953 0.4804348 0.62339557
#>   ---                                                   
#>  996: 118.6605 115.49544 0.36560142 0.7991917 0.12332451
#>  997: 119.9159  88.58999 0.87070697 0.2470474 0.70984699
#>  998: 117.9547 118.84771 0.27297641 0.4942125 0.76537063
#>  999: 112.9656 118.21472 0.39332239 0.8632909 0.33156372
#> 1000: 103.0477 117.78547 0.89226660 0.2224833 0.21636146

toc() # End timer
#> 2.14 sec elapsed

^{Created on 2021-07-07 by the reprex package (v0.3.0)}