How to pass multiple objects out of foreach() %dopar%{} loop?

BrentHutto · January 17, 2019, 7:29pm

After many (many, many, many) trials and errors I have finally figured out a way to get all my simulation results out of the foreach() %dopar% loop. Early on I reckoned that they had to be combined into a list object, it was teasing them apart afterward that was challenging.

Here's a reprex() of my simulation, modeling and results-extraction code. I'm pretty sure the way I'm doing it is less than optimal but hey, it works!

library(tidyverse)
library(lme4)
#> Loading required package: Matrix
#> 
#> Attaching package: 'Matrix'
#> The following object is masked from 'package:tidyr':
#> 
#>     expand
library(parallel)
library(foreach)
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
library(doParallel)
#> Loading required package: iterators

# Set simulation parameters

nsubj  <- 500 # Total number of subjects, randomly split into two roughly equal groups
nreps  <- 20 # Number of simulation repetitions

par0   <- -5.0 # Mean of random intercepts
par0sd <- 2.0 # SD of random intercepts
par1   <- 0.4 # Time effect (increase in logit per unit time)
par1sd <- 0.01 # Very small SD to make time a not quite perfectly fixed effect

par2   <- 1.0 # Difference in logit when grp=1

par3   <- 0.04 # Effect of continuous time-varying covariate
Zvar0 <- 45 # Mean of covariate in grp0
Zvar1 <- 60 # Mean of covariate in grp0
ZvarSD <- 15 # SD of covariate

SimulateMissing <- FALSE # If true this deletes about 10% of the non-t0 observations

kink <- function(t) ifelse(t==2,t+0.05,t) # Function to allow simulation of non-linear Time effect

# Define functions to estimate models and to extract results from returned list object

est_model_2ways <- function(dat) {
  model <- glmer(y~time + grp + zvar + (1|subj),
                 data = dat, 
                 family = binomial, 
                 control = glmerControl(optimizer = "bobyqa"), 
                 nAGQ = 10)
  
  cf_grp0 <- predict(model,newdat=mutate(dat,grp=0),type='response')
  cf_grp1 <- predict(model,newdat=mutate(dat,grp=1),type='response')
  
  pop <- predict(model,newdat=dat,type='response')

  model_params <- c(summary(model)$coefficients[1:4,1], as.data.frame(VarCorr(model))[1,"sdcor"] )
  
  res <- cbind(select(dat, time, grp, simindex, y), 
               cf_grp0, cf_grp1, pop)
  names(res) <- c("time", "grp", "simindex", "y", 
                  "cf_grp0", "cf_grp1", "pop")
  return(list(res,model_params))
  }

extract_plotdat <- function(result_list,simresponse) {

  moddata <- matrix()
  moddata <- foreach (m=1:nreps,.combine='rbind') %do% return(result_list[[m]][[1]])
  
  plotcf <- moddata %>%
    group_by(simindex,time) %>%
    summarize(cf_grp0=mean(cf_grp0),
              cf_grp1=mean(cf_grp1))
  
  plotpop <- moddata %>%
    group_by(simindex, time, grp) %>%
    summarize(pop_mean=mean(pop)) %>%
    spread(grp,pop_mean,sep='') %>%
    transmute(pop_grp0=grp0, pop_grp1=grp1)
  
  plotdat <- merge(plotcf, plotpop) %>% merge(simresponse)
  
  return(plotdat)
  }
  
extract_modparams <- function(result_list) {
  modparams <- matrix()
  modparams <- foreach (k=1:nreps,.combine='rbind') %do% return(result_list[[k]][[2]])
  colnames(modparams) <- c("Intercept", "Time", "Grp", "Zvar", "SD_Intercept")

  return(modparams)
  }

 # Simulate some data
 
set.seed(20)

simdata1 <- foreach(i=1:nreps,.combine='rbind') %do% {

  ds1 <- data.frame(time=rep(0:2,nsubj),
                    subj=rep(1:nsubj,each=3),
                    grp=rep(rbinom(nsubj,1,0.5),each=3))
  unique <- rnorm(nsubj,par0,par0sd)
  ds2 <- mutate(ds1,
                zvar=ifelse(grp,rnorm(n(),Zvar1,ZvarSD),rnorm(n(),Zvar0,ZvarSD)),
                logit=(unique[subj]+(rnorm(n(),par1,par1sd)*kink(time))+(grp*par2))+(par3*zvar),
                prob=exp(logit)/(1+exp(logit)),
                y=rbinom(n(),1,prob))
    
  missvec <- ifelse(rep(SimulateMissing,(3*nsubj)),
                    as.logical(1-((ds2$subj %in% 1:(nsubj%/%8) & ds2$time==2) | 
                                  (ds2$subj %in% 1:(nsubj%/%10) & ds2$time==1))), 
                    as.logical(rep(1,(3*nsubj))))
  
  ds3 <- ds2 %>% filter(missvec) %>% cbind(rep(i,sum(missvec)))
  names(ds3) <- c("time", "subj", "grp", "zvar", "logit", "prob", "y", "simindex")
  ds3
  }

simresponse1 <- simdata1 %>%
  group_by(simindex, time, grp) %>%
  summarize(raw_grp=mean(y)) %>%
  spread(grp,raw_grp,sep='') %>%
  transmute(raw_grp0=grp0, raw_grp1=grp1)
 
# Run a few simulation reps
 
cl<-makeCluster(4)
clusterExport(cl, c("nreps", "simdata1"))
clusterEvalQ(cl, require(dplyr))
#> [[1]]
#> [1] TRUE
#> 
#> [[2]]
#> [1] TRUE
#> 
#> [[3]]
#> [1] TRUE
#> 
#> [[4]]
#> [1] TRUE
clusterEvalQ(cl, require(lme4))
#> [[1]]
#> [1] TRUE
#> 
#> [[2]]
#> [1] TRUE
#> 
#> [[3]]
#> [1] TRUE
#> 
#> [[4]]
#> [1] TRUE
registerDoParallel(cl)
mods_scenario1 <- foreach(j=1:nreps) %dopar% est_model_2ways(filter(simdata1,simindex==j))
stopCluster(cl)

# Extract model result objects and do some graphs

modparams1 <- extract_modparams(mods_scenario1)
modparams1
#>           Intercept      Time       Grp       Zvar SD_Intercept
#> result.1  -5.289170 0.3643823 0.8327985 0.04166571     2.146418
#> result.2  -4.846403 0.3482698 1.1639233 0.03742744     2.013279
#> result.3  -5.686627 0.4883996 1.1610890 0.04873084     2.282471
#> result.4  -5.314589 0.5304110 0.4748585 0.05028079     1.749130
#> result.5  -4.929320 0.4025891 0.7537746 0.04008618     2.125053
#> result.6  -5.437674 0.3032833 1.2010048 0.04944755     2.106158
#> result.7  -4.849634 0.4087906 0.7465538 0.04108251     1.968190
#> result.8  -4.679774 0.3294246 0.8865826 0.03964922     1.941434
#> result.9  -4.916852 0.5016426 1.3680115 0.03248808     1.903482
#> result.10 -5.738815 0.5238406 1.3055919 0.04979450     2.026464
#> result.11 -4.741067 0.5376915 1.3995445 0.03157780     2.005948
#> result.12 -5.170877 0.4661977 0.8423246 0.04074828     1.907134
#> result.13 -5.764675 0.3938994 1.2536595 0.04590645     2.312255
#> result.14 -4.950653 0.3856612 1.0987770 0.04235045     1.748887
#> result.15 -5.565607 0.4498325 0.7832543 0.04278784     2.072407
#> result.16 -4.662915 0.4997881 0.8660844 0.03859824     1.848504
#> result.17 -4.489500 0.2233654 0.7963400 0.03878703     1.848505
#> result.18 -5.143580 0.2945519 0.7443318 0.04577832     2.072113
#> result.19 -4.848514 0.4045460 1.3158423 0.03454787     2.079658
#> result.20 -5.426354 0.4683027 1.4361239 0.03905594     2.174090

plotdat1 <- extract_plotdat(mods_scenario1,simresponse1)

ggplot(plotdat1,aes(y=pop_grp0,x=time,group=simindex)) +
  geom_line(color='red',linetype=1) +
  geom_line(aes(y=pop_grp1,x=time,group=simindex),color='red',linetype=5) +
  geom_line(aes(y=cf_grp0,x=time,group=simindex),color='green',linetype=1) +
  geom_line(aes(y=cf_grp1,x=time,group=simindex),color='green',linetype=5) +
  scale_y_continuous(trans="log")

^{Created on 2019-01-17 by the reprex package (v0.2.1)}