adding more and more variables in a regression in a tidy way

Consider this example


tibble(y = c(1,2,3,4,5),
       x = c(12,13,25,24,44),
       z = c(23,54,23,35,42)) %>% 
  mutate(xlag1 = lag(x,1),
         xlag2 = lag(x,2),
         zlag1 = lag(z,1),
         zlag2 = lag(z,2))
# A tibble: 5 x 7
      y     x     z xlag1 xlag2 zlag1 zlag2
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1    12    23    NA    NA    NA    NA
2     2    13    54    12    NA    23    NA
3     3    25    23    13    12    54    23
4     4    24    35    25    13    23    54
5     5    44    42    24    25    35    23

I am trying to find the shortest way to run a regression with an increasing number of lagged values for all variables.

That this, the first regression should be y ~ x + z, the second one should be y ~ x + xlag1 + z + zlag1, the third one should be y ~ x + xlag1 + xlag2 + z + zlag1 + zlag2 and so on.

Is there a purr-esque way to do so cleanly? The names of the lagged variables can be changed if needed.

Thanks!

I'm guessing there are better ways to build model formulas, but here's one option: We create a function that builds the formula as a string and then turns it into a formula object at the end:

library(tidyverse)

# Fake data
set.seed(2)
dat = tibble(y = 1:20,
       x = cumsum(rnorm(20)),
       z = cumsum(rnorm(20)),
       w = rnorm(20),
       r = rnorm(20)) %>% 
  mutate(xlag1 = lag(x,1),
         xlag2 = lag(x,2),
         zlag1 = lag(z,1),
         zlag2 = lag(z,2))

my_lag = function(DV, lag.vars, max.lag, other.vars=NULL) {
  form = paste(paste0("lag(", rep(lag.vars, each=max.lag + 1), ", ", 0:max.lag, ")"), collapse=" + ")
  form = paste(DV, " ~ ", form)
  as.formula(paste(form, paste(other.vars, collapse=" + "), sep=" + "))
}

Here's an example of the function output:

my_lag("y", c("x","z"), 2, c("w","r"))
y ~ lag(x, 0) + lag(x, 1) + lag(x, 2) + lag(z, 0) + lag(z, 1) + 
    lag(z, 2) + w + r

An example of running the model for multiple maximum lags.

models = map(0:3, 
             ~ my_lag("y", c("x","z"), .x, c("w","r")) %>% lm(data=dat))

models
List of model results for each lag
$`max lag=0`

Call:
lm(formula = ., data = dat)

Coefficients:
(Intercept)    lag(x, 0)    lag(z, 0)            w            r  
     2.1695      -0.4682       1.9575      -0.1563       0.7344  


$`max lag=1`

Call:
lm(formula = ., data = dat)

Coefficients:
(Intercept)    lag(x, 0)    lag(x, 1)    lag(z, 0)    lag(z, 1)            w  
   3.015369     0.016751    -0.309248     1.158572     0.867487     0.004935  
          r  
   0.506720  


$`max lag=2`

Call:
lm(formula = ., data = dat)

Coefficients:
(Intercept)    lag(x, 0)    lag(x, 1)    lag(x, 2)    lag(z, 0)    lag(z, 1)  
    3.26021     -0.63063      0.93202     -0.34591      2.08166     -1.71304  
  lag(z, 2)            w            r  
    1.99496     -0.05725      0.58461  


$`max lag=3`

Call:
lm(formula = ., data = dat)

Coefficients:
(Intercept)    lag(x, 0)    lag(x, 1)    lag(x, 2)    lag(x, 3)    lag(z, 0)  
    3.54903     -0.47732      0.40362      0.24174      0.09897      1.79321  
  lag(z, 1)    lag(z, 2)    lag(z, 3)            w            r  
   -0.54183     -0.25646      1.76956      0.19338     -0.19972 

Check that we get the same coefficients as in your example.

model.check = lm(y ~ ., data=dat)

models[[3]]
model.check
> models[[3]]

Call:
lm(formula = ., data = dat)

Coefficients:
(Intercept)    lag(x, 0)    lag(x, 1)    lag(x, 2)    lag(z, 0)    lag(z, 1)  
    3.26021     -0.63063      0.93202     -0.34591      2.08166     -1.71304  
  lag(z, 2)            w            r  
    1.99496     -0.05725      0.58461  

> model.check

Call:
lm(formula = y ~ ., data = dat)

Coefficients:
(Intercept)            x            z            w            r        xlag1  
    3.26021     -0.63063      2.08166     -0.05725      0.58461      0.93202  
      xlag2        zlag1        zlag2  
   -0.34591     -1.71304      1.99496  
1 Like

Similar approach, slightly different implementation:

library(purrr)

set.seed(seed = 32783)

dataset <- tibble::tibble(y = sample(x = 1:20),
                          p = rcauchy(n = 20),
                          q = rcauchy(n = 20),
                          u = rnorm(n = 20),
                          v = rnorm(n = 20))

max_lag <- sample(x = 1:19,
                  size = 1)
lag_vars <- c("p", "q")
non_lag_vars <- c("u", "v")

map(.x = paste("y",
               paste(map(.x = 0:max_lag,
                         .f = ~ paste0(outer(X = lag_vars,
                                             Y = 0:.x,
                                             FUN = function(i, j) paste0("dplyr::lag(x = ",
                                                                         i,
                                                                         ", n = ",
                                                                         j,
                                                                         ")")),
                                       collapse = " + ")),
                     paste(non_lag_vars,
                           collapse = " + "),
                     sep = " + "),
               sep = " ~ "),
    .f = ~ lm(formula = as.formula(object = .x),
              data = dataset))
#> [[1]]
#> 
#> Call:
#> lm(formula = as.formula(object = .x), data = dataset)
#> 
#> Coefficients:
#>              (Intercept)  dplyr::lag(x = p, n = 0)  
#>                 9.986524                  0.069404  
#> dplyr::lag(x = q, n = 0)                         u  
#>                -0.003836                 -3.169934  
#>                        v  
#>                -1.360815  
#> 
#> 
#> [[2]]
#> 
#> Call:
#> lm(formula = as.formula(object = .x), data = dataset)
#> 
#> Coefficients:
#>              (Intercept)  dplyr::lag(x = p, n = 0)  
#>                  9.93855                   0.07035  
#> dplyr::lag(x = q, n = 0)  dplyr::lag(x = p, n = 1)  
#>                  0.01451                   0.13824  
#> dplyr::lag(x = q, n = 1)                         u  
#>                 -0.06687                  -3.31488  
#>                        v  
#>                 -1.32692  
#> 
#> 
#> [[3]]
#> 
#> Call:
#> lm(formula = as.formula(object = .x), data = dataset)
#> 
#> Coefficients:
#>              (Intercept)  dplyr::lag(x = p, n = 0)  
#>                 11.08814                   0.12673  
#> dplyr::lag(x = q, n = 0)  dplyr::lag(x = p, n = 1)  
#>                 -0.35991                   0.13806  
#> dplyr::lag(x = q, n = 1)  dplyr::lag(x = p, n = 2)  
#>                  0.01417                   0.66470  
#> dplyr::lag(x = q, n = 2)                         u  
#>                  0.21158                  -2.98489  
#>                        v  
#>                 -2.26152  
#> 
#> 
#> [[4]]
#> 
#> Call:
#> lm(formula = as.formula(object = .x), data = dataset)
#> 
#> Coefficients:
#>              (Intercept)  dplyr::lag(x = p, n = 0)  
#>                 11.75523                   0.11512  
#> dplyr::lag(x = q, n = 0)  dplyr::lag(x = p, n = 1)  
#>                 -0.24138                  -0.05713  
#> dplyr::lag(x = q, n = 1)  dplyr::lag(x = p, n = 2)  
#>                  0.01253                   0.76071  
#> dplyr::lag(x = q, n = 2)  dplyr::lag(x = p, n = 3)  
#>                  0.65056                  -0.05549  
#> dplyr::lag(x = q, n = 3)                         u  
#>                  0.07269                   1.16025  
#>                        v  
#>                 -6.45510

Created on 2019-06-11 by the reprex package (v0.3.0)

1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.