Run Forecasting model with multiple Dependent and Independent variables in R

FactOREO · August 18, 2022, 9:10pm

Well, I am afraid it might not be pretty nor fast, but with the code below you might have a starting point. The result is a list, containing the auto_ardl, forecast and error outputs. The names are according to the formula used.

library(tidyverse)
library(ARDL)
library(dLagM)

# ARDL MODELING AND FORECASTING
in_sampleARDL <- data %>% 
  filter(Date < '2020-03-01')

out_sampleARDL <-data %>% 
  filter(Date >= '2020-03-01')
# create the formulas
indep_vars <- expression(Industrialproduction,Householdconsumption,Investmentgrowth,ConsumerPriceIndex)
dep_vars   <- expression(NORTH,YORKSANDTHEHUMBER)
# formulae with diff()
formulae <- unlist(lapply(dep_vars, \(x) lapply(indep_vars, \(y) bquote(.(x)~diff(.(y))))))
# without diff()
formulae2 <- unlist(lapply(dep_vars, \(x) lapply(indep_vars, \(y) bquote(.(x)~.(y)))))

result <- vector('list', length = length(formulae))
names(result) <- formulae2
for (i in seq_along(formulae)){
  # auto_ardl
  result[[i]][[1]] <- auto_ardl(formula(formulae2[[i]]),
                                data = in_sampleARDL, max_order = 4, selection = 'BIC')
  # prediction
  result[[i]][[2]] <- forecast(ardlDlm(formula = formula(formulae[[i]]), data = in_sampleARDL, p = 3),
                               x = out_sampleARDL |> select(sub("\\s~.*", "", formula(formulae[[i]]))) |> pull(), h = 4)
  # error
  result[[i]][[3]] <- mean((out_sampleARDL |> select(sub("\\s~.*", "", formula(formulae[[i]]))) |> pull() |> (\(x) x[1:4])() - result[[i]][[2]][["forecasts"]])^2)
  
  # set names
  names(result[[i]]) <- c('auto_ardl','forecast','error')
}
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

result[[1]]
#> $auto_ardl
#> $auto_ardl$best_model
#> 
#> Time series regression with "ts" data:
#> Start = 4, End = 164
#> 
#> Call:
#> dynlm::dynlm(formula = full_formula, data = data, start = start, 
#>     end = end)
#> 
#> Coefficients:
#>          (Intercept)           L(NORTH, 1)           L(NORTH, 2)  
#>              0.49194               0.11662               0.19177  
#>          L(NORTH, 3)  Industrialproduction  
#>              0.23848               0.07019  
#> 
#> 
#> $auto_ardl$best_order
#> [1] 3 0
#> 
#> $auto_ardl$top_orders
#>    NORTH Industrialproduction      BIC
#> 1      3                    0 860.0598
#> 2      3                    1 861.8509
#> 3      3                    2 866.6733
#> 4      3                    3 869.1412
#> 5      2                    1 870.4895
#> 6      2                    0 870.6690
#> 7      4                    4 873.3016
#> 8      2                    2 875.5567
#> 9      1                    0 880.4229
#> 10     1                    1 881.2820
#> 
#> 
#> $forecast
#> $forecasts
#> [1] 0.5361618 1.1387373 0.8677577 0.9860464
#> 
#> $call
#> forecast.ardlDlm(model = ardlDlm(formula = formula(formulae[[i]]), 
#>     data = in_sampleARDL, p = 3), x = pull(select(out_sampleARDL, 
#>     sub("\\s~.*", "", formula(formulae[[i]])))), h = 4)
#> 
#> attr(,"class")
#> [1] "forecast.ardlDlm" "dLagM"           
#> 
#> $error
#> [1] 2.581029

^{Created on 2022-08-18 by the reprex package (v2.0.1)}

Let me know is this is somewhat helpful and if there are improvements you cannot handle by yourself.

Kind regards