I have around 14k timeseries, where each time serie its assigned to an ID.
I'm trying to forecast from February 2020 to January 2021 using ARIMA by using the hyperparameters PDQ that are found using 2018 as a train data. This is a sample of my data:
library(yardstick)
library(forecast)
library(tsibble)
df<-tibble::tribble(
~ID, ~Period, ~Value,
1L, 201801L, 7713L,
1L, 201802L, 4506L,
1L, 201803L, 24475L,
1L, 201804L, 12418L,
1L, 201805L, 14545L,
1L, 201806L, 14233L,
1L, 201807L, 1271L,
1L, 201808L, 19064L,
1L, 201809L, 3018L,
1L, 201810L, 13291L,
1L, 201811L, 47111L,
1L, 201812L, 16961L,
1L, 201901L, 32442L,
1L, 201902L, 16861L,
1L, 201903L, 31819L,
1L, 201904L, 38759L,
1L, 201905L, 29220L,
1L, 201906L, 19786L,
1L, 201907L, 28620L,
1L, 201908L, 47736L,
1L, 201909L, 32586L,
1L, 201910L, 12347L,
1L, 201911L, 19758L,
1L, 201912L, 14669L,
1L, 202001L, 1499L,
2L, 201801L, 1660L,
2L, 201802L, 1857L,
2L, 201803L, 3221L,
2L, 201804L, 11009L,
2L, 201805L, 11945L,
2L, 201806L, 7152L,
2L, 201807L, 3201L,
2L, 201808L, 13226L,
2L, 201809L, 13568L,
2L, 201810L, 11952L,
2L, 201811L, 1276L,
2L, 201812L, 20049L,
2L, 201901L, 7576L,
2L, 201902L, 10370L,
2L, 201903L, 47760L,
2L, 201904L, 37809L,
2L, 201905L, 9232L,
2L, 201906L, 18635L,
2L, 201907L, 6548L,
2L, 201908L, 29065L,
2L, 201909L, 2225L,
2L, 201910L, 3613L,
2L, 201911L, 11113L,
2L, 201912L, 4626L,
2L, 202001L, 12083L,
3L, 201801L, 16850L,
3L, 201802L, 9559L,
3L, 201803L, 6727L,
3L, 201804L, 29877L,
3L, 201805L, 7453L,
3L, 201806L, 11100L,
3L, 201807L, 14289L,
3L, 201808L, 16686L,
3L, 201809L, 17925L,
3L, 201810L, 2381L,
3L, 201811L, 25015L,
3L, 201812L, 20258L,
3L, 201901L, 12875L,
3L, 201902L, 8534L,
3L, 201903L, 3880L,
3L, 201904L, 27034L,
3L, 201905L, 13624L,
3L, 201906L, 29521L,
3L, 201907L, 4933L,
3L, 201908L, 5963L,
3L, 201909L, 15193L,
3L, 201910L, 2960L,
3L, 201911L, 6150L,
3L, 201912L, 18957L,
3L, 202001L, 10326L,
4L, 201801L, 85837L,
4L, 201802L, 90903L,
4L, 201803L, 110829L,
4L, 201804L, 67992L,
4L, 201805L, 117665L,
4L, 201806L, 136909L,
4L, 201807L, -23708L,
4L, 201808L, 196362L,
4L, 201809L, -28869L,
4L, 201810L, 114243L,
4L, 201811L, 113408L,
4L, 201812L, 18932L,
4L, 201901L, 254189L,
4L, 201902L, -151225L,
4L, 201903L, 103182L,
4L, 201904L, -242319L,
4L, 201905L, 111250L,
4L, 201906L, 449959L,
4L, 201907L, 105185L,
4L, 201908L, 103575L,
4L, 201909L, 214451L,
4L, 201910L, 99015L,
4L, 201911L, 280420L,
4L, 201912L, -15325L,
4L, 202001L, 199340L
)
df$year<-as.numeric(substr(df$Period,start = 1,stop = 4))
df$month<-as.numeric(substr(df$Period,start=5,stop=6))
df$day<-1
Some treatment to use ARIMA from library fable:
df <- df %>%
mutate(date=as.character(make_date(year,month,day)))
df<-df %>%
mutate(YearMonth = tsibble::yearmonth((ymd(date)))) %>%
as_tsibble(key=ID,index = YearMonth)
df_train<- df %>%
filter(YearMonth <= yearmonth("2018 Dec") & YearMonth>=yearmonth("2018 Jan")) %>%
model(selected_model=ARIMA(Value ~ PDQ(0,0,0), stepwise=FALSE, approximation=FALSE))
Im saving the parameters with a map:
PDQ<-df_train$selected_model %>%
map_df(~.x$fit$spec)
# A tibble: 4 x 8
# p d q P D Q constant period
# <int> <int> <int> <dbl> <dbl> <dbl> <lgl> <dbl>
#1 0 0 0 0 0 0 TRUE 12
#2 0 0 0 0 0 0 TRUE 12
#3 0 0 0 0 0 0 TRUE 12
#4 2 0 0 0 0 0 TRUE 12
And some metrics:
df_train %>%
forecast(h = 13) %>%
accuracy(df) %>%
select(ID, RMSE, MAPE)
For example, for ID nª1 it has an ARIMA (0,0,0) and ID nº4 has an ARIMA (2,0,0).
Now I need to use the parameters of each ID for each time series.
I dont know how to loop it through the ids and using the pdq values, or if there's a function that can be useful for this problem. Does anyone know how to solve it? Thanks!