I have a very big dataframe with millions of rows and I need to calculate the slopes of linear regressions for each row. I figured out a method to do it but it seems extremely inneficient.
Does anyone knows a more efficient way to do this? Thanks in advance!
The independent variable is a vector that goes from c(1:years)
in this case goes from 1:6
.
The column name V1_T1
represents the variable 1 in time 1.
library(dplyr)
library(tidyr)
library(broom)
df = data.frame(replicate(12, sample(0:10, 1000, rep = TRUE)))
colnames = c("V1_T1", "V1_T2", "V1_T3", "V1_T4", "V1_T5", "V1_T6", "V2_T1", "V2_T2", "V2_T3", "V2_T4", "V2_T5", "V2_T6")
colnames(df) = colnames
head(df)
def_slope = function(df, name , first, last, years){
slope = c()
for (i in c(1:nrow(df))){ #in each row
m = df[i,] %>% #select the i row
dplyr::select({{first}}:{{last}}) %>% #select the columns
pivot_longer(c(1:years), names_to = "key", values_to = "value") %>% #transform the data for a linear regression input
mutate(ano = c(1:years)) %>%
lm(value ~ ano, data = .) %>% #calculate a linear regression
tidy() %>%
.[[2,2]] #extract the slope
slope = append(slope, m) #add the slope to the slope vector
}
slope = tibble(slope) %>% #create a column with a defined name
rename("{{name}}" := slope)
return(slope)
}
start = Sys.time()
slope = tibble( #creates a df with all the slopes
def_slope(df, trend_V1, V1_T1, V1_T6, 6),
def_slope(df, trend_V2, V2_T1, V2_T6, 6))
end = Sys.time()
end-start
Time difference of 22.93273 secs