I'm trying to use tidymodels to perform a logistic regression on a dataset with ~2.5million rows and <10 predictors. I'm running into some memory issues, and I'm not sure how to understand what's going on. Each record has an ID, which is a string. If I remove this from the dataset before creating the model it works fine (df_1
in the example). If I include it (df_2
) I get this massive memory error, even though I'm just using it as an id.
Also, if I build the formula manually, (outcome ~ pred_fct_1 + pred_fct_2 + ...
) it works, regardless of whether the id is a number or a string.
#model memory issues
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.0.2
#> Warning: package 'ggplot2' was built under R version 4.0.2
library(tidymodels)
#> -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.1 --
#> v broom 0.7.0 v recipes 0.1.13
#> v dials 0.0.8 v rsample 0.0.7
#> v infer 0.5.3 v tune 0.1.1
#> v modeldata 0.0.2 v workflows 0.1.2
#> v parsnip 0.1.2 v yardstick 0.0.7
#> Warning: package 'broom' was built under R version 4.0.2
#> Warning: package 'dials' was built under R version 4.0.2
#> Warning: package 'modeldata' was built under R version 4.0.2
#> Warning: package 'parsnip' was built under R version 4.0.2
#> Warning: package 'recipes' was built under R version 4.0.2
#> Warning: package 'tune' was built under R version 4.0.2
#> Warning: package 'workflows' was built under R version 4.0.2
#> -- Conflicts ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidymodels_conflicts() --
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter() masks stats::filter()
#> x recipes::fixed() masks stringr::fixed()
#> x dplyr::lag() masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step() masks stats::step()
nr_records <- 2.5e6
df_1 <- tibble(
id = 1:nr_records,
outcome = factor(if_else(runif(nr_records) > 0.9, "Y", "N")),
pred_fct_1 = factor(if_else(runif(nr_records) > 0.9, "Y", "N")),
pred_fct_2 = factor(if_else(runif(nr_records) > 0.7, "Y", "N")),
pred_fct_3 = factor(if_else(runif(nr_records) > 0.9, "Y", "N")),
pred_fct_4 = factor(if_else(runif(nr_records) > 0.7, "Y", "N")),
pred_fct_5 = factor(if_else(runif(nr_records) > 0.9, "Y", "N")),
pred_fct_6 = factor(if_else(runif(nr_records) > 0.7, "Y", "N")),
pred_dbl_1 = runif(nr_records),
pred_dbl_2 = runif(nr_records),
pred_dbl_3 = runif(nr_records)
)
df_2 <- df_1 %>%
mutate(
id = stringi::stri_rand_strings(nr_records, 8, pattern = "[A-Z0-9]")
)
fit_data <- function(df) {
my_recipe <- recipe(outcome ~ ., data = df) %>%
update_role(id, new_role = "ID") %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors())
lr_mod <- logistic_reg() %>%
set_engine("glm")
wflow <- workflow() %>%
add_model(lr_mod) %>%
add_recipe(my_recipe)
fit <- wflow %>%
fit(data = df)
fit
}
fit_1 <- fit_data(df_1)
fit_2 <- fit_data(df_2)
#> Error: cannot allocate vector of size 46566.1 Gb
Created on 2020-10-12 by the reprex package (v0.3.0)