I've been messing around with dials packages to help with parameter tuning and in an effort to create an accelerated test framework, I've been encountering some issues with multiprocess fits on parsnip models. Using parsnip and dials, I create a series of model specs with differing parameters. I then use future_map to fit the models, which succeeds when the environment is forked but fails when executed using multisession on Windows. Weirdly, the error seems to indicate that the model spec object is an invalid object type, or that it can't find the appropriate methodology to create a fit.
# R adults dadta random grid reprex
library(readr)
library(tidyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
library(C50)
library(furrr)
#> Loading required package: future
library(recipes)
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
library(dials)
#> Loading required package: scales
#>
#> Attaching package: 'scales'
#> The following object is masked from 'package:recipes':
#>
#> yj_trans
#> The following object is masked from 'package:purrr':
#>
#> discard
#> The following object is masked from 'package:readr':
#>
#> col_factor
#>
#> Attaching package: 'dials'
#> The following object is masked from 'package:stats':
#>
#> offset
library(parsnip)
sessionInfo()
#> R version 3.6.1 (2019-07-05)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 17763)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] parsnip_0.0.4 dials_0.0.3 scales_1.1.0 recipes_0.1.7 furrr_0.1.0
#> [6] future_1.15.1 C50_0.1.2 purrr_0.3.3 dplyr_0.8.3 tidyr_1.0.0
#> [11] readr_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_0.2.5 xfun_0.11 inum_1.0-1 reshape2_1.4.3
#> [5] listenv_0.7.0 splines_3.6.1 lattice_0.20-38 Cubist_0.2.2
#> [9] colorspace_1.4-1 vctrs_0.2.0 generics_0.0.2 htmltools_0.4.0
#> [13] yaml_2.2.0 survival_3.1-7 prodlim_2019.11.13 rlang_0.4.2
#> [17] pillar_1.4.2 withr_2.1.2 glue_1.3.1 lifecycle_0.1.0
#> [21] plyr_1.8.4 lava_1.6.6 stringr_1.4.0 timeDate_3043.102
#> [25] munsell_0.5.0 mvtnorm_1.0-11 codetools_0.2-16 evaluate_0.14
#> [29] knitr_1.26 parallel_3.6.1 class_7.3-15 highr_0.8
#> [33] Rcpp_1.0.3 backports_1.1.5 ipred_0.9-9 hms_0.5.2
#> [37] digest_0.6.23 stringi_1.4.3 grid_3.6.1 DiceDesign_1.8-1
#> [41] tools_3.6.1 magrittr_1.5 tibble_2.1.3 Formula_1.2-3
#> [45] crayon_1.3.4 pkgconfig_2.0.3 zeallot_0.1.0 partykit_1.2-5
#> [49] MASS_7.3-51.4 libcoin_1.0-5 Matrix_1.2-18 lubridate_1.7.4
#> [53] gower_0.2.1 assertthat_0.2.1 rmarkdown_1.18 R6_2.4.1
#> [57] globals_0.12.4 rpart_4.1-15 nnet_7.3-12 compiler_3.6.1
col_names <- c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occuptation",
"relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "y")
adults_df <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", col_names = col_names)
#> Parsed with column specification:
#> cols(
#> age = col_double(),
#> workclass = col_character(),
#> fnlwgt = col_double(),
#> education = col_character(),
#> education_num = col_double(),
#> marital_status = col_character(),
#> occuptation = col_character(),
#> relationship = col_character(),
#> race = col_character(),
#> sex = col_character(),
#> capital_gain = col_double(),
#> capital_loss = col_double(),
#> hours_per_week = col_double(),
#> native_country = col_character(),
#> y = col_character()
#> )
# basic adults pre processing
adults_rec <- recipe(y ~ ., data = adults_df) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric())
# prepped training dataset
adults_prepped <- prep(adults_rec, training = adults_df)
juice(adults_prepped)
#> # A tibble: 32,561 x 101
#> age fnlwgt education_num capital_gain capital_loss hours_per_week y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 0.0307 -1.06 1.13 0.148 -0.217 -0.0354 <=50K
#> 2 0.837 -1.01 1.13 -0.146 -0.217 -2.22 <=50K
#> 3 -0.0426 0.245 -0.420 -0.146 -0.217 -0.0354 <=50K
#> 4 1.06 0.426 -1.20 -0.146 -0.217 -0.0354 <=50K
#> 5 -0.776 1.41 1.13 -0.146 -0.217 -0.0354 <=50K
#> 6 -0.116 0.898 1.52 -0.146 -0.217 -0.0354 <=50K
#> 7 0.764 -0.280 -1.97 -0.146 -0.217 -1.98 <=50K
#> 8 0.984 0.188 -0.420 -0.146 -0.217 0.370 >50K
#> 9 -0.556 -1.36 1.52 1.76 -0.217 0.774 >50K
#> 10 0.251 -0.287 1.13 0.555 -0.217 -0.0354 >50K
#> # ... with 32,551 more rows, and 94 more variables:
#> # workclass_Federal.gov <dbl>, workclass_Local.gov <dbl>,
#> # workclass_Never.worked <dbl>, workclass_Private <dbl>,
#> # workclass_Self.emp.inc <dbl>, workclass_Self.emp.not.inc <dbl>,
#> # workclass_State.gov <dbl>, workclass_Without.pay <dbl>,
#> # education_X11th <dbl>, education_X12th <dbl>, education_X1st.4th <dbl>,
#> # education_X5th.6th <dbl>, education_X7th.8th <dbl>, education_X9th <dbl>,
#> # education_Assoc.acdm <dbl>, education_Assoc.voc <dbl>,
#> # education_Bachelors <dbl>, education_Doctorate <dbl>,
#> # education_HS.grad <dbl>, education_Masters <dbl>,
#> # education_Preschool <dbl>, education_Prof.school <dbl>,
#> # education_Some.college <dbl>, marital_status_Married.AF.spouse <dbl>,
#> # marital_status_Married.civ.spouse <dbl>,
#> # marital_status_Married.spouse.absent <dbl>,
#> # marital_status_Never.married <dbl>, marital_status_Separated <dbl>,
#> # marital_status_Widowed <dbl>, occuptation_Adm.clerical <dbl>,
#> # occuptation_Armed.Forces <dbl>, occuptation_Craft.repair <dbl>,
#> # occuptation_Exec.managerial <dbl>, occuptation_Farming.fishing <dbl>,
#> # occuptation_Handlers.cleaners <dbl>, occuptation_Machine.op.inspct <dbl>,
#> # occuptation_Other.service <dbl>, occuptation_Priv.house.serv <dbl>,
#> # occuptation_Prof.specialty <dbl>, occuptation_Protective.serv <dbl>,
#> # occuptation_Sales <dbl>, occuptation_Tech.support <dbl>,
#> # occuptation_Transport.moving <dbl>, relationship_Not.in.family <dbl>,
#> # relationship_Other.relative <dbl>, relationship_Own.child <dbl>,
#> # relationship_Unmarried <dbl>, relationship_Wife <dbl>,
#> # race_Asian.Pac.Islander <dbl>, race_Black <dbl>, race_Other <dbl>,
#> # race_White <dbl>, sex_Male <dbl>, native_country_Cambodia <dbl>,
#> # native_country_Canada <dbl>, native_country_China <dbl>,
#> # native_country_Columbia <dbl>, native_country_Cuba <dbl>,
#> # native_country_Dominican.Republic <dbl>, native_country_Ecuador <dbl>,
#> # native_country_El.Salvador <dbl>, native_country_England <dbl>,
#> # native_country_France <dbl>, native_country_Germany <dbl>,
#> # native_country_Greece <dbl>, native_country_Guatemala <dbl>,
#> # native_country_Haiti <dbl>, native_country_Holand.Netherlands <dbl>,
#> # native_country_Honduras <dbl>, native_country_Hong <dbl>,
#> # native_country_Hungary <dbl>, native_country_India <dbl>,
#> # native_country_Iran <dbl>, native_country_Ireland <dbl>,
#> # native_country_Italy <dbl>, native_country_Jamaica <dbl>,
#> # native_country_Japan <dbl>, native_country_Laos <dbl>,
#> # native_country_Mexico <dbl>, native_country_Nicaragua <dbl>,
#> # native_country_Outlying.US.Guam.USVI.etc. <dbl>, native_country_Peru <dbl>,
#> # native_country_Philippines <dbl>, native_country_Poland <dbl>,
#> # native_country_Portugal <dbl>, native_country_Puerto.Rico <dbl>,
#> # native_country_Scotland <dbl>, native_country_South <dbl>,
#> # native_country_Taiwan <dbl>, native_country_Thailand <dbl>,
#> # native_country_Trinadad.Tobago <dbl>, native_country_United.States <dbl>,
#> # native_country_Vietnam <dbl>, native_country_Yugoslavia <dbl>
x <- juice(adults_prepped, all_predictors())
y <- juice(adults_prepped, all_outcomes())
# parsnip model object creation
model <- decision_tree(mode = "classification") %>% set_engine("C5.0")
# build random grid
hp_grid <- grid_random(
min_n(range = c(2, 20)),
tree_depth(),
size = 10)
hp_grid
#> # A tibble: 10 x 2
#> min_n tree_depth
#> <int> <int>
#> 1 17 7
#> 2 7 9
#> 3 4 14
#> 4 20 4
#> 5 5 4
#> 6 10 7
#> 7 5 4
#> 8 12 3
#> 9 20 10
#> 10 7 9
# not sure if this is the best practice, but effectively doing a for loop of
# model updates for each entry in the random grid as merge no longer works
models <- map(seq_len(nrow(hp_grid)), ~{
update(model, hp_grid[.x, ])
})
# build model spec tibble
spec_df <- tibble(spec = models) %>%
mutate(model_id = row_number())
spec_df
#> # A tibble: 10 x 2
#> spec model_id
#> <list> <int>
#> 1 <spec[+]> 1
#> 2 <spec[+]> 2
#> 3 <spec[+]> 3
#> 4 <spec[+]> 4
#> 5 <spec[+]> 5
#> 6 <spec[+]> 6
#> 7 <spec[+]> 7
#> 8 <spec[+]> 8
#> 9 <spec[+]> 9
#> 10 <spec[+]> 10
# plan multiprocess (multisession in Windows, multicore on Mac and Linux)
plan(multiprocess)
# multisession execution fails; multicore succeeds (yay forking)
# this behavior only seems to manifest in parsnip model specs with the generalized fit API
full_fits <- spec_df %>%
mutate(fit = future_map(spec, fit_xy, x, y, .options = future_options(packages = "parsnip")))
full_fits
#> # A tibble: 10 x 3
#> spec model_id fit
#> <list> <int> <list>
#> 1 <spec[+]> 1 <fit[+]>
#> 2 <spec[+]> 2 <fit[+]>
#> 3 <spec[+]> 3 <fit[+]>
#> 4 <spec[+]> 4 <fit[+]>
#> 5 <spec[+]> 5 <fit[+]>
#> 6 <spec[+]> 6 <fit[+]>
#> 7 <spec[+]> 7 <fit[+]>
#> 8 <spec[+]> 8 <fit[+]>
#> 9 <spec[+]> 9 <fit[+]>
#> 10 <spec[+]> 10 <fit[+]>
Created on 2019-11-30 by the reprex package (v0.3.0)