Attaching reprex for reference.
options(scipen = 999)
# Libraries ----
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 3.6.3
#> Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
#> when loading 'dplyr'
#> Warning: package 'ggplot2' was built under R version 3.6.3
#> Warning: package 'tibble' was built under R version 3.6.3
#> Warning: package 'tidyr' was built under R version 3.6.3
#> Warning: package 'purrr' was built under R version 3.6.3
#> Warning: package 'dplyr' was built under R version 3.6.3
#> Warning: package 'forcats' was built under R version 3.6.3
library(rsample)
#> Warning: package 'rsample' was built under R version 3.6.3
library(recipes)
#> Warning: package 'recipes' was built under R version 3.6.3
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#>
#> fixed
#> The following object is masked from 'package:stats':
#>
#> step
library(h2o)
#> Warning: package 'h2o' was built under R version 3.6.3
#>
#> ----------------------------------------------------------------------
#>
#> Your next step is to start H2O:
#> > h2o.init()
#>
#> For H2O package documentation, ask for help:
#> > ??h2o
#>
#> After starting H2O, you can use the Web UI at http://localhost:54321
#> For more information visit http://docs.h2o.ai
#>
#> ----------------------------------------------------------------------
#>
#> Attaching package: 'h2o'
#> The following objects are masked from 'package:stats':
#>
#> cor, sd, var
#> The following objects are masked from 'package:base':
#>
#> %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
#> colnames<-, ifelse, is.character, is.factor, is.numeric, log,
#> log10, log1p, log2, round, signif, trunc
library(correlationfunnel)
#> Warning: package 'correlationfunnel' was built under R version 3.6.3
#> == Using correlationfunnel? ====================================================
#> You might also be interested in applied data science training for business.
#> </> Learn more at - www.business-science.io </>
library(DataExplorer)
#> Warning: package 'DataExplorer' was built under R version 3.6.3
# Dataset - Customers over monthly threshold ----
customer_churn_tbl <- read_rds("predictors_2021_01/customer_churn_over_780_liters_m_2020_12.rds")
#> Warning in gzfile(file, "rb"): cannot open compressed file 'predictors_2021_01/
#> customer_churn_over_780_liters_m_2020_12.rds', probable reason 'No such file or
#> directory'
#> Error in gzfile(file, "rb"): cannot open the connection
#Save a dataset copy with Customer ID
customer_churn_w_ID_tbl <- customer_churn_tbl
# Dataset w/o Customer ID
customer_churn_tbl <- customer_churn_tbl %>%
select(-CustomerID)
#> Error: Can't subset columns that don't exist.
#> x Column `CustomerID` doesn't exist.
customer_churn_tbl <- customer_churn_tbl %>%
mutate(across(contains("score"), as_factor))
funnel_churn_ggplot <- customer_churn_tbl %>%
binarize() %>%
correlate("Churn__yes") %>%
plot_correlation_funnel()
#> Error: binarize(): [Missing Values Detected] The following columns contain NAs: TotalCharges
funnel_churn_ggplot
#> Error in eval(expr, envir, enclos): object 'funnel_churn_ggplot' not found
# Preprocessing (for initial model training) ----
set.seed(123)
rsample_splits <- initial_split(customer_churn_tbl, prop = 0.8)
rec_obj <- recipe(Churn ~ ., data = training(rsample_splits)) %>%
step_string2factor(all_nominal()) %>%
prep()
train_tbl <- bake(rec_obj, training(rsample_splits))
test_tbl <- bake(rec_obj, testing(rsample_splits))
# 01 - Modeling ----
h2o.startLogging()
#> Appending REST API transactions to log file C:\Users\VACLAV~1.CUR\AppData\Local\Temp\RtmpkfcPvI/rest.log
memory.limit(60000)
#> [1] 60000
h2o.init(nthreads = -1, max_mem_size = "24g")
#> Connection successful!
#>
#> R is connected to the H2O cluster:
#> H2O cluster uptime: 2 hours 56 minutes
#> H2O cluster timezone: Europe/Prague
#> H2O data parsing timezone: UTC
#> H2O cluster version: 3.30.0.1
#> H2O cluster version age: 10 months !!!
#> H2O cluster name: H2O_started_from_R_vaclav.curik_ebi945
#> H2O cluster total nodes: 1
#> H2O cluster total memory: 23.98 GB
#> H2O cluster total cores: 8
#> H2O cluster allowed cores: 8
#> H2O cluster healthy: TRUE
#> H2O Connection ip: localhost
#> H2O Connection port: 54321
#> H2O Connection proxy: NA
#> H2O Internal Security: FALSE
#> H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
#> R Version: R version 3.6.2 (2019-12-12)
#> Warning in h2o.clusterInfo():
#> Your H2O cluster version is too old (10 months)!
#> Please download and install the latest version from http://h2o.ai/download/
y <- "Churn"
x <- setdiff(names(train_tbl), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = as.h2o(train_tbl),
max_runtime_secs = 360,
stopping_metric = "AUCPR",
exploitation_ratio = 0.1,
nfolds = 10,
keep_cross_validation_predictions = T,
keep_cross_validation_fold_assignment = T,
keep_cross_validation_models = T,
sort_metric = "AUCPR",
project_name = "rfm_decomposed_20210204",
verbosity = "info"
)
#> | | | 0% | |======================================================================| 100%
#> | | | 0%
#> 13:45:34.299: Project: rfm_decomposed_20210204
#> 13:45:34.300: Setting stopping tolerance adaptively based on the training frame: 0.013321497260575911
#> 13:45:34.300: Build control seed: -1 (random)
#> 13:45:34.300: training frame: Frame key: automl_training_train_tbl_sid_acfe_1 cols: 21 rows: 5635 chunks: 1 size: 597350 checksum: -93885958457705861
#> 13:45:34.300: validation frame: NULL
#> 13:45:34.300: leaderboard frame: NULL
#> 13:45:34.300: blending frame: NULL
#> 13:45:34.300: response column: Churn
#> 13:45:34.300: fold column: null
#> 13:45:34.300: weights column: null
#> 13:45:34.311: AutoML: XGBoost is not available; skipping it.
#> 13:45:34.316: Loading execution steps: [{XGBoost : defaults}, {GLM : defaults}, {DRF : [def_1]}, {GBM : defaults}, {DeepLearning : defaults}, {DRF : [XRT]}, {XGBoost : grids}, {GBM : grids}, {DeepLearning : grids}, {GBM : [lr_annealing]}, {XGBoost : [lr_search]}, {StackedEnsemble : defaults}]
#> 13:45:34.323: Disabling Algo: XGBoost as requested by the user.
#> 13:45:34.325: AutoML job created: 2021.02.04 13:45:34.295
#> 13:45:34.326: AutoML build started: 2021.02.04 13:45:34.326
#> 13:45:34.333: AutoML: starting GLM_1_AutoML_20210204_134534 model training | |============ | 18%
#> 13:45:37.778: New leader: GLM_1_AutoML_20210204_134534, aucpr: 0.6536921568092042
#> 13:45:37.780: AutoML: starting DRF_1_AutoML_20210204_134534 model training
#> 13:45:45.854: AutoML: starting GBM_1_AutoML_20210204_134534 model training
#> 13:45:49.10: New leader: GBM_1_AutoML_20210204_134534, aucpr: 0.6552239283761728
#> 13:45:49.10: AutoML: starting GBM_2_AutoML_20210204_134534 model training
#> 13:45:52.34: AutoML: starting GBM_3_AutoML_20210204_134534 model training | |================ | 23%
#> 13:45:55.538: AutoML: starting GBM_4_AutoML_20210204_134534 model training
#> 13:46:00.749: AutoML: starting GBM_5_AutoML_20210204_134534 model training | |====================== | 31%
#> 13:46:03.887: New leader: GBM_5_AutoML_20210204_134534, aucpr: 0.6655256146343881
#> 13:46:03.890: AutoML: starting DeepLearning_1_AutoML_20210204_134534 model training | |======================== | 34%
#> 13:46:15.614: AutoML: starting XRT_1_AutoML_20210204_134534 model training | |======================== | 35% | |========================= | 36%
#> 13:46:27.624: AutoML: starting GBM_grid__1_AutoML_20210204_134534 hyperparameter search | |========================== | 37% | |=========================== | 38% | |============================ | 39% | |============================ | 40% | |============================= | 41% | |============================= | 42% | |============================== | 43% | |=============================== | 44% | |================================ | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |==================================== | 51% | |==================================== | 52% | |============================================ | 62%
#> 13:47:49.156: New leader: GBM_grid__1_AutoML_20210204_134534_model_13, aucpr: 0.6657859960514716
#> 13:47:49.156: AutoML: starting DeepLearning_grid__1_AutoML_20210204_134534 hyperparameter search | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================== | 80% | |========================================================= | 82%
#> 13:50:29.726: AutoML: starting DeepLearning_grid__2_AutoML_20210204_134534 hyperparameter search | |=============================================================== | 90% | |================================================================ | 91% | |================================================================= | 92%
#> 13:51:06.851: AutoML: starting DeepLearning_grid__3_AutoML_20210204_134534 hyperparameter search | |======================================================================| 100%
Created on 2021-02-04 by the reprex package (v0.3.0)