Hi,
I'm trying to showcase and understand tidymodels
to iterate different models for evaluation for a school project. This is the my first attempt to use tidymodels, please excuse me if my concepts are off.
My goal is to evaluate for a given data set the following models:
- Linear Discriminant Analysis
- Simple Tree model
- KNN model
- Baysian GLM
- Random Forest model
- Xgboost
library(tidyverse)
library(tidymodels)
library(baguette)
library(xgboost)
library(mice)
# loading data
bcancer <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
# Column names
colnames(bcancer)=c("ID","clump_thick","cell_size","cell_shape", "marginal","epithelial","nuclei",
"chromatin","nucleoli","mitoses","class")
# Missing values that need replacing:
table(bcancer$nuclei)
bcancer$nuclei <- as.numeric(gsub("\\?","NA",bcancer$nuclei))
table(bcancer$nuclei)
# Impute NA
md.pattern(bcancer)
bcancer1 <- mice(bcancer)
bcancer <- complete(bcancer1)
bcancer$class <- factor(bcancer$class) # class: 2 is benign, 4 is malignant
I then create the vfolds:
set.seed(99)
bcancer_split <- initial_split(bcancer, strata = class)
bcancer_train <- training(bcancer_split)
bcancer_test <- testing(bcancer_split)
set.seed(99)
bcancer_folds <- vfold_cv(bcancer_train, strata = class)
Then the recipe:
- This part I was a little more unsure not sure if a recipe can fit all types of models or if a specific recipe is needed for different model types.
- Took a shotgun approach, I believe everything needs normalizing, centering and scaling, etc.
- If different recipe is needed for different models, how do I know what are best steps for each model recipe?
# Creating a recipe:
bcancer_recipe <- recipe(class~., data = bcancer_test) %>%
step_normalize(all_predictors()) %>%
step_corr(all_predictors()) %>%
step_center(all_predictors(), - all_outcomes()) %>%
step_scale(all_predictors(), - all_outcomes())
Then Model Specifications:
For this I just went with what I could find in parsnip
. But still not confident I'm getting the correct specifications per model. I'm unsure how, but my gut feeling leads me to think I might need to create test grids for it to tune parameters or tidymodels
does it under the hood?
- Since I needed LDA I tried using Tidymodels - discrim_linear() but this function seems to not be loaded with tidymodels anymore, any other options?
# Models Specifications:
# lda_spec <- discrim_linear() %>%
# set_mode("classification") %>%
# set_engine("MASS")
tree_spec <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
knn_spec <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("classification")
bays_spec <- decision_tree() %>%
set_mode("classification") %>%
set_engine("rpart")
svm_spec <- svm_poly() %>%
set_mode("unknown") %>%
set_engine("kernlab")
rf_spec <- rand_forest() %>%
set_mode("unknown") %>%
set_engine("ranger")
xgboost_spec <- boost_tree() %>%
set_mode("unknown") %>%
set_engine("xgboost")
Followed by the workflow_set():
# Set workflow:
bcancer_set <- workflow_set(
list(bcancer_recipe),
list(tree_spec,knn_spec, svm_spec,rf_spec, xgboost_spec),
cross = FALSE
)
Then run the models and results:
doParallel::registerDoParallel()
set.seed(99)
# Results
bcancer_rs <- workflow_map(
bcancer_set,
"fit_resamples",
resamples = bcancer_folds
)
bcancer_rs
Excluding the LDA, I thought I had done the model specifications correctly. I would like to understand and greatly appreciate if a little guidance on what is causing the issue is given. I think I might have to write different recipes for different model types? Shouldn't I be doing something regarding tuning parameters? How to pull predictions for plots for example?
What I was aiming with what I wrote was a table in which I compare these models RMSE
and Kappa (Kappa Values or what is equivalent in tidymodels) - which I wasn't able to also figure out hence missing on the below reprex
.
I would greatly appreciate the help with my attempt to connect the dots. The professor uses uses base R and I'm trying to translate what examples we have had in class to cleaner tidyverse
& tidymodel
code.
Thanks for the help.
REPREX:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(baguette)
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
library(mice)
#>
#> Attaching package: 'mice'
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following objects are masked from 'package:base':
#>
#> cbind, rbind
library(doParallel)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
# loading data
bcancer <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
#> New names:
#> * `1` -> `1...3`
#> * `1` -> `1...4`
#> * `1` -> `1...5`
#> * `2` -> `2...6`
#> * `1` -> `1...7`
#> * ...
#> Rows: 698 Columns: 11
#> -- Column specification --------------------------------------------------------
#> Delimiter: ","
#> chr (1): 1...7
#> dbl (10): 1000025, 5, 1...3, 1...4, 1...5, 2...6, 3, 1...9, 1...10, 2...11
#>
#> i Use `spec()` to retrieve the full column specification for this data.
#> i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Column names
colnames(bcancer)=c("ID","clump_thick","cell_size","cell_shape", "marginal","epithelial","nuclei",
"chromatin","nucleoli","mitoses","class")
# Missing values that need replacing:
table(bcancer$nuclei)
#>
#> ? 1 10 2 3 4 5 6 7 8 9
#> 16 401 132 30 28 19 30 4 8 21 9
bcancer$nuclei <- as.numeric(gsub("\\?","NA",bcancer$nuclei))
#> Warning: NAs introduced by coercion
table(bcancer$nuclei)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 401 30 28 19 30 4 8 21 9 132
bcancer1 <- mice(bcancer)
#>
#> iter imp variable
#> 1 1 nuclei
#> 1 2 nuclei
#> 1 3 nuclei
#> 1 4 nuclei
#> 1 5 nuclei
#> 2 1 nuclei
#> 2 2 nuclei
#> 2 3 nuclei
#> 2 4 nuclei
#> 2 5 nuclei
#> 3 1 nuclei
#> 3 2 nuclei
#> 3 3 nuclei
#> 3 4 nuclei
#> 3 5 nuclei
#> 4 1 nuclei
#> 4 2 nuclei
#> 4 3 nuclei
#> 4 4 nuclei
#> 4 5 nuclei
#> 5 1 nuclei
#> 5 2 nuclei
#> 5 3 nuclei
#> 5 4 nuclei
#> 5 5 nuclei
bcancer <- complete(bcancer1)
bcancer$class <- factor(bcancer$class) # class: 2 is benign, 4 is malignant
# Create the folds:
set.seed(99)
bcancer_split <- initial_split(bcancer, strata = class)
bcancer_train <- training(bcancer_split)
bcancer_test <- testing(bcancer_split)
set.seed(99)
bcancer_folds <- vfold_cv(bcancer_train, strata = class)
# Creating a recipe:
bcancer_recipe <- recipe(class~., data = bcancer_test) %>%
step_normalize(all_predictors()) %>%
step_corr(all_predictors()) %>%
step_center(all_predictors(), - all_outcomes()) %>%
step_scale(all_predictors(), - all_outcomes())
# Models Specifications:
#
# lda_spec <- discrim_linear() %>%
# set_mode("classification") %>%
# set_engine("MASS")
tree_spec <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
knn_spec <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("classification")
bays_spec <- decision_tree() %>%
set_mode("classification") %>%
set_engine("rpart")
svm_spec <- svm_poly() %>%
set_mode("unknown") %>%
set_engine("kernlab")
rf_spec <- rand_forest() %>%
set_mode("unknown") %>%
set_engine("ranger")
xgboost_spec <- boost_tree() %>%
set_mode("unknown") %>%
set_engine("xgboost")
# Set workflow:
bcancer_set <- workflow_set(
list(bcancer_recipe),
list(tree_spec,knn_spec, svm_spec,rf_spec, xgboost_spec),
cross = FALSE
)
doParallel::registerDoParallel()
set.seed(99)
# Results
bcancer_rs <- workflow_map(
bcancer_set,
"fit_resamples",
resamples = bcancer_folds
)
#> Warning: All models failed. See the `.notes` column.
bcancer_rs
#> # A workflow set/tibble: 5 x 4
#> wflow_id info option result
#> <chr> <list> <list> <list>
#> 1 recipe_bag_tree <tibble [1 x 4]> <opts[1]> <rsmp[x]>
#> 2 recipe_nearest_neighbor <tibble [1 x 4]> <opts[1]> <rsmp[+]>
#> 3 recipe_svm_poly <tibble [1 x 4]> <opts[1]> <try-errr [1]>
#> 4 recipe_rand_forest <tibble [1 x 4]> <opts[1]> <try-errr [1]>
#> 5 recipe_boost_tree <tibble [1 x 4]> <opts[1]> <try-errr [1]>
Created on 2021-10-23 by the reprex package (v2.0.1)