Is there any advise, rule of thumb or guide telling what should and what should not be done using step_* functions in dataprep stage?
Often I found tutorials showing datapreps mixing things been done inside recipes and outside recipes.
example: from this issue
#...
titanic <- titanic %>%
select(-PassengerId, -Name, -Ticket, -Cabin) %>%
mutate(Survived = case_when(Survived == 0 ~ "No",
Survived == 1 ~ "Yes"))
# Survived and Sex should be factors
titanic <- titanic %>%
mutate(
Survived = as.factor(Survived),
Sex = as.factor(Sex),
Embarked = as.factor(Embarked)
)
# imputation stuff
imputed_titanic <- mice(titanic, m = 1, maxit = 30, seed = 1234, printFlag = FALSE)
imputed_titanic <- complete(imputed_titanic)
# splits
train_test_split <- initial_split(imputed_titanic, prop = 0.9)
titanic_train <- training(train_test_split)
titanic_test <- testing(train_test_split)
# Don't dummy the outcome!
rec <- recipe(Survived ~ ., data = titanic_train) %>%
step_dummy(all_nominal(), -Survived)
#...