I want to create a custom step_decorrelate()
to remove linear dependency between predictors by estimating the slope (beta
) with some specific filtering and subtracting the increment (beta * x
). Everything seems to work correctly until the bake.step_decorrelate()
is called, which results into the following error message:
Error in rbind(deparse.level, ...) :
numbers of columns of arguments do not match
This is my step_decorrelate()
definition:
step_decorrelate <- function(
recipe, ..., role = NA, trained = FALSE,
x, beta = NULL,
id = rand_id("decorrelate")) {
add_step(
recipe,
step_decorrelate_new(
terms = enquos(...), role = role, trained = trained,
x = x, beta = beta,
id = id))
}
step_decorrelate_new <- function(terms, role, trained, x, beta, id) {
step(
subclass = "decorrelate",
terms = terms, role = role, trained = trained,
x = x, beta = beta,
id = id)
}
prep.step_decorrelate <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], quant = TRUE)
beta <- purrr::map(training[, col_names], get_beta, x = training %>% pull(x$x))
step_decorrelate_new(
terms = x$terms, role = x$role, trained = TRUE,
x = x$x, beta = beta,
id = x$id)
}
get_beta <- function(y, x) {
data <- tibble(y1=y, x1=x) %>%
filter(x1 > 0) %>%
group_by(x1) %>%
summarise(y1 = median(y1))
model <- lm(y1 ~ x1, data)
beta <- model$coefficients["x1"]
}
bake.step_decorrelate <- function(object, new_data, ...) {
col_names <- names(object$beta)
check_new_data(col_names, object, new_data)
#print(object)
new_data %>%
mutate(across(all_of(col_names), ~ .x - object$beta[[cur_column()]] * new_data[[object$x]]))
}
I am using the following test code:
tmp_data <- tibble(a=1:10) %>% mutate(b=2*a+1, y=3*a+5)
tmp_rec <- recipe(y ~ a + b, data=tmp_data) %>% step_decorrelate(b, x="a")
tmp_rec %>% prep(training = tmp_data)
It is also surprising to me that bake.step_decorrelate()
is called from prep()
...
Any idea about what is the problem?
Thanks