Custom recipes step fails during bake

I want to create a custom step_decorrelate() to remove linear dependency between predictors by estimating the slope (beta) with some specific filtering and subtracting the increment (beta * x). Everything seems to work correctly until the bake.step_decorrelate() is called, which results into the following error message:

Error in rbind(deparse.level, ...) : 
  numbers of columns of arguments do not match

This is my step_decorrelate() definition:

step_decorrelate <- function(
    recipe, ..., role = NA, trained = FALSE,
    x, beta = NULL,
    id = rand_id("decorrelate")) {
  
  add_step(
    recipe,
    step_decorrelate_new(
      terms = enquos(...), role = role, trained = trained,
      x = x, beta = beta,
      id = id))
}

step_decorrelate_new <- function(terms, role, trained, x, beta, id) {
  step(
    subclass = "decorrelate",
    terms = terms, role = role, trained = trained,
    x = x, beta = beta,
    id = id)
}

prep.step_decorrelate <- function(x, training, info = NULL, ...) {
  col_names <- recipes_eval_select(x$terms, training, info)
  check_type(training[, col_names], quant = TRUE)
  
  beta <- purrr::map(training[, col_names], get_beta, x = training %>% pull(x$x))
  
  step_decorrelate_new(
    terms = x$terms, role = x$role, trained = TRUE,
    x = x$x, beta = beta,
    id = x$id)
}

get_beta <- function(y, x) {
  data <- tibble(y1=y, x1=x) %>% 
    filter(x1 > 0) %>% 
    group_by(x1) %>% 
    summarise(y1 = median(y1))
  model <- lm(y1 ~ x1, data)
  beta <- model$coefficients["x1"]
}

bake.step_decorrelate <- function(object, new_data, ...) {
  col_names <- names(object$beta)
  check_new_data(col_names, object, new_data)
  
  #print(object)
  
  new_data %>%
    mutate(across(all_of(col_names), ~ .x - object$beta[[cur_column()]] * new_data[[object$x]]))
}

I am using the following test code:

tmp_data <- tibble(a=1:10) %>% mutate(b=2*a+1, y=3*a+5)
tmp_rec <- recipe(y ~ a + b, data=tmp_data) %>% step_decorrelate(b, x="a")
tmp_rec %>% prep(training = tmp_data)

It is also surprising to me that bake.step_decorrelate() is called from prep()...

Any idea about what is the problem?
Thanks

You need a skip argument. We didn't explicitly say that in the documentation so I filed an issue. Working code is below.

It is also surprising to me that bake.step_decorrelate() is called from prep()...

Each step needs the previously processed data in order to know the state of the data at that point. If you modify, add, or remove columns, the next step can't properly function without the current state.

Working reprex:

library(tidymodels)

step_decorrelate <- function(recipe, ..., role = NA, trained = FALSE, 
                             skip = FALSE,
                             x, beta = NULL, id = rand_id("decorrelate")) {
 add_step(
  recipe,
  step_decorrelate_new(
   terms = enquos(...),
   role = role,
   trained = trained,
   skip = skip,
   x = x,
   beta = beta,
   id = id
  )
 )
}

step_decorrelate_new <-
 function(terms, role, trained, skip, x, beta, id) {
  step(
   subclass = "decorrelate",
   terms = terms,
   role = role,
   trained = trained,
   skip = skip, 
   x = x,
   beta = beta,
   id = id
  )
 }

prep.step_decorrelate <- function(x, training, info = NULL, ...) {
 col_names <- recipes_eval_select(x$terms, training, info)
 check_type(training[, col_names], quant = TRUE)
 
 beta <-
  purrr::map(training[, col_names], get_beta, x = training %>% pull(x$x))
 
 step_decorrelate_new(
  terms = x$terms,
  role = x$role,
  trained = TRUE,
  skip = x$skip,
  x = x$x,
  beta = beta,
  id = x$id
 )
}

get_beta <- function(y, x) {
 data <- tibble(y1 = y, x1 = x) %>%
  filter(x1 > 0) %>%
  group_by(x1) %>%
  summarise(y1 = median(y1))
 model <- lm(y1 ~ x1, data)
 beta <- model$coefficients["x1"]
}

bake.step_decorrelate <- function(object, new_data, ...) {
 col_names <- names(object$beta)
 check_new_data(col_names, object, new_data)

 new_data %>%
  mutate(across(all_of(col_names), ~ .x - object$beta[[cur_column()]] * new_data[[object$x]]))
}

tmp_data <- tibble(a = 1:10) %>% mutate(b = 2 * a + 1, y = 3 * a + 5)
tmp_rec <-
 recipe(y ~ a + b, data = tmp_data) %>% step_decorrelate(b, x = "a")
prepped <- tmp_rec %>% prep(training = tmp_data)

Created on 2023-11-02 with reprex v2.0.2

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.