Combine data frames from many different bootstrap runs (map within map)

clausp · March 10, 2020, 4:30pm

I am trying to combine results from many different bootstraps runs into one data frame/tibble, where each set of bootstrap results have some value that differs. A silly example that works, but is less than pretty is below:

library(tidyverse)
#> Registered S3 method overwritten by 'rvest':
#>   method            from
#>   read_xml.response xml2
library(rsample)

# Just a silly example. My function runs a regression and returns the estimated
# elasticity
test_fun <- function(df, cut_off) {
  mean((df$carb < cut_off))
}

fit_fun <- function(split, ...) {
  df <- analysis(split)
  tibble(
    term = "ratio",
    estimate  = test_fun(df, ...),
    std.error = NA_real_
  )
}

for (z in 1:9) {
  set.seed(2)
  temp <- 
    bootstraps(mtcars, times = 2000, apparent = TRUE) %>% 
    mutate(ratio = map(splits, ~ fit_fun(.x, cut_off = z))) %>% 
    int_pctl(ratio) %>% 
    mutate(cut_off = z)
  assign(paste0("ratio_bt_", z), temp)
}

bind_rows(ratio_bt_1, ratio_bt_2, ratio_bt_3, ratio_bt_4, ratio_bt_5, ratio_bt_6, ratio_bt_7, ratio_bt_8, ratio_bt_9)
#> # A tibble: 9 x 7
#>   term  .lower .estimate .upper .alpha .method    cut_off
#>   <chr>  <dbl>     <dbl>  <dbl>  <dbl> <chr>        <int>
#> 1 ratio 0          0      0       0.05 percentile       1
#> 2 ratio 0.0938     0.219  0.375   0.05 percentile       2
#> 3 ratio 0.375      0.534  0.719   0.05 percentile       3
#> 4 ratio 0.438      0.626  0.781   0.05 percentile       4
#> 5 ratio 0.844      0.937  1       0.05 percentile       5
#> 6 ratio 0.844      0.937  1       0.05 percentile       6
#> 7 ratio 0.906      0.970  1       0.05 percentile       7
#> 8 ratio 0.906      0.970  1       0.05 percentile       8
#> 9 ratio 1          1      1       0.05 percentile       9

^{Created on 2020-03-10 by the reprex package (v0.2.1)}

I was thinking that I could use map_df() instead of the loop, but the following gives me the error shown below:

# Version that does not work
library(tidyverse)
#> Registered S3 method overwritten by 'rvest':
#>   method            from
#>   read_xml.response xml2
library(rsample)

test_fun <- function(df, cut_off) {
  mean((df$carb < cut_off))
}

fit_fun <- function(split, ...) {
  df <- analysis(split)
  tibble(
    term = "ratio",
    estimate  = test_fun(df, ...),
    std.error = NA_real_
  )
}

map_df(1:9, function(.y) {
  set.seed(2)
  m <- bootstraps(mtcars, times = 2000, apparent = TRUE) %>% 
    mutate(ratio = map(splits, ~ fit_fun(.x, cut_off = .y))) %>% 
    int_pctl(ratio) %>% 
    mutate(cut_off = .y)
  return(m)
})
#> Error in mean((df$carb < cut_off)): the ... list contains fewer than 2 elements

^{Created on 2020-03-10 by the reprex package (v0.2.1)}

I suspect that the problem is in the combination of multiple maps, because the following works fine:

# This works
library(tidyverse)
#> Registered S3 method overwritten by 'rvest':
#>   method            from
#>   read_xml.response xml2
library(rsample)

test_fun <- function(df, cut_off) {
  mean((df$carb < cut_off))
}

map_df(1:9, function(.y) {
  data.frame(ratio = test_fun(mtcars, cut_off = .y)) %>% 
    mutate(cut_off = .y)
})
#>     ratio cut_off
#> 1 0.00000       1
#> 2 0.21875       2
#> 3 0.53125       3
#> 4 0.62500       4
#> 5 0.93750       5
#> 6 0.93750       6
#> 7 0.96875       7
#> 8 0.96875       8
#> 9 1.00000       9

^{Created on 2020-03-10 by the reprex package (v0.2.1)}

Suggestions for how to do the looping over the cut-off values in tidyverse would be greatly appreciated.

Claus

joels · March 10, 2020, 6:45pm

I think the error is occurring due to the use of .y as the name of the variable passed into the second map. If you change this to anything else (I used a) the code will work. I'm guessing this is due to .y being preempted as the name for the second argument when a two-argument function is passed to map (see the help for the .f argument in map).

For example, you can reproduce the error you're getting with the following:

map(c("mpg","cyl"), function(.y) {
  map(1:3, ~mtcars %>% select_at(.y) %>% slice(.x))
})

Error in is_null(vars) : the ... list contains fewer than 2 elements

But this works:

map(c("mpg","cyl"), function(a) {
  map(1:3, ~mtcars %>% select_at(a) %>% slice(.x))
})

In the for loop, rather than assign, I think it would be better to store each iteration in a list:

ratio_bt = vector("list", 9)

for (z in seq_along(ratio_bt)) {
  set.seed(2)
  temp <- bootstraps(mtcars, times = 2000, apparent = TRUE) %>% 
    mutate(ratio = map(splits, ~ fit_fun(.x, cut_off = z))) %>% 
    int_pctl(ratio) %>% 
    mutate(cut_off = z)
  ratio_bt[[z]] = temp
}

dromano · March 10, 2020, 7:06pm

I'm not sure if this was what you might be going for, but I extracted your loop work into a function:

library(tidyverse)
library(rsample)

test_fun <- function(df, cut_off) {
  mean((df$carb < cut_off))
}

fit_fun <- function(split, ...) {
  df <- analysis(split)
  tibble(
    term = "ratio",
    estimate  = test_fun(df, ...),
    std.error = NA_real_
  )
}

extract_results <- 
  function(data, z){
    data %>% 
      mutate(ratio = map(splits, ~ fit_fun(.x, cut_off = z))) %>% 
      int_pctl(ratio) %>% 
      mutate(cut_off = z)
  }

map(1:9, ~ bootstraps(mtcars, times = 2000, apparent = TRUE)) %>% 
  map2_dfr(1:9, extract_results)
#> # A tibble: 9 x 7
#>   term  .lower .estimate .upper .alpha .method    cut_off
#>   <chr>  <dbl>     <dbl>  <dbl>  <dbl> <chr>        <int>
#> 1 ratio 0          0      0       0.05 percentile       1
#> 2 ratio 0.0938     0.219  0.375   0.05 percentile       2
#> 3 ratio 0.375      0.533  0.688   0.05 percentile       3
#> 4 ratio 0.469      0.625  0.781   0.05 percentile       4
#> 5 ratio 0.844      0.939  1       0.05 percentile       5
#> 6 ratio 0.844      0.938  1       0.05 percentile       6
#> 7 ratio 0.906      0.969  1       0.05 percentile       7
#> 8 ratio 0.906      0.970  1       0.05 percentile       8
#> 9 ratio 1          1      1       0.05 percentile       9

^{Created on 2020-03-10 by the reprex package (v0.3.0)}

clausp · March 10, 2020, 10:43pm

Thank you for catching that. The use of .y was, indeed, the problem. By the way, the whole motivation for using map_df was to avoid assign and the for loop, but thank you for that suggestion as well.

clausp · March 10, 2020, 10:44pm

That works as well. @joels' suggestion was closer to what I originally had, so I went with that for the solution. Thank you for the help.

system · March 17, 2020, 10:44pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.