How to calculate proportion bootstrapped confidence intervals

Khoajak · August 24, 2023, 5:32pm

Hi RStudio community,
I would like to calculate bootstrapped proportions with confidence intervals for all columns of this dataset, if anybody can help me. It is highly appreciate it. Thanks

data <- data.frame(X1=rep(c(0,1),times=c(28,72)),
                   X2=rep(c(0,1),times=c(37,63)),
                   X3=rep(c(0,1),times=c(34,66)),
                   X4=rep(c(0,1),times=c(28,72))      
                           )

Khoajak · August 25, 2023, 3:46pm

Hi again,
I have tried this codes, but stuck with confidence limits and combining, it only calculates confidence limits for the first column. If somebody can help.

data <- data.frame(X1=rep(c(0,1),times=c(28,72)),
                   X2=rep(c(0,1),times=c(37,63)),
                   X3=rep(c(0,1),times=c(34,66)),
                   X4=rep(c(0,1),times=c(28,72))      
                           )
                   
cols <- c("X1", "X2", "X3","X4")
str(data)                     

# Calculate proportions
prop<- data %>%
  summarise(across(all_of(cols), ~ mean(. == 1, na.rm = TRUE)))

# Bootstrapping
boot_fn <- function(data, indices) {
  proportions <- colMeans(data[indices, ], na.rm = TRUE)
  return(proportions)
}
 
boot<- data %>%
  select(all_of(cols)) %>%
  boot(., statistic = boot_fn, R = 1000)


# Calculate Confidence Intervals
conf<- boot.ci(boot.out = boot, type = "basic", level = 0.95)

results<- data.frame(
  Threshold = colnames(boot$t0),
  Proportion = prop[1, ],
  Lower_CI = conf$basic[1, ],
  Upper_CI = conf$basic[2, ]
)

technocrat · August 26, 2023, 1:58am

prop[1,] selects only the first row of prop and conf$basic is a list, which has only one dimension.

Khoajak · August 26, 2023, 2:18am

Ok, how to get the prop and CL for all the columns. Thanks

technocrat · August 26, 2023, 7:46am

# Load the boot library
library(boot)

# Create a 100x4 matrix of zeros and ones with
# specified proportions

m <- matrix(
  c(
    rep(0:1, c(28, 72)),
    rep(0:1, c(37, 63)),
    rep(0:1, c(34, 66)),
    rep(0:1, c(28, 72))
  ),
  nrow = 100, ncol = 4
)

colnames(m) <- paste0("X", 1:4)

# Define a function to calculate the column means
col_mean <- function(data, indices, col) {
  return(mean(data[indices, col]))
}

# Set the seed for reproducibility
set.seed(123)

level <- 0.95

# Perform bootstrap sampling for each column
boot_results_col1 <- boot(m, col_mean, R = 1000, col = 1)
boot_results_col2 <- boot(m, col_mean, R = 1000, col = 2)
boot_results_col3 <- boot(m, col_mean, R = 1000, col = 3)
boot_results_col4 <- boot(m, col_mean, R = 1000, col = 4)

# Calculate the 95% confidence intervals for each column
conf_intervals_col1 <- boot.ci(boot_results_col1, conf = level, type = "perc")
conf_intervals_col2 <- boot.ci(boot_results_col2, conf = level, type = "perc")
conf_intervals_col3 <- boot.ci(boot_results_col3, conf = level, type = "perc")
conf_intervals_col4 <- boot.ci(boot_results_col4, conf = level, type = "perc")

report <- data.frame(
  var = colnames(m),
  est = c(
    boot_results_col1$t0,
    boot_results_col2$t0,
    boot_results_col3$t0,
    boot_results_col4$t0
  ),
  level = rep(level, 4),
  lower = c(
    conf_intervals_col1$percent[4],
    conf_intervals_col2$percent[4],
    conf_intervals_col3$percent[4],
    conf_intervals_col4$percent[4]
  ),
  upper = c(
    conf_intervals_col1$percent[5],
    conf_intervals_col2$percent[5],
    conf_intervals_col3$percent[5],
    conf_intervals_col4$percent[5]
  )
)
report
#>   var  est level     lower upper
#> 1  X1 0.72  0.95 0.6300000  0.81
#> 2  X2 0.63  0.95 0.5302541  0.73
#> 3  X3 0.66  0.95 0.5600000  0.75
#> 4  X4 0.72  0.95 0.6302541  0.81

^{Created on 2023-08-26 with reprex v2.0.2}

Khoajak · August 26, 2023, 3:34pm

technocrat:

# Load the boot library
library(boot)

# Create a 100x4 matrix of zeros and ones with
# specified proportions

m <- matrix(
  c(
    rep(0:1, c(28, 72)),
    rep(0:1, c(37, 63)),
    rep(0:1, c(34, 66)),
    rep(0:1, c(28, 72))
  ),
  nrow = 100, ncol = 4
)

colnames(m) <- paste0("X", 1:4)

# Define a function to calculate the column means
col_mean <- function(data, indices, col) {
  return(mean(data[indices, col]))
}

# Set the seed for reproducibility
set.seed(123)

level <- 0.95

# Perform bootstrap sampling for each column
boot_results_col1 <- boot(m, col_mean, R = 1000, col = 1)
boot_results_col2 <- boot(m, col_mean, R = 1000, col = 2)
boot_results_col3 <- boot(m, col_mean, R = 1000, col = 3)
boot_results_col4 <- boot(m, col_mean, R = 1000, col = 4)

# Calculate the 95% confidence intervals for each column
conf_intervals_col1 <- boot.ci(boot_results_col1, conf = level, type = "perc")
conf_intervals_col2 <- boot.ci(boot_results_col2, conf = level, type = "perc")
conf_intervals_col3 <- boot.ci(boot_results_col3, conf = level, type = "perc")
conf_intervals_col4 <- boot.ci(boot_results_col4, conf = level, type = "perc")

report <- data.frame(
  var = colnames(m),
  est = c(
    boot_results_col1$t0,
    boot_results_col2$t0,
    boot_results_col3$t0,
    boot_results_col4$t0
  ),
  level = rep(level, 4),
  lower = c(
    conf_intervals_col1$percent[4],
    conf_intervals_col2$percent[4],
    conf_intervals_col3$percent[4],
    conf_intervals_col4$percent[4]
  ),
  upper = c(
    conf_intervals_col1$percent[5],
    conf_intervals_col2$percent[5],
    conf_intervals_col3$percent[5],
    conf_intervals_col4$percent[5]
  )
)
report
#>   var  est level     lower upper
#> 1  X1 0.72  0.95 0.6300000  0.81
#> 2  X2 0.63  0.95 0.5302541  0.73
#> 3  X3 0.66  0.95 0.5600000  0.75
#> 4  X4 0.72  0.95 0.6302541  0.81

Thank you very much for your help. It is exactly what I wanted. I need a little more help with this data. How to calculate the bootstrapped mean difference and 95% CIs for the columns (e.g. between column X1 & X2, between column X2 & X3, and column X3 & X4). Your help is highly appreciated. Thank you again.

technocrat · August 27, 2023, 12:03am

# Load the boot library
library(boot)

# Create a 100x4 matrix of zeros and ones
m <- matrix(sample(c(0, 1), 100*4, replace = TRUE), nrow = 100, ncol = 4)
colnames(m) <- paste0("X",1:4)

# Define a function to calculate the mean difference between two columns
col_mean_diff <- function(data, indices, col1, col2) {
  return(mean(m[indices, col1]) - mean(m[indices, col2]))
}

# Set the seed for reproducibility
set.seed(123)
levels <- c("X1-X2","X1-X3","X1-X4",
            "X2-X3","X2-X4","X3-X4")

# Perform bootstrap sampling for each pair of columns
boot_results_12 <- boot(m, col_mean_diff, R = 1000, col1 = 1, col2 = 2)
boot_results_13 <- boot(m, col_mean_diff, R = 1000, col1 = 1, col2 = 3)
boot_results_14 <- boot(m, col_mean_diff, R = 1000, col1 = 1, col2 = 4)
boot_results_23 <- boot(m, col_mean_diff, R = 1000, col1 = 2, col2 = 3)
boot_results_24 <- boot(m, col_mean_diff, R = 1000, col1 = 2, col2 = 4)
boot_results_34 <- boot(m, col_mean_diff, R = 1000, col1 = 3, col2 = 4)

# Calculate the 95% confidence intervals for each pair of columns
conf_intervals_12 <- boot.ci(boot_results_12, conf = 0.95, type = "perc")
conf_intervals_13 <- boot.ci(boot_results_13, conf = 0.95, type = "perc")
conf_intervals_14 <- boot.ci(boot_results_14, conf = 0.95, type = "perc")
conf_intervals_23 <- boot.ci(boot_results_23, conf = 0.95, type = "perc")
conf_intervals_24 <- boot.ci(boot_results_24, conf = 0.95, type = "perc")
conf_intervals_34 <- boot.ci(boot_results_34, conf = 0.95, type = "perc")

report <- data.frame(
  var = levels,
  est = c(
    boot_results_12$t0,
    boot_results_13$t0,
    boot_results_14$t0,
    boot_results_23$t0,
    boot_results_24$t0,
    boot_results_34$t0),
  lower = c(
    conf_intervals_12$percent[4],
    conf_intervals_13$percent[4],
    conf_intervals_14$percent[4],
    conf_intervals_23$percent[4],
    conf_intervals_24$percent[4],
    conf_intervals_34$percent[4]
  ),
  upper = c(
    conf_intervals_12$percent[5],
    conf_intervals_13$percent[5],
    conf_intervals_14$percent[5],
    conf_intervals_23$percent[5],
    conf_intervals_24$percent[5],
    conf_intervals_34$percent[5]
  )
)
report
#>     var   est lower upper
#> 1 X1-X2  0.01 -0.13  0.13
#> 2 X1-X3 -0.01 -0.14  0.11
#> 3 X1-X4  0.07 -0.07  0.21
#> 4 X2-X3 -0.02 -0.17  0.13
#> 5 X2-X4  0.06 -0.08  0.20
#> 6 X3-X4  0.08 -0.06  0.22

Khoajak · August 28, 2023, 2:33am

Thank you very much again Richard for all your time and help. That is what I wanted.

technocrat · August 28, 2023, 8:45am

If this will be used for more than this handful of variables, it should be written as a loop or vectorized to cut down on the copying and editing.

Khoajak · August 28, 2023, 1:09pm

Ok, it is appreciated. One more favor, how to address NA in those columns? When I run the code for the columns with missing, I get the below message. Thank you again.

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = df1, statistic = col_mean, R = 1000, col = 1)


Bootstrap Statistics :
WARNING: All values of t1* are NA

I tried to add include na.rm=TRUE in the function, but got the below error.
# Define a function to calculate the column means
col_mean <- function(data, indices, col) {
  return(mean(data[indices, col]), na.rm=TRUE)
}

boot(df1, col_mean, R = 1000, col = 1)
Error in return(mean(data[indices, col]), na.rm = TRUE) : 
  multi-argument returns are not permitted

# or how to address boot with this dataset with missing values in the columns.

# Create a 100x4 matrix of zeros, ones, and missing with
# specified proportions

m <- matrix(
  c(
    rep(c(0,1, NA), c(28, 62, 10)),
    rep(c(0,1, NA),  c(37, 58, 5)),
    rep(c(0,1, NA), c(34, 55,11)),
    rep(c(0,1, NA),  c(28, 66,6 ))
  ),
  nrow = 100, ncol = 4
)

colnames(m) <- paste0("X", 1:4)

# Define a function to calculate the column means
col_mean <- function(data, indices, col) {
  return(mean(data[indices, col]))
}

# Set the seed for reproducibility
set.seed(123)

level <- 0.95

# Perform bootstrap sampling for each column
boot_results_col1 <- boot(m, col_mean, R = 1000, col = 1)
boot_results_col2 <- boot(m, col_mean, R = 1000, col = 2)
boot_results_col3 <- boot(m, col_mean, R = 1000, col = 3)
boot_results_col4 <- boot(m, col_mean, R = 1000, col = 4)

# Calculate the 95% confidence intervals for each column
conf_intervals_col1 <- boot.ci(boot_results_col1, conf = level, type = "perc")
conf_intervals_col2 <- boot.ci(boot_results_col2, conf = level, type = "perc")
conf_intervals_col3 <- boot.ci(boot_results_col3, conf = level, type = "perc")
conf_intervals_col4 <- boot.ci(boot_results_col4, conf = level, type = "perc")

report <- data.frame(
  var = colnames(m),
  est = c(
    boot_results_col1$t0,
    boot_results_col2$t0,
    boot_results_col3$t0,
    boot_results_col4$t0
  ),
  level = rep(level, 4),
  lower = c(
    conf_intervals_col1$percent[4],
    conf_intervals_col2$percent[4],
    conf_intervals_col3$percent[4],
    conf_intervals_col4$percent[4]
  ),
  upper = c(
    conf_intervals_col1$percent[5],
    conf_intervals_col2$percent[5],
    conf_intervals_col3$percent[5],
    conf_intervals_col4$percent[5]
  )
)
report

technocrat · August 29, 2023, 10:16am

mean() will choke on NA values. Try

col_mean <- function(data, indices, col) {
  return(mean(data[indices, col], na.rm = TRUE))
}

Khoajak · August 29, 2023, 7:58pm

Thank you very much. It worked. Is the mean and proportion confidence intervals for binary (0/1) variable the same?

system · September 5, 2023, 7:59pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.