Fast binom.test in R for multiple levels of confidence?

11:49 24 Nov 2023

I have a dataset df which includes two variables: successes (x) and trials (n). I wish to obtain for every row in that dataset the stats::binom.test() results (est, lower and upper ci) for different levels of confidence. It should be able to handle n = NA or 0.

I wrote the code below but I was hoping for a faster implementation since my true dataset is very large (30 million rows). Any suggestions that will improve the speed will be appreciated (vectorisation, parallelisation...?), preferably a "tidyapproach" but not required.

library(tidyverse)
#function: 
#uses lapply to loop over the conf.levels for the same x&n pair:
binom_test <- function(x, n, conf.levels = c(0.50, 0.75, 0.90, 0.95)) {
  if (is.na(n) | n < 1) {
    return(data.frame(
      conf_level = conf.levels,
      est = rep(NA, length(conf.levels)),
      x = rep(NA, length(conf.levels)),
      n = rep(NA, length(conf.levels)),
      lower_ci = rep(NA, length(conf.levels)),
      upper_ci = rep(NA, length(conf.levels))
    ))
  } else {
    results <- lapply(conf.levels, function(conf.level) {
      test <- binom.test(x = x,
                         n = n,
                         conf.level = conf.level)
      c(
        conf_level = conf.level,
        x = x,
        n = n,
        est = test$estimate[[1]],
        lower_ci = test$conf.int[1],
        upper_ci = test$conf.int[2]
      )
    })
    
    results_df <- as.data.frame(do.call(rbind, results))
    return(results_df)
  }
}

# data
df <- tibble(
  successes = round(runif(1000, 0, 100)),
  trials = round(runif(1000, 0, 100)) + 100
)

# requires rowwise (slower)
df %>%
  rowwise() %>%
  mutate(beta = list(binom_test(x = successes, n = trials))) %>%
  unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#>    successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>                                      
#>  1        78    180            0.5      78    180    0.433         0.406
#>  2        78    180            0.75     78    180    0.433         0.389
#>  3        78    180            0.9      78    180    0.433         0.371
#>  4        78    180            0.95     78    180    0.433         0.360
#>  5        87    140            0.5      87    140    0.621         0.590
#>  6        87    140            0.75     87    140    0.621         0.570
#>  7        87    140            0.9      87    140    0.621         0.549
#>  8        87    140            0.95     87    140    0.621         0.536
#>  9        89    129            0.5      89    129    0.690         0.658
#> 10        89    129            0.75     89    129    0.690         0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci 

# avoids rowwise but still slow
df %>%
  mutate(beta = pmap(list(successes, trials), binom_test)) %>%
  unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#>    successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>                                      
#>  1        78    180            0.5      78    180    0.433         0.406
#>  2        78    180            0.75     78    180    0.433         0.389
#>  3        78    180            0.9      78    180    0.433         0.371
#>  4        78    180            0.95     78    180    0.433         0.360
#>  5        87    140            0.5      87    140    0.621         0.590
#>  6        87    140            0.75     87    140    0.621         0.570
#>  7        87    140            0.9      87    140    0.621         0.549
#>  8        87    140            0.95     87    140    0.621         0.536
#>  9        89    129            0.5      89    129    0.690         0.658
#> 10        89    129            0.75     89    129    0.690         0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci 

#parallel?
# library(furrr)
# plan(multisession, workers = 5)
# df %>%
#   mutate(beta = furrr::future_pmap(list(successes, trials), binom_test)) %>%
#   unnest(beta, names_sep = "_")

^{Created on 2023-11-24 with reprex v2.0.2}

EDIT: I found that the Hmisc package has a vectorised function to compute the CI. But I cannot handle NAs as above...

library(tidyverse)
library(Hmisc)

binom_conf <- function(x, n, alpha) {
  out <- Hmisc::binconf(
    x = x,
    n = n,
    alpha = alpha,
    method = "exact",
    return.df = TRUE
  )
  conf_level <- 1 - alpha
  out$conf_level <- conf_level
  rownames(out) <- NULL
  names(out) <- c("est", "lower_ci", "upper_ci", "conf_level")
  return(out)
}

df <- tibble(
  successes = round(runif(1e5, 0, 100)),
  trials = round(runif(1e5, 0, 100)) + 100
)

alpha <- 1 - c(0.50, 0.75, 0.90, 0.95)


system.time(
  results <-
    map_dfr(.x = alpha,
            ~ cbind(df, binom_conf(
              x = df$successes,
              n = df$trials,
              alpha = .x
            )))
)


library(furrr)
plan(multisession, workers = 5)
system.time(
  results <-
    future_map_dfr(.x = alpha,
                   ~ cbind(df, binom_conf(
                     x = df$successes,
                     n = df$trials,
                     alpha = .x
                   )))
)

r statistics vectorization lapply

Your Answer

Privacy & Cookie Consent