I have a dataset df which includes two variables: successes (x) and trials (n). I wish to obtain for every row in that dataset the stats::binom.test() results (est, lower and upper ci) for different levels of confidence. It should be able to handle n = NA or 0.
I wrote the code below but I was hoping for a faster implementation since my true dataset is very large (30 million rows). Any suggestions that will improve the speed will be appreciated (vectorisation, parallelisation...?), preferably a "tidyapproach" but not required.
library(tidyverse)
#function:
#uses lapply to loop over the conf.levels for the same x&n pair:
binom_test <- function(x, n, conf.levels = c(0.50, 0.75, 0.90, 0.95)) {
if (is.na(n) | n < 1) {
return(data.frame(
conf_level = conf.levels,
est = rep(NA, length(conf.levels)),
x = rep(NA, length(conf.levels)),
n = rep(NA, length(conf.levels)),
lower_ci = rep(NA, length(conf.levels)),
upper_ci = rep(NA, length(conf.levels))
))
} else {
results <- lapply(conf.levels, function(conf.level) {
test <- binom.test(x = x,
n = n,
conf.level = conf.level)
c(
conf_level = conf.level,
x = x,
n = n,
est = test$estimate[[1]],
lower_ci = test$conf.int[1],
upper_ci = test$conf.int[2]
)
})
results_df <- as.data.frame(do.call(rbind, results))
return(results_df)
}
}
# data
df <- tibble(
successes = round(runif(1000, 0, 100)),
trials = round(runif(1000, 0, 100)) + 100
)
# requires rowwise (slower)
df %>%
rowwise() %>%
mutate(beta = list(binom_test(x = successes, n = trials))) %>%
unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#> successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>
#> 1 78 180 0.5 78 180 0.433 0.406
#> 2 78 180 0.75 78 180 0.433 0.389
#> 3 78 180 0.9 78 180 0.433 0.371
#> 4 78 180 0.95 78 180 0.433 0.360
#> 5 87 140 0.5 87 140 0.621 0.590
#> 6 87 140 0.75 87 140 0.621 0.570
#> 7 87 140 0.9 87 140 0.621 0.549
#> 8 87 140 0.95 87 140 0.621 0.536
#> 9 89 129 0.5 89 129 0.690 0.658
#> 10 89 129 0.75 89 129 0.690 0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci
# avoids rowwise but still slow
df %>%
mutate(beta = pmap(list(successes, trials), binom_test)) %>%
unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#> successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>
#> 1 78 180 0.5 78 180 0.433 0.406
#> 2 78 180 0.75 78 180 0.433 0.389
#> 3 78 180 0.9 78 180 0.433 0.371
#> 4 78 180 0.95 78 180 0.433 0.360
#> 5 87 140 0.5 87 140 0.621 0.590
#> 6 87 140 0.75 87 140 0.621 0.570
#> 7 87 140 0.9 87 140 0.621 0.549
#> 8 87 140 0.95 87 140 0.621 0.536
#> 9 89 129 0.5 89 129 0.690 0.658
#> 10 89 129 0.75 89 129 0.690 0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci
#parallel?
# library(furrr)
# plan(multisession, workers = 5)
# df %>%
# mutate(beta = furrr::future_pmap(list(successes, trials), binom_test)) %>%
# unnest(beta, names_sep = "_")
Created on 2023-11-24 with reprex v2.0.2
EDIT: I found that the Hmisc package has a vectorised function to compute the CI. But I cannot handle NAs as above...
library(tidyverse)
library(Hmisc)
binom_conf <- function(x, n, alpha) {
out <- Hmisc::binconf(
x = x,
n = n,
alpha = alpha,
method = "exact",
return.df = TRUE
)
conf_level <- 1 - alpha
out$conf_level <- conf_level
rownames(out) <- NULL
names(out) <- c("est", "lower_ci", "upper_ci", "conf_level")
return(out)
}
df <- tibble(
successes = round(runif(1e5, 0, 100)),
trials = round(runif(1e5, 0, 100)) + 100
)
alpha <- 1 - c(0.50, 0.75, 0.90, 0.95)
system.time(
results <-
map_dfr(.x = alpha,
~ cbind(df, binom_conf(
x = df$successes,
n = df$trials,
alpha = .x
)))
)
library(furrr)
plan(multisession, workers = 5)
system.time(
results <-
future_map_dfr(.x = alpha,
~ cbind(df, binom_conf(
x = df$successes,
n = df$trials,
alpha = .x
)))
)