├── README.md ├── DVT_example ├── Logistic regression_validation_sample_size.R └── Logistic regression_validation_sample_size_parallel.R /README.md: -------------------------------------------------------------------------------- 1 | # External-validation-sample-size using simulation (1) one based on nested for loops and (2) the other based on parallel version 2 | -------------------------------------------------------------------------------- /DVT_example: -------------------------------------------------------------------------------- 1 | ### code for the DVT example. 2 | ### This can easily be tweaked for your problem in steps 1-3 3 | ### in step 4, you will need to manually change the N until 4 | ### the precision in step 3 is achieved 5 | ### (I'll wrap this all up at some point to remove the manual steps) 6 | 7 | ## requires pROC package to be installed 8 | 9 | ici.f <- function(Y, P){ 10 | loess.calibrate <- loess(Y ~ P) 11 | P.calibrate <- predict(loess.calibrate, newdata = P) 12 | return(mean(abs(P.calibrate - P))) 13 | } 14 | 15 | 16 | # step 1 17 | mu.LP <- -1.75 18 | sd.LP <- 1.47 19 | 20 | # step 2 21 | gamma <- 0 22 | S <- 1 23 | 24 | # step 3 25 | csdiff <- 0.1 26 | cstatdiff <- 0.05 27 | lnoediff <- 0.1 28 | 29 | #step 4 30 | N <- 385 ## change this until all half CI widths are lower than the pre-specified values in step 3 31 | N.SIM <- 1000 32 | slope <- vector(mode = 'numeric', length = N.SIM) 33 | slope.se <- vector(mode = 'numeric', length = N.SIM) 34 | slope.width <- vector(mode = 'numeric', length = N.SIM) 35 | 36 | c.statistic <- vector(mode = 'numeric', length = N.SIM) 37 | c.statistic.se <- vector(mode = 'numeric', length = N.SIM) 38 | c.statistic.width <- vector(mode = 'numeric', length = N.SIM) 39 | 40 | lnOE <- vector(mode = 'numeric', length = N.SIM) 41 | lnOE.se <- vector(mode = 'numeric', length = N.SIM) 42 | lnOE.width <- vector(mode = 'numeric', length = N.SIM) 43 | ici <- vector(mode = 'numeric', length = N.SIM) 44 | events <- vector(mode = 'numeric', length = N.SIM) 45 | 46 | set.seed(12131) 47 | for(i in 1:N.SIM){ 48 | LP <- rnorm(N, mu.LP, sd.LP) 49 | pr <- 1 / (1 + exp(-LP)) 50 | y <- rbinom(N, 1, pr) 51 | df <- data.frame(y = y, LP = LP) 52 | fit <- rms::lrm(y~LP, data = df, x = T, y=T) 53 | slope[i] <- as.numeric(coef(fit)[2]) 54 | slope.se[i] <- as.numeric(sqrt(diag(vcov(fit)))[2]) 55 | slope.width[i] <- 2 * qnorm(0.025, lower = F) * slope.se[i] 56 | 57 | lnOE[i] <- log(mean(pr) / mean(y)) 58 | lnOE.se[i] <- sqrt((1-mean(pr)) / sum(y)) 59 | lnOE.width[i] <- 2 * qnorm(0.025, lower = F) * lnOE.se[i] 60 | 61 | ici[i] <- ici.f(Y = y, P = pr) 62 | 63 | # get the c-statistic, its standard error and 95% CI width 64 | c.statistic[i] <- as.numeric(fit$stats['C']) 65 | c.statistic.se[i] <- sqrt(var(pROC::roc(y, LP, levels=c(0,1), quiet=T))) # based on delong 1988 66 | c.statistic.width[i] <- 2 * qnorm(0.025, lower = F) * c.statistic.se[i] 67 | 68 | # store the number outcomes simulated 69 | events[i] <- table(y)[2] 70 | } 71 | mean(c.statistic) 72 | mean(slope) 73 | mean(exp(lnOE)) 74 | mean(ici) 75 | 76 | mean(c.statistic.width) 77 | 78 | mean(slope.width) 79 | mean(lnOE.width) 80 | -------------------------------------------------------------------------------- /Logistic regression_validation_sample_size.R: -------------------------------------------------------------------------------- 1 | # simulation-based approach to calculate the sample size required for 2 | # external validation of a particular prediction model for a binary outcome 3 | # Gary Collins (02-December-2020) 4 | 5 | set.seed(200520) 6 | 7 | ## needs the rms package to be installed 8 | 9 | E <- c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000) 10 | base.prob <- 0.05 11 | mu.v <- qlogis(base.prob, lower = T) 12 | sd.v <- c(0.2, 0.4, 0.6, 0.8, 1.0) # standard deviation of the linear predictor 13 | n <- E / boot::inv.logit(mu.v) 14 | NSIM <- 500 # number of simulations 15 | n.E.scenarios <- length(E) 16 | n.sd.scenarios <- length(sd.v) 17 | 18 | hanley_se_auc <- function(x, y){ 19 | ### standard error for the C-statistic based on Hanley's formula 20 | n1 <- sum(y == 0) 21 | n2 <- sum(y == 1) 22 | Q1 <- x / (2 - x) 23 | Q2 <- 2 * x^2 / (1 + x) 24 | sqrt((x*(1-x) + (n1-1)*(Q1-x^2)+(n2-1)*(Q2-x^2)) / (n1*n2)) 25 | } 26 | 27 | events <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the number of events 28 | c.statistic <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store c-Statistic 29 | c.statistic.se <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the standard error of the C-statistic (Hanley) 30 | c.width <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store CI widths of the C-statistic 31 | slope <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the calibration slope 32 | slope.se <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the standard error of the calibration slope 33 | slope.width <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the CI widths of the calibration slope 34 | 35 | system.time(for(k in 1:n.sd.scenarios){ 36 | for(j in 1:n.E.scenarios){ 37 | for(i in 1:NSIM){ 38 | 39 | # generate the linear predictor 40 | LP <- rnorm(n[j], mu.v, sd.v[k]) 41 | pr <- 1 / (1 + exp(-LP)) 42 | # generate the outcome 43 | y <- rbinom(n[j], 1, pr) 44 | 45 | df <- data.frame(y = y, LP = LP) 46 | 47 | # fit calibration model 48 | fit <- rms::lrm(y~LP, data = df, x = T, y=T) 49 | 50 | # get the calibration slope, its standard error and 95% CI width 51 | slope[i,j,k] <- as.numeric(coef(fit)[2]) 52 | slope.se[i,j,k] <- as.numeric(sqrt(diag(vcov(fit)))[2]) 53 | slope.width[i,j,k] <- 2 * qnorm(0.025, lower = F) * slope.se[i, j, k] 54 | 55 | # get the c-statistic, its standard error and 95% CI width 56 | c.statistic[i,j,k] <- as.numeric(fit$stats['C']) 57 | c.statistic.se[i,j,k] <- hanley_se_auc(c.statistic[i,j,k], y) 58 | c.width[i,j,k] <- 2 * qnorm(0.025, lower = F) * c.statistic.se[i,j,k] 59 | 60 | # store the number outcomes simulated 61 | events[i,j,k] <- table(y)[2] 62 | } 63 | } 64 | }) 65 | 66 | # plot the results 67 | par(mfrow = c(1, 2)) 68 | plot(colMeans(events[,,1]), colMeans(c.width[,,1]), cex = 0.5, type = 'b', xlim = c(0, max(colMeans(events[,,5]))), ylim = c(0, max(c.width)), xlab = 'Average number of events', ylab = 'Average 95% CI width for the C-statistic', pch = 4, cex.lab=0.75, xaxt = 'n') 69 | axis(side = 1, at = E[-1], cex.axis=0.75) 70 | lines(colMeans(events[,,2]), colMeans(c.width[,,2]), cex = 0.5, type = 'b', col = 2, pch = 4) 71 | lines(colMeans(events[,,3]), colMeans(c.width[,,3]), cex = 0.5, type = 'b', col = 3, pch = 4) 72 | lines(colMeans(events[,,4]), colMeans(c.width[,,4]), cex = 0.5, type = 'b', col = 4, pch = 4) 73 | lines(colMeans(events[,,5]), colMeans(c.width[,,5]), cex = 0.5, type = 'b', col = 5, pch = 4) 74 | grid() 75 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75) 76 | abline(v = 100, lty = 2) 77 | abline(v = 200, lty = 2) 78 | 79 | plot(colMeans(events[,,1]), colMeans(slope.width[,,1]), cex = 0.5, type = 'b', xlim = c(0, max(colMeans(events[,,5]))), ylim = c(0, max(slope.width)), xlab = 'Average number of events', ylab = 'Average 95% CI width for the calibration slope', pch = 4, cex.lab=0.75, xaxt = 'n') 80 | axis(side = 1, at = E[-1], cex.axis=0.75) 81 | lines(colMeans(events[,,2]), colMeans(slope.width[,,2]), cex = 0.5, type = 'b', col = 2, pch = 4) 82 | lines(colMeans(events[,,3]), colMeans(slope.width[,,3]), cex = 0.5, type = 'b', col = 3, pch = 4) 83 | lines(colMeans(events[,,4]), colMeans(slope.width[,,4]), cex = 0.5, type = 'b', col = 4, pch = 4) 84 | lines(colMeans(events[,,5]), colMeans(slope.width[,,5]), cex = 0.5, type = 'b', col = 5, pch = 4) 85 | grid() 86 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75) 87 | abline(v = 100, lty = 2) 88 | abline(v = 200, lty = 2) 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Logistic regression_validation_sample_size_parallel.R: -------------------------------------------------------------------------------- 1 | # simulation-based approach to calculate the sample size required for 2 | # external validation of a particular prediction model for a binary outcome 3 | # coded in parallel 4 | # Gary Collins (02-December-2020) 5 | 6 | library(doParallel) 7 | library(foreach) 8 | library(rms) 9 | rm(list = ls()) 10 | dev.off(dev.list()["RStudioGD"]) 11 | 12 | set.seed(200520) 13 | 14 | ## needs the rms package to be installed 15 | 16 | hanley_se_auc <- function(x, y){ 17 | ### standard error for the C-statistic based on Hanley's formula 18 | n1 <- sum(y == 0) 19 | n2 <- sum(y == 1) 20 | Q1 <- x / (2 - x) 21 | Q2 <- 2 * x^2 / (1 + x) 22 | sqrt((x * (1 - x) + (n1 - 1) * (Q1 - x^2) + (n2 - 1) * (Q2 - x^2)) / (n1 * n2)) 23 | } 24 | 25 | my_fun <- function(){ 26 | # generate the linear predictor 27 | LP <- rnorm(n[k], mu.v, sd.v[j]) 28 | pr <- 1 / (1 + exp(-LP)) 29 | 30 | # generate the outcome 31 | y <- rbinom(n[k], 1, pr) 32 | 33 | df <- data.frame(y = y, LP = LP) 34 | 35 | # fit calibration model 36 | fit <- lrm(y~LP, data = df, x = T, y = T) 37 | 38 | # get the c-statistic, its standard error and 95% CI width 39 | c.statistic <- as.numeric(fit$stats['C']) 40 | c.statistic.se <- hanley_se_auc(c.statistic, y) 41 | c.width <- 2 * qnorm(0.025, lower = F) * c.statistic.se 42 | 43 | # get the calibration slope, its standard error and 95% CI width 44 | slope <- as.numeric(coef(fit)[2]) 45 | slope.se <- as.numeric(sqrt(diag(vcov(fit)))[2]) 46 | slope.width <- 2 * qnorm(0.025, lower = F) * slope.se 47 | 48 | events <- table(y)[2] 49 | 50 | myList <- list(n = n[k], sd.v = sd.v[j], events = events, c.statistic, c.width, slope, slope.width) 51 | 52 | return(myList) 53 | } 54 | 55 | E <- c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000) 56 | base.prob <- 0.01 57 | mu.v <- qlogis(base.prob, lower = T) 58 | sd.v <- c(0.2, 0.4, 0.6, 0.8, 1.0) # standard deviation of the linear predictor 59 | n <- E / boot::inv.logit(mu.v) 60 | n.E.scenarios <- length(E) 61 | n.sd.scenarios <- length(sd.v) 62 | 63 | NSIM <- 500 # number of simulations 64 | n_cores <- detectCores() 65 | cl <- makeCluster(n_cores - 1) 66 | registerDoParallel(cl) 67 | system.time(tmp <- foreach(k = 1:length(n), .combine = 'cbind') %:% foreach(j = 1:length(sd.v), .combine = 'cbind') %:% foreach(i = 1:NSIM, .packages = 'rms', .combine = 'rbind') %dopar% {rslt <- my_fun()}) 68 | stopCluster(cl) 69 | OUT <- matrix(colMeans(matrix(unlist(tmp), ncol = 7 * 55, byrow = F)), ncol = 7,byrow = T) 70 | OUT <- data.frame(OUT) 71 | names(OUT) <- c("n", "sd(LP)", "events", "c.statistic", "c (width)", "slope", "slope (width)") 72 | 73 | par(mfrow = c(1, 2)) 74 | plot(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 5], type = 'b', xlab = 'Average number of events', ylab = 'Average 95% CI width for the C-statistic', ylim = c(0, max(OUT[, 5])), xlim = c(0, max(OUT[, 3])), pch = 4, cex.lab = 0.75, xaxt = 'n') 75 | axis(side = 1, at = E[-1], cex.axis = 0.75) 76 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 5], type = 'b', col = 2, cex = 0.5, pch = 4) 77 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 5], type = 'b', col = 3, cex = 0.5, pch = 4) 78 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 5], type = 'b', col = 4, cex = 0.5, pch = 4) 79 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 5], type = 'b', col = 5, cex = 0.5, pch = 4) 80 | grid() 81 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75) 82 | abline(v = 100, lty = 2) 83 | abline(v = 200, lty = 2) 84 | 85 | plot(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 7], type = 'b', xlab = 'Average number of events', ylab = 'Average 95% CI width for the calibration slope', ylim = c(0, max(OUT[, 7])), xlim = c(0, max(OUT[, 3])), pch = 4, cex.lab = 0.75, xaxt = 'n') 86 | axis(side = 1, at = E[-1], cex.axis = 0.75) 87 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 7], type = 'b', col = 2, cex = 0.5, pch = 4) 88 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 7], type = 'b', col = 3, cex = 0.5, pch = 4) 89 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 7], type = 'b', col = 4, cex = 0.5, pch = 4) 90 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 7], type = 'b', col = 5, cex = 0.5, pch = 4) 91 | grid() 92 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75) 93 | abline(v = 100, lty = 2) 94 | abline(v = 200, lty = 2) 95 | 96 | --------------------------------------------------------------------------------