├── README.md
├── DVT_example
├── Logistic regression_validation_sample_size.R
└── Logistic regression_validation_sample_size_parallel.R


/README.md:
--------------------------------------------------------------------------------
1 | # External-validation-sample-size using simulation (1) one based on nested for loops and (2) the other based on parallel version
2 | 


--------------------------------------------------------------------------------
/DVT_example:
--------------------------------------------------------------------------------
 1 | ### code for the DVT example.
 2 | ### This can easily be tweaked for your problem in steps 1-3
 3 | ### in step 4, you will need to manually change the N until 
 4 | ### the precision in step 3 is achieved
 5 | ### (I'll wrap this all up at some point to remove the manual steps)
 6 | 
 7 | ## requires pROC package to be installed
 8 | 
 9 | ici.f <- function(Y, P){
10 |   loess.calibrate <- loess(Y ~ P)
11 |   P.calibrate     <- predict(loess.calibrate, newdata = P)
12 |   return(mean(abs(P.calibrate - P)))
13 | }
14 | 
15 | 
16 | # step 1
17 | mu.LP <- -1.75 
18 | sd.LP <-  1.47
19 | 
20 | # step 2
21 | gamma <- 0
22 | S     <- 1
23 | 
24 | # step 3
25 | csdiff    <- 0.1 
26 | cstatdiff <- 0.05 
27 | lnoediff  <- 0.1
28 | 
29 | #step 4
30 | N     <- 385 ## change this until all half CI widths are lower than the pre-specified values in step 3
31 | N.SIM <- 1000
32 | slope              <- vector(mode = 'numeric', length = N.SIM)
33 | slope.se           <- vector(mode = 'numeric', length = N.SIM)
34 | slope.width        <- vector(mode = 'numeric', length = N.SIM)
35 | 
36 | c.statistic        <- vector(mode = 'numeric', length = N.SIM)
37 | c.statistic.se     <- vector(mode = 'numeric', length = N.SIM)
38 | c.statistic.width  <- vector(mode = 'numeric', length = N.SIM)
39 | 
40 | lnOE               <- vector(mode = 'numeric', length = N.SIM)
41 | lnOE.se            <- vector(mode = 'numeric', length = N.SIM)
42 | lnOE.width         <- vector(mode = 'numeric', length = N.SIM)
43 | ici                <- vector(mode = 'numeric', length = N.SIM)
44 | events             <- vector(mode = 'numeric', length = N.SIM)
45 | 
46 | set.seed(12131)
47 | for(i in 1:N.SIM){
48 |   LP <- rnorm(N, mu.LP, sd.LP)
49 |   pr <- 1 / (1 + exp(-LP))
50 |   y  <- rbinom(N, 1, pr)
51 |   df <- data.frame(y = y, LP = LP)
52 |   fit <- rms::lrm(y~LP, data = df, x = T, y=T)
53 |   slope[i]       <- as.numeric(coef(fit)[2])
54 |   slope.se[i]    <- as.numeric(sqrt(diag(vcov(fit)))[2])
55 |   slope.width[i] <- 2 * qnorm(0.025, lower = F) * slope.se[i]
56 |   
57 |   lnOE[i]       <- log(mean(pr) / mean(y))
58 |   lnOE.se[i]    <- sqrt((1-mean(pr)) / sum(y))
59 |   lnOE.width[i] <- 2 * qnorm(0.025, lower = F) * lnOE.se[i]
60 |   
61 |   ici[i] <- ici.f(Y = y, P = pr)
62 |   
63 |   # get the c-statistic, its standard error and 95% CI width
64 |   c.statistic[i]        <- as.numeric(fit$stats['C'])
65 |   c.statistic.se[i]    <- sqrt(var(pROC::roc(y, LP, levels=c(0,1), quiet=T))) # based on delong 1988
66 |   c.statistic.width[i] <- 2 * qnorm(0.025, lower = F) * c.statistic.se[i]
67 |   
68 |   # store the number outcomes simulated 
69 |   events[i] <- table(y)[2]
70 | }
71 | mean(c.statistic)
72 | mean(slope)
73 | mean(exp(lnOE))
74 | mean(ici)
75 | 
76 | mean(c.statistic.width) 
77 | 
78 | mean(slope.width)       
79 | mean(lnOE.width)        
80 | 


--------------------------------------------------------------------------------
/Logistic regression_validation_sample_size.R:
--------------------------------------------------------------------------------
 1 | # simulation-based approach to calculate the sample size required for 
 2 | # external validation of a particular prediction model for a binary outcome
 3 | # Gary Collins (02-December-2020)
 4 | 
 5 | set.seed(200520)
 6 | 
 7 | ## needs the rms package to be installed 
 8 | 
 9 | E         <- c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)
10 | base.prob <- 0.05
11 | mu.v      <- qlogis(base.prob, lower = T)
12 | sd.v      <- c(0.2, 0.4, 0.6, 0.8, 1.0) # standard deviation of the linear predictor
13 | n         <- E / boot::inv.logit(mu.v)
14 | NSIM      <- 500 # number of simulations
15 | n.E.scenarios  <- length(E)
16 | n.sd.scenarios <- length(sd.v)
17 | 
18 | hanley_se_auc <- function(x, y){
19 |   ### standard error for the C-statistic based on Hanley's formula
20 |   n1 <- sum(y == 0)
21 |   n2 <- sum(y == 1)
22 |   Q1 <- x / (2 - x)
23 |   Q2 <- 2 * x^2 / (1 + x)
24 |   sqrt((x*(1-x) + (n1-1)*(Q1-x^2)+(n2-1)*(Q2-x^2)) / (n1*n2))
25 | }
26 | 
27 | events         <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the number of events
28 | c.statistic    <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store c-Statistic
29 | c.statistic.se <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the standard error of the C-statistic (Hanley)
30 | c.width        <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store CI widths of the C-statistic
31 | slope          <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the calibration slope
32 | slope.se       <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the standard error of the calibration slope
33 | slope.width    <- array(dim = c(NSIM, n.E.scenarios, n.sd.scenarios)) # store the CI widths of the calibration slope
34 | 
35 | system.time(for(k in 1:n.sd.scenarios){
36 |   for(j in 1:n.E.scenarios){
37 |     for(i in 1:NSIM){
38 |       
39 |       # generate the linear predictor
40 |       LP <- rnorm(n[j], mu.v, sd.v[k])
41 |       pr <- 1 / (1 + exp(-LP))
42 |       # generate the outcome
43 |       y  <- rbinom(n[j], 1, pr)
44 |     
45 |       df <- data.frame(y = y, LP = LP)
46 |       
47 |       # fit calibration model
48 |       fit <- rms::lrm(y~LP, data = df, x = T, y=T)
49 |       
50 |       # get the calibration slope, its standard error and 95% CI width
51 |       slope[i,j,k]       <- as.numeric(coef(fit)[2])
52 |       slope.se[i,j,k]    <- as.numeric(sqrt(diag(vcov(fit)))[2])
53 |       slope.width[i,j,k] <- 2 * qnorm(0.025, lower = F) * slope.se[i, j, k]
54 |       
55 |       # get the c-statistic, its standard error and 95% CI width
56 |       c.statistic[i,j,k]    <- as.numeric(fit$stats['C'])
57 |       c.statistic.se[i,j,k] <- hanley_se_auc(c.statistic[i,j,k], y)
58 |       c.width[i,j,k]        <- 2 * qnorm(0.025, lower = F) * c.statistic.se[i,j,k]
59 |       
60 |       # store the number outcomes simulated 
61 |       events[i,j,k] <- table(y)[2]
62 |     }
63 |   }
64 | })
65 | 
66 | # plot the results
67 | par(mfrow = c(1, 2))
68 | plot(colMeans(events[,,1]),  colMeans(c.width[,,1]), cex = 0.5, type = 'b', xlim = c(0, max(colMeans(events[,,5]))), ylim = c(0, max(c.width)), xlab = 'Average number of events', ylab = 'Average 95% CI width for the C-statistic', pch = 4, cex.lab=0.75, xaxt = 'n')
69 | axis(side = 1, at = E[-1], cex.axis=0.75)
70 | lines(colMeans(events[,,2]), colMeans(c.width[,,2]), cex = 0.5, type = 'b', col = 2, pch = 4)
71 | lines(colMeans(events[,,3]), colMeans(c.width[,,3]), cex = 0.5, type = 'b', col = 3, pch = 4)
72 | lines(colMeans(events[,,4]), colMeans(c.width[,,4]), cex = 0.5, type = 'b', col = 4, pch = 4)
73 | lines(colMeans(events[,,5]), colMeans(c.width[,,5]), cex = 0.5, type = 'b', col = 5, pch = 4)
74 | grid()
75 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75)
76 | abline(v = 100, lty = 2)
77 | abline(v = 200, lty = 2)
78 | 
79 | plot(colMeans(events[,,1]),  colMeans(slope.width[,,1]), cex = 0.5, type = 'b', xlim = c(0, max(colMeans(events[,,5]))), ylim = c(0, max(slope.width)), xlab = 'Average number of events', ylab = 'Average 95% CI width for the calibration slope', pch = 4, cex.lab=0.75, xaxt = 'n')
80 | axis(side = 1, at = E[-1], cex.axis=0.75)
81 | lines(colMeans(events[,,2]), colMeans(slope.width[,,2]), cex = 0.5, type = 'b', col = 2, pch = 4)
82 | lines(colMeans(events[,,3]), colMeans(slope.width[,,3]), cex = 0.5, type = 'b', col = 3, pch = 4)
83 | lines(colMeans(events[,,4]), colMeans(slope.width[,,4]), cex = 0.5, type = 'b', col = 4, pch = 4)
84 | lines(colMeans(events[,,5]), colMeans(slope.width[,,5]), cex = 0.5, type = 'b', col = 5, pch = 4)
85 | grid()
86 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75)
87 | abline(v = 100, lty = 2)
88 | abline(v = 200, lty = 2)
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/Logistic regression_validation_sample_size_parallel.R:
--------------------------------------------------------------------------------
 1 | # simulation-based approach to calculate the sample size required for 
 2 | # external validation of a particular prediction model for a binary outcome
 3 | # coded in parallel
 4 | # Gary Collins (02-December-2020)
 5 | 
 6 | library(doParallel)
 7 | library(foreach)
 8 | library(rms)
 9 | rm(list = ls())
10 | dev.off(dev.list()["RStudioGD"])
11 | 
12 | set.seed(200520)
13 | 
14 | ## needs the rms package to be installed 
15 | 
16 | hanley_se_auc <- function(x, y){
17 |   ### standard error for the C-statistic based on Hanley's formula
18 |   n1 <- sum(y == 0)
19 |   n2 <- sum(y == 1)
20 |   Q1 <- x / (2 - x)
21 |   Q2 <- 2 * x^2 / (1 + x)
22 |   sqrt((x * (1 - x) + (n1 - 1) * (Q1 - x^2) + (n2 - 1) * (Q2 - x^2)) / (n1 * n2))
23 | }
24 | 
25 | my_fun <- function(){    
26 |   # generate the linear predictor
27 |   LP <- rnorm(n[k], mu.v, sd.v[j])
28 |   pr <- 1 / (1 + exp(-LP))
29 |   
30 |   # generate the outcome
31 |   y  <- rbinom(n[k], 1, pr)
32 |   
33 |   df <- data.frame(y = y, LP = LP)
34 |   
35 |   # fit calibration model
36 |   fit <- lrm(y~LP, data = df, x = T, y = T)
37 |   
38 |   # get the c-statistic, its standard error and 95% CI width
39 |   c.statistic    <- as.numeric(fit$stats['C'])
40 |   c.statistic.se <- hanley_se_auc(c.statistic, y)
41 |   c.width        <- 2 * qnorm(0.025, lower = F) * c.statistic.se
42 |   
43 |   # get the calibration slope, its standard error and 95% CI width
44 |   slope       <- as.numeric(coef(fit)[2])
45 |   slope.se    <- as.numeric(sqrt(diag(vcov(fit)))[2])
46 |   slope.width <- 2 * qnorm(0.025, lower = F) * slope.se
47 | 
48 |   events <- table(y)[2]
49 |   
50 |   myList <- list(n = n[k], sd.v = sd.v[j], events = events, c.statistic, c.width, slope, slope.width)
51 |   
52 |   return(myList)
53 | }
54 | 
55 | E         <- c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)
56 | base.prob <- 0.01
57 | mu.v      <- qlogis(base.prob, lower = T)
58 | sd.v      <- c(0.2, 0.4, 0.6, 0.8, 1.0) # standard deviation of the linear predictor
59 | n         <- E / boot::inv.logit(mu.v)
60 | n.E.scenarios  <- length(E)
61 | n.sd.scenarios <- length(sd.v)
62 | 
63 | NSIM    <- 500 # number of simulations
64 | n_cores <- detectCores()
65 | cl      <- makeCluster(n_cores - 1)
66 | registerDoParallel(cl)
67 | system.time(tmp <- foreach(k = 1:length(n), .combine = 'cbind') %:% foreach(j = 1:length(sd.v), .combine = 'cbind') %:% foreach(i = 1:NSIM, .packages = 'rms', .combine = 'rbind') %dopar% {rslt <- my_fun()})
68 | stopCluster(cl)
69 | OUT <- matrix(colMeans(matrix(unlist(tmp), ncol = 7 * 55, byrow = F)), ncol = 7,byrow = T)
70 | OUT <- data.frame(OUT)
71 | names(OUT) <- c("n", "sd(LP)", "events", "c.statistic", "c (width)", "slope", "slope (width)")
72 | 
73 | par(mfrow = c(1, 2))
74 | plot(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 5], type = 'b', xlab = 'Average number of events', ylab = 'Average 95% CI width for the C-statistic', ylim = c(0, max(OUT[, 5])), xlim = c(0, max(OUT[, 3])), pch = 4, cex.lab = 0.75, xaxt = 'n')
75 | axis(side = 1, at = E[-1], cex.axis = 0.75)
76 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 5], type = 'b', col = 2, cex = 0.5, pch = 4)
77 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 5], type = 'b', col = 3, cex = 0.5, pch = 4)
78 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 5], type = 'b', col = 4, cex = 0.5, pch = 4)
79 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 5], type = 'b', col = 5, cex = 0.5, pch = 4)
80 | grid()
81 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75)
82 | abline(v = 100, lty = 2)
83 | abline(v = 200, lty = 2)
84 | 
85 | plot(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51), 7], type = 'b', xlab = 'Average number of events', ylab = 'Average 95% CI width for the calibration slope', ylim = c(0, max(OUT[, 7])), xlim = c(0, max(OUT[, 3])), pch = 4, cex.lab = 0.75, xaxt = 'n')
86 | axis(side = 1, at = E[-1], cex.axis = 0.75)
87 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 1, 7], type = 'b', col = 2, cex = 0.5, pch = 4)
88 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 2, 7], type = 'b', col = 3, cex = 0.5, pch = 4)
89 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 3, 7], type = 'b', col = 4, cex = 0.5, pch = 4)
90 | lines(OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 3], OUT[c(1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51) + 4, 7], type = 'b', col = 5, cex = 0.5, pch = 4)
91 | grid()
92 | legend("topright", col = 1:6, legend = as.character(sd.v), lty = 1, title = expression(paste('SD(LP) ', sigma)), cex = 0.75)
93 | abline(v = 100, lty = 2)
94 | abline(v = 200, lty = 2)
95 | 
96 | 


--------------------------------------------------------------------------------