├── .gitignore
├── .gitattributes
├── baroda_0102_1obs.dta
├── LICENSE.txt
├── README.md
├── J-PAL_Power_by_simulation_no_clusters.R
├── J-PAL_Power_by_simulation_no_clusters.do
├── J-PAL_Power_by_simulation_clusters.R
├── J-PAL_Power_by_simulation_clusters.do
├── J-PAL_Power_built_in_commands.do
└── JPAL_Power_built_in_commands.R


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .Rhistory
3 | .DS_Store
4 | .Rhistory
5 | .Rapp.history
6 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/baroda_0102_1obs.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J-PAL/Sample_Size_and_Power/HEAD/baroda_0102_1obs.dta


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Sabhya Gupta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Power_code
 2 |  Sample code for conducting power calculations using in-built commands and simulations
 3 | 
 4 | ## Description
 5 | 
 6 | This repository contains some sample code on conducting power calculations in either STATA or R. All files are self contained and can be  run independently from the other scripts. Please read the code preamble for more details on each file. Please refer to the longer [Power calculations research resource](https://www.povertyactionlab.org/resource/power-calculations) to learn more about the intuition behind the power calculations. 
 7 | 
 8 | 
 9 | ## Files
10 | 
11 | J-PAL_Power_built_in_commands: Uses in-built power commands in STATA and R to calculate sample size and minimum detectable effect size with or without covariates and with or without imperfect compliance in individual and clustered models. Both files can be run with any baseline dataset with a continuous outcome and binary treatment variable. See the code preamble for more instructions on how to adapt the code to your context. The sample code uses the Balsakhi dataset (baroda_0102_1obs.dta) for illustration purposes. 
12 | 
13 | J-PAL_Power_by_simulation_no_clusters: Calculates power using a dummy dataset simulated using an underlying sample distribution and a few design parameters. The underlying distribution and the design factors can be changed to suit the context of use. 
14 | 
15 | J-PAL_Power_by_simulation_clusters: Calculates power with simulated dataset as the previous file but with a clustered design. 
16 | 
17 | baroda_0102_1obs.dta: Data file required to run J-PAL_Power_built_in_command as written. The file can also be run with other similar datasets with a continuous outcome and treatment variable.You can learn more about the Balsakhi dataset from the documentation and data here at https://doi.org/10.7910/DVN/UV7ERB. 
18 | 
19 | ## Support
20 | 
21 | Please use the [issue tracker](https://github.com/J-PAL/Sample_Size_and_Power/issues) for all support requests
22 | 
23 | 
24 | ## License
25 | 
26 | See [license file](https://github.com/J-PAL/Sample_Size_and_Power/blob/main/LICENSE.txt)
27 | 
28 | ## Authors
29 | Sabhya Gupta (sagupta@povertyactionlab.org) with input from Jack Cavanagh, Maya Duru, Mike Gibson and Sarah Kopper
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/J-PAL_Power_by_simulation_no_clusters.R:
--------------------------------------------------------------------------------
  1 | # Power by Simulation for an experiment without clusters
  2 | ##################################################################################
  3 | # 
  4 | # This R-script:
  5 | # - Generates fake data for control and treatment with pre-determined effect size and error distribution
  6 | # - Runs the regression
  7 | # - Repeats this over and over and record how many times the treatment effect is statistically significant 
  8 | # - The percentage of simulations with a statistically significant effect is the power
  9 | # 
 10 | # Our model is:
 11 | #   
 12 | #   outcome = intercept + effect*treatment_dummy + individual_error
 13 | # 
 14 | # We assume that the error is normally distributed but the researcher can specify their own distribution
 15 | # 
 16 | # 
 17 | # Contents:
 18 | #   
 19 | #   1. SPECIFY design factors
 20 | #   2. Simulation on generated data
 21 | #   3. Load results of simulation and estimate power
 22 | # 
 23 | # 
 24 | # - To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 25 | # - The seed and all the locals in section 1 can be changed based on the research design 
 26 | # 
 27 | # Created by: Sabhya Gupta with input from Jack Cavanaugh, Mike Gibson, Sarah Kopper
 28 | # To report errors/make suggestions or ask questions contact: Sabhya Gupta - sagupta@povertyactionlab.org / sabhya154@gmail.com
 29 | # Last edited: 05/19/2021
 30 | 
 31 | ###############################################################################
 32 | 
 33 | 
 34 | #### 1. SPECIFY design factors ####################################
 35 | 
 36 | ## set seed, specify design factors and the number of simulations
 37 | set.seed(123456)	                                      												#seed for replication 		
 38 | sample_size = 500                                                               #sample size
 39 | effect = 0.2                                                                    #hypothesized treatment effect
 40 | prop = 0.5                                                                      #the proportion of the total sample in the treatment group
 41 | alpha = 0.05                                                                    #define the significance value. Here we are conducting a two-sided test.
 42 | sim.size = 2000                                                               #number of simulations
 43 | side = "two"                                                                    #the kind of test (two, left or right)
 44 | 
 45 | 
 46 | ### 2. Calculate the power by Simulation ##########################
 47 | 
 48 | ## initialize vectors and data frames to store results of the simulation
 49 | simulated_data <- data.frame()[1:sample_size,]
 50 | 
 51 | 
 52 | reject_t <- rep(0, sim.size)
 53 | t_value <- rep(0L, sim.size)
 54 | 
 55 | ## The simulation loop
 56 | 
 57 |   i<- 1                                                                         #initialize iteration
 58 |   
 59 |   while (i <= sim.size){
 60 |     simulated_data$treat <- ifelse(runif(n=sample_size, min=0, max=1)<prop,1,0) #random assignment
 61 |     
 62 |     simulated_data$error <- rnorm(n=sample_size, mean=0, sd=1)                  #SPECIFY - simulated error is assumed to follow a normal distribution 
 63 |                                                                                 #but the researcher can decide the appropriate distribution 
 64 |                                                                                 #for the sample population. 
 65 |                                                                                 #the sd is the expected standard deviation of the outcome
 66 |     
 67 |     simulated_data$outcome <- 10 + simulated_data$error                         #SPECIFY - assumed outcome for control based on the design specification
 68 |     
 69 |     simulated_data$outcome[simulated_data$treat==1]<-                           #potential outcome for the treatment is higher than the control 
 70 |                    simulated_data$outcome[simulated_data$treat==1] + effect     #by the effect size
 71 | 
 72 |     
 73 |      fit.sim <- lm(outcome~treat, data = simulated_data)                        #simple regression - the true effect is as specified above
 74 |      t_value[i]<- summary(fit.sim)$coef[2,1]/summary(fit.sim)$coef[2,2]                                    #the t value for the t test
 75 |      
 76 |      if (side == "two"){
 77 |        
 78 |        critical_u <- qt(1-alpha/2,(sample_size-2))                              #the upper critical value
 79 |        critical_l <- qt(alpha/2,(sample_size-1))                                #the lower critical value
 80 |        reject_t[i] = ifelse(abs(t_value[i])> abs(critical_u),1,0)               #reject if the t-value lies in the critical region 
 81 |   
 82 |      }
 83 |      
 84 |      else if (side == "right"){
 85 |        critical_u <- qt(1-alpha,(sample_size-2))                                #the upper critical value
 86 |        reject_t[i] = ifelse(t_value[i]> critical_u,1,0)                         #reject if the t-value is more than the upper critical value 
 87 |      }
 88 |      
 89 |      else if (side == "left"){
 90 |        critical_l <- qt(alpha,(sample_size-2))                                  #the lower critical value
 91 |        reject_t[i] = ifelse(t_value[i]< critical_l,1,0)                         #reject if the  t-value is lower than the lower critical level. 
 92 | 
 93 |      }
 94 |      
 95 |      i <- i +1 
 96 |   
 97 |   }
 98 | 
 99 |   
100 | ### 3. Load results of simulation and estimate power ######################
101 |   
102 | power = mean(reject_t)
103 | 
104 | cat("If the treatment effect and the assumption about the disribution of the error is true, the study with the sample size",
105 |       sample_size, "will detect the true treatment effect of", effect, 
106 |       "with probability", power, "in a", side,"-sided test. Hence the power of the study is", power)
107 | 
108 | ##### To improve the power, you can:
109 | #   increase sample size (sample_size)
110 | #   increase number of clusters (num_clusters). This will be more effective than increasing cluster size if the individuals are highly correlated
111 | #   increase cluster size (cluster_size)
112 | #   adjust the ratio of treatment to control (prop)
113 | #   increase significance level (alpha)
114 | #   add covariates to the regression (should be done carefully when simulating data)
115 | # 
116 | # note that increasing the effect size will mechanically increase power, but to ensure a study is adequately 
117 | # powered it is important to use a reasonable effect size (i.e., the effect size specified here should not be 
118 | # increased just to make the calculations turn out in your favor). So, to increase power, you should only increase
119 | # the effect size specified if you believe you can increase the effect size in the real world, such as by tweaking
120 | # the intervention
121 | # 
122 | # the same is true of the ICC: a lower ICC will mechanically increase study power, but it's important to use a 
123 | # reasonable ICC rather than one that is favorable in your calculations
124 | 


--------------------------------------------------------------------------------
/J-PAL_Power_by_simulation_no_clusters.do:
--------------------------------------------------------------------------------
  1 | *Power by Simulation for an experiment without clusters
  2 | 
  3 | *Created by: Sabhya Gupta with input from Jack Cavanagh, Mike Gibson, Sarah Kopper
  4 | *All errors are the author's alone
  5 | 
  6 | *To report errors/make suggestions or ask questions contact: Sabhya Gupta - sagupta@povertyactionlab.org
  7 | *Last edited: 06/02/2021
  8 | 
  9 | ********************************************************************************
 10 | 
 11 | /*
 12 | 
 13 | Our model is:
 14 | 
 15 | outcome = intercept + effect*treatment_dummy + individual_error
 16 | 
 17 | The error is assumed to be normally distributed, though the researcher can specify their own distribution
 18 |  
 19 | 
 20 | This do-file:
 21 | *Generates fake data for control and treatment with pre-determined effect size and error distribution
 22 | *Runs the regression
 23 | *Repeats this over and over and record how many times the treatment effect is statistically significant 
 24 | *The percentage of simulations with a statistically significant effect is the study's power
 25 | 
 26 | 
 27 | 
 28 | Contents:
 29 | 
 30 | 	0. Housekeeping and specify temp files 
 31 | 	1. SPECIFY design factors and the number of simulations
 32 | 	2. Simulation on generated data
 33 | 	3. Load results of simulation and estimate power
 34 | 	
 35 | 	
 36 | To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 37 | The seed and all the locals in section 1 can be changed based on the research design 
 38 | 
 39 | 
 40 | */
 41 | 
 42 | ****************************************************************************************
 43 | ***************************** 0. Housekeeping and specify temp files *******************
 44 | ****************************************************************************************
 45 | 
 46 | 
 47 | cd "" 																			//SPECIFY - working directory
 48 | capture log close "power_by_simulation_no_clusters"
 49 | log using "power_by_simulation_no_clusters", replace
 50 |  
 51 | 
 52 | *Create a temporary file that will store the results of the simulation
 53 | tempname sim_name
 54 | tempfile sim_results
 55 | postfile `sim_name' reject_t using `sim_results'
 56 | 
 57 | clear
 58 | set seed 123456																	//SPECIFY - seed for replication 
 59 | 
 60 | 
 61 | ******************************************************************************************
 62 | ************* 1. SPECIFY design factors and the number of simulations ********************
 63 | ******************************************************************************************
 64 | 
 65 | local sample_size = 500															//SPECIFY - sample size
 66 | local effect = 0.2																//SPECIFY - hypothesized treatment effect
 67 | local prop = 0.5 																//SPECIFY - ratio of the size of the treatment and control
 68 | local alpha = 0.05																//SPECIFY - define the significance value. 
 69 | local side "two"																//SPECIFY - "two", "left", "right" for a two-sided, left-sided or right-sided test
 70 | 
 71 | local sims=1000																	//SPECIFY - number of simulations
 72 | 
 73 | 
 74 | ******************************************************************************************
 75 | ******************************* 2. Simulation on generated data **************************
 76 | ******************************************************************************************
 77 | 
 78 | *Generate fake data with specified distribution and effect, regress outcome on treatment and record if significant
 79 | local it = 1																	//iteration number
 80 | 
 81 | while `it' <=`sims'{
 82 | 		clear
 83 | 		quietly set obs `sample_size'
 84 | 		quietly gen treat= runiform()<`prop'									//random assignment
 85 | 
 86 | 		drawnorm error, m(0) sd(1)												//SPECIFY - the error is assumed to follow a standard normal but the researcher can decide the appropriate distribution for the sample population
 87 | 		quietly gen outcome = 10 + error										//SPECIFY - assumed outcome for control. This can change based on the design specification
 88 | 		
 89 | 		quietly replace outcome = outcome + `effect' if treat					//potential outcome for the treatment is higher than the control by the effect size
 90 | 		
 91 | 		quietly regress outcome treat											//simple regression - the true effect is as specified above
 92 | 
 93 | 		local t_value = _b[treat]/_se[treat]									//the t-value for the t-test
 94 | 		local df=`sample_size'-2												//degrees of freedom is a function of the sample size
 95 | 			
 96 | 	if "`side'" == "two" {
 97 | 		local critical_l = invt(`df', `alpha'/2)								//the lower critical value
 98 | 		local critical_u = invt(`df', 1-`alpha'/2)								//the upper critical value
 99 | 		local reject_t=(`t_value'>`critical_u')|(`t_value'<`critical_l')		//reject if the t-value lies in the critical level. It takes value 1 if the null is rejected and 0 if not
100 | 	}
101 | 	
102 | 	else if "`side'" == "left" {
103 | 		local critical_l = invt(`df', `alpha')									//the lower critical value
104 | 		local reject_t=(`t_value'<`critical_l')									//reject if the t-value is less than the lower critical level. It takes value 1 if the null is rejected and 0 if not
105 | 	}
106 | 	
107 | 	else if "`side'" == "right" {
108 | 		local critical_u = invt(`df', 1-`alpha')								//the upper critical value
109 | 		local reject_t=(`t_value'>`critical_u')									//reject if the t-value is more than the upper critical level. It takes value 1 if the null is rejected and 0 if not
110 | 	}
111 | 	
112 | 
113 | 		post `sim_name' (`reject_t')											//write output from simulation to the temporary file
114 | 		
115 | 		tempfile simulated_data_no_clusters`it'									//save the data from the iterations
116 | 		save `simulated_data_no_clusters`it'', replace
117 | 	
118 | 		
119 | 		local it = `it' +1 
120 | 
121 | }
122 | 
123 | use `simulated_data_no_clusters4', clear										//load any of the simulated data files by changing the number - this may be useful to make sure that the simulation is working as intended
124 | 
125 | 
126 | 
127 | *****************************************************************************************
128 | ******************** 3. Load results of simulation and estimate power *******************
129 | *****************************************************************************************
130 | 
131 | 
132 | postclose `sim_name'
133 | use `sim_results',clear
134 | 
135 | sum reject_t
136 | 
137 | di as error "If the treatment effect and the assumption about the distribution of the error is true, the study with sample size `sample_size' will detect the true treatment effect of `effect' with probability `r(mean)' in a `side'-sided test. Hence the power of the study is `r(mean)'"
138 | 
139 | cap log close
140 | 
141 | /* To improve the power, you can:
142 | *increase sample size (sample_size)
143 | *adjust the ratio of treatment to control (prop)
144 | *increase significance level (alpha)
145 | *add covariates to the regression (should be done carefully when simulating data)
146 | 
147 | *note that increasing the effect size will mechanically increase power, but to ensure a study is adequately 
148 | powered it is important to use a reasonable effect size (i.e., the effect size specified here should not be 
149 | increased just to make the calculations turn out in your favor). So, to increase power, you should only increase
150 | the effect size specified if you believe you can increase the effect size in the real world, such as by tweaking
151 | the intervention
152 | 
153 | ********************************************************************************\
154 | 


--------------------------------------------------------------------------------
/J-PAL_Power_by_simulation_clusters.R:
--------------------------------------------------------------------------------
  1 | # Power by Simulation for an experiment with equal clusters
  2 | ##################################################################################
  3 | 
  4 | # This R-script:
  5 | # - Generates fake data for control and treatment clusters with pre-determined cluster-specific errors, effect size and design features
  6 | # - Generates fake data for individuals with individual and cluster-specific errors
  7 | # - Runs the regression and clusters the errors
  8 | # - Repeats this over and over and record how many times the effect of the treatment is statistically significant 
  9 | # - The percentage of simulations with a statistically significant effect is the power
 10 | # 
 11 | # 
 12 | # In a clustered design, our model is:
 13 | #   
 14 | #   Outcome = intercept + effect*treatment_dummy + cluster_error + individual_error
 15 | # 
 16 | # The model can be changed based on the research design
 17 | # 
 18 | # The total variance of the outcome in the model = cluster-specific variance + individual specific variance
 19 | # We can define the ICC as the ratio of the variance of the cluster specific error and the total variance
 20 | # 
 21 | # The cluster-specific and individual-specific error are both assumed to be continuously and normally distributed but can be 
 22 | # changed based on the sample population. 
 23 | # 
 24 | # Contents:
 25 | #   0. Set-up
 26 | #   1. SPECIFY design factors
 27 | #   2. Simulation on generated data
 28 | #   3. Load results of simulation and estimate power
 29 | # 
 30 | # 
 31 | # - To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 32 | # - The seed and all the locals in section 1 can be changed based on the research design 
 33 | # 
 34 | # Created by: Sabhya Gupta with input from Jack Cavanaugh, Mike Gibson, Sarah Kopper
 35 | # All errors are the author's alone
 36 | 
 37 | # To report errors/make suggestions or ask questions contact: Sabhya Gupta - sagupta@povertyactionlab.org / sabhya154@gmail.com
 38 | # Last edited: 05/19/2021
 39 | 
 40 | ################################################################################
 41 | 
 42 | 
 43 | ### Set-up ###########################################################
 44 | 
 45 | #install.packages(c("Hmisc","dplyr","ICC", "plm","lmtest"))
 46 | 
 47 | library(Hmisc)
 48 | library(dplyr)
 49 | library(ICC)
 50 | library(plm)
 51 | library(lmtest)
 52 | library(multiwayvcov)
 53 | 
 54 | rm(list=ls())
 55 | 
 56 | #### 1. Specify design factors ############################################
 57 | 
 58 | set.seed(123456)	                                      												#seed for replication 		
 59 | effect=0.25                                                                     #hypothesized treatment effect
 60 | prop=0.5                                                                        #ratio of the size of the treatment and control
 61 | alpha = 0.05                                                                    #define the significance value. Here we are conducting a two-sided test.
 62 | cluster_size = 500
 63 | num_clusters = 200                                                              #total number of clusters
 64 | sample_size = num_clusters*cluster_size
 65 | control_intercept = 10
 66 | 
 67 | ind_err_var = 10															                                  #This is the variation among individuals in a clusters
 68 | icc = 0.4																                                        #correlation between two randomly chosen individuals within a cluster
 69 | cluster_err_var = (icc*ind_err_var)/(1-icc)	      						                  #Variation across clusters. 
 70 | side="two"                                                                      #the kind of test (two, left or right)
 71 | 
 72 | sim.size = 1000                                                                 #number of simulations
 73 | 
 74 | #### 2. Calculate the power by Simulation ####################################
 75 | 
 76 | ## initialize vectors and data frames to store results of the simulation
 77 | simulated_clusters <- data.frame()[1:num_clusters,]
 78 | simulated_data <- data.frame()[1:sample_size,]
 79 | 
 80 | rownames(simulated_clusters) <- 1:nrow(simulated_clusters)
 81 | rownames(simulated_data) <- 1:nrow(simulated_data)
 82 | 
 83 | 
 84 | reject_t <- rep(0, sim.size)
 85 | t_value <- rep(0L, sim.size)
 86 | rho_value<- rep(0L, sim.size)
 87 | 
 88 | critical_u <- qt(1-alpha/2,2*(num_clusters-1))                                      #the upper critical value
 89 | critical_l <- qt(alpha/2,2*(num_clusters-1))                                        #the lower critical value
 90 | 
 91 | 
 92 | i<- 1                                                                           #initialize iteration
 93 | 
 94 | while (i <= sim.size){
 95 |   simulated_clusters$cluster<- rep((1:num_clusters))
 96 |   
 97 |   simulated_clusters$cluster_error <- rnorm(n=num_clusters, mean=0, 
 98 |                                             sd=sqrt(cluster_err_var))           #SPECIFY distribution of cluster specific errors. 
 99 |                                                                                 #The distribution can be changed based on the sample 
100 |   simulated_clusters$treat <- ifelse(runif
101 |                                      (n=num_clusters, min=0, max=1)<prop,1,0)   #random assignment of clusters
102 |   
103 |   simulated_data$u <- rnorm(n=sample_size, mean=0, sd=1)
104 |   simulated_data$cluster <- as.numeric(cut2(simulated_data$u,
105 |                                             g=num_clusters, m=cluster_size))    #dividing individuals into clusters
106 |   
107 |   full_data <- left_join(simulated_data, simulated_clusters, 
108 |                          by=c("cluster"="cluster"),suffix = c(".x", ".y") )
109 |   
110 |   full_data$individual_error <-  rnorm(n=sample_size, 
111 |                                        mean=0, sd=sqrt(ind_err_var))            #SPECIFY distribution of individual specific error
112 |   
113 |   full_data$outcome <- control_intercept+full_data$cluster_error+
114 |                                         full_data$individual_error              #SPECIFY - outcome for the control
115 |   
116 |   full_data$outcome[full_data$treat==1]<-                                       #potential outcome for the treatment is higher than the control 
117 |     full_data$outcome[full_data$treat==1] + effect                              #by the effect size
118 | 
119 |   control_subset <- subset(full_data,treat==0)
120 |   rownames(control_subset) <- 1:nrow(control_subset)
121 |   control_subset$cluster <- as.factor(control_subset$cluster)
122 |   
123 |   icc<- ICCest(cluster, outcome, data = control_subset)                         #testing the ICC in the simulation
124 |   rho_value[i]<- icc$ICC                                                        #this is the ICC estimated from the simulated data.
125 |                                                                                 #As the sample size increase, this value will approach the ICC specified above 
126 |   
127 |   fit.sim <- lm(outcome~treat, data = full_data)                                #simple regression - the true effect is as specified above
128 |   robust_SE <- cluster.vcov(fit.sim, full_data$cluster,df_correction = TRUE)    #cluster the standard errors
129 |   robust_coef<-coeftest(fit.sim, robust_SE)                                     
130 |   t_value[i]<- robust_coef[2,3]                                                 #the t value for the t test
131 |   
132 |   if (side == "two"){
133 |     
134 |     critical_u <- qt(1-alpha/2,2*(num_clusters-1))                              #the upper critical value
135 |     critical_l <- qt(alpha/2,sample_size)                                       #the lower critical value
136 |     reject_t[i] = ifelse(abs(t_value[i])> abs(critical_u),1,0)                  #reject if the t-value lies in the critical region 
137 |     
138 |   }
139 |   
140 |   else if (side == "right"){
141 |     critical_u <- qt(1-alpha,2*(num_clusters-1))                                  #the upper critical value
142 |     reject_t[i] = ifelse(abs(t_value[i])> abs(critical_u),1,0)                  #reject if the t-value is more than the upper critical value 
143 |   }
144 |   
145 |   else if (side == "left"){
146 |     critical_l <- qt(alpha,2*(num_clusters-1))                                  #the lower critical value
147 |     reject_t[i] = ifelse(abs(t_value[i])> abs(critical_l),1,0)                  #reject if the abs value of t is higher than the lower critical level. 
148 |     
149 |   }
150 |   
151 |   i <- i +1 
152 |   
153 |   
154 | }
155 | 
156 | ### 3. Load results of simulation and estimate power ######################
157 | 
158 | power = mean(reject_t)
159 | power
160 | 
161 | rho_calculated = mean(rho_value)
162 | 
163 | cat("If the treatment effect and the assumption about the disribution of the error is true, the study with", num_clusters,"of size", cluster_size,
164 |     "will detect the true treatment effect of", effect, 
165 |     "with probability,", power, "in a", side,"-sided test. Hence the power of the study is", power,
166 |     ".The calculated ICC is", rho_calculated)
167 | 
168 | ##### To improve the power, you can:
169 | #   increase sample size (sample_size)
170 | #   increase number of clusters (num_clusters). This will be more effective than increasing cluster size if the individuals are highly correlated
171 | #   increase cluster size (cluster_size)
172 | #   adjust the ratio of treatment to control (prop)
173 | #   increase significance level (alpha)
174 | #   add covariates to the regression (should be done carefully when simulating data)
175 | # 
176 | # note that increasing the effect size will mechanically increase power, but to ensure a study is adequately 
177 | # powered it is important to use a reasonable effect size (i.e., the effect size specified here should not be 
178 | # increased just to make the calculations turn out in your favor). So, to increase power, you should only increase
179 | # the effect size specified if you believe you can increase the effect size in the real world, such as by tweaking
180 | # the intervention
181 | 
182 | 
183 | 
184 | 
185 |   


--------------------------------------------------------------------------------
/J-PAL_Power_by_simulation_clusters.do:
--------------------------------------------------------------------------------
  1 | ** Power by Simulation for an experiment with equal clusters
  2 | 
  3 | *Created by: Sabhya Gupta with input from Jack Cavanagh, Mike Gibson, Sarah Kopper
  4 | *All errors are the author's alone
  5 | 
  6 | *To report errors/make suggestions or ask questions contact: Sabhya Gupta - sagupta@povertyactionlab.org
  7 | *Last edited: 06/02/2021
  8 | 
  9 | ********************************************************************************
 10 | 
 11 | /* 
 12 | In a clustered design, the model (which can be changed based on the research design) is:
 13 | 
 14 | outcome = intercept + effect*treatment_dummy + cluster_error + individual_error
 15 | 
 16 | Total outcome variance = cluster-specific variance + individual-specific variance
 17 | The intra-cluster correlation coefficient (ICC) is the ratio of the variance of the cluster-specific error and the total variance
 18 | 
 19 | The cluster-specific and individual-specific error are both assumed to be continuously and normally distributed but can be 
 20 | changed based on the sample population. 
 21 | 
 22 | 
 23 | This do-file:
 24 | *Generates fake data for control and treatment clusters with pre-determined cluster-specific errors, effect size and design features
 25 | *Generates fake data for individuals with individual and cluster-specific errors
 26 | *Runs the regression and clusters the errors
 27 | *Repeats this over and over and record how many times the effect of the treatment is statistically significant 
 28 | *The percentage of simulations with a statistically significant effect is the power
 29 | 
 30 | It contains sample code for the following:
 31 | 	0. Housekeeping and specify temp files 
 32 | 	1. SPECIFY design factors and the number of simulations
 33 | 	2. Simulation on generated data
 34 | 	3. Load results of simulation and estimate power
 35 | 
 36 | 
 37 | To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 38 | The seed and all the locals in section 1 can be changed based on the research design 
 39 | */
 40 | 
 41 | 
 42 | 
 43 | ********************************************************************************************************************
 44 | ********************************* 0. Housekeeping and specify temp files  ******************************************
 45 | ********************************************************************************************************************
 46 | 
 47 | cd "" 																			//SPECIFY - working directory
 48 | capture log close "power_by_simulation_clusters"
 49 | log using "power_by_simulation_clusters", replace
 50 |  
 51 | *Create a temporary file that will store the results of the simulation
 52 | tempname sim_name
 53 | tempfile sim_results
 54 | postfile `sim_name' reject_t rho_calculated  using `sim_results'
 55 | 
 56 | clear
 57 | set seed 123456																	//SPECIFY - seed for replication 
 58 | 
 59 | 
 60 | 
 61 | ********************************************************************************************************************
 62 | ************************* 1. SPECIFY design factors and the number of simulations **********************************
 63 | ********************************************************************************************************************
 64 | 
 65 | local effect = 0.5																//SPECIFY - hypothesized treatment effect
 66 | local prop = 0.5 																//SPECIFY - ratio of the size of the treatment and control
 67 | local alpha = 0.05																//SPECIFY - the significance value. 
 68 | local side "two"																//SPECIFY   "two", "left", or "right" for a two-sided, left-sided or right-sided test
 69 | 
 70 | local control_intercept = 10													//SPECIFY - intercept for the design specification
 71 | local num_clusters = 100														//SPECIFY - total number of clusters
 72 | local cluster_size = 500														//SPECIFY - size of each cluster							
 73 | local sample_size = `num_clusters'*	`cluster_size'								//SPECIFY - sample size i.e. the number of units of observation across all clusters
 74 | 
 75 | local within_err_var = 1														//SPECIFY - this is the within-cluster variation
 76 | local icc = 0.4																	//SPECIFY - correlation between two randomly chosen individuals within a cluster
 77 | local between_err_var = (`icc'*`within_err_var')/(1-`icc')						//variation between clusters 
 78 | 
 79 | /*Cluster-specific variance is 0 when the ICC is 0. That is, when individuals in a cluster are not correlated with each other, 
 80 | the variance in the outcome is only from the variation across individuals. Cluster-specific variance increases as the ICC increases. 
 81 | The ICC can also be specified as a function of the within-cluster variance and between-cluster variance by solving for it in the 
 82 | between_err_var equation above. 
 83 | 
 84 | The ICC can be calculated using data from a similar population or based on the literature. 
 85 | If using existing data, the ICC can be estimated as follows:
 86 | 
 87 | loneway outcome cluster_var if treatment==0										//The loneway command calculates the one-way ANOVA by a group variable. 
 88 | 																				//It gives the within-group variation and the between group variation of a variable. 
 89 | local rho = `r(rho)'															//rho gives the ICC 
 90 | */
 91 | 	
 92 | 
 93 | local sims=1000																	//SPECIFY number of simulations
 94 | 
 95 | 
 96 | 
 97 | ********************************************************************************************************************
 98 | ****************************************** 2. Simulation on generated data *****************************************
 99 | ********************************************************************************************************************
100 | 
101 | *Generate fake data with clusters, specified distribution and effect, regress outcome on treatment and record if significant
102 | 
103 | local it = 1																	//iteration number
104 | 
105 | while  `it'<=`sims'{
106 | 	clear
107 | 	
108 | 	*cluster errors
109 | 	quietly set obs `num_clusters'
110 | 	quietly gen cluster_group= _n
111 | 	quietly gen cluster_error=rnormal(0,sqrt(`between_err_var'))				//we assume that the cluster-specific error is distributed normally with mean 0 
112 | 																				//SPECIFY - should be decided by the researcher based on the sample population
113 | 
114 | 	quietly gen treat= runiform()<`prop'										//random assignment at the cluster level
115 | 	
116 | 	sort cluster_group
117 | 	tempfile cluster_error_g
118 | 	quietly save `cluster_error_g', replace										//save the file with cluster-specific errors
119 | 
120 | 	
121 | 	*Assign individuals to clusters
122 | 	clear
123 | 	quietly set obs `sample_size'
124 | 	quietly gen u=invnormal(uniform())											//create a random number to randomly divide into clusters
125 | 	quietly egen cluster_group = cut(u), group(`num_clusters')					//divide individuals into clusters
126 | 	quietly replace cluster_group = cluster_group+1								//replace the numbering from 0-9 to 1-10
127 | 	sort cluster_group															//sort by cluster
128 | 	
129 | 	*merge in cluster errors
130 | 	quietly merge m:1 cluster_group using  `cluster_error_g'					//merge with the dataset with cluster-specific errors
131 | 	quietly gen individual_error=rnormal(0,sqrt(`within_err_var'))				//SPECIFY - individual specific error that is assumed to be normally distributed with mean 0. 
132 | 																				//This should be decided based on the sample population
133 | 																				
134 | 	quietly gen outcome = `control_intercept' + cluster_error + individual_error //SPECIFY - assumed outcome for control
135 | 	
136 | 	quietly replace outcome = outcome + `effect' if treat==1					//potential outcome for the treatment is higher than the control by the effect size
137 | 	
138 | 	quietly loneway outcome cluster_group if treat==0							//The loneway command calculates the one-way ANOVA by a group variable. 
139 | 																				//It gives the 	within-group variation and the between group variation of a variable. 
140 | 																				//It also  produces the intra-cluster correlation coefficient (ICC). 
141 | 																				//This is computed at the baseline or for the control (as a proxy for the baseline)
142 | 																				
143 | 	local rho_calculated=`r(rho)'												//this is the ICC estimated from the simulated data. As the sample size increase, this value will approach the ICC specified above 
144 | 	
145 | 	quietly regress outcome treat, vce(cluster cluster_group)					//simple regression - the true effect is as specified above. The errors are clustered
146 | 	
147 | 	local t_value = _b[treat]/_se[treat]
148 | 	local df=2*((`num_clusters')-1)											//degrees of freedom is a function of the number of clusters
149 | 	
150 | 	if "`side'" == "two" {
151 | 		local critical_l = invt(`df', `alpha'/2)								//the lower critical value
152 | 		local critical_u = invt(`df', 1-`alpha'/2)								//the upper critical value
153 | 		local reject_t=(`t_value'>`critical_u')|(`t_value'<`critical_l')		//reject if the t-value lies in the critical level. It takes value 1 if the null is rejected and 0 if not
154 | 	}
155 | 	
156 | 	else if "`side'" == "left" {
157 | 		local critical_l = invt(`df', `alpha')									//the lower critical value
158 | 		local reject_t=(`t_value'<`critical_l')									//reject if the t-value is less than the lower critical level. It takes value 1 if the null is rejected and 0 if not
159 | 	}
160 | 	
161 | 	else if "`side'" == "right" {
162 | 		local critical_u = invt(`df', 1-`alpha')								//the upper critical value
163 | 		local reject_t=(`t_value'>`critical_u')									//reject if the t-value is more than the upper critical level. It takes value 1 if the null is rejected and 0 if not
164 | 	}
165 | 	
166 | 	
167 | 	
168 | 	quietly post `sim_name' (`reject_t') (`rho_calculated') 					//write output from simulation to the temporary file
169 | 	
170 | 	tempfile simulated_data_clusters`it'										//save the data from the iterations
171 | 	save `simulated_data_clusters`it'', replace
172 | 	
173 | 	local it=`it'+1
174 | 	clear
175 | }
176 | 
177 | use `simulated_data_clusters4', clear											//load any of the simulated data files by changing the number - this may be useful to make sure that the simulation is working as intended
178 | 
179 | 
180 | *****************************************************************************************
181 | ******************** 3. Load results of simulation and estimate power *******************
182 | *****************************************************************************************
183 | 
184 | postclose `sim_name'
185 | use `sim_results',clear
186 | 
187 | sum reject_t
188 | local power = `r(mean)'
189 | 
190 | sum rho_calculated
191 | local rho_calculated = round(`r(mean)',0.01)
192 | 
193 | di as error "If the treatment effect and the assumption about the distribution of the cluster and individual specific error is true, the study with `num_clusters' clusters of size `cluster_size' will detect the true treatment effect of `effect' with probability `power' in a `side'-sided test. Hence the power of the study is `power'. The calculated ICC from the simulation is `rho_calculated'"
194 | 
195 | 
196 | 
197 | /* To improve the power, you can:
198 | *increase sample size (sample_size)
199 | *increase number of clusters (num_clusters). This will be more effective than increasing cluster size if the individuals are highly correlated
200 | *increase cluster size (cluster_size)
201 | *adjust the ratio of treatment to control (prop)
202 | *increase significance level (alpha)
203 | *add covariates to the regression (should be done carefully when simulating data)
204 | 
205 | *note that increasing the effect size will mechanically increase power, but to ensure a study is adequately 
206 | powered it is important to use a reasonable effect size (i.e., the effect size specified here should not be 
207 | increased just to make the calculations turn out in your favor). So, to increase power, you should only increase
208 | the effect size specified if you believe you can increase the effect size in the real world, such as by tweaking
209 | the intervention
210 |  
211 | *the same is true of the ICC: a lower ICC will mechanically increase study power, but it's important to use a 
212 | reasonable ICC rather than one that is favorable in your calculations
213 | 
214 | */
215 | 
216 | cap log close
217 | 


--------------------------------------------------------------------------------
/J-PAL_Power_built_in_commands.do:
--------------------------------------------------------------------------------
  1 | ** Power calculations using built-in Stata commands
  2 | 
  3 | *Created by: Sabhya Gupta with input from Jack Cavanagh, Maya Duru, Mike Gibson, Sarah Kopper
  4 | *All errors are the author's alone
  5 | 
  6 | *To report errors/make suggestions or ask questions contact: Sabhya Gupta - sagupta@povertyactionlab.org
  7 | *Last edited: 06/24/2021
  8 | 
  9 | ********************************************************************************
 10 | /* This do-file:
 11 | *Computes sample size and effect size for a given power and treatment to control size ratio
 12 | *Includes variations with controls, clusters and take-up rate 
 13 | *Uses the Balsakhi dataset to illustrate how to calculate power
 14 | 
 15 | About the data: The Balsakhi program was a remedial education program that was conducted in Indian schools to increase literacy and numeracy skills. 
 16 | You can learn more about the Balsakhi dataset from the documentation and data here at https://doi.org/10.7910/DVN/UV7ERB
 17 | 
 18 | Variables:
 19 | - Outcome of interest is in the "normalised total score." This is represented by: 
 20 | 	- "pre_totnorm" at baseline
 21 | 	- "post_totnorm" at the endline
 22 | - Treatment: bal
 23 | 	- 0=control
 24 | 	- 1=treatment
 25 | - Clustering variable (by school): divid
 26 | 
 27 | Note: key inputs for calculating power like the mean and the standard deviation at baseline, ICC, etc. are calculated 
 28 | using the specified dataset, but they can also be specified manually.
 29 | 
 30 | About the file:
 31 | - This file contains sample code for the following:
 32 | 	0. Housekeeping and load data
 33 | 	1. No covariates
 34 | 		1a. Sample size for a given effect size
 35 | 		1b. MDE for a given sample size
 36 | 	2. With covariates (not applicable for binary data)
 37 | 		2a. Sample size for a specified effect - with covariates 
 38 | 		2b. MDE for a given sample size - with covariates
 39 | 	3. Sample size with Partial Take-up
 40 | 	4. Overview of how MDE and sample size change as we add covariates and take-up changes
 41 | 	5. Clustered designs
 42 | 		5a. Compute number of clusters for a given effect size and size of cluster 
 43 | 		5b. Compute cluster size given the number of clusters and effect size 
 44 | 		5c. Compute effect size for a given cluster size and number of clusters
 45 | 
 46 | - With the exception of module 4, each module is self-contained and can be run on its own after running module 0
 47 | - To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 48 | 
 49 | 
 50 | ** Binary outcome variable: 
 51 | 
 52 | This code assumes a continuous outcome. If the outcome variable is a binary variable, use the "power twoproportions" command.
 53 | - The baseline mean is the proportion of 1s in the outcome variable 
 54 | - The effect size is the change in the treatment of the proportion of the outcome variable from the control group 
 55 | - Standard deviation is a function of the proportion of the outcome variable in the control dataset
 56 | - Documentation can be found by typing "help power twoproportions" or "help power twoproportions cluster"
 57 | 
 58 | The section on covariates is not applicable to binary outcome variables due to the different model specification for binary variables. 
 59 | See McConnell and Vera-Hernandez (2015) (https://www.ifs.org.uk/uploads/publications/wps/WP201517_update_Sep15.pdf)
 60 | for a discussion of how the power calculations change with covariates when the outcome variable is binary.
 61 | 
 62 | */
 63 | 
 64 | 
 65 | 
 66 | ****************************************************************************************
 67 | ***************************** 0. Housekeeping and load data ****************************
 68 | ****************************************************************************************
 69 | 
 70 | cd ""																			//SPECIFY - working directory
 71 | capture log close "power_built_in_commands"
 72 | log using "power_built_in_commands", replace
 73 | 
 74 | use "baroda_0102_1obs.dta", clear												//SPECIFY - the dataset
 75 | 
 76 | global outcome "pre_totnorm"													//SPECIFY the outcome and treatment variable
 77 | global treatment "bal"
 78 | 
 79 | 
 80 | ****************************************************************************************
 81 | *********************************** 1. No covariates ***********************************
 82 | ****************************************************************************************
 83 | 
 84 | * The following code assumes the unit of randomization is the same as the unit of observation 
 85 | 
 86 | 
 87 | ****************************************************************************************
 88 | ************************* 1a. Sample size for a given effect size **********************
 89 | ****************************************************************************************
 90 | 
 91 | local power = 0.8																//SPECIFY - desired power
 92 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group (1=equal allocation)
 93 | local alpha = 0.05																//SPECIFY - the significance level
 94 | 
 95 | sum $outcome  if !missing($outcome)												//sum the outcome at baseline and record the mean and the standard deviation
 96 | local sd = `r(sd)'
 97 | local baseline = `r(mean)'
 98 | 
 99 | local effect = `sd'*0.3									       					//SPECIFY - the expected effect. Here we specify 0.3 standard deviations, but this should be updated based on what is reasonable for the study
100 | local treat = `baseline' + `effect'
101 | 
102 | power twomeans `baseline' `treat', power(`power') sd(`sd') nratio(`nratio') table
103 | 
104 | local effect = round(`effect',0.0001)
105 | 
106 | local samplesize = r(N)
107 | 
108 | di as error "The minimum sample size needed is `samplesize' to detect an effect size of `effect' with a probability of `power' if the effect is true and the ratio of units in treatment and control is `nratio'"
109 | 
110 | 
111 | * How does the sample size change when standard deviation and the effect size changes?
112 | 
113 | power twomeans `baseline' `treat', power(`power') sd(0.5(0.1)2) nratio(`nratio') table    	        //SPECIFY sd range
114 | 
115 | power twomeans `baseline', power(`power') sd(`sd') nratio(`nratio') diff(0.1(0.15)2) table        	//SPECIFY diff range to indicate the different possible effect sizes
116 | 
117 | 
118 | ****************************************************************************************
119 | **************************** 1b. MDE for a given sample size ****************************
120 | ****************************************************************************************
121 | 
122 | local power = 0.8
123 | local nratio = 1
124 | local alpha = 0.05
125 | local N = _N																	//SPECIFY - the total sample size. This is taken from the Balsakhi dataset but can be changed based on the study
126 | 
127 | quietly sum $outcome if !missing($outcome)										//sum the baseline level and record the mean and the standard deviation
128 | local sd = `r(sd)'
129 | local baseline = `r(mean)'
130 | 
131 | power twomeans `baseline', n(`N') power(`power') sd(`sd') nratio(`nratio') table
132 | 
133 | local mde= round(`r(delta)',0.0001)
134 | 
135 | di as error "The MDE is `mde' given a sample size of `N', ratio of units in treatment and control of `nratio', and power `power'"
136 | 
137 | 
138 | 
139 | * How does MDE change when sample size and the ratio of allocation between the two groups changes
140 | 
141 | power twomeans `baseline', power(`power') sd(`sd') n(10000(2000)20000) nratio(`nratio') table    	   //SPECIFY N range to indicate the different possible sample sizes
142 | 
143 | power twomeans `baseline', n(`N') power(`power') sd(`sd') nratio(1(-0.2)0.1) table			           //SPECIFY range of ratios of treatment to sample size
144 | 																									   //NRatio = 1  means an equal allocation between treatment and control groups. A decrease in the ratio means that a larger proportion of the sample size is allocated to the control group
145 | 
146 | 
147 | 	
148 | ****************************************************************************************
149 | ********************************* 2. Adding covariates *********************************
150 | ****************************************************************************************
151 | 
152 | /* To see how potential controls affect power,  we would ideally have access to a sample data set 
153 | (e.g. historical or pilot data). With these data, we would want to:
154 | 	1. Regress Y_i (the outcome) on X_i (the controls) 
155 | 	2. Use the residual standard deviation of the outcome variable from this regression to evaluate 
156 | 	how much variance is explained by the set of covariates we plan to include
157 | 		- In practice, this residual SD becomes the new SD we include in our parametric power calculations
158 | 
159 | With access to historical data, for example, this would involve regressing last year's test scores 
160 | on test scores from the year before. Using balsakhi data, this would be as follows. 
161 | 
162 | Note that this section is not applicable for power calculations with a binary outcome variable. 
163 | See McConnell and Vera-Hernandez 2015 (https://www.ifs.org.uk/uploads/publications/wps/WP201517_update_Sep15.pdf)
164 | for a discussion of covariates for binary outcomes and accompanying sample code */
165 | 
166 | 
167 | ****************************************************************************************
168 | ************** 2a. Sample size for a given effect size - with covariates  **************
169 | ****************************************************************************************
170 | 
171 | local power = 0.8																//SPECIFY - desired power
172 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
173 | local alpha =0.05																//SPECIFY - the significance level
174 | 
175 | 	
176 | local covariates "female std sessiond"											//SPECIFY the covariates - use baseline values of covariates
177 | local number_covariates: word count `covariates'													
178 | 
179 | regress $outcome `covariates' 												    //SPECIFY outcome and control variables
180 | 
181 | local res_sd =round(sqrt(`e(rss)'/`e(df_r)'),0.0001)							//this is the new standard deviation for the power calculation or the residual sd not explained by the control(s). 
182 | 																				//This will be used for power calculation.
183 | 	
184 | quietly sum $outcome if  !missing($outcome)					    				//sum the outcome at baseline and record the mean and the standard deviation
185 | local baseline = `r(mean)'
186 | local sd = `r(sd)'
187 | local effect_cov = `sd'*0.3														//SPECIFY - the expected effect. Here we specify 0.3 standard deviations, but this should be updated based on what is reasonable for the study
188 | 	
189 | local treat = `baseline' + `effect_cov'
190 | 
191 | power twomeans `baseline' `treat', power(`power') sd(`res_sd') nratio(`nratio') alpha(`alpha') table
192 | 	
193 | local effect_cov = round(`effect_cov',0.0001)
194 | local samplesize_cov = `r(N)'
195 | 	
196 | di as error "The minimum sample size needed is `samplesize_cov' to detect an effect of `effect_cov' with a probability of `power' if the effect is true, the ratio of units in treatment and control is `nratio', and the residual standard deviation is `res_sd' after accounting for covariates: `covariates'"
197 | 
198 | 	
199 | 
200 | ****************************************************************************************
201 | ****************** 2b. MDE for a given sample size - with covariates  ******************
202 | ****************************************************************************************
203 | 
204 | local power = 0.8																//SPECIFY - desired power
205 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
206 | local alpha =0.05																//SPECIFY - the significance level
207 | local N_cov= _N																	//SPECIFY - the total sample size. 
208 | 																				//This is taken from the Balsakhi dataset but can be changed based on the study
209 | 
210 | local covariates "female std sessiond"											//SPECIFY the covariates - use baseline values of covariates
211 | regress $outcome `covariates' 													//SPECIFY outcome and control variables
212 | 
213 | local res_sd = round(sqrt(`e(rss)'/`e(df_r)'),0.0001)							//this is the new standard deviation for the power calculation or the residual sd not explained by the control(s). 
214 | 																				//This will be used for power calculation.
215 | 	
216 | quietly sum $outcome if  !missing($outcome)					   					//sum the outcome at baseline and record the mean and the standard deviation
217 | local baseline = `r(mean)'
218 | 	
219 | power twomeans `baseline', n(`N_cov') power(`power') sd(`res_sd') nratio(`nratio') alpha(`alpha')  table 
220 | 	
221 | local mde_cov= round(`r(delta)',0.0001)
222 | 
223 | di as error "The MDE is `mde_cov' given a sample size of `N_cov', ratio of treatment and control group of `nratio', power `power', and the residual standard deviation of `res_sd' after accounting for covariates: `covariates'"
224 | 	
225 | 
226 | ****************************************************************************************
227 | ********************* 3. Sample size with partial take-up   ****************************
228 | ****************************************************************************************
229 | 
230 | /* When there is inperfect compliance in the treatment or the control group, the expected effect is 
231 | reduced by a factor of the effective take-up, where effective take-up = take-up in treatment - take-up in control */
232 | 	
233 | local power = 0.8																//SPECIFY - desired power
234 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
235 | local alpha = 0.05																//SPECIFY - the significance level
236 | 	
237 | local takeup_treat = 0.9														//SPECIFY - take-up in the treatment
238 | local takeup_control =  0.1														//SPECIFY - take-up in the control
239 | 	
240 | quietly sum $outcome if !missing($outcome)										//sum the outcome at baseline and record the mean and the standard deviation with perfect take-up
241 | local sd_tu = `r(sd)'
242 | local baseline = `r(mean)'
243 | 
244 | local effect= `sd_tu'*0.3														//SPECIFY - the expected effect with perfect take-up. Here we specify 0.3 standard deviations, but this should be updated based on what is reasonable for the study
245 | 
246 | local tu = `takeup_treat' - `takeup_control'									//effective take-up
247 | local effect_tu = `effect'*`tu'													//effect size after adjusting for take-up. This will be the effect size you expect to measure with a true effect size of `effect' and a take-up rate of `tu’. effect_tu < effect for imperfect take-up rates. 
248 | local treat_tu = `baseline' + `effect_tu'										//treatment mean after adjusting for take-up
249 | 
250 | power twomeans `baseline' `treat_tu', power(`power') sd(`sd_tu') nratio(`nratio') table
251 | local samplesize_tu = `r(N)'
252 | local effect_tu = round(`effect_tu',0.01)
253 | 	
254 | di as error "A minimum sample size of `samplesize_tu' is needed to detect an effect of `effect_tu' (the true effect of `effect’ adjusted for the effective take-up of `tu') with a probability of `power' if the effect is true and the ratio of units in treatment and control is `nratio'"
255 | 
256 | 
257 | 
258 | ****************************************************************************************
259 | * 4. Overview of how MDE and sample size change as we add covariates and take-up changes
260 | ****************************************************************************************	
261 | 
262 | *Note: This module calls on locals in modules 1-3, so you'll have to run them too
263 | 
264 | //how sample_size changes when we change the design
265 | matrix input sample_size = (1,0,`effect',`samplesize', `sd'\ 1,`number_covariates',`effect_cov',`samplesize_cov', `res_sd' \ `tu',0,`effect_tu',`samplesize_tu', `sd_tu')
266 | matrix colnames sample_size = take_up_rate number_covariates effect_given_take_up sample_size standard_dev
267 | matrix list sample_size
268 | 
269 | //how MDE changes when we add more covariates
270 | matrix input mde = (0,`mde',`N',`sd'\ `number_covariates',`mde_cov',`N_cov', `res_sd')
271 | matrix colnames mde = number_covariates MDE N standard_dev
272 | matrix list mde
273 | 
274 | ****************************************************************************************
275 | ********************************* 5. Clustered designs *********************************
276 | ****************************************************************************************
277 | 
278 | /* The code presented so far has been for when the unit of randomization is the same
279 | as the unit of observation. The following code is for clustered designs, when there are
280 | multiple units of observation contained in a single unit of randomization (e.g., randomization
281 | is at the school level but outcomes measured at the student level) */
282 | 
283 | global cluster_var "divid"														//SPECIFY - the cluster variable
284 | 
285 | 
286 | ****************************************************************************************
287 | ****** 5a.Compute number of clusters for a given effect size and size of clusters ******
288 | ****************************************************************************************
289 | 
290 | local power = 0.8																//SPECIFY - desired power
291 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
292 | local alpha = 0.05																//SPECIFY - the significance level
293 | 	
294 | quietly sum $outcome if  !missing($outcome)										//sum the outcome at baseline and record the mean and the standard deviation
295 | local sd = `r(sd)'
296 | local baseline = `r(mean)'
297 | 
298 | local cluster_size_control = 50													//SPECIFY - number of people in each cluster. 
299 | 																				//This should be specified by the researcher
300 | local mratio=1																	//SPECIFY - the ratio of the cluster size in the treatment and the control
301 | local cluster_size_treatment = `cluster_size_control'*`mratio'
302 | 
303 | local kratio = 1																//SPECIFY - The ratio of the number of treatment clusters to the number of control clusters
304 | 
305 | local effect_cluster = `sd'*0.3													//SPECIFY - the expected effect. Here we specify 0.3 standard deviations, but this should be updated based on what is reasonable for the study
306 | 																				//Here the expected change in the treatment group is half of the standard deviation
307 | local treat= `baseline' + `effect_cluster' 										//define treatment mean
308 | 
309 | loneway $outcome $cluster_var													//The loneway command calculates the one-way ANOVA by a group variable. 
310 | 																				//It gives the within-group variation and the between group variation of a variable. 
311 | 																				//It also produces the intra-cluster correlation coefficient (ICC)
312 | 	
313 | local rho = `r(rho)'
314 | 
315 | power twomeans `baseline' `treat', cluster m1(`cluster_size_control') mratio(`mratio') kratio(`kratio') power(`power') sd(`sd') rho(`rho') alpha(`alpha') table
316 | 	
317 | local effect_cluster = round(`effect_cluster',0.0001)
318 | 
319 | local n_clus_t = `r(K2)'
320 | local n_clus_c = `r(K1)'
321 | 	
322 | di as error "A minimum of `n_clus_c' control clusters and `n_clus_t' treatment clusters is needed to detect an effect of `effect_cluster' with a probability of `power' if the effect is true, given the size of each control cluster as `cluster_size_control' units and ratio of the treatment to control cluster size of `mratio'"
323 | 	
324 | 	
325 | ****************************************************************************************
326 | ********* 5b. Compute cluster size given the number of clusters and effect size ********
327 | ****************************************************************************************
328 | 
329 | local power = 0.8																//SPECIFY - desired power
330 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
331 | local alpha = 0.05																//SPECIFY - the significance level
332 | 	
333 | quietly sum $outcome if  !missing($outcome)										//sum the outcome at baseline and record the mean and the standard deviation
334 | local sd = `r(sd)'
335 | local baseline = `r(mean)'
336 | 
337 | bysort $cluster_var: gen control_cluster = _n==1								
338 | count if control_cluster & $treatment==0										//count the number of control clusters
339 | 
340 | local num_clusters_control = `r(N)'												//SPECIFY number of clusters in the control group - 
341 | 																				//Taken from Balsakhi but can be specified by researcher 
342 | 	
343 | local kratio = 1																//SPECIFY - The ratio of the number of treatment clusters to the number of control clusters
344 | local num_clusters_treatment = `num_clusters_control'*`kratio'	
345 | 
346 | local effect_cluster = `sd'*0.3													//SPECIFY - the expected effect. Here we specify 0.3 standard deviations, but this should be updated based on what is reasonable for the study
347 | 																				//Here the expected change in the treatment group is half of the standard deviation
348 | 																				
349 | local treat = `baseline' + `effect_cluster' 									//define treatment mean
350 | 
351 | loneway $outcome $cluster_var 								                    //The loneway command calculates the one-way ANOVA by a group variable. 
352 | 																				//It gives the within-group variation and the between group variation of a variable. 
353 | 																				//It also produces the intra-cluster correlation coefficient (ICC)
354 | 	
355 | local rho = `r(rho)'
356 | 	
357 | power twomeans `baseline' `treat', cluster  k1(`num_clusters_control') kratio(`kratio') power(`power') sd(`sd') rho(`rho')
358 | 
359 | 		
360 | local clus_size_t = `r(M2)'
361 | local clus_size_c = `r(M1)'
362 | 	
363 | di as error "The minimum size of each cluster should be `clus_size_c' in the control and `clus_size_t' in the treatment to etect an effect of `effect' with a probability of `power' if the effect is true, given `num_clusters_control' clusters in the control and the ratio of the number of treatment and control clusters as `kratio'"
364 | 	
365 | drop control_cluster
366 | 
367 | 	
368 | ****************************************************************************************
369 | ******* 5c. Compute effect size for a given cluster size and number of clusters ********
370 | ****************************************************************************************
371 | 
372 | local power = 0.8																//SPECIFY - desired power
373 | local nratio = 1																//SPECIFY - the ratio of experimental group to control group
374 | local alpha =0.05																//SPECIFY - the significance level
375 | 	
376 | quietly sum $outcome if !missing($outcome)										//sum the outcome at baseline and record the mean and the standard deviation
377 | local sd = `r(sd)'
378 | local baseline = `r(mean)'
379 | 
380 | bysort $cluster_var: gen control_cluster = _n==1										
381 | count if control_cluster & $treatment==0										//count the number of control clusters
382 | 
383 | local num_clusters_control=`r(N)'												//SPECIFY number of clusters in the control group - Taken from dataset but can be specified by researcher 
384 | 	
385 | local kratio = 1																//SPECIFY - The ratio of the number of treatment clusters to the number of control clusters
386 | 
387 | local cluster_size_control = 50													//SPECIFY - number of people in each cluster. This should be specified by the researcher
388 | local mratio=1																	//SPECIFY - the ratio of the cluster size in the treatment and the control
389 | 
390 | loneway $outcome $cluster_var 													//The loneway command calculates the one-way ANOVA by a group variable. 
391 | 																				//It gives the within-group variation and the between group variation of a variable. 
392 | 																				//It also produces the intra-cluster correlation coefficient (ICC)
393 | 	
394 | local rho = `r(rho)'
395 | 	
396 | 
397 | power twomeans `baseline', cluster k1(`num_clusters_control') kratio(`kratio') mratio(`mratio') m1(`cluster_size_control') power(`power') sd(`sd') rho(`rho')  alpha(`alpha') table
398 | 
399 | 	
400 | local mde_cluster = round(`r(delta)',0.0001)
401 | 	
402 | di as error "The MDE is `mde_cluster' given `num_clusters_control' clusters in the control, ratio of the number of treatment and control clusters as `kratio', `cluster_size_control' units in the control, the ratio of units in each treatment and control cluster of `mratio', and power `power'."
403 | 
404 | drop control_cluster
405 | 
406 | cap log close
407 | 


--------------------------------------------------------------------------------
/JPAL_Power_built_in_commands.R:
--------------------------------------------------------------------------------
  1 | # This R script:
  2 | #
  3 | # Computes sample size and effect size for a given power and treatment to control size ratio
  4 | # Includes variations with controls, clusters and take-up rate
  5 | # Uses the Balsakhi dataset to illustrate how to calculate power
  6 | # About the data: The Balsakhi program was a remedial education program that was conducted in  Indian schools to increase literacy and numeracy skills.
  7 | # You can learn more about the Balsakhi dataset from the documentation and data here at https://doi.org/10.7910/DVN/UV7ERB
  8 | 
  9 | 
 10 | # Variables:
 11 | # - Outcome of interest is in the "normalised total score." This is represented by: 
 12 | #     - "pre_totnorm" at baseline
 13 | #     - "post_totnorm" at the endline
 14 | # - Treatment: bal
 15 | #     - 0=control
 16 | #     - 1=treatment
 17 | # - Clustering variable (by school): divid
 18 | 
 19 | # Note: key inputs for calculating power like the mean and the standard deviation at baseline, icc etc. are calculated using the 
 20 | # specified dataset but they can also be specified manually
 21 | #
 22 | 
 23 | # Binary outcome variable:
 24 | # Use power.prop.test https://stat.ethz.ch/R-manual/R-devel/library/stats/html/power.prop.test.html to calculate the sample 
 25 | # size or Minimum effect size for a binary outcome variable. The command is not suitable for clustered data. 
 26 | # The section on covariates is not applicable to binary outcome variables due to the different
 27 | # model specification for binary variables. See McConnell and Vera-Hernandez (2015) 
 28 | # for a discussion of how the power calculations change with covariates when the outcome variable is binary
 29 | 
 30 | 
 31 | # About the file:
 32 | 
 33 | # - This file contains sample code for the following:
 34 | #   0. Housekeeping and load data
 35 | #   1. No covariates
 36 | #     1a. Sample size for a given effect size
 37 | #     1b. MDE for a given sample size
 38 | #   2. With covariates (not applicable to binary outcome variable)
 39 | #     2a. Sample size for a specified effect - with covariates 
 40 | #     2b. MDE for a given sample size - with covariates
 41 | #   3. Sample size with Partial Take-up
 42 | #   4. Clustered designs
 43 | #     4a. Compute number of clusters for a given effect size and size of cluster 
 44 | #     4b. Compute cluster size given the number of clusters and effect size 
 45 | #     4c. Compute effect size for a given cluster size and number of clusters
 46 | # 
 47 | # - To use this file for your own power calculations, change the dataset/variables/values marked as "SPECIFY" 
 48 | # 
 49 | # Created by: Sabhya Gupta with input from Jack Cavanaugh, Maya Duru, Mike Gibson, Sarah Kopper
 50 | # All errors are the author's alone
 51 | 
 52 | 
 53 | # Contact: sagupta@povertyactionlab.org
 54 | # Last edited: 07/20/2021
 55 | 
 56 | 
 57 | ### 0. Housekeeping and load data ##################################
 58 | 
 59 | #It may take a long time to run the install command. You can try to install them separately
 60 | 
 61 | # install.packages(c("haven", "ICC", "randomizr", "multiwayvcov", "lmtest","devtools", "Hmisc", "gsubfn"), 
 62 | #      dependencies=TRUE, INSTALL_opts = c('--no-lock'))
 63 | 
 64 | library(devtools)
 65 | 
 66 | #remotes::install_github("tidyverse/magrittr")
 67 | #devtools::install_github('vikjam/pwrcalc')
 68 | 
 69 | # The pwrcalc package was developed by vikjam: https://github.com/vikjam/pwrcalc
 70 | # Documentation: https://pwrcalc.readthedocs.io/en/latest/intro.html 
 71 | # If devtools fails, download the package file from the github page and install 
 72 | # by following the instructions here: http://outmodedbonsai.sourceforge.net/InstallingLocalRPackages.html
 73 | 
 74 | 
 75 | # Once you installed the packages, load them to get started:
 76 | library(haven)
 77 | library(ICC)
 78 | library(randomizr)
 79 | library(multiwayvcov)
 80 | library(lmtest)
 81 | library(magrittr)
 82 | library(pwrcalc)
 83 | library(Hmisc)
 84 | library(gsubfn)
 85 | 
 86 | 
 87 | # Load the dataset and specify the outcome and treatment variables
 88 | # The balsakhi data is already included in the pwrcalc package
 89 | 
 90 | rm(list=ls())
 91 | data(balsakhi)                                                                  #SPECIFY - load the dataset                                                                  
 92 | 
 93 | dataset = balsakhi                                                              #SPECIFY - the dataset
 94 | outcome = dataset$pre_totnorm                                                   #SPECIFY - the outcome variable
 95 | treatment = dataset$bal                                                         #SPECIFY - treatment variable
 96 | 
 97 | 
 98 | ### 1. No covariates ######################################################
 99 | 
100 | 
101 | # The following code assumes the unit of randomization is the same as the unit of observation 
102 | 
103 | 
104 | ### 1a. Sample size for a given minimum detectable effect size ############
105 | 
106 | power = 0.8                                                                     #SPECIFY - desired power
107 | nratio = 1                                                                      #SPECIFY - the ratio of the size of the treatment group to control group
108 | alpha =0.05                                                                     #SPECIFY - significance level
109 | p = nratio/(1+nratio)
110 | 
111 | N_base <- function (dataset, outcome, treatment){
112 |   
113 |   baseline_mean <- mean(outcome, na.rm = TRUE)                                  #Record the mean of the outcome variable at baseline   
114 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
115 |   
116 |   expected_effect = 0.3*baseline_sd                                             #The expected effect should be specified based on the intervention and the cost. 
117 |                                                                                 #Here it is 0.3 times the sd
118 |   treated_mean <- expected_effect + baseline_mean
119 |   
120 |   base_model = twomeans(m1 = baseline_mean, m2 = treated_mean, sd = baseline_sd, 
121 |                         nratio=nratio, power=power, sig.level = alpha)
122 |   
123 |   # Here we assume that the standard deviation does not change with the treatment but 
124 |   # you can also specify different standard deviations for the control and treatment groups
125 | 
126 |   print(base_model)
127 |   
128 |   cat("n1 and n2 are the control and treatment sample sizes respectively. 
129 | m1 and m2 are the control and treatment means \n\n")
130 |   
131 |   cat("We need a minimum treatment size of",base_model$n2,"and control size of", 
132 |       base_model$n1, "to detect an effect of", 
133 |       expected_effect, "with a probability of", 
134 |       power,  "if the effect is true and the ratio of the treatment and control is",nratio)
135 |   
136 |   return(base_model)
137 | }
138 | 
139 | 
140 | 
141 | base_model <- N_base(dataset, outcome, treatment)  
142 | 
143 | # Change the parameters of the function to see how the sample size changes
144 | 
145 | 
146 | ### 1b. MDE for a given sample size ####################################
147 | 
148 | power = 0.8                                                                     #SPECIFY - desired power
149 | nratio = 1                                                                      #SPECIFY - the ratio of the size of the treatment group to control group
150 | alpha =0.05                                                                     #SPECIFY - significance level
151 | p = nratio/(1+nratio)
152 | 
153 | mde_base <- function (dataset, outcome, treatment, N){
154 |   t_power = qt(power, df=N-2)
155 |   t_alpha = qt(1-alpha/2, df=N-2)
156 |   
157 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
158 |   
159 |   mde <- (t_power + t_alpha) * sqrt(1 /(p*(1-p))) * sqrt(1 / N) * baseline_sd
160 |   mde = round(mde, digits=2)
161 |   print(mde)
162 |   
163 |   cat("Given our sample size of",N,
164 |       "and ratio of treatment and control group as,",
165 |       nratio, ",the effect needs to be higher than", 
166 |       mde, "for us to detect it with a probability of",power)
167 |   
168 |   return(mde)
169 |   
170 | }
171 | 
172 | N = nrow(dataset)                                                               #SPECIFY N - this is taken from the specified dataset but can be specified by the researcher
173 | 
174 | mde <- mde_base(dataset, outcome, treatment, N)              
175 | 
176 | # Change the parameters of the function to see how the MDE changes
177 | 
178 | ### 2. Adding covariates #################################################
179 | 
180 | 
181 | # To see how potential controls affect power,  we would ideally have access to a sample data set 
182 | # (e.g. historical or pilot data). With these data, we would want to:
183 | 
184 | #   1. Regress Y_i (the outcome) on X_i (the controls) 
185 | #   2. Use the residual standard deviation of the outcome variable from this regression to evaluate 
186 | #      how much variance is explained by the set of covariates we plan to include
187 | 
188 | # - In practice, this residual SD becomes the new SD we include in our parametric power calculations
189 | # 
190 | # With access to historical data, for example, this would involve regressing last year's test scores 
191 | # on test scores from the year before. Using balsakhi data, this would be as follows.
192 | 
193 | # Note that this section is not applicable for power calculations with a binary outcome variable. 
194 | # See McConnell and Vera-Hernandez 2015 for a discussion of covariates for binary outcomes
195 | 
196 | ### 2a. Sample size for a given effect size - with covariates #############
197 | 
198 | power = 0.8                                                                     #SPECIFY - desired power
199 | nratio = 1                                                                      #SPECIFY - the ratio of the size of the treatment group to control group
200 | alpha =0.05                                                                     #SPECIFY - significance level
201 | p = nratio/(1+nratio)
202 | 
203 | 
204 | N_cov <- function(dataset,covariates, outcome, treatment){
205 |   
206 |   cov_list <- paste(cov, collapse = " + ")
207 |   formula <- as.formula(paste("outcome ~ ",cov_list,sep = ""))
208 |   fit <- lm(formula, data = dataset)
209 |   summary(fit)
210 |   
211 |   res_baseline_sd <- sd(summary(fit)$residuals, na.rm=TRUE)
212 |   res_baseline_sd                                                               #the new SD for power calculations
213 |   
214 |   
215 |   expected_effect = 0.3*res_baseline_sd                                         #The expected effect should be specified based on the intervention and the cost. 
216 |                                                                                 #Here it is 0.3 times the sd
217 |   
218 |   baseline_mean <- mean(outcome, na.rm = TRUE)                                  #Record the mean of the outcome variable at baseline   
219 |   
220 |   treated_mean <- expected_effect + baseline_mean
221 |   
222 |   cov_model <- twomeans(m1 = baseline_mean, m2 = treated_mean, sd = res_baseline_sd, 
223 |                               power=power, nratio= nratio, sig.level = alpha)
224 |   
225 |   # Here we assume that the standard deviation does not change with the treatment but 
226 |   # you can also specify different standard deviations for the control and treatment groups
227 |   
228 |   print(cov_model)
229 |   
230 |   cat("We need a minimum treatment size of",cov_model$n2,
231 |       "and control size of", cov_model$n1, "to detect an effect of", 
232 |       expected_effect, "with a probability of", 
233 |       power,  "if the effect is true and the ratio of the treatment and control is",nratio)
234 |   
235 |   return(cov_model)
236 | 
237 | }
238 | 
239 | cov= c("gender", "std", "sessiond")                                                  #SPECIFY- a vector of covariate names- use baseline values of covariates
240 | 
241 | cov_model <- N_cov(dataset, cov, outcome, treatment) 
242 | # Change the parameters of the function to see how the sample size changes
243 | 
244 | 
245 | ### 2b. MDE for a given sample size - with covariates ###################
246 | 
247 | power = 0.8                                                                     #SPECIFY - desired power
248 | nratio = 1                                                                      #SPECIFY - the ratio of the size of the treatment group to control group
249 | alpha =0.05                                                                     #SPECIFY - significance level
250 | p = nratio/(1+nratio)
251 | 
252 | cov= c("gender", "std", "sessiond")                                             #SPECIFY- a vector of covariate names - use baseline values of covariates
253 | 
254 | mde_cov <- function (dataset, outcome, covariates, treatment, N){
255 |   t_power = qt(power, df=N-2)
256 |   t_alpha = qt(1-alpha/2, df=N-2)
257 |   
258 |   cov_list <- paste(covariates, collapse = " + ")
259 |   formula <- as.formula(paste("outcome ~ ",cov_list,sep = ""))
260 |   fit <- lm(formula, data = dataset)
261 |   summary(fit)
262 |   
263 |   res_baseline_sd <- sd(summary(fit)$residuals, na.rm=TRUE)
264 |   res_baseline_sd                                                                #the new SD for power calculations
265 |   
266 |   mde_res <- (t_power + t_alpha) * sqrt(1 /(p*(1-p))) * sqrt(1 / N) * res_baseline_sd
267 |   mde_res = round(mde_res, digits=2)
268 |   
269 |   print(mde_res)
270 |   
271 |   cat("Given our sample size of",N,
272 |       "and ratio of treatment and control group as,",
273 |       nratio, ",the effect needs to be higher than", 
274 |       mde_res, "for us to detect it with a probability of",power)
275 |   
276 |   return(mde_res)
277 |   
278 | }
279 | 
280 | 
281 | N = nrow(dataset)                                                               #SPECIFY N - this is taken from the specified dataset but can be specified by the researcher
282 | 
283 | mde_res <- mde_cov(dataset, outcome, cov, treatment, N)   
284 | 
285 | # Change the parameters of the function to see how the MDE changes
286 | 
287 | 
288 | ### 3.Sample size with Partial Take-up ################################
289 | 
290 | ## when we have imperfect compliance in the treatment or the control group, 
291 | ## the expected effect is reduced by a factor of the effective take-up 
292 | ## effective take-up = take-up in treatment - take-up in control
293 | 
294 | power = 0.8                                                                     #SPECIFY - desired power
295 | nratio = 1                                                                      #SPECIFY - the ratio of the size of the treatment group to control group
296 | alpha =0.05                                                                     #SPECIFY - significance level
297 | p = nratio/(1+nratio)
298 | 
299 | 
300 | N_partial <- function(dataset, outcome, treatment, takeup_treat, takeup_control){
301 |   
302 |   baseline_mean <- mean(outcome, na.rm = TRUE)                                  #Record the mean of the outcome variable at baseline   
303 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
304 |   
305 |   
306 |   expected_effect = 0.3*baseline_sd                                             #The expected effect with perfect take-up should be specified based on the intervention and the cost. 
307 |                                                                                 #Here it is 0.3 times the sd
308 |   
309 |   tu = takeup_treat - takeup_control													                  #effective take-up
310 |   effect_tu = expected_effect*tu											                          #effect size after adjusting for take-up. 
311 |                                                                                 #This will be the effect size you expect to measure with a true effect size of 
312 |                                                                                 #`effect' and a take-up rate of `tu'. effect_tu < effect for imperfect take-up rates. 
313 |   treat_tu = baseline_mean + effect_tu                                          #treatment mean after adjusting for take-up
314 |   
315 |   partial_model <- twomeans(m1 = baseline_mean, m2 = treat_tu, nratio=nratio, sd = baseline_sd, 
316 |                             power=, sig.level = alpha)
317 |   
318 |   # Here we assume that the standard deviation does not change with the treatment but 
319 |   # you can also specify different standard deviations for the control and treatment groups
320 |   
321 |   print(partial_model)
322 |   
323 |   cat("we need a higher sample size to have the same power because 
324 |   the expected effect has decreased by the take-up rate \n\n")
325 |   
326 |   
327 |   cat("We need a minimum treatment size of",partial_model$n2,
328 |     "and control size of", partial_model$n1, "to detect an effect of", 
329 |     expected_effect, "with a probability of", 
330 |     power, "if the ratio of the treatment and control is",
331 |     nratio,"and take-up rate is", tu, "and the effect is true")
332 |   
333 |   return(partial_model)
334 | 
335 | }
336 | 
337 | 
338 | 
339 | takeup_treat = 0.75                                                             #SPECIFY - take up in the treatment
340 | takeup_control =  0.2                                                           #SPECIFY - take up in the control
341 | 
342 | partial_model <- N_partial(dataset, outcome, treatment,takeup_treat, takeup_control )
343 | 
344 | # Change the parameters of the function to see how the sample size changes
345 | 
346 | 
347 | ### 4.Clustered RCTs ####################################################
348 | 
349 | # The code presented so far has been for when the unit of randomization is the same
350 | # as the unit of observation. The following code is for clustered designs, when there are
351 | # multiple units of observation contained in a single unit of randomization 
352 | # (e.g., randomization is at the school level but outcomes measured at the student level)
353 | 
354 | 
355 | # Calculate ICC 
356 | 
357 | # Here the ICC is calculated from the dataset but it can also be manually defined
358 | 
359 | baseline_subset <- subset(dataset, !is.na(outcome))                             #remove the NA values 
360 | 
361 | cluster_var_subset <- as.factor(baseline_subset$divid)                           #SPECIFY- change "divid" to the cluster variable
362 | outcome_subset <- baseline_subset$pre_totnorm                                    #SPECIFY- change "pre_totnorm" to the outcome variable
363 | 
364 | icc <- ICCest(cluster_var_subset, outcome_subset, data = baseline_subset)
365 | rho <- icc$ICC                                                                
366 | 
367 | rho
368 | 
369 | # rho = 0.1                                                                     #SPECIFY - Manually define rho if required
370 | 
371 | ### 4a. The number of clusters given cluster size and effect size #############
372 | 
373 | power = 0.8                                                                     #SPECIFY - desired power
374 | nratio = 1                                                                      #SPECIFY - the ratio of the total number of units in the treatment and the control group
375 | alpha =0.05                                                                     #SPECIFY - significance level
376 | 
377 | # maybe an iterative process is more appropriate since the k is a part of the df calc
378 | number_of_clusters <- function(dataset, outcome, treatment, m, icc){
379 |   
380 |   baseline_mean <- mean(outcome, na.rm = TRUE)                                  #Record the mean of the outcome variable at baseline   
381 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
382 |   
383 |   expected_effect = 0.3*baseline_sd                                              #The expected effect should be specified based on the intervention and the cost. 
384 |                                                                                 #Here it is 0.3 times the sd
385 |   treated_mean = baseline_mean+expected_effect
386 |   
387 |   
388 |   cluster_number <- twomeans(m1 = baseline_mean, m2 = treated_mean, sd = baseline_sd, nratio=nratio, 
389 |                              sig.level = alpha, power=power)%>%
390 |     clustered(obsclus = cluster_size, rho = icc)
391 |   
392 |   # Here we assume that the standard deviation does not change with the treatment but 
393 |   # you can also specify different standard deviations for the control and treatment groups
394 |   
395 |   print(cluster_number)
396 |   
397 |   cat("Adjusted n1 and n2 indicate the sample size in the control and the treatment group respectively. 
398 |     sample size is the total number of units across the clusters \n\n")
399 |   
400 |   
401 |   cat("Given the size of each cluster as", 
402 |      cluster_size,"and ratio of the number of units in the treatment to control as", nratio,
403 |      "we need a minimum of", cluster_number$`Minimum number of clusters`, 
404 |      "clusters to detect an effect of", 
405 |      expected_effect,"with a probability of", power, "if the effect is true")
406 |   
407 |   return(cluster_number)
408 |   
409 | }
410 | 
411 | cluster_size = 50												                                        #SPECIFY - number of people in each cluster
412 | 
413 | cluster_number <- number_of_clusters(dataset, outcome, treatment, 
414 |                                     cluster_size, rho)
415 | 
416 | 
417 | ### 4b. The cluster size given number of clusters and effect size ##############
418 | 
419 | # This assumes that the number of clusters in the control and the treatment arm is the same
420 | 
421 | power = 0.8                                                                     #SPECIFY - desired power
422 | nratio = 2                                                                      #SPECIFY - the ratio of the total number of units in the treatment and the control group
423 | alpha =0.05                                                                     #SPECIFY - significance level
424 | 
425 | cluster_size <- function(dataset, outcome, treatment, total_clusters, icc){
426 |   
427 |   baseline_mean <- mean(outcome, na.rm = TRUE)                                  #Record the mean of the outcome variable at baseline   
428 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
429 |   
430 |   expected_effect = 0.3*baseline_sd                                             #The expected effect should be specified based on the intervention and the cost. 
431 |                                                                                 #Here it is 0.3 times the sd
432 |   
433 |   treated_mean = baseline_mean+expected_effect
434 |   
435 | 
436 |   cluster_size_model <- twomeans(m1 = baseline_mean, m2 = treated_mean, sd = baseline_sd, 
437 |                        power=power, nratio=nratio, sig.level= alpha)%>%
438 |    clustered(numclus = total_clusters, rho = icc)
439 |   
440 |   print(cluster_size_model)
441 |   
442 |   cat("Given", total_clusters,"clusters, and the ratio of units in the treatment and the control as", nratio,
443 |   "the minimum size of each cluster
444 |     should be", cluster_size_model$`Average per cluster`, "for us to detect an effect
445 |     of",expected_effect, "with a probability of", power, "if the effect is true")
446 |   
447 |   return(cluster_size_model)
448 | 
449 | }
450 | 
451 | total_clusters = 50                                                             #SPECIFY - the total number of clusters
452 | 
453 | cluster_size_model <- cluster_size(dataset, outcome, 
454 |                                    treatment, total_clusters, rho)
455 | 
456 | 
457 | ### 4c. The effect size in a clustered design ##########################
458 | 
459 | # This assumes that the number of clusters in the control and the treatment arm is the same
460 | 
461 | power = 0.8                                                                     #SPECIFY - desired power
462 | nratio = 1                                                                      #SPECIFY - the ratio of the total number of units in the treatment and the control group
463 | alpha =0.05                                                                     #SPECIFY - significance level
464 | p = nratio/(1+nratio)
465 | 
466 | 
467 | mde_cluster <- function (dataset, outcome, treatment, cluster_size, number_clusters, icc){
468 |   N = cluster_size*number_clusters
469 |   t_power = qt(power, df=2*(number_clusters-1))
470 |   t_alpha = qt(1-alpha/2, df=2*(number_clusters-1))
471 |   
472 |   t_stat <- t_power + t_alpha
473 | 
474 |   baseline_sd <- sd(outcome, na.rm = TRUE)                                      #Record the standard deviation of the outcome variable at baseline   
475 |   
476 |   cluster_mde <- t_stat * sqrt(1 / (p * (1 - p) * total_clusters )) * 
477 |     sqrt(icc + (1 - icc) / cluster_size) * baseline_sd
478 |   
479 |   print(cluster_mde)
480 |   
481 |   cat("Given", total_clusters,"clusters and the ratio of the treatment and control size as", 
482 |       nratio,"and", 
483 |       cluster_size, "people in each cluster",
484 |       "the effect needs to be higher than", cluster_mde,
485 |       "for us to detect it with a probability of", power)
486 |   
487 |   return(cluster_mde)
488 |   
489 | }
490 | 
491 | total_clusters = 100                                                            #SPECIFY - the number of total number of clusters
492 | cluster_size = 50												                                        #SPECIFY - number of people in each cluster
493 | 
494 | cluster_mde <- mde_cluster(dataset, outcome, treatment, cluster_size,
495 |                                    total_clusters, rho)
496 | 


--------------------------------------------------------------------------------