├── .Rhistory
├── .gitignore
├── README.md
├── Reading List.md
├── Tutorials
    └── write.txt.tutorial.R
├── biostatistics.ca
    ├── Justin-35(1).jpg
    ├── abuse_of_power
    │   ├── observed-power-in-spss-glm.png
    │   ├── survey_monkey.png
    │   ├── the_abuse_of_power.Rmd
    │   └── the_abuse_of_power.html
    ├── collider_bias
    │   ├── collider_bias.Rmd
    │   └── collider_bias.html
    ├── common_DAG_structures
    │   ├── common_DAG_structures_blog.Rmd
    │   └── common_DAG_structures_blog.html
    ├── influential_observations
    │   ├── influential_observations.Rmd
    │   └── influential_observations.html
    ├── kronecker_product
    │   ├── kronecker_product.Rmd
    │   └── kronecker_product.pdf
    ├── powerTOST_tutorial
    │   ├── Sample size.xlsx
    │   └── sample size.R
    ├── power_with_uncertainty
    │   ├── power_with_uncertainty.Rmd
    │   └── power_with_uncertainty.html
    ├── pvalue_distributions
    │   ├── output_plot.png
    │   ├── p.value_distribution_simulations.Rmd
    │   └── p.value_distribution_simulations.html
    └── theme.css
├── causal_inference
    └── battle_for_the_soul_of_causal_inference.pdf.pdf
└── contact.png


/.Rhistory:
--------------------------------------------------------------------------------
 1 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
 2 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 3 | path <- "C:/Users/Utilisateur/Documents/JB Consulting/GitHub/consultation/Content/power"
 4 | setwd(path)
 5 | library(ggplot2)
 6 | library(pagedown)
 7 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
 8 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
 9 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
10 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
11 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
12 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
13 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
14 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
15 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
16 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
17 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
18 | chrome_print(file.path(path, "the_abuse_of_power.html"), file.path(path, "the_abuse_of_power.pdf"), options=list(paperWidth = 14.063/2, paperHeight = 11.25/2))
19 | 158/34.2
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rhistory


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to [biostatistics.ca's](https://www.biostatistics.ca/?utm_source=github&utm_medium=readme&utm_campaign=header) biostatistics and causal inference repo, in partnership with [JB Statistical Consulting](https://www.justinbelair.ca/?utm_source=GitHub&utm_medium=read_mew&utm_campaign=welcome)!
 2 | 
 3 | In it, you will find
 4 | - The [Biostatistics and Causal Inference Reading List](https://github.com/JB-Statistical-Consulting/biostatistics/blob/main/Reading%20List.md)
 5 |   - A curated selection of papers (including DOI links), books, and other materials to help improve your understanding of biostatistics
 6 | - Tutorials on technical tools such as R, Github, and others
 7 | - RMarkdown source notebooks for the [biostatistics.ca](https://www.biostatistics.ca/?utm_source=github&utm_medium=readme&utm_campaign=header) blog
 8 | - And more...
 9 |   
10 | # Resources
11 | - Check out the complete [Biostatistics Roadmap, or How to Become a Great Biostatistician](https://www.biostatistics.ca/the-biostatistics-roadmap-or-how-to-become-a-great-biostatistician/?utm_source=github&utm_medium=readme&utm_campaign=roadmap)
12 | - [Sign up for my monthly Causal Inference in Biostatistics Newsletter](https://causal-inference-in-statistics.beehiiv.com/subscribe?utm_source=github&utm_medium=read_me&utm_campaign=footer) to receive free content directly in your inbox.
13 | - Visit [biostatistics.ca](https://www.biostatistics.ca/?utm_source=Github&utm_medium=read_me&utm_campaign=footer) for blog articles about biostatistics/causal inference, comprehensive guides about education/careers in biostatistics/causal inference, free downloadable resources, and more.
14 | 
15 | # Contact
16 | - Visit [www.justinbelair.ca](https://www.justinbelair.ca/?utm_source=GitHub&utm_medium=read_mew&utm_campaign=footer) if you have any questions or need help with statistics.
17 | - Or [email Justin Belair](mailto:belairjustin@gmail.com) directly.
18 | 
19 | ![alt text](https://github.com/JB-Statistical-Consulting/biostatistics/blob/main/contact.png)
20 | 


--------------------------------------------------------------------------------
/Reading List.md:
--------------------------------------------------------------------------------
  1 | # A free Causal Inference in Biostatistics reading list
  2 | 
  3 | - Learn more about my upcoming book: [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book/?utm_source=github&utm_medium=reading_list&utm_campaign=header)
  4 | - Learn Statistics the Right Way with my new online course sequence. [Learn more and pre-order here.](https://justinbelair.ca/introduction-to-biostatistics/?utm_source=github&utm_medium=reading_list&utm_campaign=header)
  5 | - Check out the complete [Biostatistics Roadmap, or How to Become a Great Biostatistician](https://www.biostatistics.ca/the-biostatistics-roadmap-or-how-to-become-a-great-biostatistician/?utm_source=github&utm_medium=reading_list&utm_campaign=roadmap)
  6 | - [Sign up for my monthly Causal Inference in Biostatistics Newsletter](https://causal-inference-in-statistics.beehiiv.com/subscribe?utm_source=github&utm_medium=reading_list&utm_campaign=header) to receive free content directly in your inbox.
  7 | - Visit [www.justinbelair.ca](https://www.justinbelair.ca/?utm_source=GitHub&utm_medium=reading_list&utm_campaign=header) if you have any questions or need help with statistics.
  8 | - Visit [www.biostatistics.ca](https://www.biostatistics.ca/?utm_source=Github&utm_medium=reading_list&utm_campaign=header) for blog articles about biostatistics/causal inference, comprehensive guides about education/careers in biostatistics/causal inference, free downloadable resources, and more.
  9 | 
 10 | ## Fundamentals
 11 | 
 12 | ### Hypothesis Testing
 13 | 
 14 | - [Huang et al., Formulating Appropriate Statistical Hypotheses for Treatment Comparison in Clinical Trial Design and Analysis, 2014](https://doi.org/10.1016%2Fj.cct.2014.09.005)
 15 |   - This paper is quite technical, but truly amazing! Anybody with a background in pure statistics should read this paper to widen their theoretical understanding of designing hypotheses tests to be applied in research settings. I can't recommend this paper enough!
 16 | 
 17 | - [Delacre, Lakens & Leys, Why Psychologists Should by Default Use Welch's t-test Instead of Student's t-test, 2017](https://doi.org/10.5334/irsp.82)
 18 |   - A must-read! Take-home message : t-test is great, especially its robust forms!
 19 |   
 20 | - [Lakens et al., Equivalence Testing for Psychological Research: A Tutorial, 2018](https://doi.org/10.1177/2515245918770963)
 21 |   - A great gentle introduction to minimal-effects testing, equivalence testing, and inferiority testing. These are underutilized tools that should be taught to any applied researchers using hypothesis tests for experimental data! A must read.
 22 |   - Related to this paper is an R package called TOSTER, developed by Lakens and others. Check out the [package vignette here](https://cran.rstudio.com/web/packages/TOSTER/vignettes/IntroductionToTOSTER.html)
 23 | 
 24 | ### Misinterpretations of p-values, power analysis, and other concepts
 25 | - [Sander Greenland et al., Statistical tests, P values, confidence intervals, and power: a guide to misinterprations, 2016](https://doi.org/10.1007%2Fs10654-016-0149-3)
 26 |   - THE resource for all p-value misinterpretations by a collection of eminent statisticians. Must be read and re-read!
 27 | - [Sander Greenland, Nonsignificance Plus High Power Does Not Imply Supper for the Null Over the Alternative, 2012](https://doi.org/10.1016/j.annepidem.2012.02.007)
 28 |   - The title says it all...it's an easy mistake to make!
 29 | - [John M. Hoenig & Dennis M. Heisey, The Abuse of Power: The Pervasive Fallacy of Power Calculations for Data Analysis, 2001](https://doi.org/10.1198/000313001300339897)
 30 |   - Power is a widely misunderstood statistical concept. Study it!
 31 | - [Steven Goodman, A Dirty Dozen: Twelve P-Value Misconceptions, 2008](https://doi.org/10.1053/j.seminhematol.2008.04.003)
 32 |   - The title says it all.
 33 | - [Harvey J. Motulsky, Common misconceptions about data analysis and statistics, 2014](https://doi.org/10.1007%2Fs00210-014-1037-6)
 34 |   - Goes over many misconceptions. It is quite beginner friendly, take a look!
 35 | 
 36 | ### Dichotomania: How Dichotomizing Continous Variables is Detrimental to Science
 37 | 
 38 | - Key Articles and Papers
 39 | 	- [Dichotomania: An Obsessive-Compulsive Disorder that is Rampant in Clinical Research](https://www.semanticscholar.org/paper/Dichotomania%3A-An-Obsessive-Compulsive-Disorder-that-Senn/8c390cb90465ad1f08b86cee52474859e4dffd77)  
 40 | 	- [Statistical Errors in the Medical Literature](https://www.fharrell.com/post/errmed/)  
 41 | 	- [Consequences of Dichotomization](https://pubmed.ncbi.nlm.nih.gov/24475020/)  
 42 | 	- [Three Problems with Dichotomization](https://journals.sagepub.com/doi/10.1177/014662168300700301)  
 43 | 
 44 | - Real-World Examples of the Disservice Dichotomization Causes:
 45 | 	- [Quality & Safety in Healthcare](https://qualitysafety.bmj.com/content/26/10/799)  
 46 | 	- [British Medical Journal (BMJ)](https://www.bmj.com/content/357/bmj.j2353)  
 47 | 	- [Circulation: Arrhythmia and Electrophysiology](https://www.ahajournals.org/doi/full/10.1161/circep.117.006091)  
 48 |   
 49 | ### Design of Experiments
 50 | 
 51 | - B.J. Winer, Donald R. Brown, Kenneth M. Michels, Statistical Principles in Experimental Design, Third edition, 1991 (Book)
 52 |   - A very thick book that thoroughly covers a wide-array of experimental designs. A great reference manual for anyone working on experimental research, espcially with human-subjects (e.g. psychology)
 53 | 
 54 | - [Leducq et al., Research Techniques Made Simple : Randomized Controlled Trials for Topical Drugs in Dermatology: When and How Should We Use a Within-Person Design?, 2020](https://doi.org/10.1016/j.jid.2020.03.945)
 55 |   - A great introduction to the within-person (aka within-patient, within-subject) randomized controlled trial. This type of trial is used when it is possible to use a patient as its own control (e.g. in ophthalmology, where each eye can be randomized to a different treatment).
 56 | 
 57 | - [Pandis et al., CONSORT 2010 statement: extension checklist for reporting within person randomised trials, 2017](https://doi.org/10.1136/bmj.j2835)
 58 |   - Anyone running a within-person trial should consult these guidelines for maximizing the utility of what they report from the trial!
 59 | 
 60 | ### Lord's Paradox
 61 | - [Frederic M. Lord, A Paradox in the Interpretation of Group Comparisons, 1967](https://psycnet.apa.org/doi/10.1037/h0025105)
 62 |   - The first very influential paper describing what is now known as Lord's Paradox.
 63 | - [Frederic M. Lord, Statistical Adjustments When Comparing Preexisting Groups, 1969](https://psycnet.apa.org/doi/10.1037/h0028108)
 64 |   - Lord's second paper that goes into more details on his 'paradox'
 65 | - [Judea Pearl, Lord's Paradox Revisited - (Oh Lord! Kumbaya!), 2016](https://doi.org/10.1515/jci-2016-0021)
 66 |   - A causal inference perspective by one of its main contributors, Judea Pearl. (See Causal Inference section of this reading list)
 67 |   
 68 | ### Survey Sampling
 69 | 
 70 | - [W.G Cochran, The Use of the Analysis of Variance In Enumeration By Sampling, 1938](https://doi.org/10.2307/2279483)
 71 |   - A landmark paper on survey sampling where the idea to use ANOVA to estimate properties of the sampling design. Cochran also mentions the idea of an infinite population in relation to sampling theory.
 72 |   
 73 | - [V.P. Godambe and M.E. Thompson, Parameters of Superpopulation and Survey Population: Their Relationships and Estimation, 1986](https://doi.org/10.2307/1403139)
 74 |   - A highly technical paper on the use of estimating functions to relate superpopulation parameters with those of a survey population under study.
 75 |   
 76 | - [J.N.K. Rao, Interplay Between Sample Survey Theory and Practice : An Appraisal, 2005](https://www150.statcan.gc.ca/n1/en/catalogue/11-522-X20050019443)
 77 |   - A thorough historical overview of the evolution of survey sampling theory through the interplay between theory and practice.
 78 | 
 79 | ### Philosophical Questions
 80 | - [Jose D. Perezgonzalez, *Fisher, Neyman-Pearson or NHST? A tutorial for teaching data testing*, 2015](https://doi.org/10.3389/fpsyg.2015.00223)
 81 |   - An interesting overview of the two foundational classical approaches to testing in statistics: Fisher's approach and the Neyman-Pearson framework. The third approach, Null Hypothesis Significance Testing (NHST) is presented as a loose and controversial approach lacking rigour. A must read!
 82 | 
 83 | ### Classics
 84 | - R. A. Fisher, *The Design of Experiments*, 1935 (Book)
 85 | 
 86 |   - Contains Fisher's famous lady-tasting tea experiment, first example I know of permutation testing, many groundbreaking examples of Analysis of Variance (ANOVA), and some disparaging (and very funny) remarks towards Pearson.
 87 |   
 88 | ## Missing Data
 89 | - [Stef van Buuren, *Flexible Imputation of Missing Data*, 2018 (Book, with online version](https://stefvanbuuren.name/fimd/))
 90 |   - A must-have for any applied statistician dealing with missing data problems. This book presents the state-of-the-art in multiple imputation (MI), a field where van Buuren made his name. Contains lots of concrete examples with code, discusses trade-offs in complex situations, and gives lots of references to literature with simulation studies to back any claims up.
 91 |   
 92 | - Gert Molenberghs and Michael G. Kenward, *Missing Data in Clinical Studies*, 2007 (Book)
 93 |   - A deep and thorough exposition of missing data in clinical studies. A complex book for advanced statisticians, especially those working in clinical studies.
 94 |   
 95 | - Roderick J. A. Little & Donald B. Rubin, *Statistical Analysis with Missing Data*, 2002 (Book)
 96 |   - The first textbook put together to reflect the growing literature on missing data methodology. Still useful, although van Buuren, 2018 is probably better suited for applied statisticians
 97 | 
 98 | ## Causal Inference
 99 | 
100 | ### Fundamentals
101 | 
102 | - Justin Belair, [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book/?utm_source=github&utm_medium=reading_list&utm_campaign=header)
103 | 	- Part I with Chapters 1-4 is being released soon. Think of it as the result if Imbens & Rubin, Pearl, and Hernan & Robins had a baby. The first chapter is available for free.
104 | 
105 | - Judea Pearl, *Causality : Models, Reasoning and Inference*, 2000, updated in 2009 (Book)
106 |   - A true masterpiece. A technical and deep exposition of Pearl's life work on Directed Acyclic Graphs (DAGs) as Structural Causal Models (SCMs) that got me started on my causal inference journey. His viewpoint is an alternative to the Neyman-Rubin causal model based on potential outcomes. This book can also be seen as the academic version of The Book of Why, a famous general-audience book on causality.
107 |     
108 | - Guido W. Imbens and Donald B. Rubin, *Causal Inference for Statistics, Social, and Biomedical Sciences*, 2015 (Book)
109 |   - A true masterpiece. The most achieved and thorough exposition of the Neyman-Rubin causal model based on potential outcomes. It is an alternative to Pearl's DAG and SCM framework (see above). A beautiful book that I find myself going back to often, for its depth and breadth of insights into thinking about causal inference. Imbens is an economist who contributed much to this field, most notably through is Local-Average Treatment Effect identification in cases of non-compliance. Rubin is one of the greatest living statisticians.
110 |   
111 | - Judea Pearl, Madelyn Glymour & Nicholas P. Jewell, *Causal Inference in Statistics : A Primer*, 2016 (Book)
112 |   - A gentle introduction to Directed Acyclic Graphs (DAGs) and Structural Causal Models (SCMs) at about the undergraduate in statistics level.
113 |   
114 | - Bill Shipley, *Cause and Correlation in Biology : A User's Guide to Path Analysis, Structural Equations and Causal Inference*, 2000 (Book)
115 |   - A well-written introduction to causal inference for biologists, with an emphasis of Structural Equation Models (SEMs) and Path Analysis. There is also a little bit of interesting history sprinkled in. I took a class with this professor (who just retired from a University close to my home town) in 2023 and his focus on biological applications without sacrificing rigour is great for any non-statistician looking to tackle complex statistical methods!
116 |   
117 | - Paul R. Rosenbaum, *Observational Studies*, 1995 (Book)
118 |   - A compact monograph by one of the inventors of the propensity score (PS) with a clear writing style. I've always enjoyed Rosenbaum's incisive perspectives on problems related to causality in observational studies!
119 |   
120 | - Tyler J. VanderWeele, *Explanations in Causal Inference : Methods for Mediation and Interaction*, 2015 (Book)
121 |   - A thick, somewhat terse book by the leading researcher in mediation and interaction analysis in causal inference. This book is aimed at advanced users of causal inference methods, but it is truly unique in its depth of the subject! It is written by an epidemiologist but he also targets a social science audience.
122 | 
123 | ### Tutorials
124 | 
125 | - [Andy Wilson & Aimee Harrison, Crash course on confounding, bias, and deconfounding remedies using R, 2024](https://www.biostatistics.ca/crash-course-on-confounding-bias-and-deconfounding-remedies-using-r/)
126 | 	- A great blog post about practical, hands-on methods (including R code) to remove confounding and estimate unbiased treatment effects.
127 | 
128 | ### Propensity Score
129 | 
130 | - [Peter C. Austin, An Introduction to Propensity Score Methods for Reducing the Effects of Confounding in Observational Studies, 2011](https://doi.org/10.1080%2F00273171.2011.568786)
131 | 	- A great introduction to propensity score (PS) methods used for matching, stratification, and IPTW methods with comparisons between them. There is also a discussion about computing covariate balance and choosing predictors for the propensity score model. It ends with a discussion about PS methods vs. regression adjustment.
132 | - [Brookhart et al., Variable Selection for Propensity Score Models, 2006](https://doi.org/10.1093/aje/kwj149)
133 | 	- A thorough discussion grounded in simulation studies about when to include variables in the propensity score (PS) model, especially by considering them as related to the treatment, the outcome, or both.
134 | - [King & Nielsen, Why Propensity Scores Should Not Be Used for Matching, 2019](https://gking.harvard.edu/files/gking/files/pan1900011_rev.pdf)
135 | 	- A much-discussed paper that criticizes propensity score matching techniques on grounds that the method tries to approximate complete randomization, while approximating block (or stratified) randomization gives better results.
136 | - [Schuster et al., Propensity score model overfitting led to inflated variance of estimated odds ratios, 2016](https://doi.org/10.1016/j.jclinepi.2016.05.017)
137 | 	- A Monte Carlo simulation showing why propensity score (PS) models shouldn't be overfit. As a matter of fact, the quality of fit is not a primary concern when fitting a PS model.
138 | - [Setoguchi et al., Evaluating uses of data mining techniques in propensity score estimation: a simulation study, 2008](https://doi.org/10.1002/pds.1555)
139 | 	- A simulation study evaluation of logistic regression (LR), tree-based methods with and without pruning, and neural networks as the model classes used to fit propensity score (PS) models. The main takeaway is that results based on a LR PS model are robust.
140 | - [Elizabeth Stuart, Matching methods for causal inference: A review and a look forward, 2010](https://doi.org/10.1214%2F09-STS313)
141 | 	- The title says it all : a review and look forward about different matching techniques used to emulate randomized trials, including propensity score matching.
142 | - [Rosenbaum & Rubin, The Central Role of the Propensity Score in Observational Studies for Causal Effects, 1983](https://doi.org/10.2307/2335942)
143 | 	- The landmark paper where the propensity score is introduced and its properties derived, most notably that conditioning on the propensity score also "deconfounds" the treatment assignment. A classic.
144 | 
145 | ## Clinical Prediction Modelling
146 | 
147 | When building prediction models, we are less interested about inference on the parameters and more focused on the values and uncertainty of the predictions. In clinical settings, robust prediction models can mean the difference between life and death!
148 | 
149 | - [Ewout W. Steyerberg, Clinical Prediction Models, 2010 (Book, with online content)](https://www.clinicalpredictionmodels.org/)
150 |   - A pretty thick reference manual by a leader in the field. Aimed especially towards prediction models in clinical settings, it's a must-have for any advanced modeller looking to make a difference in healthcare with novel technologies.
151 |   
152 | - [Collins et al., Evaluation of clinical prediction models (part 1): from development to external validation, 2024](https://doi.org/10.1136/bmj-2023-074819)
153 |   - Part 1 of a step-by-step tutorial on rigorous and robust clinical prediction model-building, focused on the early stages of model-building.
154 |   
155 | - [Riley et al., Evaluation of clinical prediction models (part 2): how to undertake an external validation study, 2024](https://doi.org/10.1136/bmj-2023-074820)
156 |   - Part 2 of a step-by-step tutorial on rigorous and robust clinical prediction model-building, focused on conducting external validation of the model built following the steps outlined in part 1.
157 |   
158 | - [Riley et al., Evaluation of clinical prediction models (part 3): calculating the sample size required for an external validation study, 2024](https://doi.org/10.1136/bmj-2023-074821)
159 |   - Part 3 of a step-by-step tutorial on rigorous and robust clinical prediction model-building, focused on power and sample size calculations. It is often difficult to know the sample-size required for adequate external validation data. This guide offers detailed instructions on conducting these estimations, once we've built a model.  
160 |   
161 | ## Epidemiology
162 | 
163 | ### Fundamentals
164 | 
165 | Epidemiology is a discipline distinct from biostatistics, but there is strong overlap in the methods.
166 | Epidemiology relies on many difficult design principles to obtain valid inferences. A few textbooks that are must-haves for epidemiologists.
167 | 
168 | - Kenneth J. Rothman & Sander Greenland, Modern Epidemiology, Second Edition, 1998 (Book)
169 |   - The bible of modern epidemiology. An authoritative textbook on study design principles. Its sections on analysis techniques are a bit dated. Also, it doesn't discuss much of the causal inference techniques and principles that have come to slowly dominate the field through the works of VanderWeele, Hernàn, Robins and others. Still, anybody wishing to understand how to think like an epidemiologist must tackle this book. Its explanation of case-control studies and their peculiarities is particularly illuminating. 
170 |   
171 | - Leon Gordis, Epidemiology, Fifth Edition, 2014 (Book)
172 |   - A very popular introduction to Epidemiology in color with many images and illustrations. A good tool to learn the basics of epidemiological design principles.
173 |   
174 | ### Bradford Hill Criteria And Their Legacy
175 | 
176 | In 1965, Bradford Hill proposed a series of 9 criteria which should be thought about when trying to uncover a causal relationship among the correlational noise. Causal inference has a gone a long way since, but these 9 criteria are still widely discussed and serve as guiding principles in epidemiology and its subfields.
177 | 
178 | - [Sir Austin Bradford Hill, The Environment and Disease: Association or Causation?, 1965](https://doi.org/10.1177/003591576505800503)
179 |   - The classic President's Address delivered to newly formed Section of Occupational Medicine of the Royal Society of Medicine by Sir Bradford Hill in which he presents his famous 9 criteria for an association to be deemed causal. The paper went on to become tremendously influential and its still commented to this day.
180 |   
181 | - [Glass, Goodman, Hernan & Samet, Causal Inference in Public Health, 2013](https://doi.org/10.1146/annurev-publhealth-031811-124606)
182 |   - A modern discussion of causal inference through the lens of policymaking in public health areas.
183 |   
184 | - [Fedak et al., Applying the Bradford Hill criteria in the 21st century: how data integration has changed causal inference in molecular epidemiology, 2015](https://doi.org/10.1186%2Fs12982-015-0037-4)
185 |   - Discusses how our understanding of Bradford Hill's original 9 criteria has evolve over time through a review of examples taken from molecular epidemiology.
186 | 
187 | ## Time-series Models
188 | 
189 | ### Generalized Linear Autoregressive Moving Average (GLARMA) Models for Count Data (Poisson, Binomial, Negative Binomial)
190 | 
191 | - [Zeger, A regression model for time series of counts, 1988](https://doi.org/10.2307/2336303)
192 |   - A classic paper discussing the problem of modelling time series of counts, with the famous example of U.S. Polio incidence data, now part of the `glarma` package.
193 | - [Davis, Wang & Dunsmuir, Modeling Time Series of Count Data, 1999. In S Ghosh (ed.), *Asymptotics, Nonparametrics, and Time Series*, volume 158 of *Statistics Textbooks and Monographs*, pp. 63-114](https://doi.org/10.1201/9781482269772)
194 |   - A theoretical paper discussing differences between parameter-driven and observation-driven state-space models, with many example analyses at the end.
195 | - [Davis, Dunsmuir and Streett, Observation-driven models for Poisson counts, 2003](https://doi.org/10.1093/biomet/90.4.777)
196 |   - A theoretical paper with an interesting example application to the Asthma dataset.
197 | - [Dunsmuir & Scott, The `glarma` Package for Observation Driven Time Series Regression of Counts, 2015](https://doi.org/10.18637/jss.v067.i07)
198 |   - The `glarma` package vignette with theory, code, and examples.
199 | 
200 | ## Modeling ordinal data
201 | ### Proportional Odds Regression AKA Ordinal Regression AKA many other names
202 | 
203 | - [Rune Haubo B Christensen, Cumulative Link Models for Ordinal Regression with the `R` Package `ordinal`, 2018](https://cran.r-project.org/web/packages/ordinal/vignettes/clm_article.pdf)
204 |   - The `ordinal` package vignette with theory, code, and examples. It's rather lengthy and extensive!
205 | 
206 | - [Peter McCullagh, Regression Models for Ordinal Data, 1980](https://doi.org/10.1111/j.2517-6161.1980.tb01109.x)
207 |   - A foundational paper for the proportional odds model, relating it mathematically to the famoux Cox proportional hazards model.
208 |   
209 | - [Christopher Winship & Robert D. Mare, Regression Models With Ordinal Variables, 1984](https://doi.org/10.2307/2095465)
210 |   - A foundational paper describing techniques to handle ordinal data especially aimed at eliminating bad practices in the sociology literature.
211 |   
212 | - [UCLA: Statistical Consulting Group., Ordinal Logistic Regression | R Data Analysis Examples, Accessed April 12, 2024](https://stats.oarc.ucla.edu/r/dae/ordinal-logistic-regression/)
213 |   - A tutorial for ordinal logistic regression using `MASS` package. No matter if you use this package or `clm`, this tutorial is interesting as it addresses how to analyze the data before modeling, namely by checking the proportional odds assumption.
214 | 
215 | 
216 | ## Modeling proportions data (in the 0-1 interval)
217 | ### Beta regression
218 | 
219 | - Foundational papers
220 |   - [Kieschnick & McCullough, *Regression analysis of variates observed on (0, 1): percentages, proportions and fractions*, 2003](https://doi.org/10.1191/1471082X03st053oa)
221 |   - [Ferrari & Cribari-Neto, *Beta Regression for Modelling Rates and Proportions*, 2004](https://doi.org/10.1080/0266476042000214501)
222 |     
223 | - Regression tools
224 |   - [Cribari-Neto & Zeilis, *Beta Regression in R*, 2010](https://doi.org/10.18637/jss.v034.i02)
225 |     - The paper accompagnying the R package betareg
226 |   - [Kubinec, *Ordered Beta Regression: A Parsimonious, Well-Fitting Model for Continuous Data with Lower and Upper Bounds*, 2021](https://doi.org/10.1017/pan.2022.20)
227 |     - A paper proposing an alternative to the Zero-One Inflated Beta (ZOIB Model)
228 |   
229 | ### Simplex Regression  
230 | - [Zhang, Qiu & Shi, *simplexreg: An R Package for Regression Analysis of Proportional Data Using the Simplex Distribution*, 2016](https://doi.org/10.18637/jss.v071.i11)
231 |   - Simplex Regression, an alternative to beta regression using the Simplex distribution. Incorporates both MLE and GEE techniques.
232 |       
233 | # Contact
234 | - Visit [www.justinbelair.ca](https://www.justinbelair.ca/?utm_source=GitHub&utm_medium=reading_list&utm_campaign=footer) if you have any questions or need help with statistics.
235 | - Or, email me directly at belairjustin@gmail.com
236 | 
237 | ![alt text](https://github.com/JB-Statistical-Consulting/biostatistics/blob/main/contact.png)
238 | 


--------------------------------------------------------------------------------
/Tutorials/write.txt.tutorial.R:
--------------------------------------------------------------------------------
 1 | write.txt <- function(name, content){
 2 |   #Takes name (string) and content (string) and writes content to name.txt files in the wd
 3 |   options(max.print=1000000000)
 4 |   fileConn<-file(name)
 5 |   writeLines(content, fileConn)
 6 |   on.exit(close(fileConn))
 7 | } 
 8 | 
 9 | # Replace data.analysis with your own list of data.frames!
10 | 
11 | class(data.analysis)
12 | names(data.analysis)
13 | 
14 | class(data.analysis$outcome1)
15 | names(data.analysis$outcome1)
16 | 
17 | lm.models <- lapply(data.analysis, \(x) lm(value ~ time*treatment, data = x))
18 | class(lm.models)
19 | 
20 | lm.summaries <- lapply(lm.models, summary.lm)
21 | 
22 | write.txt("outputs/lm_model_summaries.txt", capture.output(lm.summaries))
23 | 


--------------------------------------------------------------------------------
/biostatistics.ca/Justin-35(1).jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/Justin-35(1).jpg


--------------------------------------------------------------------------------
/biostatistics.ca/abuse_of_power/observed-power-in-spss-glm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/abuse_of_power/observed-power-in-spss-glm.png


--------------------------------------------------------------------------------
/biostatistics.ca/abuse_of_power/survey_monkey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/abuse_of_power/survey_monkey.png


--------------------------------------------------------------------------------
/biostatistics.ca/abuse_of_power/the_abuse_of_power.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Don't Compute the Statistical Power of Your Experiment...Even if SPSS Allows It And Your Editor Requires It!
  3 | date: \today
  4 | author:
  5 | output:
  6 |   html_document:
  7 |     #css: "C:/Users/justi/Documents/GitHub/Content/power/theme.css"
  8 |     #toc: TRUE
  9 |     #toc_float : TRUE
 10 |     #toc_depth : 2
 11 | ---
 12 | 
 13 | ## Introduction
 14 | 
 15 | *[Download the R Markdown notebook here](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca/abuse_of_power) used to generate this blog post and practice your R skills for free!*
 16 | ```{r setup, echo = FALSE, warning=FALSE}
 17 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 18 | path <- "C:/Users/justi/Documents/Github/biostatistics/biostatistics.ca/abuse_of_power"
 19 | setwd(path)
 20 | 
 21 | library(ggplot2)
 22 | library(pagedown)
 23 | ```
 24 | 
 25 | Some widely used statistical tools, e.g. the AB testing significance calculator from Survey Monkey or even respectable (?) software like SPSS, provide a value of observed (or post-hoc) power computed from the data - see images below.
 26 | 
 27 | ```{r survey monkey, echo = FALSE, fig.show = 'hold', out.width = "25%", out.height = "25%", fig.align = "center"}
 28 | 
 29 | ```
 30 | <center>
 31 | ![](survey_monkey.png){width=440px} ![](observed-power-in-spss-glm.png){width=440px}
 32 | </center>
 33 | 
 34 | \newline 
 35 | A company like Survey Monkey may be forgiven for making statistical mistakes...But SPSS, you should know better!
 36 | 
 37 | <p style="border-width:3px; border-style:solid; border-color:#ff5555; padding: 1em;"> **THIS IS NOT ONLY COMPLETELY USELESS, BUT DOWNRIGHT DANGEROUS, AS IT STRONGLY ENCOURAGES FALLACIOUS STATISTICAL LOGIC !** </p>
 38 | 
 39 | Let's dig into this.
 40 | 
 41 | Disclaimer : there will be some mathematical symbols and code, but they are not essential to the message; hopefully they will help though!
 42 | 
 43 | \newpage
 44 | 
 45 | ### A refresher on hypothesis testing
 46 | 
 47 | We must first recall the basics of **hypothesis testing**. 
 48 | 
 49 | Examples: 
 50 | 
 51 | - We run an AB test and want to know if the A channel converted at a higher rate than the B channel.
 52 | - We test a treatment vs. a placebo and want to know if the treatment improves a health outcome, e.g. reduces blood-pressure.
 53 | 
 54 | We are interested in $\mu_A$ and $\mu_B$, the means of our groups. We do the following hypothesis test:
 55 | \[\cases{H_0 : \mu_A = \mu_B, \text{ both groups have the same mean}\\
 56 | H_1 : \mu_A \neq \mu_B, \text{ means are different.}}\]
 57 | 
 58 | We collect data (using sound methodology, which is way harder than it seems) and want to *reject the null hypothesis* $H_0$.
 59 | 
 60 | <p style="border-width:3px; border-style:solid; border-color:#50fa7b; padding: 1em;"> **Goal of hypothesis testing: **We want our data to convince us that we can comfortably affirm that the means are **not** equal in both groups, using some statistical test, often the *two-sample t-test* (see below).</p>
 61 | 
 62 | \newpage
 63 | 
 64 | ## Ok, but what is power?
 65 | 
 66 | Power is the **probability of rejecting the null hypothesis when it is *false***
 67 | \[\mathbb{P}(\text{reject } H_0|H_0 \text{ is false}).\]
 68 | 
 69 | This measures how *sensitive* the statistical test is to deviations from the null hypothesis. 
 70 | Let's call $\delta := \mu_A - \mu_B$, the difference in the means. 
 71 | 
 72 | If $H_0$ is false, then $\delta \neq 0.$ 
 73 | There is an immediate difficulty : $H_0$ can be false in infinite ways!
 74 | 
 75 | - $\delta$ can be minuscule;
 76 | - $\delta$ can be huge;
 77 | - in fact, $\delta$ can be any non-zero number!
 78 | 
 79 | This means that the power of the test depends on the difference of the means...but if we knew the difference of the means, we wouldn't need statistics... what gives?
 80 | 
 81 | \newpage
 82 | 
 83 | ### Power
 84 |     
 85 | Power depends on:
 86 | 
 87 | 1. How large the true difference $\delta$ is
 88 |     + A larger difference is easier to detect.
 89 | 2. How uncertain is our estimate of this difference
 90 |     + A less variable estimate (i.e. smaller variances $\sigma_A^2, \sigma_B^2$ for groups $A$ and $B$, respectively) gives us more confidence
 91 | 3. How much data (i.e. $n_A, n_B$ data points for group $A$ and $B$, respectively) is available in each group to run the test.
 92 |     + More data gives us more confidence.
 93 |     
 94 | Mathematically, we write 
 95 | \[\mathbb{P}(\text{reject } H_0|H_0 \text{ is false}) = f(\delta, \sigma^2, n),\]
 96 | 
 97 | where $f$ is some function that depends on the test. 
 98 | 
 99 | In practice, we summarize all this in an *effect-size* (there are many ways to do this).
100 | 
101 | \newpage
102 | 
103 | ### Power analysis, the right and wrong ways
104 | 
105 | :::: {style="display: flex;"}
106 | ::: {}
107 | 
108 | <p style="border-width:3px; border-style:solid; border-color:#50fa7b; padding: 1em;">
109 |  **Correct power analysis** is used **BEFORE running the experiment** to *determine the sample-size needed* in about the following manner:</p>
110 | 
111 | 
112 | 1. We use an estimate of *effect size* from the literature
113 |     + Or we use a Minimum Significant Difference (MSD), that is the smallest *effect size* that is worth the trouble
114 | 2. We fix levels of power and significance (conventionally, $80\%$ and $5\%$, respectively).
115 | 3. We use some math (or software) to calculate the sample-size required to adequately power our test.
116 | 
117 | 
118 | :::
119 | 
120 | 
121 | ::: {}
122 | 
123 | :::
124 | 
125 | ::: {}
126 | <p style="border-width:3px; border-style:solid; border-color:#ff5555; padding: 1em;">
127 | 
128 | **Post-hoc** power proceeds INCORRECTLY as follows, **using the data collected from the experiment**:</p> 
129 | 
130 | 1. Computes the effect-size measured in the experiment.
131 | 2. Computes the power associated with this effect-size.
132 | 
133 | But, as we will see, this is worthless, considering we always ALSO calculate a *p-value* (we will not dive into the p-value rabbit hole, lest we never come back).
134 | 
135 | :::
136 | ::::
137 | 
138 | \newpage
139 | 
140 | ## Example of Power Calculation for Two-Sample t-test
141 | 
142 | We run an AB test, or a Randomized Controlled Trial, and are now ready to perform the *two-sample t-test*, which uses the *t-statistic* \[t = \frac{\bar{x}_A - \bar{x}_B}{s},\]
143 | where 
144 | 
145 | - $\bar{x}_A$ is the mean computed in the first group,
146 | - $\bar{x}_B$ is the mean computed in the second group, and
147 | - $s$ is a standard deviation estimate (the estimation approach can vary depending on different assumptions; it's a nuanced discussion for another time.)
148 | 
149 | We will run a simulation of this scenario, where we know the ground-truth, showing along the way why computing observed power makes no sense!
150 | 
151 | \newpage
152 | 
153 | ### Simulation Parameters 
154 | 
155 | <p style="border-width:3px; border-style:solid; border-color:#f8f8f2; padding: 1em;">
156 | *You can safely skip these sections and head directly to the plots if you're not familiar with code or technical probability and statistics stuff.*
157 | </p>
158 | 
159 | We fix identical parameters for the two groups: there are no differences to detect with the test.
160 | 
161 | - $x_A \sim \mathcal{N}(\mu_A, sd_A),\quad  x_B \sim \mathcal{N}(\mu_B, sd_B)$,
162 | where
163 |   - $\mu_A = \mu_B = 0$
164 |   - $s_A = s_B = 1$
165 |   - $n_A = n_B = 20$
166 | 
167 | ```{r parameters}
168 | set.seed(1) #for reproducibility
169 | n_sim <- 10000 #We will simulate a large number of experiments
170 | #normal variate simulation parameters
171 | n_A <- 20  #group A
172 | n_B <- 20 #group B
173 | mu_A <- 0 #group A's real mean we are trying to estimate
174 | mu_B <- 0 #group B' real mean we are trying to estimate
175 | sd_A <- 1 #group A's standard deviation
176 | sd_B <- 1 #group B's standard deviation
177 | ```
178 | \newpage
179 | 
180 | ### Simulation 
181 | 
182 | ```{r simulation}
183 | #t-test critical value, we will reject H_0 when the absolute value of the t-statistic exceeds this threshold
184 | crit <- qt(0.975, n_A + n_B -2) 
185 | 
186 | #simulation
187 | significant <- c()
188 | t.stat <- c()
189 | 
190 | for (simulation in 1:n_sim){
191 |   x_A <- rnorm(n_A, mu_A, sd_A) #we simulate our group A
192 |   x_B <- rnorm(n_B, mu_B, sd_B) #we simulate our group B
193 |   diff.means <- mean(x_A) - mean(x_B) #we estimate  the difference in means
194 |   # print(diff.means)
195 |   s <- sqrt((var(x_A)/n_A + var(x_B)/n_B)) #we estimate s
196 |   
197 |   t.stat[simulation] <- diff.means/s #we compute the t-statistic
198 |   significant[simulation] <- abs(t.stat[simulation]) >= crit #we check if the test is significant
199 | }
200 | ```
201 | 
202 | \newpage
203 | 
204 | ### Simulation Results
205 | 
206 | From our 10000 simulations, there is no difference to detect between our groups. We therefore expect about $5\%$ of them to be significant, by definition of testing at the $5\%$ level.
207 | 
208 | ```{r alpha, echo=FALSE}
209 | print(paste(round(sum(significant) / n_sim, 4)*100, "% of simulations are significant, when the null hypothesis is true.")) 
210 | ```
211 | ```{r cohen_d hist, echo = FALSE, out.width = "65%", out.height = "65%", fig.align = "center"}
212 | knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
213 | 
214 | t.stat.hist <- ggplot(data.frame(t.stat, significant), aes(x=t.stat, fill = factor(1-significant))) +
215 |   geom_histogram(bins=100) +
216 |   theme(legend.position = "none") +
217 |   geom_vline(xintercept = crit, alpha = 1/2, color = "#F8766D") +
218 |   geom_vline(xintercept = -crit, alpha = 1/2, color = "#F8766D") +
219 |   annotate("label", x = -4, y = 20, color = "#F8766D", alpha = 1/3, label = "significant") +
220 |   annotate("label", x = 4, y = 20, color = "#F8766D", alpha = 1/3, label = "significant") +
221 |   xlim(c(-5,5))
222 | 
223 | t.stat.hist
224 | ```
225 | 
226 | ```{r effect size, echo = FALSE, out.width = "50%", out.height = "50%", fig.align = "center"}
227 | #visualize results
228 | results <- data.frame(
229 |   t.stat = abs(t.stat), 
230 |   p.value = 1-pt(abs(t.stat),n_A + n_B -2))
231 | 
232 | #gives the quantile function of 
233 | #the n-1 df Student distribution
234 | effect_size_plot <- results |>
235 |   ggplot(aes(x = t.stat, y = p.value)) +
236 |   geom_point(alpha = 0.1) +
237 |   ggtitle("Nonsignificant Results Are Those With Lower Estimated Effect Sizes... \n and the relationship is 1 to 1!")
238 | ```
239 | 
240 | \newpage
241 | 
242 | ### Power and P-Values
243 | 
244 | Observed power is then computed for every simulation by using the estimated t-statistics.
245 | 
246 | ```{r power}
247 | #what about power?
248 | observed.power <- 1-pt(crit,n_A + n_B - 2, ncp=abs(t.stat))
249 | ```
250 | 
251 | 
252 | ```{r power plot creation, echo = FALSE}
253 | results$observed.power <- observed.power
254 | 
255 | power.plot <- results |>
256 |   ggplot(aes(x = observed.power, y = p.value, color = factor(p.value > 0.05))) +
257 |   geom_point(size= 0.25, alpha = 1/3) +
258 |   geom_hline(yintercept = 0.05, alpha = 1/2, color = "#F8766D") +
259 |   annotate("label", x = 0.12, y = 0.07, color = "#00BFC4", alpha = 1/4, label = "non-significant") +
260 |   annotate("label", x = 0.12, y = 0.03, color = "#F8766D", alpha = 1/4, label = "significant") +
261 |   theme(legend.position = "none") +
262 |   ggtitle("Nonsignificant Results Are Those With Lower Power... \nand the relationship is 1 to 1!")
263 | ```
264 | 
265 | ```{r power plot, echo = FALSE, out.width = "65%", out.height = "65%", fig.align = "center"}
266 | power.plot
267 | ```
268 | 
269 | \newpage
270 | 
271 | ## The Takeaways
272 | 
273 | 1. Observed power and p-value are in a 1-to-1 relationship: observed power adds no new information. 
274 |     + If we have a low p-value, we have high power.
275 |     + If we have a high p-value, we have low power. 
276 | 2. By definition, tests with low-observed power will be those for which we have not rejected $H_0$.
277 |     + Using observed power when our test is not significant is simply wrong.
278 |     + Power should be computed BEFORE any data analysis, not after. 
279 | 3. Tools like SPSS and Survey Monkey reporting observed power tempts users to interpret non-significant tests with "high" power as evidence for the null hypothesis.
280 |     + But we simply cannot make this logically fallacious claim.
281 | 
282 | In Sander Greenland's 2012 paper *Nonsignificance Plus High Power Does Not Imply Support For the Null Over the Alternative*, he reports being "asked to review pooled data from randomized trials as used in a U.S. Food and Drug Administration (FDA) alert and report" where a "defense expert statistician (a full professor of biostatistics at a major university and ASA Fellow)" made this logical mistake!
283 |     
284 | **No one is immune from statistical error, so spread the word and be vigilant :)**
285 | 
286 | ## Some References
287 | 
288 | - Daniel Lakens has a [great blog post](https://daniellakens.blogspot.com/2014/12/observed-power-and-what-to-do-if-your.html) about this.
289 | - If you want to learn how to use power correctly to calculate required sample sizes, I recommend [this book](https://www.amazon.ca/Design-Analysis-Experiments-Douglas-Montgomery/dp/1118146921) by Douglas Montgomery. I will eventually write more about this, but in the meantime don't hesitate to [reach out here](https://justinbelair.ca/contact/) or by [Linkedin DM](https://www.linkedin.com/in/justinbelair/) if you need help with this.
290 | 


--------------------------------------------------------------------------------
/biostatistics.ca/collider_bias/collider_bias.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title:
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | 
  9 | set.seed(1)
 10 | 
 11 | library(ggplot2)
 12 | library(dplyr)
 13 | library(ggdag)
 14 | 
 15 | knitr::opts_chunk$set(
 16 |   warning = FALSE,   # Suppress warnings
 17 |   message = FALSE,   # Suppress messages
 18 |   echo = TRUE,       # Show code (optional)
 19 |   fig.align = 'center' # Center plots
 20 | )
 21 | ```
 22 | 
 23 | [Find the RMarkdown Notebook on Github and Run the Code Yourself!](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca)
 24 | 
 25 | ## Introduction - What is Collider Bias?
 26 | 
 27 | Collider bias occurs when we condition on (or select based on) a variable that is influenced by both the exposure and outcome of interest. This seemingly innocent action can create spurious associations between variables that are actually independent. Let's explore this through some concrete examples.
 28 | 
 29 | ### Example: College Admissions
 30 | Consider college admissions where students can be admitted based on either high intellectual ability or high athletic ability. Let's simulate some data where these abilities are actually independent in the population. Next, we create an indicator for `admission` based on whether a student has high intellectual ability or high athletic ability. We then plot the data to see how the selection process affects the relationship between intellectual and athletic ability.
 31 | 
 32 | ```{r}
 33 | selection.bias <- data.frame("intellectual.ability" = rnorm(500, 0, 1),
 34 |                              "athletic.ability" = rnorm(500, 0, 1)
 35 |                              ) %>%
 36 |   mutate(admission = (intellectual.ability > 1) | (athletic.ability > 1.5)
 37 |          )
 38 | ```
 39 | 
 40 | If you want to access the code to create the plot below, the code RMarkdown notebook used for this blog post is [available for free on Github](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca).
 41 | 
 42 | ```{r, echo = FALSE}
 43 | select.bias.plot <- selection.bias %>%
 44 |   ggplot(aes(x = intellectual.ability, y = athletic.ability)) +
 45 |   geom_point(aes(colour = admission)) +
 46 |   scale_color_manual(values = c("FALSE" = "#989898", "TRUE" = "#000000"))+
 47 |   geom_smooth(method = "lm", se = FALSE, colour = "#000000") +
 48 |   geom_label(x = 2, y = 2, label = "Admitted Students", colour =  "#000000", label.size = 0.5) +
 49 |   geom_label(x = -2, y = -2, label = "Rejected Students", colour =  "#989898", label.size = 0.5) +
 50 |   theme(legend.position = "none",
 51 |         axis.ticks = element_blank(),
 52 |         axis.text = element_blank()) +
 53 |   xlab("Intellectual Ability") +
 54 |   ylab("Athletic Ability") +
 55 |   theme(plot.title = element_text(size = 20),
 56 |         axis.title.x = element_text(size = 20),
 57 |         axis.title.y = element_text(size = 20))
 58 | 
 59 | print(select.bias.plot)
 60 | ```
 61 | 
 62 | From this plot, we see that there is basically no relationship between intellectual and athletic ability in the population. Indeed, when fitting a linear regression model, we get a slightly negative slope.
 63 | 
 64 | ```{r}
 65 | lm(athletic.ability ~ intellectual.ability, data= selection.bias) %>%
 66 |   summary()
 67 | ```
 68 |  Yet, the coefficient is not significantly different from 0. We know (because we generated the data) that the real-value is 0.
 69 | 
 70 | However, when we condition on `admission`, we see a strong negative relationship between intellectual and athletic ability. 
 71 | 
 72 | 
 73 | ```{r, echo = FALSE}
 74 | selection.bias.selected.plot <- selection.bias %>%
 75 |   filter(admission) %>%
 76 |   ggplot(aes(x = intellectual.ability, y = athletic.ability, colour = admission)) +
 77 |   geom_point() +
 78 |   geom_smooth(method = "lm", se = FALSE) +
 79 |     scale_color_manual(values = c("TRUE" = "#000000"))+
 80 |   geom_smooth(method = "lm", se = FALSE) +
 81 |   geom_label(x = 2, y = 2, label = "Admitted Students", colour =  "#000000", label.size = 0.5) +
 82 |   theme(legend.position = "none",
 83 |         axis.ticks = element_blank(),
 84 |         axis.text = element_blank(),
 85 |         axis.title.x = element_blank(),
 86 |         axis.title.y = element_blank(),
 87 |         plot.title = element_text(size = 20)
 88 |         )
 89 | 
 90 | print(selection.bias.selected.plot)
 91 | ```
 92 | This is confirmed by estimating the coefficient of the linear regression model when conditioning on `admission`.
 93 | 
 94 | ```{r}
 95 | lm(athletic.ability ~ intellectual.ability, data= selection.bias %>% filter(admission)) %>%
 96 |   summary()
 97 | ```
 98 | We obtain a highly significant negative coefficient. While this is visually intuitive in this specific example, it is not always so clear in real-world data. This is why it is important to learn to draw meaningful DAGs to represent background knowledge. In this specific example, it would look like this.
 99 | 
100 | ```{r, echo = FALSE}
101 | theme_set(theme_dag())
102 | 
103 | collider_dag <- collider_triangle(x_y_associated = TRUE,
104 |   x = "Intellectual Ability",
105 |   y = "Athletic Ability",
106 |   m = "Admitted (selected) to college"
107 |   ) %>%
108 |   tidy_dagitty() %>% 
109 |   mutate(linetype = ifelse(to == "y", "dashed", "solid"),
110 |          direction = ifelse(to == "y", "<->", "->")) %>%
111 |   ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + 
112 |   geom_dag_point() + 
113 |   geom_dag_label(aes(label = label), nudge_x = 0.01, nudge_y = -0.05, show.legend = FALSE) + 
114 |   geom_dag_edges(aes(edge_linetype = linetype), curvature = 0, 
115 |                  arrow_directed = grid::arrow(length = grid::unit(12, "pt"), type = "closed"),
116 |                  arrow_bidirected = grid::arrow(length = grid::unit(12, "pt"), ends = "both", type = "closed"),
117 |                  show.legend = FALSE) +
118 |   geom_text(x=1, y=1.05, label = "Spurious Correlation ?")
119 | 
120 | print(collider_dag)
121 | ```
122 | 
123 | The theory of graphical causal models developed by Judea Pearl and others tells us that when there is a path between two variables of the form `athletic_ability -> admission <- intellectual ability` the variable `admission` is a *collider*. The concept of *d-separation* tells us that conditioning on a collider induces spurious correlation and biases the (causal) estimate we are seeking to establish[^1].
124 | 
125 | It is not always obvious that our analysis is 'conditional' on a given variable when it is done through a selection mechanism. Oftentimes, we simply work with a dataset at hand and do not know immediately know the selection process that generated it. In this example, we implicitly condition on the admission variable if we simply select a sample from a university. I hope it is obvious that this form of bias can be lurking in many real-world problems, and it is one of the reasons why any experienced statistician always advises to study deeply the data-generating process.
126 | 
127 | By this, it is meant that before doing any sort of analysis, a deep investigation of the origin of the data is warranted. This means understanding how and why it was collected, who was responsible for collection, was there any processing done before sending it to the statistician, was there a data management protocol specified in advance, is there specific domain-knowledge that the statistician should know, etc. In many cases, data is handled by research assistants, is generated by software, or is collected by a third-party. In these cases, the statistician should be in open discussion with anybody responsible for the data and any domain expert that can help gain any insight into the data.
128 | 
129 | ## Low Birth Weight Paradox
130 | 
131 | An important real-world example of collider bias that baffled scientists for a long-time was the so-called "Low Birth Weight Paradox". Surprisingly, low birth-weight babies born to smoking mothers have a lower infant mortality rate than low birth-weight babies of non-smoking mothers. This is counterintuitive, because smoking is known to be an important risk factor for infant mortality, which is thought to be induced by low birth weight.
132 | 
133 | In my upcoming book, [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks ](https://justinbelair.ca/causal-inference-in-statistics-book?utm_source=biostatistics&utm_medium=blog&utm_campaign=collider_bias), I discuss this paradox in detail using a real dataset. It forms the case-study of Chapter 4 on Observational Studies, where I give the reader the dataset and a code notebook to walk through the analysis. Visit the book page to learn more and download the first chapter for free.
134 | 
135 | ## Why Does This Matter?
136 | Collider bias is not just a theoretical concern. It appears in many real-world scenarios:
137 | 
138 | - Hospital-based studies (selecting on being hospitalized)
139 | - Social media analysis (selecting on platform usage)
140 | - Survey response bias (selecting on willingness to respond)
141 | - Scientific publication bias (selecting on significant results)
142 | 
143 | Understanding collider bias helps researchers avoid drawing incorrect conclusions when analyzing data that has been subject to selection processes.
144 | 
145 | ## Key Takeaways
146 | 
147 | - Selection can create associations that don't exist in the full population
148 | - When analyzing data, we must be careful about conditioning on colliders
149 | - DAGs help us identify potential collider bias in our analyses
150 | 
151 | ## Conclusion
152 | 
153 | Collider bias is a common and often overlooked source of bias in observational studies, especially when we did not perform any adjustment, but simply biased the sample through a selection mechanism. By understanding the concept of collider bias and how it can arise in real-world data, researchers can avoid drawing incorrect conclusions from their analyses. By using directed acyclic graphs (DAGs) to represent the relationships between variables in their data, researchers can identify potential sources of collider bias and adjust their analyses accordingly. By being aware of the potential for collider bias in their data, researchers can ensure that their analyses are accurate and reliable.
154 | 
155 | If you want to receive monthly insights about Causal Inference in Statistics, please consider [subscribing to my newsletter](https://causal-inference-in-statistics.beehiiv.com/subscribe?utm_source=biostatistics&utm_medium=blog&utm_campaign=collider_bias). You will receive updates about my upcoming book, blog posts, and otherexclusive resources to help you learn more about causal inference in statistics. 
156 | 
157 | Join the stats nerds🤓!
158 | 
159 | [^1]: I discuss this theory in detail with examples, exericises, data, and code in my upcoming book [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book?utm_source=biostatistics&utm_medium=blog&utm_campaign=collider_bias). 


--------------------------------------------------------------------------------
/biostatistics.ca/common_DAG_structures/common_DAG_structures_blog.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title:
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | 
  9 | set.seed(1)
 10 | 
 11 | library(ggplot2)
 12 | library(dplyr)
 13 | library(ggdag)
 14 | 
 15 | knitr::opts_chunk$set(
 16 |   warning = FALSE,   # Suppress warnings
 17 |   message = FALSE,   # Suppress messages
 18 |   echo = FALSE,       # Show code (optional)
 19 |   fig.align = 'center' # Center plots
 20 | )
 21 | 
 22 | theme_set(theme_dag())
 23 | ```
 24 | 
 25 | ## Introduction
 26 | 
 27 | Directed Acyclic Graphs (DAGs) are powerful tools for visualizing and understanding causal relationships. In this blog post, we'll explore common DAG structures that frequently appear in causal inference problems, simulate data according to these structures, and demonstrate how different analytical approaches can lead to correct or incorrect causal estimates. If you want to begin your journey of learning causal inference and don't know where to start, visit our [Causal Inference Guide: Books, Courses, and More](https://www.biostatistics.ca/causal-inference-guide-books-courses-and-more/?utm_source=biostatistics&utm_medium=blog&utm_campaign=common_DAG_structures_intro).
 28 | 
 29 | If you're interested in obtaining the R code for this blog post, consider purchasing my upcoming book, [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book?utm_source=biostatistics&utm_medium=blog&utm_campaign=common_DAG_structures_intro). Each Chapter contains a complete case-study with an extensive code notebook that you can use to grasp the principles using code. There are also exercises and practice projects to help you solidify your understanding of the material.
 30 | 
 31 | Let's jump in!
 32 | 
 33 | ## Confounding
 34 | 
 35 | One of the most basic causal structures is confounding, where a third variable affects both the treatment and the outcome. Here, $W$ is the treament, $Y$ is the outcome, and $Z$ is a confounder that affects both $W$ and $Y$.
 36 | 
 37 | ```{r}
 38 | confounder_dag <- dagify(Y ~ W + Z,
 39 |                    W ~ Z,
 40 |                    exposure = "W",
 41 |                    outcome = "Y",
 42 |                    coords = list(x = c(Y = 1, W = -1, Z = 0)/2,
 43 |                                  y = c(Y = 0, W = 0, Z = 1)/2)
 44 |                    )
 45 | 
 46 | confounder_dag <- ggdag(confounder_dag, text = TRUE, node_size = 16*1.5, text_size = 3.88*1.5) +
 47 |   geom_dag_edges_link(arrow = grid::arrow(length = grid::unit(15, "pt"), type = "closed"))
 48 | 
 49 | print(confounder_dag)
 50 | ```
 51 | 
 52 | Let's simulate 200 data points that follow this structure and see what happens when we analyze it. The true treatment effect will be set at a value of 5. Since we simulate data that we know has a true treatment effect of 5, we will be able to assess the bias in our methods, i.e. the difference between our estimates and the ground-truth value of 5.
 53 | 
 54 | Here is a snapshot of what the dataset looks like.
 55 | 
 56 | ```{r, echo= FALSE}
 57 | n <- 200 # number of observations
 58 | treatment_effect <- 5 # true treatment effect
 59 | 
 60 | # Generate confounder
 61 | Z <- rnorm(n, 5, 1) + rnorm(n, 0, 1) 
 62 | 
 63 | # Treatment depends on confounder
 64 | W <- rbinom(n, 1, plogis(Z-5))
 65 | 
 66 | # Outcome depends on treatment and confounder
 67 | Y <- rnorm(n, 2 + treatment_effect*W + 20*Z, 3) + rnorm(n, 0, 1)
 68 | 
 69 | simulated_data_confounder <- data.frame(W, Z, Y)
 70 | ```
 71 | 
 72 | 
 73 | ```{r}
 74 | head(simulated_data_confounder)
 75 | ```
 76 | Now, let's fit two different models to this data and compare the results.
 77 | 
 78 | - **Model 1** : We fit a simple linear regression model of outcome on treament, without adjusting for the confounder: \[Y \sim W\]
 79 | - **Model 2** : We fit a linear regression model of outcome on treatment, adjusting for the confounder by adding it as a covariate: \[Y \sim W + Z\]
 80 | 
 81 | ```{r}
 82 | Y_W <- simulated_data_confounder %>%
 83 |   lm(Y ~ W, data = .)
 84 | 
 85 | #correct model
 86 | Y_W_Z <- simulated_data_confounder %>%
 87 |   lm(Y ~ W + Z, data = .)
 88 | 
 89 | Y_W_coef <- Y_W %>%
 90 |   coef() %>%
 91 |   c(., NA)
 92 | 
 93 | Y_W_Z_coef <- Y_W_Z %>% 
 94 |   coef() %>%
 95 |   c(.)
 96 | ```
 97 | 
 98 | 
 99 | ```{r}
100 | coef_data <- as.data.frame(rbind(Y_W_coef, Y_W_Z_coef))
101 | 
102 | names(coef_data) <- c("Intercept", "W", "Z")
103 | rownames(coef_data) <- c("Y ~ W", "Y ~ W + Z")
104 | 
105 | coef_data
106 | ```
107 | 
108 | We see that when we correctly specify the model, the $W$ coefficient is close to the true treatment effect of 5. It is not exactly 5 due to sampling variability. However, when we fail to adjust for the confounder, we get a biased estimate. 
109 | 
110 | It is not possible to determine beforehand the size and magnitude of bias based solely on the DAG. However, the DAG structure can help us identify the presence of bias and guide us in the right direction. Further structural knowledge about the relationship between the confounder and the treatment/outcome variables can help us assess the magnitude and direction of the bias if we were to omit adjusting for a confounder, e.g. if we did not measure it.
111 | 
112 | ## Collider Bias
113 | 
114 | Another important structure is the collider, where a variable is influenced by both the treatment and the outcome. Formally, a collider has a definition that can be bit tricky[^1]. Informally, a collider is a variable that has two arrows pointing into it (see illustration below, where $Z$ is now a collider).
115 | 
116 | 
117 | ```{r}
118 | collider_DAG <- dagify(Y ~ W,
119 |                        Z ~ W + Y,
120 |                    exposure = "W",
121 |                    outcome = "Y",
122 |                    coords = list(x = c(Y = 1, Z = 0, W = -1)/2,
123 |                                  y = c(Y = 0, Z = -1, W = 0)/2)
124 |                    )
125 | 
126 | collider_DAG <- ggdag(collider_DAG, text = TRUE, node_size = 16*1.5, text_size = 3.88*1.5) +
127 |   geom_dag_edges_link(arrow = grid::arrow(length = grid::unit(15, "pt"), type = "closed"))
128 | 
129 | print(collider_DAG)
130 | ```
131 | Different selection bias mechanisms, such as differential loss-to-followup, convenience sampling, etc. can all be represented as bias induced by conditioning on a collider (or one of its descendants) in a DAG[^2]. One such example that is very common and not always easy to identify arises when a sample is selected based on some its characteristics. For example, when assessing the correlation of athletic ability and intellectual ability, selecting a sample of students from highly selective universities could induce a spurious correlation, leading to the false belief that intellectual ability leads student to achieve higher athletic ability, or vice-versa. See my [previous blog post](https://www.biostatistics.ca/selection-bias-a-causal-inference-perspective-with-downloadable-code-notebook/) on selection bias for a detailed illustration of this example.
132 | 
133 | Let's simulate data and see what happens when we condition on a collider. The data looks like this.
134 | 
135 | ```{r}
136 | n <- 200 #number of observations to simulate
137 | treatment_effect <- 5 #true treatment effect
138 |  
139 | W <- rbinom(n, 1, 0.5) #binary treatment, independent of collider
140 | Y <- rnorm(n, 2 + treatment_effect*W, 3) + rnorm(n, 0, 1) #outcome model + noise
141 | 
142 | Z <- rnorm(n, 20*W + 20*Y, 3) + rnorm(n, 0, 1) #collider + noise
143 | ```
144 | 
145 | 
146 | ```{r}
147 | simulated_data_collider <- data.frame(W, Z, Y)
148 | 
149 | head(simulated_data_collider)
150 | ```
151 | 
152 | We then fit 2 models:
153 | 
154 | - **Model 1**: $Y \sim W$, correctly ignoring the collider
155 | - **Model 2**: $Y \sim W + Z$, erreneously adjusting for the collider
156 | 
157 | ```{r}
158 | #unadjusted model
159 | Y_W <- simulated_data_collider %>%
160 |   lm(Y ~ W, data = .)
161 | 
162 | #erroneously adjusted model
163 | Y_W_Z <- simulated_data_collider %>%
164 |   lm(Y ~ W + Z, data = .)
165 | 
166 | Y_W_coef <- Y_W %>%
167 |   coef() %>%
168 |   c(., NA)
169 | 
170 | Y_W_Z_coef <- Y_W_Z %>% 
171 |   coef() %>%
172 |   c(.)
173 | 
174 | coef_data <- as.data.frame(rbind(Y_W_coef, Y_W_Z_coef))
175 | 
176 | names(coef_data) <- c("Intercept", "W", "Z")
177 | rownames(coef_data) <- c("Y ~ W", "Y ~ W + Z")
178 | 
179 | coef_data
180 | ```
181 | 
182 | Looking at these results, we see that the effect estimate for the model that does not adjust for $Z$ is close to 5, as expected, whereas the model that adjusts for $Z$ gives a biased estimate. This is because conditioning on a collider can introduce bias in our treatment effect estimate. This can be counterintuitive--controlling for more variables doesn't always improve your analysis!
183 | 
184 | ## Mediators
185 | 
186 | A mediator is a variable that lies on the causal pathway between exposure and outcome, such as $M$ in the DAG below.
187 | 
188 | ```{r}
189 | mediator_DAG <- dagify(Y ~ W + M,
190 |                        M ~ W,
191 |                    exposure = "W",
192 |                    outcome = "Y",
193 |                    coords = list(x = c(Y = 1, M = 0, W = -1)/2,
194 |                                  y = c(Y = 0, M = 1, W = 0)/2)
195 |                    )
196 | 
197 | mediator_DAG <- ggdag(mediator_DAG, text = TRUE, node_size = 16*1.5, text_size = 3.88*1.5) +
198 |   geom_dag_edges_link(arrow = grid::arrow(length = grid::unit(15, "pt"), type = "closed"))
199 | 
200 | print(mediator_DAG)
201 | ```
202 | When working with mediators, we can decompose the total effect into direct and indirect effects. When the model is linear (as we have assumed in this example) these effects work additively along distincty paths. That is, \[\text{Total effect} = \text{Direct Effect} + \text{Indirect Effect}.\] 
203 | 
204 | In this example, the treatment effect of $W$ on $Y$ is 5, and the effect of $W$ on $M$ is 2. The effect of $M$ on $Y$ is 3. The indirect effect is works multiplicatively along the path $W \rightarrow M \rightarrow Y$[^3]. Thus, the total effect is given by 
205 | \begin{align*}
206 |   \text{Total Effect} &= \text{Direct Effect} + \text{Indirect Effect} \\
207 |   &= 5 + 2 \times 3 \\
208 |   &= 11.
209 | \end{align*}
210 | 
211 | The data looks like this.
212 | 
213 | ```{r simulating_mediator_DAG}
214 | n <- 200 #number of observations to simulate
215 | treatment_effect <- 5 #true treatment effect
216 | 
217 | W <- rbinom(n, 1, 0.5) #binary treatment
218 | M <- rnorm(n, 2*W, 1) + rnorm(n, 0, 1) #treatment effect on mediator + noise
219 | Y <- rnorm(n, 2 + treatment_effect*W + 3*M, 3) + rnorm(n, 0, 1) #outcome model + noise
220 | ```
221 | 
222 | 
223 | ```{r}
224 | simulated_data_mediator <- data.frame(W, M, Y)
225 | head(simulated_data_mediator)
226 | ```
227 | 
228 | We then fit 3 models:
229 | 
230 | - **Model 1**: $Y \sim W$, ignoring the mediation component
231 | - **Model 2**: $Y \sim W + M$, incorporating an adjustment for the mediator
232 | - **Model 3**: $M \sim W$, the mediation model, where we model the relationship between the mediator and the treatment indicator
233 | 
234 | ```{r}
235 | #unadjusted model
236 | Y_W <- simulated_data_mediator %>%
237 |   lm(Y ~ W, data = .)
238 | 
239 | #model for direct effect
240 | Y_W_M <- simulated_data_mediator %>%
241 |   lm(Y ~ W + M, data = .)
242 | 
243 | #model for mediator
244 | M_W <- simulated_data_mediator %>%
245 |   lm(M ~ W, data = .)
246 | 
247 | Y_W_coef <- Y_W %>%
248 |   coef() %>%
249 |   c(., NA)
250 | 
251 | Y_W_M_coef <- Y_W_M %>% 
252 |   coef() %>%
253 |   c(.)
254 | 
255 | M_W_coef <- M_W %>% 
256 |   coef() %>%
257 |   c(., NA)
258 | 
259 | coef_data <- as.data.frame(rbind(Y_W_coef, Y_W_M_coef, M_W_coef))
260 | 
261 | names(coef_data) <- c("Intercept", "W", "M")
262 | rownames(coef_data) <- c("Y ~ W", "Y ~ W + M", "M ~ W")
263 | coef_data
264 | ```
265 | 
266 | We see that when we regress $Y$ on $W$, we get an estimate close to 11, as expected. The direct effect of $W$ on $Y$ can be obtained via the regression adjusted for $M$, which *blocks* the effect that passes through the mediator. We obtain an estimate close to 5, as expected. The effect of $W$ on $M$ is close to 2, as given by the coefficient of $W$ in the $M \sim W$ regression. The effect of $M$ on $Y$ is close to 3, as given by the $M$ coefficient in the $Y \sim W +M$ regression. Multiplying the latter two effects we obtain the indirect effect close to 6, as expected.
267 | 
268 | ## Conclusion
269 | 
270 | Understanding these common DAG structures is crucial for accurate causal inference:
271 | 
272 | - **Confounding**: Requires adjustment for common causes of treatment and outcome
273 | - **Collider bias**: Avoid adjusting for variables affected by both treatment and outcome
274 | - **Mediation**: Be clear about whether you're estimating direct, indirect, or total effects. In cases with linear models, path analysis rules can be used to quickly decompose the total effect into direct and indirect effects
275 | 
276 | DAGs provide a powerful visual language for communicating causal assumptions and guiding proper statistical analysis. By understanding these common structures, researchers can better design studies, analyze data, and interpret results.
277 | 
278 | If you want to receive monthly insights about Causal Inference in Statistics, please consider [subscribing to my newsletter](https://causal-inference-in-statistics.beehiiv.com/subscribe?utm_source=biostatistics&utm_medium=blog&utm_campaign=common_DAG_structures). You will receive updates about my upcoming book, blog posts, and other exclusive resources to help you learn more about causal inference in statistics. 
279 | 
280 | Join the stats nerds🤓!
281 | 
282 | [^1]: See [Pearl, J. (2009). Causality: Models, Reasoning, and Inference. Cambridge University Press](https://amzn.to/3Nq0RYY) for a formalization of a collider. The previous link is an affiliate link and we may earn a small commission on a purchase. I also discuss this idea in detail with examples, exercises, data, and code in my upcoming book [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book?utm_source=biostatistics&utm_medium=blog&utm_campaign=collider_bias_footnote)
283 | 
284 | [^2]: See Hernán, Hernández-Díaz, Robins (2004). A structural approach to selection bias. Epidemiology, 15(5), 615-625.
285 | 
286 | [^3]: This technique is known as Path Analysis. I discuss it in detail in Part II of my upcoming book [Causal Inference in Statistics, with Exercises, Practice Projects, and R Code Notebooks](https://justinbelair.ca/causal-inference-in-statistics-book?utm_source=biostatistics&utm_medium=blog&utm_campaign=path_analysis_footnote).           


--------------------------------------------------------------------------------
/biostatistics.ca/influential_observations/influential_observations.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: How to Handle Influential Observations Using R
  3 | date: \today
  4 | author:
  5 | output:
  6 |   html_document:
  7 |     css: "C:/Users/belai/Documents/JBConsulting/GitHub/biostatistics/biostatistics.ca/theme.css"
  8 |     #toc: TRUE
  9 |     #toc_float : TRUE
 10 |     #toc_depth : 2
 11 |     header_includes:
 12 |       - \usepackage{amsmath, amssymb, amsthm, mathtools}
 13 |       - \newcommand{\training}{\mathcal{X} \times \mathcal{Y}}
 14 |       - \newcommand{\defeq}{:=}
 15 | ---
 16 | 
 17 | By Justin Bélair &copy; | Biostatistician at [JB Statistical Consulting](www.justinbelair.ca)
 18 | 
 19 | [Find the RMarkdown Notebook on Github and Run the Code Yourself!](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca)
 20 | 
 21 | ![](C:/Users/belai/Documents/JBConsulting/GitHub/Content/Justin-35(1).jpg){height=50%}
 22 | 
 23 | \newpage
 24 | 
 25 | # Influential Observations vs. Outliers
 26 | 
 27 | ```{r, echo = FALSE, include = FALSE}
 28 | 
 29 | library(dplyr)
 30 | library(ggplot2)
 31 | 
 32 | ```
 33 | 
 34 | Much has been said about handling outliers and influential observations, but what exactly do these terms mean and how can we go about dealing with such issues in a pragmatic way?
 35 | 
 36 | We'll start by simulating data that follows a simple linear model + noise and in turn add observations that deviate from the model in different ways to determine the impact of such deviations on our statistical estimates.
 37 | 
 38 | ## An Influential Observation
 39 | 
 40 | ```{r, warning = FALSE}
 41 | set.seed(1) # for reproducibility
 42 | 
 43 | n <- 39 #number of observations
 44 | 
 45 | true.slope.coefficient <- 1
 46 | 
 47 | x <- rnorm(n, 1, 1) # x values
 48 | 
 49 | y <- true.slope.coefficient*x + rnorm(n, 0, 1) # y = x + epsilon
 50 | 
 51 | data <- data.frame(x = x, y = y) # data generated according to the model
 52 | 
 53 | influential <- c(6,3) # a value that doesn't follow the general pattern
 54 | 
 55 | data.influential <- rbind(data, influential) # we sneak the influential observation in the data
 56 | 
 57 | data.influential %>%
 58 |   ggplot(aes(x = x, y = y)) +
 59 |   geom_point() +
 60 |   geom_smooth(method="lm", se = FALSE) +
 61 |   geom_smooth(aes(x=x, y=y), data, method = "lm", se = FALSE, color = "red")
 62 | 
 63 | ```
 64 | 
 65 | Here, the red line is the "true" unbiased OLS regression slope, whereas the blue line is influenced by the data point we added that doesn't follow the main data pattern.
 66 | 
 67 | ### Residuals
 68 | 
 69 | Typically, visual checks of model residuals would help in finding outliers.
 70 | 
 71 | ```{r}
 72 | lm.influential <- lm(y ~ x, data=data.influential)
 73 | 
 74 | lm.influential.fitted <- predict(lm.influential)
 75 | lm.influential.residuals <- residuals(lm.influential)
 76 | 
 77 | residuals.influential.df <- data.frame(fitted = lm.influential.fitted, residuals = lm.influential.residuals) 
 78 | 
 79 | # We look at residuals vs fitted 
 80 | 
 81 | residuals.influential.df %>%
 82 |   ggplot(aes(x=fitted, y = residuals)) +
 83 |   geom_point()
 84 | 
 85 | # We look at QQ plot of residuals against standard gaussian
 86 | 
 87 | residuals.influential.df %>%
 88 |   ggplot(aes(sample = residuals)) +
 89 |   geom_abline(intercept = 0, slope = 1) +
 90 |   stat_qq()
 91 | 
 92 | ```
 93 | 
 94 | Here, the typical visual diagnostics do not reveal any issues with residuals, if only that one data point does follow the pattern of fitted values of the data cloud - this is our first clue that something has gone awry!
 95 | 
 96 | ## Using `stats::influence.measures` in `R`
 97 | 
 98 | It is apparent from the plot of the blue regression line superimposed on the data cloud that the value at $x=6$ does not conform to the pattern predicted by the main group of data points. In turn, the departure is so egregious as to completely undermine the validity of the estimated linear trend. 
 99 | 
100 | Let's first compute the linear model without the influential data point and look at its residuals' diagnostics plots. 
101 | 
102 | ```{r}
103 | 
104 | lm.wo.influential<- lm(y ~ x, data=data)
105 | 
106 | lm.wo.influential.fitted <- predict(lm.wo.influential)
107 | lm.wo.influential.residuals <- residuals(lm.wo.influential)
108 | 
109 | residuals.wo.influential.df <- data.frame(fitted = lm.wo.influential.fitted, residuals = lm.wo.influential.residuals) 
110 | 
111 | # We look at residuals vs fitted 
112 | 
113 | residuals.wo.influential.df %>%
114 |   ggplot(aes(x=fitted, y = residuals)) +
115 |   geom_point()
116 | 
117 | # We look at QQ plot of residuals against standard gaussian
118 | 
119 | residuals.wo.influential.df %>%
120 |   ggplot(aes(sample = residuals)) +
121 |   geom_abline(intercept = 0, slope = 1) +
122 |   stat_qq()
123 | ```
124 | 
125 | There is nothing notable on the regression diagnostics.
126 | 
127 | To properly see the difference made by removing the influential observation, we can look more closely at the model coefficients with and without the influential observation.
128 | 
129 | 
130 | ```{r}
131 | coefficients(lm.influential)
132 | 
133 | coefficients(lm.wo.influential)
134 | ```
135 | 
136 | Indeed, looking at the model outputs with and without the so-called influential observation, we see that the estimated model coefficients are discrepant. These sorts of changes in model estimates attributable to one single data point can be measured using different influence measures. The default `stats::influence.measures` in `R` computes a handful of such useful measures.
137 | 
138 | ### Difference in Betas and Difference in Fits
139 | 
140 | The change in a fitted model parameter when we remove a data point from the dataset is referred to as DFBETA. For each beta coefficient of a given model, the DFBETA associated with data point $i$ is simply
141 | 
142 | \[\text{DFBETA}_i = \hat{\beta} - \hat{\beta}_{(-i)},\]
143 | 
144 | where $\hat{\beta}$ is the standard estimated coefficient and $\hat{\beta}_{(-i)}$ is the coefficient estimated from exactly the same model, except the $i$ data point is removed. For a model with $p$ fitted parameters to $n$ data points there is thus $n\times p$ DFBETA measures available.
145 | 
146 | The `influence.measures` function returns a list with 2 important elements :
147 | 
148 | 1. `infmat` contains a matrix of various influence measures computed on every single data point, including the DFBETAs of the previous section.
149 | 
150 | 2. `is.inf` contains a matrix of logical values determining if, according to a given measure, a data point is deemed influential. Obviously, the cutoff at which a point is deemed influential according to a given measure is somewhat arbitrary.
151 | 
152 | Let's take a look on our data.
153 | 
154 | ```{r}
155 | # Without the influential
156 | influence.measures.wo.influential <- influence.measures(lm.wo.influential)
157 | 
158 | influence.measures.wo.influential$is.inf
159 | ```
160 | 
161 | We see that in the data without the influential observation, most measures consider the data points to not be influential, as expected. There are a few `TRUE` values in the matrix, which shows that even with data that perfectly agree with the assumptions needed for OLS regression, the influence measures are not perfect - this is a case of false positives.
162 | 
163 | ```{r}
164 | # With the influential observation
165 | influence.measures.influential <- influence.measures(lm.influential)
166 | 
167 | influence.measures.influential$is.inf
168 | 
169 | ```
170 | 
171 | Here, we see that the last data point is flagged by multiple measures as being influential - which agrees with what we saw by looking at the plot. Indeed, when comparing the blue regression line with the red regression line above, we already had an idea that the estimated model was highly sensitive to the influential data point and that the red regression line seemed to lie closer to the data - here, we know this is the right model since we simulated the data from it.
172 | 
173 | ## DFFITS 
174 | 
175 | Going back to our influence measure matrix, we notice that the data point is also considered influential according to the DFFITS measure, which is simply a Studentized measure of the difference between model predictions with and without the data point.
176 | 
177 | \[\text{DFFITS}_i = \frac{\hat{y}_i - \hat{y}_{(-i)}}{s},\]
178 | 
179 | where $s$ is an appropriate Studentization term which we will discuss in an ulterior advanced article. Indeed, looking at the regression lines above with and without the influential data point clearly shows that the predicted $y$ value at $x=6$ is highly sensitive to the inclusion of the influential data point in the model fit :
180 | 
181 | \[ \hat{y}_{40} = \hat{\beta}_0 + \hat{\beta}_1 \times 6 = 0.256 + 0.851 \times 6 = 5.361 \]
182 | 
183 | \[ \hat{y}_{(-40)} = \hat{\beta}_{0(-40)} + \hat{\beta}_{1(-40)} \times 6 = -0.098 + 1.236 \times 6 = 7.319 \]
184 | 
185 | \[ \hat{y}_{40} - \hat{y}_{(-40)} = -1.957 \]
186 | 
187 | This value would then be Studentized to determine if it is large according to a given measure of standard deviation (again, we will get back to this in an ulterior advanced article).
188 | 
189 | ## An Outlier
190 | 
191 | ```{r}
192 | outlier <- c(1,5) # a value that doesn't follow the general pattern
193 | 
194 | data.outlier <- rbind(data, outlier) # we sneak the outlier in the data
195 | 
196 | data.outlier %>%
197 |   ggplot(aes(x = x, y = y)) +
198 |   geom_point() +
199 |   geom_smooth(method="lm", se = FALSE) +
200 |   geom_smooth(aes(x=x, y=y), data, method = "lm", se = FALSE, color = "red")
201 | ```
202 | 
203 | We see from this plot that there is a data point located at $(1,5)$ that does not conform to the pattern in the data cloud, but removing barely alters the regression line (in red) - this point is thus not very influential. Below, a call to `influence.measures` confirms this!
204 | 
205 | ```{r}
206 | lm.outlier <- lm(y ~ x, data=data.outlier)
207 | 
208 | influence.measures.outlier <- influence.measures(lm.outlier)
209 | 
210 | influence.measures.outlier$is.inf
211 | ```
212 | 
213 | Indeed, the outlier (observation number 40) does not seem particularly influential when compared with other data points, say observations 14 and 24.
214 | 
215 | Yet, we clearly see that this point has a problematic residual from the following diagnostics.
216 | 
217 | ```{r}
218 | 
219 | lm.outlier.fitted <- predict(lm.outlier)
220 | lm.outlier.residuals <- residuals(lm.outlier)
221 | 
222 | residuals.df <- data.frame(fitted = lm.outlier.fitted, residuals = lm.outlier.residuals) 
223 | 
224 | # We look at residuals vs fitted 
225 | 
226 | residuals.df %>%
227 |   ggplot(aes(x=fitted, y = residuals)) +
228 |   geom_point()
229 | 
230 | # We look at QQ plot of residuals against standard gaussian
231 | 
232 | residuals.df %>%
233 |   ggplot(aes(sample = residuals)) +
234 |   geom_abline(intercept = 0, slope = 1) +
235 |   stat_qq()
236 | ```
237 | 
238 | This point would typically be flagged as an outlier and further study might indicate a proper way of handling it - we never automatically remove it!
239 | 
240 | ### Takeaways
241 | 
242 | - An **influential observation** is one that, when removed, significantly alters the model under investigation. It is discovered by using influence measures, although there is a subjective element in choosing a proper threshold for these measures.
243 | - An **outlier** is a data point that significantly departs from the pattern represented by the model under investigation. It is discovered using model residuals, although there is a subjective element in deciding when a residual is considered too large.
244 | - These two caracteristics **need not coincide**! Indeed, an outlier can be influential or not, and an influential observation can be an outlier or not.
245 | 
246 | # References
247 | 
248 | - This article was partly inspired by a great post on influential observations available at https://online.stat.psu.edu/stat462/node/170/. 


--------------------------------------------------------------------------------
/biostatistics.ca/kronecker_product/kronecker_product.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 
 3 | output: pdf_document
 4 | header-includes:
 5 |   - \usepackage{amsmath}
 6 |   - \usepackage{amssymb}
 7 |   - \usepackage[colorlinks=true, urlcolor=blue, allbordercolors={blue}, pdfborderstyle={/S/U/W 1}]{hyperref}
 8 |   - \usepackage{fancyhdr}
 9 |   - \fancypagestyle{plain}{\pagestyle{fancy}}
10 |   - \pagestyle{fancy}
11 |   - \fancyhead[LO,LE]{\href{https://justinbelair.ca/?utm_source=github&utm_medium=pdf&utm_campaign=kronecker_product}{JB Statistical Consulting}}
12 |   - \fancyhead[RO,RE]{\href{https://www.biostatistics.ca/?utm_source=github&utm_medium=pdf&utm_campaign=kronecker_product}{biostatistics.ca}}
13 |   - \renewcommand{\footrulewidth}{0.4pt}
14 |   - \fancyfoot[LO,LE]{Visit the biostatistics.ca \href{https://www.biostatistics.ca/github-library/?utm_source=github&utm_medium=pdf&utm_campaign=kronecker_product}{\underline{\textbf{Github Library}}} for a full collection of our FREE R Markdown code notebooks.}
15 |   - \fancyfoot[CO,CE]{}
16 | ---
17 | 
18 | # Kronecker Product
19 | 
20 | The Kronecker product is a special operation on matrices (arrays). It is defined as follows.
21 | 
22 | If $A$ is an $m \times n$ matrix and $\mathbf{B}$ is a $p \times q$ matrix, then the Kronecker product $A \otimes B$ $pm \times qn$ block matrix:
23 | 
24 | $$A \otimes \mathbf{B} = \begin{bmatrix} 
25 | a_{11}\mathbf{B} & \cdots & a_{1n}\mathbf{B} \\ 
26 | \vdots & \ddots & \vdots \\ 
27 | a_{m1}\mathbf{B} & \cdots & a_{mn}\mathbf{B} 
28 | \end{bmatrix},$$
29 | 
30 | where $a_{ij}$ is the $(i,j)$-th entry of $A$ and $\mathbf{B}$ is the matrix $B$. More explicitly, let's use:
31 | 
32 | - $A = \begin{bmatrix} 
33 | 1 & 2 & 3 \\ 
34 | 4 & 5 & 6
35 | \end{bmatrix}$, a 2×3 matrix, and 
36 | $B = \begin{bmatrix} 
37 | 1 & 2 \\ 
38 | 3 & 4
39 | \end{bmatrix}$, a 2×2 matrix. Then, the Kronecker product $A \otimes B$ is computed as follows:
40 | 
41 | $$A \otimes \mathbf{B} = 
42 |   \begin{bmatrix}
43 |     1 & 2 & 3 \\
44 |     4 & 5 & 6
45 |   \end{bmatrix} \otimes 
46 |   \begin{bmatrix}
47 |     1 & 2 \\
48 |     3 & 4
49 |   \end{bmatrix} = \begin{bmatrix} 1
50 |         \begin{bmatrix}
51 |           1 & 2 \\ 3 & 4 
52 |         \end{bmatrix} & 2
53 |         \begin{bmatrix} 
54 |           1 & 2 \\ 3 & 4 
55 |         \end{bmatrix} & 3
56 |         \begin{bmatrix} 
57 |           1 & 2 \\ 3 & 4 
58 |         \end{bmatrix} \\ 
59 |         4
60 |         \begin{bmatrix} 
61 |           1 & 2 \\ 3 & 4 
62 |         \end{bmatrix} & 5
63 |         \begin{bmatrix} 
64 |           1 & 2 \\ 3 & 4 
65 |         \end{bmatrix} & 6
66 |         \begin{bmatrix} 
67 |           1 & 2 \\ 3 & 4 
68 |         \end{bmatrix}
69 |     \end{bmatrix} \\
70 |     = \begin{bmatrix}
71 |           1 & 2 & 2 & 4 & 3 & 6 \\
72 |           3 & 4 & 6 & 8 & 9 & 12 \\
73 |           4 & 8 & 5 & 10 & 6 & 12 \\
74 |           12 & 16 & 15 & 20 & 18 & 24
75 |       \end{bmatrix}$$
76 | 
77 | In R, this is easy to implement
78 | 
79 | ```{r}
80 | A <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, byrow = TRUE)
81 | B <- matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE)
82 | A %x% B
83 | ```
84 | ## Kronecker Product Use-Case From My Consulting Practice
85 | 
86 | Suppose I want to simulate a quasi-poisson variable based on given mean parameters $\lambda$ and $\phi$. For a quasi-poisson, the variance is equal to $\phi \cdot \lambda$. Suppose I have a matrix of means $\lambda$, that are determined by a combination of 2-variables. I also have a set of parameters $\phi$. Now, I want all the possible combinations of $\lambda \cdot phi$, i.e all the possible variances. I can create this using the Kronecker product. The matrix is given by:
87 | 
88 | ```{r}
89 | lambda <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, byrow = TRUE)
90 | phi <- c(3, 4)
91 | phi %x% lambda
92 | ```
93 | Now, I can iterate over these variance parameters for my analysis.


--------------------------------------------------------------------------------
/biostatistics.ca/kronecker_product/kronecker_product.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/kronecker_product/kronecker_product.pdf


--------------------------------------------------------------------------------
/biostatistics.ca/powerTOST_tutorial/Sample size.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/powerTOST_tutorial/Sample size.xlsx


--------------------------------------------------------------------------------
/biostatistics.ca/powerTOST_tutorial/sample size.R:
--------------------------------------------------------------------------------
  1 | library(PowerTOST)
  2 | library(dplyr)
  3 | library(ggplot2)
  4 | 
  5 | BESS <- function(power = 0.8, alpha = 0.05, r = 0.9, ll = 0.8, ul = 1.25, cv = 0.14) {
  6 |   ss <- data.frame()  # Initialize an empty dataframe
  7 |   
  8 |   b <- 1 - power
  9 |   mse <- log(cv^2 + 1)
 10 |   
 11 |   for (n in seq(2, 10000)) {
 12 |     t1 <- sqrt(n) * (log(r) - log(ll)) / sqrt(mse)
 13 |     t2 <- sqrt(n) * (log(ul) - log(r)) / sqrt(mse)
 14 |     t <- ifelse(is.nan(qt(1 - alpha, 2 * n - 2)), 0, qt(1 - alpha, 2 * n - 2))
 15 |     p1 <- pt(t, 2 * n - 2, t2)
 16 |     p2 <- pt(t, 2 * n - 2, t1)
 17 |     p <- p1 + p2
 18 |     
 19 |     if (b >= p) {
 20 |       cat("Date:", Sys.Date(), "\n")
 21 |       cat("Sample size Calculation for Two way BE design using R language", "\n")
 22 |       cat("Assumed T/R Ratio:", r, "\n")
 23 |       cat("BE limits:", ll, "-", ul, "\n")
 24 |       cat("Assumed Power:", power, "\n")
 25 |       cat("Assumed level of significance:", alpha, "\n")
 26 |       cat("Assumed CV:", cv, "\n")
 27 |       cat("The required sample size:", 2 * n, "\n")
 28 |       
 29 |       ss <- data.frame(alpha = alpha, power = power, ratio = r, size = 2 * n, cv=cv)
 30 |       break
 31 |     } 
 32 |   }
 33 |   
 34 |   if (n == 10000) {
 35 |     cat("Warning: Sample size not found within the range [6, 10000]. Consider revising assumptions or increasing the maximum sample size limit.\n")
 36 |   }
 37 |   
 38 |   return(ss)  # Return the dataframe
 39 | }
 40 | 
 41 | # Example usage:
 42 | result <- BESS(power = 0.8, alpha = 0.05, r = 0.85, cv = 0.05)
 43 | print(result)
 44 | 
 45 | ss <- data.frame()
 46 | 
 47 | # Nested loops to iterate over combinations of parameters
 48 | for (i in c(0.8, 0.9)) {
 49 |   for (j in seq(0.05, 1, 0.05)) {
 50 |     for (k in seq(0.81, 1.24, 0.05)) {
 51 |       # Call BESS function for each combination of parameters
 52 |       s_size <- BESS(power = i, r = k, cv = j)
 53 |       
 54 |       # Append s_size to ss using rbind and assign back to ss
 55 |       ss <- rbind(ss, s_size)
 56 |     }
 57 |   }
 58 | }
 59 | 
 60 | # Print the resulting dataframe ss
 61 | print(ss)
 62 | 
 63 | 
 64 | ss1<-data.frame()
 65 | 
 66 | for (i in c(0.8, 0.9)) {
 67 |   for (j in seq(0.05, 1, 0.05)) {
 68 |     for (k in seq(0.81, 1.24, 0.05)) {
 69 |       # Call BESS function for each combination of parameters
 70 |       s_size <- sampleN.TOST(targetpower = i, theta0 = k, CV = j)
 71 |       
 72 |       # Append s_size to ss using rbind and assign back to ss
 73 |       ss1 <- rbind(ss1, s_size)
 74 |     }
 75 |   }
 76 | }
 77 | setwd("C:\\Users\\Hanu\\OneDrive\\Desktop\\Anil\\R programes")
 78 | write.csv(ss1, "ptss.csv")
 79 | write.csv(ss,"mss.csv")
 80 | 
 81 | ss %>% ggplot(aes(cv, size, color=ratio))+
 82 |   geom_line(aes(group=ratio))+
 83 |   facet_wrap(~power, labeller = labeller(power=c("0.8"="Power = 0.8", "0.9"="Power = 0.9")), 
 84 |              ncol = 2)+
 85 |   labs(
 86 |     x = "CV",
 87 |     y = "Sample Size",
 88 |     color = "T/R Ratio"
 89 |   )
 90 | 
 91 | ss %>% ggplot(aes(ratio, size, color=cv))+
 92 |   geom_line(aes(group=cv))+
 93 |   facet_wrap(~power, labeller = labeller(power=c("0.8"="Power = 0.8", "0.9"="Power = 0.9")), 
 94 |              ncol = 2)+
 95 |   labs(
 96 |     x = "T/R Ratio",
 97 |     y = "Sample Size",
 98 |     color = "CV"
 99 |   )
100 | 


--------------------------------------------------------------------------------
/biostatistics.ca/power_with_uncertainty/power_with_uncertainty.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Sample Size Estimates Done Via Power Calculation Are Uncertain
 3 | date: \today
 4 | author:
 5 | output:
 6 |   html_document
 7 | ---
 8 | [Find the RMarkdown Notebook on Github and Run the Code Yourself!](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca)
 9 | 
10 | \newpage
11 | 
12 | ## Introduction
13 | 
14 | I often see the results of power calculations used as if there is no uncertainty around the required sample size number obtained. This is a misconception, made almost invisible by the fact that traditional software tools like G-Power, SPSS, etc. don't provide any hint of the uncertainty around the sample size number they provide.
15 | 
16 | In short, estimates of the anticipated effect size from the study are by definition uncertain, or else the study wouldn't be needed - this uncertainty is transferred to the sample size number obtained via a power calculation.
17 | 
18 | Let's give a few quick examples.
19 | 
20 | ## A Simple Example - the t-test
21 | 
22 | With an effect-size of 0.5, we need ~64 patients for a two-sample t-test with common standard deviation 1 to have 80% power at the 5% significance level.
23 | 
24 | ```{r}
25 | delta <- 0.5
26 | sd <- 1
27 | 
28 | n <- power.t.test(delta = delta, sd = sd, sig.level = 0.05, power = 0.8)$n
29 | 
30 | n
31 | ```
32 | ### The t-test with an uncertain delta parameter
33 | 
34 | In practice, the delta is uncertain. Let's suppose there is uncertainty of about 5\% of the estimated value. For now, we assume the standard deviation is also known.
35 | 
36 | ```{r}
37 | delta.range <- c(1-0.05, 1, 1+0.05)*delta
38 | n.delta.range <- sapply(delta.range, \(x) power.t.test(delta = x, sd = sd, sig.level = 0.05, power = 0.8)$n)
39 | 
40 | n.delta.range
41 | ```
42 | We see that a range of ~58 to ~71 patients would be compatible with a small uncertainty around the estimate of delta.
43 | 
44 | ### The t-test with uncertain delta and standard deviation parameters
45 | 
46 | Let's add uncertainty around the standard deviation as well, using the same approach.
47 | 
48 | ```{r}
49 | sd.range <- c(1+0.05, 1, 1-0.05)*sd
50 | 
51 | n.delta.sd.range <- mapply(FUN = function(x,y){power.t.test(delta = x, sd = y, sig.level = 0.05, power = 0.8)$n},
52 |                            delta.range, sd.range)
53 | 
54 | n.delta.sd.range
55 | ```
56 | This additional uncertainty widens the range from ~53 to ~78 patients. The $n=64$ patients we found without considering the uncertainty inherent in estimating the effect-size and the standard deviation before hand could seriously mislead us if we do not account for it. 
57 | 
58 | ## Conclusion
59 | 
60 | Be vigilant! Test a range of plausible values - you might be shocked by what you see. Your study might might heavily dependent on a false sense of confidence around the parameter values used to estimate the required sample size. 
61 | 
62 | Using a range of estimates for the effect-size and correspondingly determining a range of plausible sample sizes might give you a better understanding of the power you have to detect interesting effects.
63 | 
64 | Even better, consult a statistician to help you navigate the uncertainty in your study design. 
65 | 
66 | If you need help with statistics, don't hesitate to reach out to me at [JB Statistical Consulting](www.justinbelair.ca).
67 | 


--------------------------------------------------------------------------------
/biostatistics.ca/pvalue_distributions/output_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/biostatistics.ca/pvalue_distributions/output_plot.png


--------------------------------------------------------------------------------
/biostatistics.ca/pvalue_distributions/p.value_distribution_simulations.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title:
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | 
  9 | library(ggplot2)
 10 | library(dplyr)
 11 | library(gridExtra)
 12 | 
 13 | knitr::opts_chunk$set(
 14 |   warning = FALSE,   # Suppress warnings
 15 |   message = FALSE,   # Suppress messages
 16 |   echo = TRUE,       # Show code (optional)
 17 |   fig.align = 'center' # Center plots
 18 | )
 19 | ```
 20 | 
 21 | ## Introduction 
 22 | 
 23 | In this simulation, we will investigate the distribution of p-values : both when the null hypothesis is true. The idea is simply to simulate a sample size of $n$ from normal distributions of standard deviation 1 that get progressively shifted as we change the mean (feel free to modify the simulation parameters and rerun the simulations yourself by downloading the [free R Code Notebook here!](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca) Then, we test this mean against $H_0 : \mu_0 = 0$ using a t-test.
 24 | 
 25 | ## Simulating P-values Distribution
 26 | 
 27 | ```{r}
 28 | #number of simulations
 29 | n_experiment <- 25000
 30 | 
 31 | #simulation parameters
 32 | n <- 20
 33 | sd <- 1
 34 | means <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.25, 1.5)
 35 | ```
 36 | 
 37 | We then run the simulation and plot the results. The code below will generate both histograms and empirical cumulative distribution function (ecdf) plots to visualize the distribution of the p-values.
 38 | 
 39 | ### Computer Simulation Results
 40 | 
 41 | Notice how when the null hypothesis is true, the p-value is *uniformly distributed* between 0 and 1. This might be surprising at first thought, but it makes a lot of sense. If the null hypothesis is true, what is the probability that $p<0.05$? Well, by definition it's 0.05, since there is absolutely no effect under the null hypothesis : we would declare significance erroneously around 5 times out of 100, or 0.05. This is true for any value, the probability that $p<p_0$ for any $p_0$ is exactly $p_0$ : this is the *definition* of a uniform distribution.
 42 | 
 43 | Cool, huh?
 44 | 
 45 | Then, when the mean moves, the p-value distribution shifts to the left, and the probability of getting a significant result increases. This is because the t-test is more likely to reject the null hypothesis when the true mean is different from 0.
 46 | 
 47 | See the results below. To see the simulation code, [Download the Free R Code Notebook here!](https://github.com/JB-Statistical-Consulting/biostatistics/tree/main/biostatistics.ca)
 48 | 
 49 | ```{r echo=FALSE}
 50 | set.seed(1) # for reproducibility
 51 | 
 52 | #initializing variables
 53 | sim.sign.level <- c()
 54 | p.values <- matrix(NA, nrow = n_experiment, ncol = length(means))
 55 | p.value_hist <- list()
 56 | p.value_ecdf <- list()
 57 | j <- 0
 58 | 
 59 | #looping over different values of true parameter (i.e true mean)
 60 | for (mean in means){
 61 |   #counting the iteration through means
 62 |   j <- j+1
 63 |   
 64 |   #starting counter of significant tests to estimate true significance level
 65 |   significant.t.test <- 0
 66 |   
 67 |   #running the experiments
 68 |   for (experiment in 1:n_experiment){
 69 |     #sim data
 70 |     x <- rnorm(n, mean, sd)
 71 |     #extract t.test p-value for mu = 0
 72 |     p.values[experiment, j] <- t.test(x)$p.value
 73 |     
 74 |     #counting if test is significant
 75 |     if (p.values[experiment, j] < 0.05){
 76 |       significant.t.test <- significant.t.test + 1
 77 |     }
 78 |   }
 79 | 
 80 |   sim.sign.level[j] <- significant.t.test/n_experiment
 81 |   
 82 |   #plotting results as density histograms and empirical cdf
 83 | 
 84 |   data <- data.frame(p_values = p.values[, j])
 85 |   
 86 |   p.value_hist[[j]] <- data %>%
 87 |     ggplot(aes(x = p_values)) +
 88 |       geom_histogram(aes(y = after_stat(density)), bins = 25, fill = "#bd93f9", alpha = 0.6) +
 89 |       geom_density(color = "#ff5555", size = 1) +
 90 |       labs(
 91 |         title = paste("Pval for t.test H0: mu = 0, when true mean mu = ", mean, " (n =", n, ")"),
 92 |         x = "P-values",
 93 |         y = "Density"
 94 |       ) +
 95 |       theme_minimal()
 96 |   
 97 |   # Plotting the empirical cumulative distribution function (ecdf)
 98 |   
 99 |   p.value_ecdf[[j]] <- data %>%
100 |     ggplot(aes(x = p_values)) +
101 |       stat_ecdf(geom = "step", color = "#44475a", size = 1) +
102 |       labs(
103 |         title = paste0("Pval for t.test H0: mu = 0, when true mean mu = ", mean, " (n =", n, ")"),
104 |         x = "P-values",
105 |         y = "ECDF"
106 |       ) +
107 |       theme_minimal()
108 | 
109 | # Print in a 2x1 layout
110 | grid.arrange(p.value_hist[[j]], p.value_ecdf[[j]], nrow = 2)
111 | }
112 | ```
113 | 
114 | ## Conclusion
115 | 
116 | Under the null hypothesis, by definition the p-value has a uniform distribution. As we move away from the null hypothesis, the p-value skews towards smaller and smaller values. This is obviously desirable : we want the p-value to help us detect effects when they are present, in spite of sampling uncertainty.
117 | 
118 | If you want to learn more about p-values and statistical inference, how to run simulations like this, and how to do statistics the right way, consider joining my [*Introduction to Biostatistics : Learn Statistics the Right Way!* Course](https://justinbelair.ca/introduction-to-biostatistics/?utm_source=biostatistics&utm_medium=blog&utm_campaign=p-values-distribution) where I teach you everything you need to know about statistics and data analysis.


--------------------------------------------------------------------------------
/biostatistics.ca/theme.css:
--------------------------------------------------------------------------------
 1 | .main-container { 
 2 |      max-width: 1080px !important;
 3 | 	font-color: #000000 !important;
 4 | 	background-color: #FFFFFF !important;
 5 | } 
 6 | 
 7 | body, p, em, strong, ul li, ol li {
 8 |   color:#000000  !important;
 9 | font-size = 12 !important;
10 | }
11 | 
12 | h1 {
13 |   font-size: 20px !important;
14 |   color:  #ff79c6 !important;
15 |   font-weight: bold !important;	
16 | }
17 | 
18 | 
19 | h2 {
20 |   font-size: 16px !important;
21 |   color:  #ff79c6 !important;
22 | }
23 | 
24 | h3 {
25 |   font-size: 14px !important;
26 |   color:  #ff79c6 !important;
27 | }
28 | 
29 | .math, .inlineMath {
30 |     color: #000000 !important;
31 | }
32 | 
33 | .rcode{
34 | background-color: #f8f8f2 !important;
35 | color:  	#000000 !important;
36 | font-size: 11px !important;
37 |   border: 3px solid #bd93f9;
38 |   font-weight: bold;
39 | }
40 | 
41 |  hr {
42 |      page-break-after: always !important;
43 |  }


--------------------------------------------------------------------------------
/causal_inference/battle_for_the_soul_of_causal_inference.pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/causal_inference/battle_for_the_soul_of_causal_inference.pdf.pdf


--------------------------------------------------------------------------------
/contact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JB-Statistical-Consulting/biostatistics/b50e58b1c4ce6d0629bcf8d263438cd86707159a/contact.png


--------------------------------------------------------------------------------