├── .gitignore ├── EH6126_data_analysis_tutorials.Rproj ├── LICENSE ├── Linear_models ├── Linear_models.Rmd ├── Linear_models.html ├── Linear_models.md ├── Linear_models_files │ └── figure-html │ │ ├── alt_data_2-1.png │ │ ├── alt_data_le_05-1.png │ │ ├── alt_data_repeat-1.png │ │ ├── many_sample_means-1.png │ │ ├── null_data-1.png │ │ ├── null_data_repeat-1.png │ │ ├── p_distribution_null_1-1.png │ │ ├── p_distribution_null_2-1.png │ │ ├── power_sim-1.png │ │ ├── sim_t_test_data-1.png │ │ ├── simulate_population-1.png │ │ ├── t_null_dist-1.png │ │ ├── t_one_sided-1.png │ │ ├── t_two_sidedd-1.png │ │ ├── unnamed-chunk-12-1.png │ │ ├── unnamed-chunk-13-1.png │ │ ├── unnamed-chunk-14-1.png │ │ ├── unnamed-chunk-14-2.png │ │ ├── unnamed-chunk-14-3.png │ │ ├── unnamed-chunk-15-1.png │ │ ├── unnamed-chunk-18-1.png │ │ ├── unnamed-chunk-19-1.png │ │ ├── unnamed-chunk-20-1.png │ │ ├── unnamed-chunk-21-1.png │ │ ├── unnamed-chunk-23-1.png │ │ ├── unnamed-chunk-24-1.png │ │ ├── unnamed-chunk-25-1.png │ │ ├── unnamed-chunk-5-1.png │ │ └── unnamed-chunk-6-1.png └── example.csv ├── README.md ├── Unit_1_Review ├── Change_scores │ ├── Change_scores.Rmd │ ├── Change_scores.html │ ├── Change_scores.md │ └── Change_scores_files │ │ └── figure-html │ │ ├── unnamed-chunk-11-1.png │ │ ├── unnamed-chunk-12-1.png │ │ ├── unnamed-chunk-14-1.png │ │ ├── unnamed-chunk-17-1.png │ │ ├── unnamed-chunk-19-1.png │ │ ├── unnamed-chunk-22-1.png │ │ ├── unnamed-chunk-23-1.png │ │ ├── unnamed-chunk-24-1.png │ │ ├── unnamed-chunk-26-1.png │ │ ├── unnamed-chunk-27-1.png │ │ ├── unnamed-chunk-31-1.png │ │ ├── unnamed-chunk-35-1.png │ │ └── unnamed-chunk-8-1.png ├── Frequentist_inference.Rmd ├── Frequentist_inference.html ├── Frequentist_inference.md ├── Frequentist_inference_files │ └── figure-html │ │ ├── alt_data_2-1.png │ │ ├── alt_data_le_05-1.png │ │ ├── alt_data_repeat-1.png │ │ ├── many_sample_means-1.png │ │ ├── null_data-1.png │ │ ├── null_data_repeat-1.png │ │ ├── p_distribution_null_1-1.png │ │ ├── p_distribution_null_2-1.png │ │ ├── power_sim-1.png │ │ ├── sample_10k-1.png │ │ ├── shift_to_effect-1.png │ │ ├── sim_t_test_data-1.png │ │ ├── simulate_population-1.png │ │ ├── t_null_dist-1.png │ │ ├── t_one_sided-1.png │ │ ├── t_two_sided-1.png │ │ ├── t_two_sidedd-1.png │ │ ├── unnamed-chunk-11-1.png │ │ ├── unnamed-chunk-12-1.png │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-4-1.png │ │ ├── unnamed-chunk-5-1.png │ │ └── unnamed-chunk-9-1.png └── Review_main_points_6124.md ├── Unit_2_Equivalance_Trials ├── Equivalance_trials.Rmd ├── Equivalance_trials.html ├── Equivalance_trials.md ├── Equivalance_trials_files │ └── figure-html │ │ ├── new_interval_1-1.png │ │ ├── new_interval_2-1.png │ │ ├── non_inferiority-1.png │ │ ├── non_superiority-1.png │ │ ├── plot_mean_diffs-1.png │ │ ├── plot_raw_pain_data-1.png │ │ ├── shift-1.png │ │ ├── shift_to_effect-1.png │ │ ├── start-1.png │ │ ├── t_two_sided-1.png │ │ └── tost_primary-1.png ├── data │ └── data.csv ├── plots │ ├── figure2.pdf │ ├── figure_1.pdf │ └── figure_2.pdf └── scripts │ └── ggplot2_primer.R ├── Unit_3_Crossover_Trials ├── Crossover_trials.Rmd ├── Crossover_trials.html ├── Crossover_trials.md ├── Crossover_trials_files │ └── figure-html │ │ ├── cross_over_plot-1.png │ │ ├── cross_over_plot_2-1.png │ │ ├── distribution_plot-1.png │ │ ├── distribution_plot_2-1.png │ │ └── unnamed-chunk-3-1.png └── data │ └── data.csv ├── Unit_4_Cluster_randomized_trials ├── Cluster_randomized_trials.Rmd ├── Cluster_randomized_trials.html ├── Cluster_randomized_trials.md └── Cluster_randomized_trials_files │ └── figure-html │ ├── dependent-1.png │ └── independent-1.png ├── collapsability ├── collapsability.Rmd ├── collapsability.html ├── collapsability.md ├── collapsability_files │ └── figure-html │ │ ├── unnamed-chunk-11-1.png │ │ ├── unnamed-chunk-12-1.png │ │ ├── unnamed-chunk-4-1.png │ │ ├── unnamed-chunk-5-1.png │ │ └── unnamed-chunk-9-1.png └── figs │ ├── eq_1.JPG │ ├── fig_1_a.JPG │ └── fig_1_b.JPG └── testing_testing ├── .Rproj.user ├── 1503004 │ ├── pcs │ │ ├── files-pane.pper │ │ ├── packages-pane.pper │ │ ├── source-pane.pper │ │ ├── windowlayoutstate.pper │ │ └── workbench-pane.pper │ ├── rmd-outputs │ ├── saved_source_markers │ └── sources │ │ └── prop │ │ ├── 28A47032 │ │ ├── 36475FDC │ │ ├── 5318C6C7 │ │ ├── 6937C0F1 │ │ ├── 6FDF2087 │ │ ├── B5ECED0D │ │ ├── B65F4002 │ │ ├── E01D0251 │ │ └── INDEX └── shared │ └── notebooks │ ├── patch-chunk-names │ └── paths ├── README.md ├── docs ├── R-RStudio_installation_steps.pdf └── R_update_presentation.pdf ├── plots └── successful_test.png ├── scripts └── test_script.R └── testing_testing.Rproj /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /EH6126_data_analysis_tutorials.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | StripTrailingWhitespace: Yes 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-ShareAlike 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-ShareAlike 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. Share means to provide material to the public by any means or 126 | process that requires permission under the Licensed Rights, such 127 | as reproduction, public display, public performance, distribution, 128 | dissemination, communication, or importation, and to make material 129 | available to the public including in ways that members of the 130 | public may access the material from a place and at a time 131 | individually chosen by them. 132 | 133 | l. Sui Generis Database Rights means rights other than copyright 134 | resulting from Directive 96/9/EC of the European Parliament and of 135 | the Council of 11 March 1996 on the legal protection of databases, 136 | as amended and/or succeeded, as well as other essentially 137 | equivalent rights anywhere in the world. 138 | 139 | m. You means the individual or entity exercising the Licensed Rights 140 | under this Public License. Your has a corresponding meaning. 141 | 142 | 143 | Section 2 -- Scope. 144 | 145 | a. License grant. 146 | 147 | 1. Subject to the terms and conditions of this Public License, 148 | the Licensor hereby grants You a worldwide, royalty-free, 149 | non-sublicensable, non-exclusive, irrevocable license to 150 | exercise the Licensed Rights in the Licensed Material to: 151 | 152 | a. reproduce and Share the Licensed Material, in whole or 153 | in part; and 154 | 155 | b. produce, reproduce, and Share Adapted Material. 156 | 157 | 2. Exceptions and Limitations. For the avoidance of doubt, where 158 | Exceptions and Limitations apply to Your use, this Public 159 | License does not apply, and You do not need to comply with 160 | its terms and conditions. 161 | 162 | 3. Term. The term of this Public License is specified in Section 163 | 6(a). 164 | 165 | 4. Media and formats; technical modifications allowed. The 166 | Licensor authorizes You to exercise the Licensed Rights in 167 | all media and formats whether now known or hereafter created, 168 | and to make technical modifications necessary to do so. The 169 | Licensor waives and/or agrees not to assert any right or 170 | authority to forbid You from making technical modifications 171 | necessary to exercise the Licensed Rights, including 172 | technical modifications necessary to circumvent Effective 173 | Technological Measures. For purposes of this Public License, 174 | simply making modifications authorized by this Section 2(a) 175 | (4) never produces Adapted Material. 176 | 177 | 5. Downstream recipients. 178 | 179 | a. Offer from the Licensor -- Licensed Material. Every 180 | recipient of the Licensed Material automatically 181 | receives an offer from the Licensor to exercise the 182 | Licensed Rights under the terms and conditions of this 183 | Public License. 184 | 185 | b. Additional offer from the Licensor -- Adapted Material. 186 | Every recipient of Adapted Material from You 187 | automatically receives an offer from the Licensor to 188 | exercise the Licensed Rights in the Adapted Material 189 | under the conditions of the Adapter's License You apply. 190 | 191 | c. No downstream restrictions. You may not offer or impose 192 | any additional or different terms or conditions on, or 193 | apply any Effective Technological Measures to, the 194 | Licensed Material if doing so restricts exercise of the 195 | Licensed Rights by any recipient of the Licensed 196 | Material. 197 | 198 | 6. No endorsement. Nothing in this Public License constitutes or 199 | may be construed as permission to assert or imply that You 200 | are, or that Your use of the Licensed Material is, connected 201 | with, or sponsored, endorsed, or granted official status by, 202 | the Licensor or others designated to receive attribution as 203 | provided in Section 3(a)(1)(A)(i). 204 | 205 | b. Other rights. 206 | 207 | 1. Moral rights, such as the right of integrity, are not 208 | licensed under this Public License, nor are publicity, 209 | privacy, and/or other similar personality rights; however, to 210 | the extent possible, the Licensor waives and/or agrees not to 211 | assert any such rights held by the Licensor to the limited 212 | extent necessary to allow You to exercise the Licensed 213 | Rights, but not otherwise. 214 | 215 | 2. Patent and trademark rights are not licensed under this 216 | Public License. 217 | 218 | 3. To the extent possible, the Licensor waives any right to 219 | collect royalties from You for the exercise of the Licensed 220 | Rights, whether directly or through a collecting society 221 | under any voluntary or waivable statutory or compulsory 222 | licensing scheme. In all other cases the Licensor expressly 223 | reserves any right to collect such royalties. 224 | 225 | 226 | Section 3 -- License Conditions. 227 | 228 | Your exercise of the Licensed Rights is expressly made subject to the 229 | following conditions. 230 | 231 | a. Attribution. 232 | 233 | 1. If You Share the Licensed Material (including in modified 234 | form), You must: 235 | 236 | a. retain the following if it is supplied by the Licensor 237 | with the Licensed Material: 238 | 239 | i. identification of the creator(s) of the Licensed 240 | Material and any others designated to receive 241 | attribution, in any reasonable manner requested by 242 | the Licensor (including by pseudonym if 243 | designated); 244 | 245 | ii. a copyright notice; 246 | 247 | iii. a notice that refers to this Public License; 248 | 249 | iv. a notice that refers to the disclaimer of 250 | warranties; 251 | 252 | v. a URI or hyperlink to the Licensed Material to the 253 | extent reasonably practicable; 254 | 255 | b. indicate if You modified the Licensed Material and 256 | retain an indication of any previous modifications; and 257 | 258 | c. indicate the Licensed Material is licensed under this 259 | Public License, and include the text of, or the URI or 260 | hyperlink to, this Public License. 261 | 262 | 2. You may satisfy the conditions in Section 3(a)(1) in any 263 | reasonable manner based on the medium, means, and context in 264 | which You Share the Licensed Material. For example, it may be 265 | reasonable to satisfy the conditions by providing a URI or 266 | hyperlink to a resource that includes the required 267 | information. 268 | 269 | 3. If requested by the Licensor, You must remove any of the 270 | information required by Section 3(a)(1)(A) to the extent 271 | reasonably practicable. 272 | 273 | b. ShareAlike. 274 | 275 | In addition to the conditions in Section 3(a), if You Share 276 | Adapted Material You produce, the following conditions also apply. 277 | 278 | 1. The Adapter's License You apply must be a Creative Commons 279 | license with the same License Elements, this version or 280 | later, or a BY-SA Compatible License. 281 | 282 | 2. You must include the text of, or the URI or hyperlink to, the 283 | Adapter's License You apply. You may satisfy this condition 284 | in any reasonable manner based on the medium, means, and 285 | context in which You Share Adapted Material. 286 | 287 | 3. You may not offer or impose any additional or different terms 288 | or conditions on, or apply any Effective Technological 289 | Measures to, Adapted Material that restrict exercise of the 290 | rights granted under the Adapter's License You apply. 291 | 292 | 293 | Section 4 -- Sui Generis Database Rights. 294 | 295 | Where the Licensed Rights include Sui Generis Database Rights that 296 | apply to Your use of the Licensed Material: 297 | 298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 299 | to extract, reuse, reproduce, and Share all or a substantial 300 | portion of the contents of the database; 301 | 302 | b. if You include all or a substantial portion of the database 303 | contents in a database in which You have Sui Generis Database 304 | Rights, then the database in which You have Sui Generis Database 305 | Rights (but not its individual contents) is Adapted Material, 306 | 307 | including for purposes of Section 3(b); and 308 | c. You must comply with the conditions in Section 3(a) if You Share 309 | all or a substantial portion of the contents of the database. 310 | 311 | For the avoidance of doubt, this Section 4 supplements and does not 312 | replace Your obligations under this Public License where the Licensed 313 | Rights include other Copyright and Similar Rights. 314 | 315 | 316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 317 | 318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 328 | 329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 338 | 339 | c. The disclaimer of warranties and limitation of liability provided 340 | above shall be interpreted in a manner that, to the extent 341 | possible, most closely approximates an absolute disclaimer and 342 | waiver of all liability. 343 | 344 | 345 | Section 6 -- Term and Termination. 346 | 347 | a. This Public License applies for the term of the Copyright and 348 | Similar Rights licensed here. However, if You fail to comply with 349 | this Public License, then Your rights under this Public License 350 | terminate automatically. 351 | 352 | b. Where Your right to use the Licensed Material has terminated under 353 | Section 6(a), it reinstates: 354 | 355 | 1. automatically as of the date the violation is cured, provided 356 | it is cured within 30 days of Your discovery of the 357 | violation; or 358 | 359 | 2. upon express reinstatement by the Licensor. 360 | 361 | For the avoidance of doubt, this Section 6(b) does not affect any 362 | right the Licensor may have to seek remedies for Your violations 363 | of this Public License. 364 | 365 | c. For the avoidance of doubt, the Licensor may also offer the 366 | Licensed Material under separate terms or conditions or stop 367 | distributing the Licensed Material at any time; however, doing so 368 | will not terminate this Public License. 369 | 370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 371 | License. 372 | 373 | 374 | Section 7 -- Other Terms and Conditions. 375 | 376 | a. The Licensor shall not be bound by any additional or different 377 | terms or conditions communicated by You unless expressly agreed. 378 | 379 | b. Any arrangements, understandings, or agreements regarding the 380 | Licensed Material not stated herein are separate from and 381 | independent of the terms and conditions of this Public License. 382 | 383 | 384 | Section 8 -- Interpretation. 385 | 386 | a. For the avoidance of doubt, this Public License does not, and 387 | shall not be interpreted to, reduce, limit, restrict, or impose 388 | conditions on any use of the Licensed Material that could lawfully 389 | be made without permission under this Public License. 390 | 391 | b. To the extent possible, if any provision of this Public License is 392 | deemed unenforceable, it shall be automatically reformed to the 393 | minimum extent necessary to make it enforceable. If the provision 394 | cannot be reformed, it shall be severed from this Public License 395 | without affecting the enforceability of the remaining terms and 396 | conditions. 397 | 398 | c. No term or condition of this Public License will be waived and no 399 | failure to comply consented to unless expressly agreed to by the 400 | Licensor. 401 | 402 | d. Nothing in this Public License constitutes or may be interpreted 403 | as a limitation upon, or waiver of, any privileges and immunities 404 | that apply to the Licensor or You, including from the legal 405 | processes of any jurisdiction or authority. 406 | 407 | 408 | ======================================================================= 409 | 410 | Creative Commons is not a party to its public 411 | licenses. Notwithstanding, Creative Commons may elect to apply one of 412 | its public licenses to material it publishes and in those instances 413 | will be considered the “Licensor.” The text of the Creative Commons 414 | public licenses is dedicated to the public domain under the CC0 Public 415 | Domain Dedication. Except for the limited purpose of indicating that 416 | material is shared under a Creative Commons public license or as 417 | otherwise permitted by the Creative Commons policies published at 418 | creativecommons.org/policies, Creative Commons does not authorize the 419 | use of the trademark "Creative Commons" or any other trademark or logo 420 | of Creative Commons without its prior written consent including, 421 | without limitation, in connection with any unauthorized modifications 422 | to any of its public licenses or any other arrangements, 423 | understandings, or agreements concerning use of licensed material. For 424 | the avoidance of doubt, this paragraph does not form part of the 425 | public licenses. 426 | 427 | Creative Commons may be contacted at creativecommons.org. 428 | -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/alt_data_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/alt_data_2-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/alt_data_le_05-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/alt_data_le_05-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/alt_data_repeat-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/alt_data_repeat-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/many_sample_means-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/many_sample_means-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/null_data-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/null_data-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/null_data_repeat-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/null_data_repeat-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/p_distribution_null_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/p_distribution_null_1-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/p_distribution_null_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/p_distribution_null_2-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/power_sim-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/power_sim-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/sim_t_test_data-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/sim_t_test_data-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/simulate_population-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/simulate_population-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/t_null_dist-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/t_null_dist-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/t_one_sided-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/t_one_sided-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/t_two_sidedd-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/t_two_sidedd-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-2.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-14-3.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-24-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-24-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /Linear_models/Linear_models_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Linear_models/Linear_models_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /Linear_models/example.csv: -------------------------------------------------------------------------------- 1 | id,demo_age,demo_gender,demo_ht_cm,demo_wt_kg,bmi,arm,pre_timi,eos_glvef,bl_glvef 2 | 101,63,Male,172.72,90,30.1687454585982,Arm C (HD),2,33.3,41.9 3 | 104,36,Male,173.73,76.2,25.2467414234608,Arm C (HD),3,58.4,53.5 4 | 106,65,Male,165,76,27.9155188246097,Arm B (LD),3,61,47.6 5 | 107,63,Male,185,108,31.5558802045289,Arm A (P),1,46,47.7 6 | 108,70,Male,173,90,30.0711684319556,Arm C (HD),3,NA,36.2 7 | 109,48,Male,173,90,30.0711684319556,Arm B (LD),0,NA,NA 8 | 110,58,Male,170,80,27.681660899654,Arm C (HD),2,55.4,49.5 9 | 111,61,Male,177.8,58,18.3469754694611,Arm A (P),1,58.4,46.9 10 | 112,45,Female,160,71.1,27.7734375,Arm A (P),3,40,35.1 11 | 113,62,Male,170,66.7,23.0795847750865,Arm B (LD),3,48.3,45 12 | 114,74,Male,172,66,22.3093564088697,Arm B (LD),0,58.3,NA 13 | 115,41,Male,170,64,22.1453287197232,Arm A (P),0,42.6,28.9 14 | 116,62,Male,180,70,21.6049382716049,Arm C (HD),1,NA,NA 15 | 117,67,Female,155,59,24.5577523413111,Arm B (LD),3,61.5,53.2 16 | 118,55,Male,182.9,73,21.8220451082598,Arm C (HD),3,51.5,34.7 17 | 120,39,Male,182,82,24.7554643159039,Arm B (LD),3,34.2,34.3 18 | 121,71,Male,180,82,25.3086419753086,Arm C (HD),1,46.2,36.7 19 | 122,61,Female,165,69.8,25.6382001836547,Arm C (HD),3,62.8,52.3 20 | 123,69,Male,183,75,22.3954134193317,Arm B (LD),0,34.9,24.9 21 | 124,69,Male,178,70,22.0931700542861,Arm A (P),0,41.3,32.4 22 | 125,31,Male,180.34,80,24.5983431859742,Arm B (LD),0,56.3,41 23 | 126,54,Female,170,43.5,15.0519031141869,Arm A (P),0,52.6,36.6 24 | 127,64,Male,178,83.8,26.448680722131,Arm C (HD),3,56.2,51.8 25 | 128,39,Female,152,76,32.8947368421053,Arm B (LD),0,36.1,27.9 26 | 129,63,Female,167,50.8,18.2150668722435,Arm A (P),0,40.9,48 27 | 130,63,Male,173,76,25.3934311203181,Arm C (HD),0,52.6,49 28 | 131,53,Male,183,92,27.4717071277136,Arm A (P),0,38,28.7 29 | 132,74,Female,163,67.3,25.3302721216455,Arm C (HD),0,50.6,46.3 30 | 133,35,Male,163,91,34.2504422447213,Arm B (LD),0,44.3,52.9 31 | 134,69,Male,178,80.3,25.3440222194167,Arm C (HD),3,55.8,46.4 32 | 135,71,Male,188,102,28.8592123132639,Arm A (P),0,43.8,43 33 | 136,59,Male,170,71.2,24.636678200692,Arm B (LD),0,51.9,39.2 34 | 137,50,Male,176,70,22.5981404958678,Arm A (P),0,44.7,37.7 35 | 138,64,Male,165,70,25.7116620752984,Arm B (LD),0,26.4,NA 36 | 139,50,Male,183,95.6,28.5466869718415,Arm C (HD),2,57.7,54.4 37 | 140,51,Male,170,82.5,28.5467128027682,Arm B (LD),0,54.1,44.1 38 | 141,71,Male,167,79,28.3265803721898,Arm A (P),0,53.3,50.7 39 | 142,47,Male,178,95,29.9835879308168,Arm C (HD),0,36.7,31.9 40 | 143,56,Female,163,50.8,19.120027099251,Arm B (LD),2,57.8,41.7 41 | 105,55,Female,163,61,22.9590876585494,Arm A (P),2,48.3,40.7 42 | 201,68,Female,162.02,60.3,22.9710081801017,Arm B (LD),3,69.9,53.2 43 | 202,71,Male,170.18,83,28.6590014035452,Arm A (P),0,49.3,43.5 44 | 203,53,Male,167,108,38.724945318943,Arm C (HD),3,53.3,53.8 45 | 204,54,Male,170,112,38.7543252595156,Arm A (P),2,NA,NA 46 | 144,41,Male,176,86.4,27.8925619834711,Arm C (HD),0,32.6,35.6 47 | 102,52,Male,180.34,92.07,28.3096182141581,Arm B (LD),0,31.8,31.1 48 | 103,28,Male,213,93.34,20.573519363442,Arm A (P),0,43.6,31.4 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Clinical trial design and analysis 2 | 3 | ## Summary 4 | 5 | These are the course materials from our modules (EH6124 and EH6126) on clinical trial design and analysis. For the official module descriptions, please see: 6 | 7 | [EH 6124 - Introduction to Clinical Trial Design and Analysis](https://www.ucc.ie/admin/registrar/modules?mod=EH6124) 8 | 9 | [EH 6126 - Advanced Clinical Trial Design and Analysis](https://www.ucc.ie/admin/registrar/modules?mod=EH6126) 10 | 11 | These modules are offered as a part of our Postgraduate Programme in Clinical Trials (Certificate, Diploma, and MSc), and delivered by Darren Dahly ([@statsepi](https://twitter.com/statsepi)) and Brendan Palmer ([@B_A_Palmer](https://twitter.com/B_A_Palmer)) of the [HRB Clinical Research Facility Cork](https://crfc.ucc.ie/) - [Statistics and Data Analysis Unit](https://crfcsdau.github.io/). 12 | 13 | ## Purpose 14 | 15 | We are posting these materials online for two reasons. The first is that we think properly designed and analysed randomized controlled trials are critical for evaluating the impact of clinical interventions, and so we hope these open materials are useful for anyone wanting to run effective clinical trials. The second is that we acknowledge that we don't (and can't) know everything there is to know about clinical trials, and so hope that the wider community of experienced medical statisticians and trialists might contribute their suggestions on how we might improve our modules. 16 | 17 | ## Course structure 18 | 19 | Each of the two modules consists of eight units. The introductory module focuses on relatively simple two-arm parallel superiority trials with fixed designs, and frequentist statistical inference. The second, advanced module then introduces deviations from this, such as non-inferiority trials, cluster and cross-over designs, adaptive trials, and Bayesian statistical inference. 20 | 21 | Generally the units include some required reading (papers, book chapters, blog-posts), usually some additional reading, and one or more video lectures covering the key points for that unit. To help learners understand key concepts, we also use R/RStudio to run simulations and analyse trial data, and provide data analysis exercises for the learners to do on their own. This is the material that will make up the bulk of the material on this course page. 22 | 23 | Shield: [![CC BY-SA 4.0][cc-by-sa-shield]][cc-by-sa] 24 | 25 | This work is licensed under a 26 | [Creative Commons Attribution-ShareAlike 4.0 International License][cc-by-sa]. 27 | 28 | [![CC BY-SA 4.0][cc-by-sa-image]][cc-by-sa] 29 | 30 | [cc-by-sa]: http://creativecommons.org/licenses/by-sa/4.0/ 31 | [cc-by-sa-image]: https://licensebuttons.net/l/by-sa/4.0/88x31.png 32 | [cc-by-sa-shield]: https://img.shields.io/badge/License-CC%20BY--SA%204.0-lightgrey.svg 33 | 34 | -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-22-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-22-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-24-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-24-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-26-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-26-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-27-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-27-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-31-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-31-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-35-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-35-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Change_scores/Change_scores_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Frequentist Inference" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | ```{r setup, include = FALSE} 9 | 10 | knitr::opts_chunk$set(message = FALSE, warning = FALSE) 11 | 12 | # Install/load packages 13 | 14 | packs <- c("tidyverse", "knitr", "viridis", "broom", "pwr") 15 | install.packages(packs[!packs %in% installed.packages()]) 16 | lapply(packs, library, character.only = TRUE) 17 | 18 | ``` 19 | 20 | ## Part 1. Estimands, estimators, and estimates 21 | 22 | The primary aim of a clinical trial is to arrive at some number that tells us something useful about the "true" causal effect of the treatment we are testing. This number, which is calculated from the study data, is called an **estimate** of some corresponding "truth" called an **estimand**. The actual calculation or algorithm we used to provided this estimate of the estimand is called the **estimator**. 23 | 24 | Much of clinical science is about deciding what the important estimands are, asking, "What do we want to know?" On the other hand, much of statistics is about understanding and evaluating the properties of estimators, asking "What is the optimal way to learn it?" 25 | 26 | ### A simple example. 27 | 28 | Let's say we want to estimate the mean systolic blood pressure in the population of Irish women, to help plan health services. One way to do this would be to take a random sample from this population, measure their blood pressures, and calculate the mean of the observed values. In this example, the estimand, estimator, and estimate are as follows: 29 | 30 | - Estimand - The mean systolic blood pressure in the **population** of Irish women. 31 | - Estimator - The mean systolic blood pressure in our **sample** of this population. 32 | - Estimate - The actual value that arises from our estimator. 33 | 34 | Estimators have different properties that we can use to evaluate how useful they are. Importantly, our understanding of an estimator's properties will usually rely on assumptions, for example, that the sample was truly a random draw from the population. 35 | 36 | To better understand all of this, let's simulate a large number of SBP measurements from a normal distribution with a known mean and standard deviation. This will be our population. The idea of simulating data might sound a little advanced to the unwary, but it's actually really easy to do. We will use simulations a lot in this module, since we can generate data with "known" properties (like the mean and SD of SBP in a population), against which we can compare the results of various statistical methods. Hopefully the examples that follow will make this clear. 37 | 38 | ```{r simulate_population} 39 | 40 | # First we will simulate our population 41 | 42 | # Set the seed for the random number generator to preserve reproducability 43 | set.seed(1209) 44 | 45 | # Simulate a large population of values (10 million) from a normal distribution 46 | # with a mean of 124.5 and a SD of 18. 47 | pop_sbp <- rnorm(n = 1e7, mean = 24.5, sd = 18.0) 48 | 49 | hist(pop_sbp) # Plot the distribution of those values 50 | 51 | summary(pop_sbp) # Summarize the distribution 52 | 53 | ``` 54 | 55 | Next, let's draw a random sample from this population, and calculate the mean. The full procedure of taking a random sample of the population and calculating the mean of that sample is our estimator **for** the population mean. The actual number that results is our estimate **of** the population mean. 56 | 57 | ```{r sample_1} 58 | 59 | sample_sbp <- sample(pop_sbp, 50, replace = FALSE) # Draw a sample of n = 50 60 | 61 | mean(sample_sbp) # Calculate the mean of the sample 62 | 63 | ``` 64 | 65 | We can immediately see that the sample mean is close to, but not exactly, the population mean (which we know was 124.5, from our simulation). Importantly, if we were to repeat the process using the **same** estimator, we would expect a **different** estimate. 66 | 67 | ```{r sample_2} 68 | 69 | # Draw another sample of n = 50 and calculate the mean 70 | mean(sample(pop_sbp, 50, replace = FALSE)) 71 | # Notice how I've nested the call to the function `sample` within the call to 72 | # the function `mean` 73 | 74 | ``` 75 | 76 | So what happens if we repeat this process many times? We get a **sampling distribution** of sample means. This concept of a sampling distribution is critical to understanding the **frequentist** school of statistical inference that is most often used to interpret the results of clinical trials, as we will see below. 77 | 78 | The **sampling distribution** for a **statistic** (anything we calculate from the observed data) is the probability distribution we would observe for that statistic if we were able to repeatedly sample from the underlying population an infinite number of times. 79 | 80 | ```{r many_sample_means} 81 | 82 | # This is a custom function that drwas a sample from a population and then 83 | # calculates the mean of those values. It takes 2 arguments: pop, which is a 84 | # dataframe object containing the population data we simulated; and n, which is 85 | # the size of the sample we want to draw. 86 | 87 | rep_means <- function(pop, n, ...){ 88 | round(mean(sample(pop, n, replace = FALSE)), 2) 89 | } 90 | 91 | # Use this new function 100 times and keep the results from each replicate 92 | many_sample_means <- replicate(100, rep_means(pop_sbp, 50)) 93 | 94 | hist(many_sample_means) # Plot the resulting distribution 95 | 96 | summary(many_sample_means) # Summarize it 97 | 98 | 99 | ``` 100 | 101 | ### Try it yourself 102 | 103 | What do you think would happen to the distribution of the sample mean if we increased the number of observations that we sample from the population? Copy and modify the code above to see what would happen for samples of n = 100, 200, and 500. 104 | 105 | ## Part 2. Pesky p-values 106 | 107 | Now we are going to simulate a clinical trial with a known effect of the intervention, and evaluate our estimate of that effect using the ubiquitous p-value. 108 | 109 | ```{r sim_t_test_data} 110 | 111 | set.seed(0236) 112 | 113 | # Simulate some study data into a dataframe. Tx effect = 0.5 114 | study_df <- data_frame( 115 | y = c(rnorm(20, 0, 1), # Control arm outcomes 116 | rnorm(20, 0, 1) + 0.5), # Active arm outcomes 117 | arm = rep(c("Control", "Active"), each = 20) # Arm labels 118 | ) 119 | 120 | study_df$arm <- relevel(factor(study_df$arm), ref = "Control") 121 | 122 | # Plot the distribtion of outcomes in each arm 123 | ggplot(study_df, aes(x = y, fill = arm)) + 124 | geom_histogram() + 125 | facet_wrap(~arm, ncol = 1) + 126 | scale_fill_viridis(guide = FALSE, discrete = TRUE, end = 0.85) 127 | 128 | # Note: When we plotted the distribution before, we used so-called "base" R 129 | # plotting. Going forward, we will more often used a package called ggplot2 130 | # (which is included in the tidyverse family of packages). 131 | 132 | ``` 133 | 134 | Now we will pretend that we don't know what the true effect of the treatment is (0.5). The difference in means between the two arms in this sample is `r round(mean(study_df$y[study_df$arm == "Active"]) - mean(study_df$y[study_df$arm == "Control"]), 2)`. This of course isn't exactly 0.5, since the tx effect is added to a variable outcome that is measured in a random sample. Thus some sampling variability should be expected. 135 | 136 | The actual test statistic we will use is called Student's t. It is a ratio of this difference in means to the standard error (which is what we, somewhat confusingly, call the standard deviation of a sampling distribution). You can think of it as a "signal to noise ratio". 137 | 138 | Let's do it "by hand" so we can see how the p-value is calculated. 139 | 140 | The first step is to calculate the value for t from our sample data. This is our estimate. 141 | 142 | ```{r calculate_t} 143 | 144 | # https://en.wikipedia.org/wiki/Welch's_t-test 145 | 146 | # First, just to see how eveCalcuate T "by hand" 147 | m_act <- mean(study_df$y[study_df$arm == "Active"]) # mean outcome active arm 148 | m_con <- mean(study_df$y[study_df$arm == "Control"]) # mean outcome control arm 149 | v_act <- var(study_df$y[study_df$arm == "Active"]) # outcome variance active 150 | v_con <- var(study_df$y[study_df$arm == "Control"]) # outcome variance control 151 | n <- 20 # Sample size in each group 152 | se <- sqrt(v_act/n + v_con/n) # Pooled standard error 153 | df_s <- (v_act/n + v_con/n)^2 / # Need this for the sampling dist. below 154 | ((v_act^2 / (n^2 * (n - 1))) + 155 | (v_con^2 / (n^2 * (n - 1)))) 156 | 157 | t_sample <- round((m_act - m_con) / se, 2) 158 | 159 | t_sample 160 | 161 | ``` 162 | 163 | Then we want to plot the sampling distribution of t under the null hypothesis that there is no different in the means (i.e. t = 0). The key parameter for this distribution is the degrees of freedom we calculated above (df_s). 164 | 165 | ```{r t_null_dist} 166 | 167 | # Get the expected sampling distribution under a null hypothesis of no difference 168 | 169 | g1 <- ggplot(data_frame(x = c(-4 , 4)), aes(x = x)) + 170 | stat_function(fun = dt, args = list(df = df_s), 171 | geom = "area", fill = viridis(1), alpha = 0.3) + 172 | xlim(c(-4 , 4)) + 173 | xlab("t") + 174 | ylab("Density") + 175 | theme_minimal() 176 | 177 | g1 178 | 179 | ``` 180 | 181 | Then we plot the position of t for our sample, and calculate the area of the t distribution (under the null) for values of t as big or bigger than the value we actually observed. 182 | 183 | ```{r t_one_sided} 184 | 185 | g2 <- g1 + 186 | geom_vline(xintercept = t_sample) + 187 | stat_function(xlim = c(t_sample , 4), 188 | fun = dt, args = list(df = df_s), 189 | geom = "area", fill = viridis(1)) + 190 | scale_x_continuous(breaks = c(-3, -1.5, 0, 1.5, t_sample, 3)) + 191 | theme(panel.grid.minor = element_blank()) + 192 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t (given the null is true) is ", signif(pt(t_sample, df_s, lower.tail = FALSE), 2))) 193 | 194 | g2 195 | 196 | ``` 197 | 198 | This means that **IF** there was no treatment effect, **AND** the assumptions underlying our calculation of t distribution under the null hold, **AND** we were able repeat our experiment many, many times, **THEN** we would only expect to see a value as large or larger than the one we calculated in *our* sample, 1.2% of the time. This is our p-value. Please note that there is no way to correctly explain what this p-value is in a single sentence. 199 | 200 | So what can we do with this p-value? That's where things start to get tricky. At the risk of over-simplifying things, Fisher saw the p-value as a continuous measure of how surprised we should be if the data were truly generated under a mechanism of "no effect". So then a small p-value indicates there might be an effect worth further exploring or looking for in continued experiments. Then, if you were able able to repeat an experiment several times and consistently produce a small p-value, you might finally conclude that your intervention "works". 201 | 202 | Others would use the p-value as a hypothesis test. In the example above, it would be the null hypothesis of t ≤ 0 vs the alternative of t > 0. Then you would set some threshold for p (e.g. 5%) for which you would "reject the null". We'll come back to this way of thinking when we discuss power, and type 1 and 2 errors below. 203 | In the example above, we only considered our estimate of t with respect to one tail of the sampling distribution of t under the null. This is what we refer to as a one-sided test. However, it usually makes more sense to make it relative to both tails, so that the null is t = 0 vs. t < 0 or t > 0. 204 | 205 | ```{r t_two_sidedd} 206 | 207 | g3 <- g2 + 208 | stat_function(xlim = c(t_sample , 4), 209 | fun = dt, args = list(df = df_s), 210 | geom = "area", fill = viridis(1)) + 211 | stat_function(xlim = c(-4, -t_sample), 212 | fun = dt, args = list(df = df_s), 213 | geom = "area", fill = viridis(1)) + 214 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t is ", signif(2 * pt(t_sample, df_s, lower.tail = FALSE), 3))) 215 | 216 | g3 217 | 218 | ``` 219 | 220 | Now this was a fair bit of work just to do a simple t-test. So now let's R do this for us (and confirm our result). 221 | 222 | ```{r t_test} 223 | 224 | t_1 <- t.test(y ~ arm, data = study_df) 225 | 226 | t_1 227 | 228 | ``` 229 | 230 | ### Side-note: Linear models 231 | 232 | Next week we will learn that most commonly used statistical tests can be represented by linear models. The advantage of this is two-fold. First, linear models can incorporate covariate information which often leads to more efficient estimators by integrating out variability in the outcome predicted by the covariates. It's also useful to think in terms of linear models because it's much easier than trying to remember a multitude statistical tests. 233 | 234 | For example, the t-test is used when we are interested in a difference in means and when we have a fairly small sample size (the specific problem the t-test addresses is that when we have a small sample and we need to use the sample standard deviation to work our the distribution of the sample mean, then we should have more uncertainty in that distribution, i.e. it should be wider - it should have "fatter tails"). 235 | 236 | ```{r linear_model} 237 | 238 | lm_1 <- lm(y ~ arm, data = study_df) 239 | 240 | pvalue <- filter(tidy(lm_1), term == "armActive")$p.value 241 | 242 | summary(lm_1) 243 | 244 | ``` 245 | 246 | So here we get a p-value of `r signif(pvalue, 3)` vs. `r signif(t_1$p.value, 2)` from the t-test. Of course the difference in means, the effect estimate, is exactly the same (`r round(t_1$estimate[1] - t_1$estimate[2], 2)`). It's the sampling distributions of the two estimators of that effect (t vs the regression coefficient) that are different. 247 | 248 | ## Part 3. Error control and power 249 | 250 | In the example above, we focused mostly on Fisher's interpretations of the p-value given by a particular set of set of observations as a continuous measure of evidence for/against a null hypothesis/model (though he still viewed being able to repeatedly procure a small p-value as an important step before declaring the existence of an experimental effect). However, this idea was met with resistance from the very beginning. Much of this resistance came from the Neyman-Pearson (NP) school of thought, whose proponents instead thought that p-values should be used to conduct the binary hypothesis tests we noted above. 251 | 252 | When we choose to use such tests, we are using a relaxed version of the following logic (called [modus tollens](https://en.wikipedia.org/wiki/Modus_tollens)): 253 | 254 | If A, then B | Not B, then not A 255 | 256 | These hypothesis tests are used to make a similar argument: **If** the null hypothesis/model is true, **then** the data should be more likely to look one way. If however the data **don't** look that way, **then** we should consider rejecting the null. 257 | 258 | This means we can incorrectly reject the null then it's true; or incorrectly fail to reject the null when it isn't. From this point of view, our goal when using statistics is to control the probability of making these errors. 259 | 260 | Let's look at some simulations. 261 | 262 | First we will generate some data under a null, normal model, with a random allocation to study arms. 263 | 264 | ```{r null_data} 265 | 266 | n <- 100 267 | 268 | data <- data_frame( 269 | sbp = rnorm(n, 124.5, 18.0), 270 | arm = sample(c("Active", "Control"), size = n, replace = TRUE) 271 | ) 272 | 273 | # Plot the distribtion of outcomes in each arm 274 | ggplot(data, aes(x = sbp, fill = arm)) + 275 | geom_histogram() + 276 | facet_wrap(~arm, ncol = 1) + 277 | scale_fill_viridis(guide = FALSE, discrete = TRUE, end = 0.85) 278 | 279 | # Regress SBP on the treatment arm 280 | summary(lm(sbp ~ arm, data = data)) 281 | 282 | ``` 283 | 284 | Now let's look at what happens if we repeat this process many times. 285 | 286 | ```{r null_data_repeat} 287 | 288 | make_pvalues <- function(n, ...){ 289 | 290 | data <- data_frame( 291 | sbp = rnorm(n, 124.5, 18.0), 292 | arm = sample(c("Active", "Control"), size = n, replace = TRUE) 293 | ) 294 | 295 | data$arm <- relevel(factor(data$arm), ref = "Control") 296 | 297 | result <- tidy(lm(sbp ~ arm, data = data)) %>% 298 | filter(term == "armActive") 299 | 300 | return(result$p.value) 301 | 302 | } 303 | 304 | k <- 2000 305 | results <- as_data_frame(replicate(k, make_pvalues(100))) %>% 306 | rename(p_value = value) 307 | 308 | ggplot(results, aes(x = p_value)) + 309 | geom_histogram() + 310 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 311 | 312 | ``` 313 | 314 | As some of you might have expected, about 5% of the p-values fall below the nominal 5% level (`r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k, 3) *100`% in this set of simulations) 315 | 316 | Just for fun, what will happen to the distribution of p-values if I decrease the sample size of the individual replicates? 317 | 318 | ```{r p_distribution_null_1} 319 | 320 | k <- 2000 321 | results <- as_data_frame(replicate(k, make_pvalues(20))) %>% # n = 20 322 | rename(p_value = value) 323 | 324 | ggplot(results, aes(x = p_value)) + 325 | geom_histogram() + 326 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 327 | 328 | ``` 329 | 330 | Again, about 5% of the p-values fall below the nominal 5% level (`r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k, 3) *100`% - the difference is just simulation error - if we ran more simulations, this would be 5% for sure) 331 | 332 | And if I increase the sample size? 333 | 334 | ```{r p_distribution_null_2} 335 | 336 | k <- 2000 337 | results <- as_data_frame(replicate(k, make_pvalues(1000))) %>% # n = 1000 338 | rename(p_value = value) 339 | 340 | ggplot(results, aes(x = p_value)) + 341 | geom_histogram() + 342 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 343 | 344 | ``` 345 | 346 | The same result (`r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k, 3) * 100`%). To put a finer point on this, **when the data are indeed generated by a null model**, the distribution of p-values is constant; and because p is based on a sampling distribution, which is in turn partly determined by the sample size (via the standard error), then 5% of p values will be below the 5% threshold, regardless of sample size. 347 | 348 | The other important addition from the NP view is the introduction of some alternative hypothesis, so that when we reject the null (based on a small p-value) then we are implicitly accepting some alternative. Thus we can restate the two types of possible error we noted above: 349 | 350 | - We can reject the null when it's true, thus declaring there is an effect (the alternative) when there isn't: a false positive. 351 | 352 | - We can accept the null when it's false, thus declaring there is no effect (the alternative) when there actually is: a false negative. 353 | 354 | The first of these is what we just looked at above, when we simulated data under the null and say how many times we would reject it when it was in fact true - and that we could "control" this error rate using the p-value. If I'm happy accepting making this kind of mistake 5% of the time, then my p-value for "significance" should be 0.05. If I want to make that more or less stringent, I can. However, this only really makes sense if I am also happy thinking of what I'm doing as one experiment in a long succession of such experiments, like a machine making widgets. 355 | 356 | So now let's look at the second kind of error, and generate some data with a known effect. 357 | 358 | ```{r alt_data_repeat} 359 | 360 | make_pvalues <- function(n, effect, ...){ 361 | 362 | data <- data_frame( 363 | sbp = rnorm(n, 124.5, 18.0), 364 | arm = sample(c("Active", "Control"), size = n, replace = TRUE) 365 | ) 366 | 367 | data$arm <- relevel(factor(data$arm), ref = "Control") 368 | 369 | 370 | data$sbp[data$arm == "Active"] <- data$sbp[data$arm == "Active"] + effect 371 | 372 | result <- tidy(lm(sbp ~ arm, data = data)) %>% 373 | filter(term == "armActive") 374 | 375 | return(result$p.value) 376 | 377 | } 378 | 379 | k <- 2000 380 | results <- as_data_frame(replicate(k, make_pvalues(100, -2))) %>% 381 | rename(p_value = value) 382 | 383 | ggplot(results, aes(x = p_value)) + 384 | geom_histogram() + 385 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 386 | 387 | ``` 388 | 389 | Now if I use p = 0.05 to reject the null, which is now the correct decision (since I know I simulated data with "an effect"), I will do so `r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k * 100, 3)`% of the time. That is my **power**. Alternately, I will accept the null (wrongly) `r 100 - round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k * 100, 3)`% of the time. That is my type 2 error rate. 390 | 391 | Most of the time people recommend a power of at least 80% (more on this later), so clearly this is a very low power. That's because our effect (= -2) is small relative to the variability in the outcome. So we need a bigger effect or a bigger sample to overcome that variability and increase the power. So let's greatly increase the sample size. 392 | 393 | ```{r alt_data_2} 394 | 395 | k <- 2000 396 | results <- as_data_frame(replicate(k, make_pvalues(2000, -2))) %>% 397 | rename(p_value = value) 398 | 399 | ggplot(results, aes(x = p_value)) + 400 | geom_histogram() + 401 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 402 | 403 | ``` 404 | 405 | Whoa! What just happened? Now if I use p = 0.05 to (correctly) reject the null, my power is `r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k * 100, 3)`%, and my type 2 error rate is `r 100 - round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k * 100, 3)`%. 406 | 407 | Here is an important point - there is no such thing as **A** study's power. Power is specific to both a study's design, and **AN** effect size. Thus the same study will produce a different power for different effect sizes. In these simulated examples, we **know** what the alternative hypothesis is (effect = -2), while for a real study, there are many plausible alternative hypotheses (i.e. many plausible potential effect sizes) and we can't know which is true, or else we wouldn't need to run the study. 408 | 409 | However, one of these hypothetical effect sizes stands out among all the rest. It's the one Senn refers to as the smallest possible effect that we'd like to not miss. In other words, many of these plausible effect sizes are so small that we don't really care if they exist or not (when we are talking about clinical research where the goal is to intervene on people), but there are other, larger effect sizes we certainly would like to be able to detect with a high power - and if we design the study to have high power for the smallest of these, then we will have high power to detect all of them. So let's plan a study to detect an effect = 2, given what we know about the distribution of the outcome we used in the simulation. 410 | 411 | ```{r power_calc} 412 | 413 | effect_size <- -2 414 | sd <- 18 415 | d <- effect_size / sd 416 | 417 | pwr.t.test(d = d, sig.level = .05, power = 0.90) 418 | # I am being a bit lazy here, using the power based on a t-test when I know 419 | # that I am going to analyze the data with a linear model - but sample sizes are 420 | # an educated guess at best - so close enough for now! 421 | 422 | ``` 423 | 424 | Ha! So to detect an effect this small (relative to the SD of the SBP distribution), I need 3400 people in my study for achieve a power of 90% (i.e. to produce a study that will detect this effect, based on a binary hypothesis test vs the null, 90% of the time). Now, let's confirm this with a simulation. 425 | 426 | ```{r power_sim} 427 | 428 | k <- 2000 429 | results <- as_data_frame(replicate(k, make_pvalues(3400, 2))) %>% 430 | rename(p_value = value) 431 | 432 | ggplot(results, aes(x = p_value)) + 433 | geom_histogram() + 434 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 435 | 436 | ``` 437 | 438 | From the above I can see that `r round(table(cut(results$p_value, c(0, 0.05, 1)))[1] / k * 100, 3)`% of p-values from the simulations fall below 5% - that's my power, which is just what I expected (with a bit of simulation error). 439 | 440 | I want to point out one more thing, which is the distribution of p-values that fall under 0.05 when the alternative is in fact true and we have a high power test, as we just simulated. 441 | 442 | ```{r alt_data_le_05} 443 | 444 | filter(results, p_value <= 0.05) %>% 445 | ggplot(aes(x = p_value)) + 446 | geom_histogram() + 447 | geom_vline(xintercept = 0.05, color = "red", linepype = "dashed") 448 | 449 | ``` 450 | 451 | So how many of those p-values are close to 0.05? Very few! What does this mean? It means that when some alternative effect is true and the trial is very well powered to detect that effect, the distribution of p-values is heavily skewed towards very small values, values much smaller than 0.05. This is why many people are skeptical of papers and bodies of literature that report many "just significant" p-values, as it suggests publication bias and p-hacking. 452 | 453 | ## Part 4: Confidence intervals 454 | 455 | In the previous sections, we discussed how to calculate and use p-values to make inferences, either by using the p-value directly, or by using it to perform binary hypothesis tests and focus on controlling errors. Another common way to use p-values is to create frequentist confidence intervals (CI). 456 | 457 | To do this, first we take some level of alpha, our acceptable type 1 error rate (usually 0.05 (5%) due to thoughtless convention). Then we set the null to be the effect estimate we actually observed. Then we take the collection of values for which we would fail to reject that null at the set alpha level. This collection of values is our CI. Another way to think of it is that it is the middle 95% of values (for alpha = 0.05) from the estimator's sampling distribution, centered on our observed effect estimate. This can be tricky to think about, so let's work through it. 458 | 459 | First, let's look at the two-sided p-value from a normal sampling distribution with an effect = 0 and an standard error (SE) = 1 (remember, an SE is just what we call the standard deviation for a sampling distribution). 460 | 461 | ```{r t_two_sided} 462 | 463 | g1 <- ggplot(data_frame(x = c(-4 , 4)), aes(x = x)) + 464 | stat_function(fun = dnorm, args = list(0, 1), 465 | geom = "area", fill = viridis(1), alpha = 0.3) + 466 | xlim(c(-4 , 4)) + 467 | xlab("z") + 468 | ylab("Density") + 469 | theme_minimal() + 470 | stat_function(xlim = c(1.96 , 4), 471 | fun = dnorm, args = list(0, 1), 472 | geom = "area", fill = viridis(1)) + 473 | stat_function(xlim = c(-4, -1.96), 474 | fun = dnorm, args = list(0, 1), 475 | geom = "area", fill = viridis(1)) + 476 | geom_errorbarh(aes(x = 0, xmax = 1.96, xmin = -1.96, y = 0.1), 477 | height = 0.05, size = 2, color = viridis(1)) + 478 | geom_point(aes(x = 0, y = 0.1), size = 4, color = viridis(1)) + 479 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t is ", signif(2 * pnorm(1.96, 0, 1, lower.tail = FALSE), 3))) 480 | 481 | g1 482 | 483 | ``` 484 | 485 | We can thus see the middle 95% of values, which would be our 95% CI. Values falling outside of this range are in the rejection region of the distribution. 486 | 487 | Now, for sampling distributions where we are happy to assume a normal model (this is often true), we can just slide this same range over so that it's centered on our observed effect estimate. So let's see this for an effect = 2. 488 | 489 | ```{r shift_to_effect} 490 | 491 | g2 <- g1 + 492 | stat_function(fun = dnorm, args = list(2, 1), 493 | geom = "area", fill = viridis(1, direction = -1), alpha = 0.3) + 494 | xlim(c(-4 , 6)) + 495 | xlab("z") + 496 | ylab("Density") + 497 | theme_minimal() + 498 | stat_function(xlim = c(2 + 1.96 , 6), 499 | fun = dnorm, args = list(2, 1), 500 | geom = "area", fill = viridis(1, direction = -1)) + 501 | stat_function(xlim = c(-4, -1.96 + 2), 502 | fun = dnorm, args = list(2, 1), 503 | geom = "area", fill = viridis(1, direction = -1)) + 504 | geom_errorbarh(aes(x = 2, xmax = 2 + 1.96, xmin = 2 + -1.96, y = 0.15), 505 | height = 0.05, size = 2, 506 | color = viridis(1, direction = -1)) + 507 | geom_point(aes(x = 2, y = 0.15), size = 4, 508 | color = viridis(1, direction = -1)) + 509 | ggtitle("") 510 | 511 | g2 512 | 513 | ``` 514 | 515 | So now the have the same interval centered on our observed effect = 2 (yellow). 516 | 517 | Because this effect of 2 is > 1.96 (1.96 being the critical value we used the calculate the p-value if we want alpha = 0.05; see the purple sampling distribution), we would thus "reject the null hypothesis of 0 effect". 518 | 519 | We can also see that zero falls (just barely) outside of the yellow 95% CI. This means that if we set the null to 2, rather than 0, then we would reject this new null if we actually observed a 0 effect, just like we reject a null of 0 when we observe a 2. One test is just the mirror image of the other. We will use the same idea when we get to equivalence tests below. 520 | 521 | So how should we interpret a CI centered on our observed effect? Many people think that an X% CI says that there is an X% chance that the "true" effect falls in the interval. This is incorrect, and reflects a Bayesian interpretation that is unwarranted here. 522 | 523 | Instead, an X% CI will contain the "true" effect X% of the time over many replications of the procedure that led to the CI. In other words, the CI is still a completely frequentist tool, and one way to think of it is as a collection of statistical tests. You can also think of it as a "compatibility interval" - a collection of values that our observed data would be compatible with (at some pre-specified alpha) had the data indeed been generated by (any one of) those values. We will come back to Bayesian intervals, where can *can* conclude some % that the "truth" falls in our interval (given other assumptions), when we get to Bayesian statistics later in the module. 524 | 525 | -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_2-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_le_05-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_le_05-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_repeat-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/alt_data_repeat-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/many_sample_means-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/many_sample_means-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/null_data-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/null_data-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/null_data_repeat-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/null_data_repeat-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/p_distribution_null_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/p_distribution_null_1-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/p_distribution_null_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/p_distribution_null_2-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/power_sim-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/power_sim-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/sample_10k-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/sample_10k-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/shift_to_effect-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/shift_to_effect-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/sim_t_test_data-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/sim_t_test_data-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/simulate_population-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/simulate_population-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/t_null_dist-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/t_null_dist-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/t_one_sided-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/t_one_sided-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/t_two_sided-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/t_two_sided-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/t_two_sidedd-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/t_two_sidedd-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_1_Review/Frequentist_inference_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /Unit_1_Review/Review_main_points_6124.md: -------------------------------------------------------------------------------- 1 | # Review of key points from EH 6124 2 | 3 | These are the major points from [**EH6124: Introduction to Clinical Trial Design and Analysis**](https://www.ucc.ie/admin/registrar/modules/?mod=EH6124). Please ask any questions on the discussion board if you need to clarify anything, or even if you just want to argue with me. 4 | 5 | ## The role of the statistician 6 | 7 | Successful clinical trials require teams with a variety of expertise. Though many scientists and clinical investigators receive some training in statistics and study design, this training is usually rudimentary, and often clumsy. The trial statistician is thus needed to provide expert guidance in these areas, and so should be heavily involved in the design, analysis, and reporting phases of the project. However, to do their job properly, the statistician must rely on the subject matter expertise of the clinical investigators. Thus the scientific goals of clinical trials are best met when there are close collaborations between clinical investigators and statisticians. 8 | 9 | **Selected reading:** 10 | 11 | [The Statistician's Role in Developing a Protocol for a Clinical Trial](https://amstat.tandfonline.com/doi/abs/10.1080/00031305.1979.10482674) 12 | 13 | [Bridging Clinical Investigators and Statisticians: Writing the Statistical Methodology for a Research Proposal](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4415704/) 14 | 15 | ## Causal inference 16 | 17 | Most clinical studies estimate and report an association between an outcome (e.g. mortality) and an exposure (such being randomly allocated to one study arm or another). Usually we want to make some inferences about that estimated association, to help us decide whether the exposure caused the outcome, i.e. would the observed distribution of the outcome have been different had there been no exposure? Our confidence in any causal inferences largely rests on an assumption of “no confounding” which means there aren’t any other extraneous factors that cause both the exposure and the outcome. This vaguely-stated condition can be explicitly described using [Directed Acyclic Graphs](http://dagitty.net/), which we can use to show that if an exposure is indeed randomly allocated, then there can be no confounding. Thus randomization allows us to make causal inferences with the fewest/most-palatable assumptions, compared to other methods (e.g. adjusting/controlling for suspected confounders). 18 | 19 | **Selected reading:** 20 | 21 | [A structural approach to selection bias](https://www.ncbi.nlm.nih.gov/pubmed/15308962) 22 | 23 | ## Regression to the mean 24 | 25 | Patients often enter a clinical trial when they are ill, and most illnesses have some natural variability in their progression. This means that if I recruit a sample of sick people and do nothing, some will get better anyway, and thus the health of the group will have improved, on average. This is called regression to the mean. Unfortunately, many researchers set up similar scenarios, except they *do* intervene in some way, and then mistakenly attribute the improvement in their sample’s average health as an effect of intervention. This is why we include concurrent control groups in RCTs. Then we can get an estimate of the magnitude of regression to the mean (from the control group), which we can then subtract from our estimated treatment effect in the active arm by literally looking at the difference in outcomes between the study arms. This doesn’t mean we can’t ever make inferences from within-group comparisons (pre vs post) or using “historical” controls (control groups external to our trial), but we should do so much more cautiously, and not expect many other people to trust our results even if we do. 26 | 27 | **Selected reading:** 28 | 29 | [Sensitivity and Specificity of Clinical Trials - Randomized v Historical Controls ](https://jamanetwork.com/journals/jamainternalmedicine/fullarticle/603025) 30 | 31 | [The use of historical controls and concurrent controls to assess the effects of sulphonamides, 1936-1945. JLL Bulletin: Commentaries on the history of treatment evaluation](https://www.jameslindlibrary.org/articles/the-use-of-historical-controls-and-concurrent-controls-to-assess-the-effects-of-sulphonamides-1936-1945/) 32 | 33 | [Workplace Wellness Programs Don’t Work Well. Why Some Studies Show Otherwise](https://www.nytimes.com/2018/08/06/upshot/employer-wellness-programs-randomized-trials.html) 34 | 35 | ## Equipoise and ethics 36 | 37 | When we run a RCT, we are exposing patients to unknown risks in the hopes of learning something important about treatment that might benefit future patients, as well as those enrolled in the trial. For this to be ethical, it means that at a minimum, all patients enrolled in the trial must get at least the same quality of care as they would have if they hadn’t enrolled in the trial. It also means that there should be a genuine uncertainty about the potential benefits of the new intervention – that is that we should be in a state of equipoise. Equipoise can be hard to demonstrate, so it’s important for investigators to clearly make their case that it exists (though this often doesn’t happen). In my opinion, this is an area where patient voices should have more prominence. The ethical obligations of the trial also mean that we should stand to learn something important, which means that shoddy comparators (known to be substandard, thus stacking the deck in favour of the new treatment) and other preventable design flaws are arguably unethical. 38 | 39 | **Selected reading:** 40 | 41 | [Equipoise in Research - Integrating Ethics and Science in Human Research](https://jamanetwork.com/journals/jama/fullarticle/2600451) 42 | 43 | [Is the concept of clinical equipoise still relevant to research?](https://www.bmj.com/content/359/bmj.j5787) 44 | 45 | [Choice of control group in randomised trials of cancer medicine: are we testing trivialities?](https://www.thelancet.com/journals/lanonc/article/PIIS1470-2045(18)30501-1/fulltext) 46 | 47 | ## Patient selection 48 | 49 | We must carefully consider which patients to enroll on our RCT. This will largely depend on the overall goals of the trial. In the initial phase 3 trials for a new treatment, we are often most interested in seeing if the intervention can work in a “best-case” scenario. This means that we select patients in a manner that maximizes the internal validity of the trial, with little or no consideration for the external validity or generalizability. In turn, this means recruiting patients that are most likely to benefit from, and adhere to, the proposed treatment (based on our current understanding). It also means recruiting a more homogeneous sample, to reduce natural variability in the outcome. This will make it easier to see the effects of our intervention, if there is one, using a smaller sample than would be required to see the same effect in a broader, more heterogeneous sample. This best-case scenario makes sense for newly tested treatments, since it’s less costly (less money, fewer patients exposed unknown risks), and because if we fail to see an effect under these favourable conditions, it’s probably safe to conclude that we can move on from this intervention to pursue other possibilities. 50 | 51 | That said, once we’ve demonstrated the efficacy of an intervention in trials with a high internal validity (i.e. we’ve demonstrated that the intervention *can* work), we will like want to see if it *does* work (paraphrasing Senn) when applied in something that looks more like normal clinical practice. This is where effectiveness (or [pragmatic](https://www.precis-2.org/)) trials come in, where we want more broadly representative samples and scenarios. The implication is that there might be practical issues with compliance in a broader sample, or that there might be heterogeneity in treatment effects (HTE). This means that some groups of patients will benefit more or less from the intervention than other groups, and that these different groups might have been disproportionately represented in the earlier efficacy trials we described above. Unfortunately, the sample size required to demonstrate such interactions can be much larger than that needed to demonstrate the marginal (on average) effect of the treatment, so frequently concerns about HTE are not evidenced. Regardless, demonstrating a beneficial average treatment effect in a more generalizable sample is still very comforting, especially to people who make health technology assessments (i.e. the people who decide which treatments to fund with public money). 52 | 53 | Importantly, even if we have reason to expect no HTE, we want to be mindful of representation. This means working to ensure that all patients have access to participate in clinical trials, as patients enrolled on trials, even when the new treatment doesn’t out-perform standard care, often have better outcomes than those who aren’t enrolled. Poor representation, which is unfortuately typical in clinical trials (for example, of women or underepresented minorities), can also degrade trust in clinical research in general (and understandably so). 54 | 55 | Lastly, we control the composition of the patient sample with inclusion and exclusion criteria. A common mistake is to make a list where the list exclusions are just the conversely-stated inclusion criteria (e.g. inclusion: age >= 65 years; exclusion: age < 65 years). It’s better to consider inclusion as controlling entry into the trial to get a more/less homogeneous sample, and to precisely define the disease/problem we are trying to impact. Exclusions then, which should usually be fewer, are typically used to excluded people that can’t consent, that aren’t expected to possibly benefit from the new treatment, or that might be a higher than acceptable risk (in either arm). Patients are often excluded based on flawed reasoning about homogeneity (presumably to increase the internal validity of the trial), which can only be considered with respect to factors that predict variability in the outcome, i.e. there is zero point in excluding patients based on some demographic or other factor if it has no relationship to the outcome. 56 | 57 | **Selected reading:** 58 | 59 | [Enrollment of Racial Minorities in Clinical Trials: Old Problem Assumes New Urgency in the Age of Immunotherapy](https://pubmed.ncbi.nlm.nih.gov/31099618/) 60 | 61 | [Eligibility Criteria of Randomized Controlled Trials Published in High-Impact General Medical Journals](https://jamanetwork.com/journals/jama/fullarticle/206151) 62 | 63 | [Evaluating inclusion and exclusion criteria](https://www.federalregister.gov/documents/2018/08/23/2018-18232/evaluating-inclusion-and-exclusion-criteria-in-clinical-trials-workshop-report-availability) 64 | 65 | [Why representativeness should be avoided](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3888189/) 66 | 67 | [Assessing and reporting heterogeneity in treatment effects in clinical trials: a proposal](https://trialsjournal.biomedcentral.com/articles/10.1186/1745-6215-11-85) 68 | 69 | ## Outcomes 70 | 71 | We define the effects of interventions in terms of their impact on outcomes. In other words, outcomes are the variables we want to change in response to an intervention. So first and foremost, we must ensure that the outcomes we use in a trial are actually important, and be cautious about using so-called surrogate outcomes, which may reflect that the intervention did something, but not necessarily what we wanted it to do. Outcomes must also be precisely defined and of course measurable. Your description of an outcome should also avoid any qualitative statements (e.g. systolic blood pressure is an outcome, while “improved” systolic blood pressure is not). 72 | 73 | Our choice of outcomes will also have important implications for the overall design of the trial. Generally, outcome that are noisier (have more natural variance, which is typical of subjective measures), or rarer, will require a greater sample of patients to demonstrate the effect of an intervention. That said, we must often accommodate this if those in fact are the most important outcomes. However, one avoidable but still frequently made mistake is the categorization of inherently continuous outcomes, which always results in a loss of information and needlessly lowers the power of the study (and even if some categorized outcome is perceived as being more relevant, this can always be captured from the analysis of the underlying continuous outcome). 74 | 75 | **Selected reading:** 76 | 77 | [The perils of surrogate endpoints](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4554958/) 78 | 79 | [Disease-free survival is not a surrogate endpoint for overall survival in adjuvant trials of pancreatic cancer: a systematic review of randomized trials]() 80 | 81 | [Cardiology World Erupts Into Controversy Over Change In Major Clinical Trial (outcome switching)](https://www.cardiobrief.org/2018/03/18/cardiology-world-erupts-into-controversy-over-change-in-major-clinical-trial/) 82 | 83 | [FDA draft guidance on multiple endpoints](https://www.fda.gov/regulatory-information/search-fda-guidance-documents/multiple-endpoints-clinical-trials-guidance-industry) 84 | 85 | ## Covariates 86 | 87 | There is some disagreement among trialists about how to treat covariate information. In general, model based adjustment for strong predictors of the outcome will result in a more efficient (more powerful) estimator of the treatment effect. This means we don’t need to enroll as many patients on the trial to detect the minimally important effect size (or we have even higher power to detect this effect with the same number of patients). This part is uncontroversial. However, some people see any covariate adjustment as a problem, especially if they suspect that the choice of covariates to adjust for is made after seeing the data, with the intent to produce a small p-value for the effect of the intervention (p-hacking). Other people are fine with covariate adjustment, but they choose their covariates based on perceived imbalances in the covariate distributions between trial arms (so called “table 1 tests”). However, this procedure is sub-optimal, recommended against by all competent authorities, and opens the investigator up to the accusations of p-hacking we just discussed. 88 | 89 | The correct way to account for covariate information is to use your subject matter expertise and understanding of the outcome to select the strongest prognostic factors before the study begins, and to pre-register these decisions in the statistical analysis plan attached to the clinical trial registration. Then the reported analyses at the end of the study much match what was declared in the registration (and thus couldn’t have been p-hacked). 90 | 91 | The final point is how to handle baseline information. **The** baseline is a measure of the outcome that is taken prior to randomization. Baselines are thus often powerful predictors of the later outcome and thus a good choice for model based adjustment. However, instead of this, some investigators calculate change scores (outcome minus baseline) and use that for the outcome in the eventual trial analysis. What they don’t realize is that such a change score will still be correlated with the baseline values (but now in the opposite direction), and thus still benefit from an adjustment for baseline – and that the estimated effect of an intervention on the change score adjusted for baseline will be exactly the same as that on the (raw) outcome adjusted for baseline. While there are some scenarios where the unadjusted estimator of the treatment effect will be more efficient using change scores vs raw outcome, the baseline adjusted estimator is always more efficient than unadjusted change scores. 92 | 93 | **Selected reading:** 94 | 95 | [The risks and rewards of covariate adjustment in randomized trials: an assessment of 12 outcomes from 8 studies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4022337/) 96 | 97 | [Substantial and confusing variation exists in handling of baseline covariates in randomized controlled trials: a review of trials published in leading medical journals](https://www.sciencedirect.com/science/article/pii/S0895435609001747) 98 | 99 | [The use of percentage change from baseline as an outcome in a controlled trial is statistically inefficient: a simulation study](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC34605/) 100 | 101 | [Out of balance](https://medium.com/@darren_dahly/out-of-balance-9021fd636d88) 102 | 103 | ## Randomization and allocation concealment 104 | 105 | In the module, we discussed the importance of randomization for preventing any selection bias when recruiting patients into the trial. To actually accomplish this, we must maintain strict allocation concealment, which requires that the following are true: 106 | 107 | - The allocation for a patient cannot possibly be known by the trial staff until after they are unambiguously and irreversibly enrolled onto the trial (this of course doesn't mean they can't drop out of the trial, they just can't disappear without a trace as if never enrolled). 108 | 109 | - Once a patient is allocated, their allocation cannot be altered (again, this of course doesn't mean that they can't be moved onto another treatment, just that their initial allocation can't be changed without any indication). 110 | 111 | Given the importance of allocation concealment for preventing selection bias, and for subsequent blinding, its critically important that investigators use trustworthy **systems** that don't rely on the trustworthiness of investigators. Thus computer databases and remote services should be used in serious trials, and stuffed envelopes should generally be avoided. 112 | 113 | Finally, with respect to the randomization list itself, we discussed restriction and stratification. Given that estimators are more efficient when there is an even split of participants across study arms, it can be a good idea to restrict the randomization list to force equal (or very nearly equal) allocation of patients across arms. However, in larger samples (n > 100 maybe), the probability of an imbalance in numbers large enough to appreciably affect the estimator's efficiency gets very small very quickly. 114 | 115 | However, there is another reason to restrict a randomization list and that is when we decide to stratify on one or more key factors that are prognostic for the outcome. There is a wide-spread misconception about stratification which is that we use it to prevent covariate imbalances across study arms (i.e. to prevent the distribution of some important covariate from being notably different between arms). Importantly, however, this is an incomplete solution to that problem, because if we force balance in a covariate by stratifying on it, we then need to adjust for that covariate in our statistical model - to not do so would be analogous to treating matched data as if they were unmatched. In other words, you must "analyze as you randomize" (Senn). Further, by adjusting for the covariate, you fix the problem you were trying to solve with the stratification, even if you don't stratify! However, stratification, when feasible, is still a good idea, because it forces balanced distribution of study arms (which is why a stratified list must be restricted) within strata, which, as noted above, leads to more efficient estimators. 116 | 117 | **Selected reading:** 118 | 119 | [Seven myths of randomisation in clinical trials Links to an external site](https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.5713) 120 | 121 | [How to randomize](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2596474/) 122 | 123 | ## Module Text 124 | 125 | [Statistical Issues in Drug Development (UCC Library Link)](https://library.ucc.ie/search?/Xstatstical+issues+in+drug+development&SORT=D/Xstatstical+issues+in+drug+development&SORT=D&SUBKEY=statstical+issues+in+drug+development/1%2C25682%2C25682%2CB/frameset&FF=Xstatstical+issues+in+drug+development&SORT=D&4%2C4%2C) -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Equivalence Trials" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | ```{r setup, include = FALSE} 9 | 10 | knitr::opts_chunk$set(message = FALSE, warning = FALSE) 11 | 12 | # Install/load packages 13 | 14 | packs <- c("tidyverse", "knitr", "viridis", "broom", "ggbeeswarm", "TOSTER") 15 | install.packages(packs[!packs %in% installed.packages()]) 16 | lapply(packs, library, character.only = TRUE) 17 | 18 | ``` 19 | 20 | # Frequentist statistics continued + equivalence vs superiority tests 21 | 22 | ## Part 4: Confidence intervals 23 | 24 | In the previous sections, we discussed how to calculate and use p-values to make inferences, either by using the p-value directly, or by using it to perform binary hypothesis tests and focus on controlling errors. Another common way to use p-values is to create frequentist confidence intervals (CI). 25 | 26 | To do this, first we take some level of alpha, our acceptable type 1 error rate (usually 0.05 (5%) due to thoughtless convention). Then we set the null to be the effect estimate we actually observed. Then we take the collection of values for which we would fail to reject that null at the set alpha level. This collection of values is our CI. Another way to think of it is that it is the middle 95% of values (for alpha = 0.05) from the estimator's sampling distribution, centered on our observed effect estimate. This can be tricky to think about, so let's work through it. 27 | 28 | First, let's look at the two-sided p-value from a normal sampling distribution with an effect = 0 and an standard error (SE) = 1 (remember, an SE is just what we call the standard deviation for a sampling distribution). 29 | 30 | ```{r t_two_sided} 31 | 32 | g1 <- ggplot(data_frame(x = c(-4 , 4)), aes(x = x)) + 33 | stat_function(fun = dnorm, args = list(0, 1), 34 | geom = "area", fill = viridis(1), alpha = 0.3) + 35 | xlim(c(-4 , 4)) + 36 | xlab("z") + 37 | ylab("Density") + 38 | theme_minimal() + 39 | stat_function(xlim = c(1.96 , 4), 40 | fun = dnorm, args = list(0, 1), 41 | geom = "area", fill = viridis(1)) + 42 | stat_function(xlim = c(-4, -1.96), 43 | fun = dnorm, args = list(0, 1), 44 | geom = "area", fill = viridis(1)) + 45 | geom_errorbarh(aes(x = 0, xmax = 1.96, xmin = -1.96, y = 0.1), 46 | height = 0.05, size = 2, color = viridis(1)) + 47 | geom_point(aes(x = 0, y = 0.1), size = 4, color = viridis(1)) + 48 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t is ", signif(2 * pnorm(1.96, 0, 1, lower.tail = FALSE), 3))) 49 | 50 | g1 51 | 52 | ``` 53 | 54 | We can thus see the middle 95% of values, which would be our 95% CI. Values falling outside of this range are in the rejection region of the distribution. 55 | 56 | Now, for sampling distributions where we are happy to assume a normal model (this is often true), we can just slide this same range over so that it's centered on our observed effect estimate. So let's see this for an effect = 2. 57 | 58 | ```{r shift_to_effect} 59 | 60 | g2 <- g1 + 61 | stat_function(fun = dnorm, args = list(2, 1), 62 | geom = "area", fill = viridis(1, direction = -1), alpha = 0.3) + 63 | xlim(c(-4 , 6)) + 64 | xlab("z") + 65 | ylab("Density") + 66 | theme_minimal() + 67 | stat_function(xlim = c(2 + 1.96 , 6), 68 | fun = dnorm, args = list(2, 1), 69 | geom = "area", fill = viridis(1, direction = -1)) + 70 | stat_function(xlim = c(-4, -1.96 + 2), 71 | fun = dnorm, args = list(2, 1), 72 | geom = "area", fill = viridis(1, direction = -1)) + 73 | geom_errorbarh(aes(x = 2, xmax = 2 + 1.96, xmin = 2 + -1.96, y = 0.15), 74 | height = 0.05, size = 2, 75 | color = viridis(1, direction = -1)) + 76 | geom_point(aes(x = 2, y = 0.15), size = 4, 77 | color = viridis(1, direction = -1)) + 78 | ggtitle("") 79 | 80 | g2 81 | 82 | ``` 83 | 84 | So now the have the same interval centered on our observed effect = 2 (yellow). 85 | 86 | Because this effect of 2 is > 1.96 (1.96 being the critical value we used the calculate the p-value if we want alpha = 0.05; see the purple sampling distribution), we would thus "reject the null hypothesis of 0 effect". 87 | 88 | We can also see that zero falls (just barely) outside of the yellow 95% CI. This means that if we set the null to 2, rather than 0, then we would reject this new null if we actually observed a 0 effect, just like we reject a null of 0 when we observe a 2. One test is just the mirror image of the other. We will use the same idea when we get to equivalence tests below. 89 | 90 | So how should we interpret a CI centered on our observed effect? Many people think that an X% CI says that there is an X% chance that the "true" effect falls in the interval. This is incorrect, and reflects a Bayesian interpretation that is unwarranted here. 91 | 92 | Instead, an X% CI will contain the "true" effect X% of the time over many replications of the procedure that led to the CI. In other words, the CI is still a completely frequentist tool, and one way to think of it is as a collection of statistical tests. You can also think of it as a "compatibility interval" - a collection of values that our observed data would be compatible with (at some pre-specified alpha) had the data indeed been generated by (any one of) those values. We will come back to Bayesian intervals, where can *can* conclude some % that the "truth" falls in our interval (given other assumptions), when we get to Bayesian statistics later in the module. 93 | 94 | ## Part 5: Equivalence 95 | 96 | Thinking about equivalence tests can get tricky, so we will try to build up slowly. In a superiority trial, we usually construct a 95% CI around our effect estimate. If this interval excludes our null, then we might choose to reject that null, concluding that the data we observed would have been unusual had they actually been generated under a null model (which includes a null effect and the assumptions underlying the sampling distribution we use). In this case, our hypothesis test is one of H:0 y = 0 vs H_Alt: y < 0 or y >0 (assuming we are using a two-sided test). 97 | 98 | However, to test for non-inferiority and non-superiority (the two "sides" of establishing equivalence), we will use two one-sided tests. 99 | 100 | The non-inferiority test is H0: y >= margin vs H_Alt y < margin, where higher values of y indicate worse outcome, and the margin is some value indicating how much worse we are willing the new treatment is compared to the current standard of care (see the lecture for more on this). If we reject this test, we can declare non-inferiority - that is, we reject the notion that the data we observed were from a data generating model where the "true" effect was as larger as our margin or larger 101 | 102 | The non-superiority test is then H0: y <= margin vs H_Alt: y > margin. When we reject this hypothesis test, we declare non-superiority. 103 | 104 | If we reject both tests, we conclude there is equivalence between the treatments being compared. 105 | 106 | Let's start with our familiar null = 0 sampling distribution from a normal model with a SE = 1 and a 95% two sided CI. 107 | 108 | ```{r start} 109 | 110 | # Get the expected sampling distibution under a null hypotheis of no difference 111 | 112 | g1 113 | 114 | ``` 115 | 116 | Now let's slide the CI so that it centers on our non-inferiority margin (= 3 in this example). 117 | 118 | ```{r shift} 119 | 120 | margin <- 3 121 | 122 | g2 <- ggplot(data_frame(x = c(-6 , 6)), aes(x = x)) + 123 | stat_function(fun = dnorm, args = list(0, 1), 124 | geom = "area", fill = viridis(1), alpha = 0.1) + 125 | stat_function(fun = dnorm, args = list(margin, 1), 126 | geom = "area", fill = viridis(1, end = 0.7), 127 | alpha = 0.5) + 128 | xlim(c(-6 , 6)) + 129 | xlab("Z") + 130 | ylab("Density") + 131 | theme_minimal() + 132 | stat_function(xlim = c(1.96 + margin , 6), 133 | fun = dnorm, args = list(margin, 1), 134 | geom = "area", fill = viridis(1)) + 135 | stat_function(xlim = c(-6, -1.96 + margin), 136 | fun = dnorm, args = list(margin, 1), 137 | geom = "area", fill = viridis(1)) + 138 | geom_errorbarh(aes(xmin = -1.96 + margin, xmax = 1.96 + margin, 139 | y = dnorm(-1.96)), height = 0.05) + 140 | geom_point(x = margin, y = dnorm(-1.96), size = 3) + 141 | geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) + 142 | geom_vline(xintercept = margin, color = "red", linetype = "dashed", 143 | size = 1) 144 | 145 | g2 146 | 147 | ``` 148 | 149 | But we aren't interested in a two sided test of H0: z = 3 vs H_Alt: Z < 3 or Z > 3, but rather we want the one sided test of Z >= 3 vs Z < 3 - our non-inferiority test. 150 | 151 | ```{r non_inferiority} 152 | 153 | g3 <- ggplot(data_frame(x = c(-6 , 6)), aes(x = x)) + 154 | stat_function(fun = dnorm, args = list(0, 1), 155 | geom = "area", fill = viridis(1), alpha = 0.1) + 156 | stat_function(fun = dnorm, args = list(margin, 1), 157 | geom = "area", fill = viridis(1, end = 0.7), 158 | alpha = 0.5) + 159 | xlim(c(-6 , 6)) + 160 | xlab("Z") + 161 | ylab("Density") + 162 | theme_minimal() + 163 | stat_function(xlim = c(-6, -1.96 + margin), 164 | fun = dnorm, args = list(margin, 1), 165 | geom = "area", fill = viridis(1)) + 166 | geom_errorbarh(aes(xmin = -1.96 + margin, xmax = Inf, 167 | y = dnorm(-1.96)), height = 0.05) + 168 | geom_point(x = margin, y = dnorm(-1.96), size = 3) + 169 | geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) + 170 | geom_vline(xintercept = margin, color = "red", linetype = "dashed", 171 | size = 1) 172 | 173 | g3 174 | 175 | ``` 176 | Note: the right side of the 97.5% one-sided CI technically goes out to infinity, which is why the interval is asymetric. 177 | 178 | As well as the one sided test of Z <= -3 vs Z > -3 - our non-superiority test. 179 | 180 | ```{r non_superiority} 181 | 182 | g4 <- g3 + 183 | stat_function(fun = dnorm, args = list(-margin, 1), 184 | geom = "area", fill = viridis(1, direction = -1), 185 | alpha = 0.5) + 186 | stat_function(xlim = c(6, 1.96 + -margin), 187 | fun = dnorm, args = list(-margin, 1), 188 | geom = "area", fill = viridis(1, direction = -1)) + 189 | geom_errorbarh(aes(xmin = 1.96 + -margin, xmax = -Inf, 190 | y = dnorm(-1.96)), height = 0.05) + 191 | geom_point(x = -margin, y = dnorm(-1.96), size = 3) + 192 | geom_vline(xintercept = -margin, color = "red", linetype = "dashed", 193 | size = 1) 194 | 195 | g4 196 | 197 | ``` 198 | 199 | Based on these two one-sided 97.5% intervals, we can create a two-sided 95% interval as follows: 200 | 201 | First, take each bounded side of the two one-sided intervals. 202 | 203 | ```{r new_interval_1} 204 | 205 | g5 <- g4 + 206 | geom_segment(x = 1.96 + -margin, xend = -margin, 207 | y = 0.1, yend = 0.1, size = 2) + 208 | geom_segment(x = -1.96 + margin, xend = margin, 209 | y = 0.1, yend = 0.1, size = 2) 210 | 211 | g5 212 | 213 | ``` 214 | 215 | Then stick them together. 216 | 217 | ```{r new_interval_2} 218 | 219 | g6 <- g5 + 220 | geom_segment(x = 1.96, xend = 0, 221 | y = 0.15, yend = 0.15, size = 2) + 222 | geom_segment(x = -1.96, xend = 0, 223 | y = 0.15, yend = 0.15, size = 2) + 224 | geom_point(y = 0.15, x = 0, size = 3, color = "red") 225 | 226 | 227 | g6 228 | 229 | ``` 230 | 231 | So now I have a 95% CI centered on 0, and if I shifted it so that its upper limit crossed the non-inferiority margin, that would also drag the point estimate into the acceptance (light purple) region of the one-sided test, so we wouldn't be able to declare non-inferiority. Similarly, if I shifted it so that its lower limit crossed the non-superiority margin, that would drag the estimate into the acceptance region (light yellow) of the one-sided non-superiority test. 232 | 233 | One final point, here were focused on one-sided tests with alpha = 0.025 (2.5%) so that we could get to the 95% (100 - (2 * 2.5)) two-sided confidence intervals we are more used to. However, if we are truly wanting to use a 5% type 1 error rate, then we would use two one-sided 95% intervals, resulting in a 90% two-sided CI. Both 90% and 95% CIs are used in practice, but you need to understand that when used in equivalence tests the first has a 5% type 1 error rate, and the second has a 2.5% type 1 error rate. 234 | 235 | ## Analysis example 236 | 237 | Here is an example of this in practice. We start with a small trial looking at the effect of fentanyl vs morphine in controlling pain. 238 | 239 | ```{r load_data} 240 | 241 | data <- read_csv("data/data.csv") 242 | 243 | names(data) 244 | 245 | length(unique(data$subject)) == nrow(data) 246 | 247 | table(data$arm) 248 | 249 | ``` 250 | 251 | We can see the dataset contains a unique id for 31 observations, information about which arm the participant was in, and a series of pain scores measured over time. 252 | 253 | The first thing I usually do with a dataset is to plot the key variables, so let's have a look at these pain scores over time. To do this, first I want to "reshape" the dataset from a wide format (1 row with many columns per observations) to a long format (many rows - one for each time point per person). 254 | 255 | ```{r reshape_long} 256 | 257 | pain <- gather(data, time, pain, a_painscore0:h_painscore120) %>% 258 | mutate(time = as.numeric(gsub("_", "", gsub("[[:alpha:]]", "", time)))) 259 | 260 | head(pain) 261 | 262 | ``` 263 | 264 | Next I want to calculate some summary statistics, and then plot these alongside the data. 265 | 266 | ```{r plot_raw_pain_data} 267 | 268 | # Summary stats 269 | pain_df <- group_by(pain, arm, time) %>% 270 | summarise(median = quantile(pain, 0.5), 271 | upperq = quantile(pain, 0.75), 272 | lowerq = quantile(pain, 0.25), 273 | mean = mean(pain), 274 | sd = sd(pain), 275 | n = n(), 276 | se = sd / sqrt(n-1), 277 | ul = mean + (1.96 * se), 278 | ll = mean - (1.96 * se)) %>% 279 | gather(type, pain, median:ll) 280 | 281 | # Plot the summary stats and the raw data 282 | ggplot(pain_df, aes(x = time, y = pain)) + 283 | geom_line(data = filter(pain_df, type == "upperq"), 284 | alpha = 0.6, color = "black", size = 1, linetype = "dashed") + 285 | geom_line(data = filter(pain_df, type == "mean"), 286 | alpha = 0.6, color = "black", size = 1) + 287 | geom_line(data = filter(pain_df, type == "lowerq"), 288 | alpha = 0.6, color = "black", size = 1, linetype = "dashed") + 289 | geom_boxplot(data = pain, aes(y = pain, group = time), 290 | color = "grey20", alpha = 0) + 291 | geom_beeswarm(data = pain, aes(group = time, y = pain), 292 | color = "white", shape = 1) + 293 | facet_wrap(~arm, nrow = 1) + 294 | theme_minimal() + 295 | theme(panel.grid = element_blank()) + 296 | xlab("Minutes post treatment") + 297 | ylab("Pain score") + 298 | scale_y_continuous(breaks = seq(0, 10, by = 2)) + 299 | scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 30, 60, 120)) + 300 | theme(panel.background = element_rect(fill = "grey")) + 301 | coord_cartesian(ylim = c(0, 10)) 302 | 303 | # Save a pdf version 304 | ggsave("plots/figure_1.pdf", height = 19.05, width = 33.86, 305 | units = "cm", scale = 0.8) 306 | 307 | ``` 308 | 309 | ### TOST for the primary outcome (10 minutes) 310 | 311 | Next we want to use the TOSTER package to carry out our two one-sided tests that we will use to evaluate if fentanyl is non-inferior to morphine. This means that we will accept that it's a little worse (e.g. is associated with higher pain scores), but not too much worse (not more than +0.36 SDs of pain worse - this is the margin). 312 | 313 | ```{r tost_primary} 314 | 315 | m2 <- mean(filter(data, arm == "Morphine")$c_painscore10, na.rm = TRUE) 316 | m1 <- mean(filter(data, arm == "Fentanyl")$c_painscore10, na.rm = TRUE) 317 | sd2 <- sd(filter(data, arm == "Morphine")$c_painscore10, na.rm = TRUE) 318 | sd1 <- sd(filter(data, arm == "Fentanyl")$c_painscore10, na.rm = TRUE) 319 | 320 | TOSTtwo( 321 | m1 = m1, m2 = m2, sd1 = sd1, sd2 = sd2, n1 = 16, n2 = 15, 322 | low_eqbound_d = 0.36, 323 | high_eqbound_d = 0.36, 324 | alpha = 0.05, 325 | var.equal = FALSE, 326 | plot = TRUE 327 | ) 328 | 329 | ``` 330 | 331 | # Two sample t-tests with unequal variances 332 | 333 | For our own learning, let's see if we can replicate what the TOSTER package does, and apply it to every time point. Below we will basically run 2 t-tests comparing the mean pain score between arms, at each time point. For one of these tests, we will set alpha to 0.05 (for a 95% CI), and for the other, we will set alpha to 0.10 (for a 90% CI, which is our two one-sided tests, each with alpha = 0.05). 334 | 335 | ```{r calculate_mean_diffs} 336 | 337 | # Function for running and reporting t-tests 338 | ttest_res <- function(x, data, level, ...){ 339 | form <- as.formula(paste0(x, " ~ arm")) 340 | x <- tidy(t.test(form, data, conf.level = level)) %>% 341 | mutate(time = x, level = level) %>% 342 | mutate(time = gsub("[[:alpha:]]", "", time)) %>% 343 | mutate(time = as.numeric(gsub("_", "", time))) 344 | 345 | return(x) 346 | } 347 | 348 | # Mean diffs with 95%CI (taken from t-test) 349 | ttests_95 <- list() 350 | for(i in 1:length(names(dplyr::select(data, contains("painscore"))))){ 351 | ttests_95[[i]] <- ttest_res( 352 | names(dplyr::select(data, contains("painscore")))[i], data, 0.95 353 | ) 354 | } 355 | 356 | ttests_95 <- do.call(bind_rows, ttests_95) 357 | 358 | # Mean diffs with 90%CI (taken from t-test) 359 | ttests_90 <- list() 360 | for(i in 1:length(names(dplyr::select(data, contains("painscore"))))){ 361 | ttests_90[[i]] <- ttest_res( 362 | names(dplyr::select(data, contains("painscore")))[i], data, 0.90 363 | ) 364 | } 365 | 366 | ttests_90 <- do.call(bind_rows, ttests_90) 367 | 368 | ``` 369 | 370 | Now let's plot the results. 371 | 372 | ```{r plot_mean_diffs} 373 | 374 | ggplot(ttests_95, aes(x = time, y = estimate, ymax = conf.high, 375 | ymin = conf.low)) + 376 | geom_rect(aes(ymin = 0, ymax = Inf, xmin = -Inf, xmax = Inf), 377 | fill = "white", alpha = 0.8) + 378 | geom_hline(yintercept = -0.6, linetype = "dashed", color = "black") + 379 | geom_hline(yintercept = 0.6, linetype = "dashed", color = "black") + 380 | geom_hline(yintercept = 0, color = "black") + 381 | geom_hline(yintercept = 1, color = "grey") + 382 | geom_pointrange(color = "black", size = 2) + 383 | geom_linerange(data = ttests_90, color = "white", size = 1) + 384 | theme_minimal() + 385 | theme(panel.grid.major.x = element_blank(), 386 | panel.grid.minor.x = element_blank(), 387 | panel.grid.minor.y = element_blank()) + 388 | xlab("Minutes post treatment") + 389 | ylab("Difference in mean pain score (Fentanyl vs Morphine") + 390 | scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 30, 60, 120)) + 391 | theme(panel.background = element_rect(fill = "grey")) + 392 | annotate("text", y = -3.4, x = 100, label = "Favours Fentanyl") + 393 | annotate("text", y = 1.2, x = 100, label = "Favours Morphine") 394 | 395 | ggsave("plots/figure_2.pdf", height = 19.05, width = 33.86, 396 | units = "cm", scale = 0.8) 397 | 398 | 399 | ``` 400 | 401 | And finally, a table of results. 402 | 403 | ```{r results table} 404 | 405 | # Take the long pain data, convert to wide, keep mean, n, sd by arm and time 406 | pain_df_wide <- spread(pain_df, type, pain) %>% 407 | dplyr::select(-ll, -ul, -median, -upperq, -lowerq, -se) %>% 408 | dplyr::select(arm, time, n, mean, sd) 409 | 410 | # Split the data by arm and cbind into dataset. 411 | 412 | df1 <- filter(pain_df_wide, arm == "Fentanyl") 413 | df2 <- filter(pain_df_wide, arm == "Morphine") 414 | names(df2) <- paste0(names(df2), "1") 415 | 416 | table_data <- bind_cols(df1, df2) %>% 417 | ungroup() %>% 418 | dplyr::select(-arm, -arm1, -time1) 419 | 420 | table_data <- full_join( 421 | table_data, 422 | dplyr::select(ttests_95, estimate, conf.low, conf.high, time), 423 | by = "time" 424 | ) 425 | 426 | tar <- c("mean", "sd", "mean1", "sd1", "estimate", "conf.low", "conf.high") 427 | table_data[tar] <- lapply(table_data[tar], round, 2) 428 | 429 | table_data <- mutate(table_data, effect = paste0( 430 | estimate, " (", conf.low, " to ", conf.high, ")" 431 | )) %>% 432 | dplyr::select(-estimate, -conf.low, -conf.high) 433 | 434 | knitr::kable( 435 | table_data, 436 | col.names = c( 437 | "Time (+min)", rep(c("N", "Mean", "SD"), 2), 438 | "Mean Difference (95%CI)" 439 | ), 440 | align = c("l", rep("c", ncol(table_data) - 1)) 441 | ) 442 | 443 | 444 | 445 | ``` 446 | 447 | ### Challenge 448 | 449 | Try to use what you've seen in the last two tutorials to 1) simulate some data (hint: see sim_t_test_data from the last tutorial) from a two arm parallel trial with n = 40 where the effect of the new treatment on a normally distributed outcome (mean = 0, sd = 1) is 0.3 SDs worse than the standard treatment (i.e. higher scores is bad). Then test for non-inferiority using the TOST procedure based on a t-test with unequal variances and a NI margin of 0.4 SDs (hint: see tost_primary above). Then repeat your simulation many times to explore it's performance. Finally, increase the sample size to 100 and repeat the exercise. 450 | 451 | -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Equivalence Trials" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | 9 | 10 | # Frequentist statistics continued + equivalence vs superiority tests 11 | 12 | ## Part 4: Confidence intervals 13 | 14 | In the previous sections, we discussed how to calculate and use p-values to make inferences, either by using the p-value directly, or by using it to perform binary hypothesis tests and focus on controlling errors. Another common way to use p-values is to create frequentist confidence intervals (CI). 15 | 16 | To do this, first we take some level of alpha, our acceptable type 1 error rate (usually 0.05 (5%) due to thoughtless convention). Then we set the null to be the effect estimate we actually observed. Then we take the collection of values for which we would fail to reject that null at the set alpha level. This collection of values is our CI. Another way to think of it is that it is the middle 95% of values (for alpha = 0.05) from the estimator's sampling distribution, centered on our observed effect estimate. This can be tricky to think about, so let's work through it. 17 | 18 | First, let's look at the two-sided p-value from a normal sampling distribution with an effect = 0 and an standard error (SE) = 1 (remember, an SE is just what we call the standard deviation for a sampling distribution). 19 | 20 | 21 | ```r 22 | g1 <- ggplot(data_frame(x = c(-4 , 4)), aes(x = x)) + 23 | stat_function(fun = dnorm, args = list(0, 1), 24 | geom = "area", fill = viridis(1), alpha = 0.3) + 25 | xlim(c(-4 , 4)) + 26 | xlab("z") + 27 | ylab("Density") + 28 | theme_minimal() + 29 | stat_function(xlim = c(1.96 , 4), 30 | fun = dnorm, args = list(0, 1), 31 | geom = "area", fill = viridis(1)) + 32 | stat_function(xlim = c(-4, -1.96), 33 | fun = dnorm, args = list(0, 1), 34 | geom = "area", fill = viridis(1)) + 35 | geom_errorbarh(aes(x = 0, xmax = 1.96, xmin = -1.96, y = 0.1), 36 | height = 0.05, size = 2, color = viridis(1)) + 37 | geom_point(aes(x = 0, y = 0.1), size = 4, color = viridis(1)) + 38 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t is ", signif(2 * pnorm(1.96, 0, 1, lower.tail = FALSE), 3))) 39 | 40 | g1 41 | ``` 42 | 43 | ![](Equivalance_trials_files/figure-html/t_two_sided-1.png) 44 | 45 | We can thus see the middle 95% of values, which would be our 95% CI. Values falling outside of this range are in the rejection region of the distribution. 46 | 47 | Now, for sampling distributions where we are happy to assume a normal model (this is often true), we can just slide this same range over so that it's centered on our observed effect estimate. So let's see this for an effect = 2. 48 | 49 | 50 | ```r 51 | g2 <- g1 + 52 | stat_function(fun = dnorm, args = list(2, 1), 53 | geom = "area", fill = viridis(1, direction = -1), alpha = 0.3) + 54 | xlim(c(-4 , 6)) + 55 | xlab("z") + 56 | ylab("Density") + 57 | theme_minimal() + 58 | stat_function(xlim = c(2 + 1.96 , 6), 59 | fun = dnorm, args = list(2, 1), 60 | geom = "area", fill = viridis(1, direction = -1)) + 61 | stat_function(xlim = c(-4, -1.96 + 2), 62 | fun = dnorm, args = list(2, 1), 63 | geom = "area", fill = viridis(1, direction = -1)) + 64 | geom_errorbarh(aes(x = 2, xmax = 2 + 1.96, xmin = 2 + -1.96, y = 0.15), 65 | height = 0.05, size = 2, 66 | color = viridis(1, direction = -1)) + 67 | geom_point(aes(x = 2, y = 0.15), size = 4, 68 | color = viridis(1, direction = -1)) + 69 | ggtitle("") 70 | 71 | g2 72 | ``` 73 | 74 | ![](Equivalance_trials_files/figure-html/shift_to_effect-1.png) 75 | 76 | So now the have the same interval centered on our observed effect = 2 (yellow). 77 | 78 | Because this effect of 2 is > 1.96 (1.96 being the critical value we used the calculate the p-value if we want alpha = 0.05; see the purple sampling distribution), we would thus "reject the null hypothesis of 0 effect". 79 | 80 | We can also see that zero falls (just barely) outside of the yellow 95% CI. This means that if we set the null to 2, rather than 0, then we would reject this new null if we actually observed a 0 effect, just like we reject a null of 0 when we observe a 2. One test is just the mirror image of the other. We will use the same idea when we get to equivalence tests below. 81 | 82 | So how should we interpret a CI centered on our observed effect? Many people think that an X% CI says that there is an X% chance that the "true" effect falls in the interval. This is incorrect, and reflects a Bayesian interpretation that is unwarranted here. 83 | 84 | Instead, an X% CI will contain the "true" effect X% of the time over many replications of the procedure that led to the CI. In other words, the CI is still a completely frequentist tool, and one way to think of it is as a collection of statistical tests. You can also think of it as a "compatibility interval" - a collection of values that our observed data would be compatible with (at some pre-specified alpha) had the data indeed been generated by (any one of) those values. We will come back to Bayesian intervals, where can *can* conclude some % that the "truth" falls in our interval (given other assumptions), when we get to Bayesian statistics later in the module. 85 | 86 | ## Part 5: Equivalence 87 | 88 | Thinking about equivalence tests can get tricky, so we will try to build up slowly. In a superiority trial, we usually construct a 95% CI around our effect estimate. If this interval excludes our null, then we might choose to reject that null, concluding that the data we observed would have been unusual had they actually been generated under a null model (which includes a null effect and the assumptions underlying the sampling distribution we use). In this case, our hypothesis test is one of H:0 y = 0 vs H_Alt: y < 0 or y >0 (assuming we are using a two-sided test). 89 | 90 | However, to test for non-inferiority and non-superiority (the two "sides" of establishing equivalence), we will use two one-sided tests. 91 | 92 | The non-inferiority test is H0: y >= margin vs H_Alt y < margin, where higher values of y indicate worse outcome, and the margin is some value indicating how much worse we are willing the new treatment is compared to the current standard of care (see the lecture for more on this). If we reject this test, we can declare non-inferiority - that is, we reject the notion that the data we observed were from a data generating model where the "true" effect was as larger as our margin or larger 93 | 94 | The non-superiority test is then H0: y <= margin vs H_Alt: y > margin. When we reject this hypothesis test, we declare non-superiority. 95 | 96 | If we reject both tests, we conclude there is equivalence between the treatments being compared. 97 | 98 | Let's start with our familiar null = 0 sampling distribution from a normal model with a SE = 1 and a 95% two sided CI. 99 | 100 | 101 | ```r 102 | # Get the expected sampling distibution under a null hypotheis of no difference 103 | 104 | g1 105 | ``` 106 | 107 | ![](Equivalance_trials_files/figure-html/start-1.png) 108 | 109 | Now let's slide the CI so that it centers on our non-inferiority margin (= 3 in this example). 110 | 111 | 112 | ```r 113 | margin <- 3 114 | 115 | g2 <- ggplot(data_frame(x = c(-6 , 6)), aes(x = x)) + 116 | stat_function(fun = dnorm, args = list(0, 1), 117 | geom = "area", fill = viridis(1), alpha = 0.1) + 118 | stat_function(fun = dnorm, args = list(margin, 1), 119 | geom = "area", fill = viridis(1, end = 0.7), 120 | alpha = 0.5) + 121 | xlim(c(-6 , 6)) + 122 | xlab("Z") + 123 | ylab("Density") + 124 | theme_minimal() + 125 | stat_function(xlim = c(1.96 + margin , 6), 126 | fun = dnorm, args = list(margin, 1), 127 | geom = "area", fill = viridis(1)) + 128 | stat_function(xlim = c(-6, -1.96 + margin), 129 | fun = dnorm, args = list(margin, 1), 130 | geom = "area", fill = viridis(1)) + 131 | geom_errorbarh(aes(xmin = -1.96 + margin, xmax = 1.96 + margin, 132 | y = dnorm(-1.96)), height = 0.05) + 133 | geom_point(x = margin, y = dnorm(-1.96), size = 3) + 134 | geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) + 135 | geom_vline(xintercept = margin, color = "red", linetype = "dashed", 136 | size = 1) 137 | 138 | g2 139 | ``` 140 | 141 | ![](Equivalance_trials_files/figure-html/shift-1.png) 142 | 143 | But we aren't interested in a two sided test of H0: z = 3 vs H_Alt: Z < 3 or Z > 3, but rather we want the one sided test of Z >= 3 vs Z < 3 - our non-inferiority test. 144 | 145 | 146 | ```r 147 | g3 <- ggplot(data_frame(x = c(-6 , 6)), aes(x = x)) + 148 | stat_function(fun = dnorm, args = list(0, 1), 149 | geom = "area", fill = viridis(1), alpha = 0.1) + 150 | stat_function(fun = dnorm, args = list(margin, 1), 151 | geom = "area", fill = viridis(1, end = 0.7), 152 | alpha = 0.5) + 153 | xlim(c(-6 , 6)) + 154 | xlab("Z") + 155 | ylab("Density") + 156 | theme_minimal() + 157 | stat_function(xlim = c(-6, -1.96 + margin), 158 | fun = dnorm, args = list(margin, 1), 159 | geom = "area", fill = viridis(1)) + 160 | geom_errorbarh(aes(xmin = -1.96 + margin, xmax = Inf, 161 | y = dnorm(-1.96)), height = 0.05) + 162 | geom_point(x = margin, y = dnorm(-1.96), size = 3) + 163 | geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) + 164 | geom_vline(xintercept = margin, color = "red", linetype = "dashed", 165 | size = 1) 166 | 167 | g3 168 | ``` 169 | 170 | ![](Equivalance_trials_files/figure-html/non_inferiority-1.png) 171 | Note: the right side of the 97.5% one-sided CI technically goes out to infinity, which is why the interval is asymetric. 172 | 173 | As well as the one sided test of Z <= -3 vs Z > -3 - our non-superiority test. 174 | 175 | 176 | ```r 177 | g4 <- g3 + 178 | stat_function(fun = dnorm, args = list(-margin, 1), 179 | geom = "area", fill = viridis(1, direction = -1), 180 | alpha = 0.5) + 181 | stat_function(xlim = c(6, 1.96 + -margin), 182 | fun = dnorm, args = list(-margin, 1), 183 | geom = "area", fill = viridis(1, direction = -1)) + 184 | geom_errorbarh(aes(xmin = 1.96 + -margin, xmax = -Inf, 185 | y = dnorm(-1.96)), height = 0.05) + 186 | geom_point(x = -margin, y = dnorm(-1.96), size = 3) + 187 | geom_vline(xintercept = -margin, color = "red", linetype = "dashed", 188 | size = 1) 189 | 190 | g4 191 | ``` 192 | 193 | ![](Equivalance_trials_files/figure-html/non_superiority-1.png) 194 | 195 | Based on these two one-sided 97.5% intervals, we can create a two-sided 95% interval as follows: 196 | 197 | First, take each bounded side of the two one-sided intervals. 198 | 199 | 200 | ```r 201 | g5 <- g4 + 202 | geom_segment(x = 1.96 + -margin, xend = -margin, 203 | y = 0.1, yend = 0.1, size = 2) + 204 | geom_segment(x = -1.96 + margin, xend = margin, 205 | y = 0.1, yend = 0.1, size = 2) 206 | 207 | g5 208 | ``` 209 | 210 | ![](Equivalance_trials_files/figure-html/new_interval_1-1.png) 211 | 212 | Then stick them together. 213 | 214 | 215 | ```r 216 | g6 <- g5 + 217 | geom_segment(x = 1.96, xend = 0, 218 | y = 0.15, yend = 0.15, size = 2) + 219 | geom_segment(x = -1.96, xend = 0, 220 | y = 0.15, yend = 0.15, size = 2) + 221 | geom_point(y = 0.15, x = 0, size = 3, color = "red") 222 | 223 | 224 | g6 225 | ``` 226 | 227 | ![](Equivalance_trials_files/figure-html/new_interval_2-1.png) 228 | 229 | So now I have a 95% CI centered on 0, and if I shifted it so that its upper limit crossed the non-inferiority margin, that would also drag the point estimate into the acceptance (light purple) region of the one-sided test, so we wouldn't be able to declare non-inferiority. Similarly, if I shifted it so that its lower limit crossed the non-superiority margin, that would drag the estimate into the acceptance region (light yellow) of the one-sided non-superiority test. 230 | 231 | One final point, here were focused on one-sided tests with alpha = 0.025 (2.5%) so that we could get to the 95% (100 - (2 * 2.5)) two-sided confidence intervals we are more used to. However, if we are truly wanting to use a 5% type 1 error rate, then we would use two one-sided 95% intervals, resulting in a 90% two-sided CI. Both 90% and 95% CIs are used in practice, but you need to understand that when used in equivalence tests the first has a 5% type 1 error rate, and the second has a 2.5% type 1 error rate. 232 | 233 | ## Analysis example 234 | 235 | Here is an example of this in practice. We start with a small trial looking at the effect of fentanyl vs morphine in controlling pain. 236 | 237 | 238 | ```r 239 | data <- read_csv("data/data.csv") 240 | 241 | names(data) 242 | ``` 243 | 244 | ``` 245 | ## [1] "subject" "arm" "a_painscore0" "b_painscore5" 246 | ## [5] "c_painscore10" "d_painscore15" "e_painscore20" "f_painscore30" 247 | ## [9] "g_painscore60" "h_painscore120" 248 | ``` 249 | 250 | ```r 251 | length(unique(data$subject)) == nrow(data) 252 | ``` 253 | 254 | ``` 255 | ## [1] TRUE 256 | ``` 257 | 258 | ```r 259 | table(data$arm) 260 | ``` 261 | 262 | ``` 263 | ## 264 | ## Fentanyl Morphine 265 | ## 15 16 266 | ``` 267 | 268 | We can see the dataset contains a unique id for 31 observations, information about which arm the participant was in, and a series of pain scores measured over time. 269 | 270 | The first thing I usually do with a dataset is to plot the key variables, so let's have a look at these pain scores over time. To do this, first I want to "reshape" the dataset from a wide format (1 row with many columns per observations) to a long format (many rows - one for each time point per person). 271 | 272 | 273 | ```r 274 | pain <- gather(data, time, pain, a_painscore0:h_painscore120) %>% 275 | mutate(time = as.numeric(gsub("_", "", gsub("[[:alpha:]]", "", time)))) 276 | 277 | head(pain) 278 | ``` 279 | 280 | ``` 281 | ## # A tibble: 6 x 4 282 | ## subject arm time pain 283 | ## 284 | ## 1 101 Morphine 0 10 285 | ## 2 102 Fentanyl 0 9 286 | ## 3 103 Morphine 0 9 287 | ## 4 104 Fentanyl 0 9 288 | ## 5 105 Morphine 0 9 289 | ## 6 106 Morphine 0 8 290 | ``` 291 | 292 | Next I want to calculate some summary statistics, and then plot these alongside the data. 293 | 294 | 295 | ```r 296 | # Summary stats 297 | pain_df <- group_by(pain, arm, time) %>% 298 | summarise(median = quantile(pain, 0.5), 299 | upperq = quantile(pain, 0.75), 300 | lowerq = quantile(pain, 0.25), 301 | mean = mean(pain), 302 | sd = sd(pain), 303 | n = n(), 304 | se = sd / sqrt(n-1), 305 | ul = mean + (1.96 * se), 306 | ll = mean - (1.96 * se)) %>% 307 | gather(type, pain, median:ll) 308 | 309 | # Plot the summary stats and the raw data 310 | ggplot(pain_df, aes(x = time, y = pain)) + 311 | geom_line(data = filter(pain_df, type == "upperq"), 312 | alpha = 0.6, color = "black", size = 1, linetype = "dashed") + 313 | geom_line(data = filter(pain_df, type == "mean"), 314 | alpha = 0.6, color = "black", size = 1) + 315 | geom_line(data = filter(pain_df, type == "lowerq"), 316 | alpha = 0.6, color = "black", size = 1, linetype = "dashed") + 317 | geom_boxplot(data = pain, aes(y = pain, group = time), 318 | color = "grey20", alpha = 0) + 319 | geom_beeswarm(data = pain, aes(group = time, y = pain), 320 | color = "white", shape = 1) + 321 | facet_wrap(~arm, nrow = 1) + 322 | theme_minimal() + 323 | theme(panel.grid = element_blank()) + 324 | xlab("Minutes post treatment") + 325 | ylab("Pain score") + 326 | scale_y_continuous(breaks = seq(0, 10, by = 2)) + 327 | scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 30, 60, 120)) + 328 | theme(panel.background = element_rect(fill = "grey")) + 329 | coord_cartesian(ylim = c(0, 10)) 330 | ``` 331 | 332 | ![](Equivalance_trials_files/figure-html/plot_raw_pain_data-1.png) 333 | 334 | ```r 335 | # Save a pdf version 336 | ggsave("plots/figure_1.pdf", height = 19.05, width = 33.86, 337 | units = "cm", scale = 0.8) 338 | ``` 339 | 340 | ### TOST for the primary outcome (10 minutes) 341 | 342 | Next we want to use the TOSTER package to carry out our two one-sided tests that we will use to evaluate if fentanyl is non-inferior to morphine. This means that we will accept that it's a little worse (e.g. is associated with higher pain scores), but not too much worse (not more than +0.36 SDs of pain worse - this is the margin). 343 | 344 | 345 | ```r 346 | m2 <- mean(filter(data, arm == "Morphine")$c_painscore10, na.rm = TRUE) 347 | m1 <- mean(filter(data, arm == "Fentanyl")$c_painscore10, na.rm = TRUE) 348 | sd2 <- sd(filter(data, arm == "Morphine")$c_painscore10, na.rm = TRUE) 349 | sd1 <- sd(filter(data, arm == "Fentanyl")$c_painscore10, na.rm = TRUE) 350 | 351 | TOSTtwo( 352 | m1 = m1, m2 = m2, sd1 = sd1, sd2 = sd2, n1 = 16, n2 = 15, 353 | low_eqbound_d = 0.36, 354 | high_eqbound_d = 0.36, 355 | alpha = 0.05, 356 | var.equal = FALSE, 357 | plot = TRUE 358 | ) 359 | ``` 360 | 361 | ![](Equivalance_trials_files/figure-html/tost_primary-1.png) 362 | 363 | ``` 364 | ## TOST results: 365 | ## t-value lower bound: -3.35 p-value lower bound: 0.999 366 | ## t-value upper bound: -3.35 p-value upper bound: 0.001 367 | ## degrees of freedom : 28.01 368 | ## 369 | ## Equivalence bounds (Cohen's d): 370 | ## low eqbound: 0.36 371 | ## high eqbound: 0.36 372 | ## 373 | ## Equivalence bounds (raw scores): 374 | ## low eqbound: 0.5952 375 | ## high eqbound: 0.5952 376 | ## 377 | ## TOST confidence interval: 378 | ## lower bound 90% CI: -2.413 379 | ## upper bound 90% CI: -0.387 380 | ## 381 | ## NHST confidence interval: 382 | ## lower bound 95% CI: -2.62 383 | ## upper bound 95% CI: -0.18 384 | ## 385 | ## Equivalence Test Result: 386 | ## The equivalence test was non-significant, t(28.01) = -3.351, p = 0.999, given equivalence bounds of 0.595 and 0.595 (on a raw scale) and an alpha of 0.05. 387 | ## Null Hypothesis Test Result: 388 | ## The null hypothesis test was significant, t(28.01) = -2.352, p = 0.026, given an alpha of 0.05. 389 | ## Based on the equivalence test and the null-hypothesis test combined, we can conclude that the observed effect is statistically different from zero and statistically not equivalent to zero. 390 | ``` 391 | 392 | # Two sample t-tests with unequal variances 393 | 394 | For our own learning, let's see if we can replicate what the TOSTER package does, and apply it to every time point. Below we will basically run 2 t-tests comparing the mean pain score between arms, at each time point. For one of these tests, we will set alpha to 0.05 (for a 95% CI), and for the other, we will set alpha to 0.10 (for a 90% CI, which is our two one-sided tests, each with alpha = 0.05). 395 | 396 | 397 | ```r 398 | # Function for running and reporting t-tests 399 | ttest_res <- function(x, data, level, ...){ 400 | form <- as.formula(paste0(x, " ~ arm")) 401 | x <- tidy(t.test(form, data, conf.level = level)) %>% 402 | mutate(time = x, level = level) %>% 403 | mutate(time = gsub("[[:alpha:]]", "", time)) %>% 404 | mutate(time = as.numeric(gsub("_", "", time))) 405 | 406 | return(x) 407 | } 408 | 409 | # Mean diffs with 95%CI (taken from t-test) 410 | ttests_95 <- list() 411 | for(i in 1:length(names(dplyr::select(data, contains("painscore"))))){ 412 | ttests_95[[i]] <- ttest_res( 413 | names(dplyr::select(data, contains("painscore")))[i], data, 0.95 414 | ) 415 | } 416 | 417 | ttests_95 <- do.call(bind_rows, ttests_95) 418 | 419 | # Mean diffs with 90%CI (taken from t-test) 420 | ttests_90 <- list() 421 | for(i in 1:length(names(dplyr::select(data, contains("painscore"))))){ 422 | ttests_90[[i]] <- ttest_res( 423 | names(dplyr::select(data, contains("painscore")))[i], data, 0.90 424 | ) 425 | } 426 | 427 | ttests_90 <- do.call(bind_rows, ttests_90) 428 | ``` 429 | 430 | Now let's plot the results. 431 | 432 | 433 | ```r 434 | ggplot(ttests_95, aes(x = time, y = estimate, ymax = conf.high, 435 | ymin = conf.low)) + 436 | geom_rect(aes(ymin = 0, ymax = Inf, xmin = -Inf, xmax = Inf), 437 | fill = "white", alpha = 0.8) + 438 | geom_hline(yintercept = -0.6, linetype = "dashed", color = "black") + 439 | geom_hline(yintercept = 0.6, linetype = "dashed", color = "black") + 440 | geom_hline(yintercept = 0, color = "black") + 441 | geom_hline(yintercept = 1, color = "grey") + 442 | geom_pointrange(color = "black", size = 2) + 443 | geom_linerange(data = ttests_90, color = "white", size = 1) + 444 | theme_minimal() + 445 | theme(panel.grid.major.x = element_blank(), 446 | panel.grid.minor.x = element_blank(), 447 | panel.grid.minor.y = element_blank()) + 448 | xlab("Minutes post treatment") + 449 | ylab("Difference in mean pain score (Fentanyl vs Morphine") + 450 | scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 30, 60, 120)) + 451 | theme(panel.background = element_rect(fill = "grey")) + 452 | annotate("text", y = -3.4, x = 100, label = "Favours Fentanyl") + 453 | annotate("text", y = 1.2, x = 100, label = "Favours Morphine") 454 | ``` 455 | 456 | ![](Equivalance_trials_files/figure-html/plot_mean_diffs-1.png) 457 | 458 | ```r 459 | ggsave("plots/figure_2.pdf", height = 19.05, width = 33.86, 460 | units = "cm", scale = 0.8) 461 | ``` 462 | 463 | And finally, a table of results. 464 | 465 | 466 | ```r 467 | # Take the long pain data, convert to wide, keep mean, n, sd by arm and time 468 | pain_df_wide <- spread(pain_df, type, pain) %>% 469 | dplyr::select(-ll, -ul, -median, -upperq, -lowerq, -se) %>% 470 | dplyr::select(arm, time, n, mean, sd) 471 | 472 | # Split the data by arm and cbind into dataset. 473 | 474 | df1 <- filter(pain_df_wide, arm == "Fentanyl") 475 | df2 <- filter(pain_df_wide, arm == "Morphine") 476 | names(df2) <- paste0(names(df2), "1") 477 | 478 | table_data <- bind_cols(df1, df2) %>% 479 | ungroup() %>% 480 | dplyr::select(-arm, -arm1, -time1) 481 | 482 | table_data <- full_join( 483 | table_data, 484 | dplyr::select(ttests_95, estimate, conf.low, conf.high, time), 485 | by = "time" 486 | ) 487 | 488 | tar <- c("mean", "sd", "mean1", "sd1", "estimate", "conf.low", "conf.high") 489 | table_data[tar] <- lapply(table_data[tar], round, 2) 490 | 491 | table_data <- mutate(table_data, effect = paste0( 492 | estimate, " (", conf.low, " to ", conf.high, ")" 493 | )) %>% 494 | dplyr::select(-estimate, -conf.low, -conf.high) 495 | 496 | knitr::kable( 497 | table_data, 498 | col.names = c( 499 | "Time (+min)", rep(c("N", "Mean", "SD"), 2), 500 | "Mean Difference (95%CI)" 501 | ), 502 | align = c("l", rep("c", ncol(table_data) - 1)) 503 | ) 504 | ``` 505 | 506 | 507 | 508 | |Time (+min) | N | Mean | SD | N | Mean | SD | Mean Difference (95%CI) | 509 | |:-----------|:--:|:----:|:----:|:--:|:----:|:----:|:-----------------------:| 510 | |0 | 15 | 8.20 | 1.26 | 16 | 8.38 | 0.96 | -0.18 (-1.01 to 0.66) | 511 | |5 | 15 | 6.47 | 1.92 | 16 | 7.00 | 1.67 | -0.53 (-1.86 to 0.8) | 512 | |10 | 15 | 4.60 | 1.55 | 16 | 6.00 | 1.75 | -1.4 (-2.61 to -0.19) | 513 | |15 | 15 | 3.67 | 1.91 | 16 | 5.50 | 2.25 | -1.83 (-3.37 to -0.3) | 514 | |20 | 15 | 3.20 | 1.93 | 16 | 5.06 | 2.17 | -1.86 (-3.37 to -0.35) | 515 | |30 | 15 | 2.53 | 1.85 | 16 | 4.81 | 2.29 | -2.28 (-3.8 to -0.76) | 516 | |60 | 15 | 1.93 | 1.49 | 16 | 3.81 | 2.37 | -1.88 (-3.33 to -0.43) | 517 | |120 | 15 | 1.67 | 1.72 | 16 | 3.31 | 1.96 | -1.65 (-3 to -0.29) | 518 | 519 | ### Challenge 520 | 521 | Try to use what you've seen in the last two tutorials to 1) simulate some data (hint: see sim_t_test_data from the last tutorial) from a two arm parallel trial with n = 40 where the effect of the new treatment on a normally distributed outcome (mean = 0, sd = 1) is 0.3 SDs worse than the standard treatment (i.e. higher scores is bad). Then test for non-inferiority using the TOST procedure based on a t-test with unequal variances and a NI margin of 0.4 SDs (hint: see tost_primary above). Then repeat your simulation many times to explore it's performance. Finally, increase the sample size to 100 and repeat the exercise. 522 | 523 | -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/new_interval_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/new_interval_1-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/new_interval_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/new_interval_2-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/non_inferiority-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/non_inferiority-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/non_superiority-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/non_superiority-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/plot_mean_diffs-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/plot_mean_diffs-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/plot_raw_pain_data-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/plot_raw_pain_data-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/shift-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/shift-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/shift_to_effect-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/shift_to_effect-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/start-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/start-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/t_two_sided-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/t_two_sided-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/tost_primary-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/Equivalance_trials_files/figure-html/tost_primary-1.png -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/data/data.csv: -------------------------------------------------------------------------------- 1 | subject,arm,a_painscore0,b_painscore5,c_painscore10,d_painscore15,e_painscore20,f_painscore30,g_painscore60,h_painscore120 2 | 101,Morphine,10,9,8,8,7,8,7,6 3 | 102,Fentanyl,9,5,3,3,2,2,1,2 4 | 103,Morphine,9,7,6,6,5,8,6,5 5 | 104,Fentanyl,9,7,3,1,1,0,2,0 6 | 105,Morphine,9,8,8,7,6,5,4,3 7 | 106,Morphine,8,3,2,2,1,1,0,0 8 | 107,Fentanyl,10,10,8,6,4,2,1,1 9 | 108,Fentanyl,7,7,6,5,3,3,1,0 10 | 109,Morphine,8,8,8,7,7,5,4,4 11 | 110,Fentanyl,10,8,7,7,6,6,5,4 12 | 201,Morphine,9,8,8,8,6,6,5,5 13 | 202,Fentanyl,10,9,4,2,2,1,2,1 14 | 203,Morphine,8,6,5,3,3,3,1,2 15 | 204,Fentanyl,7,4,4,1,0,0,0,0 16 | 205,Fentanyl,9,6,5,3,3,3,3,6 17 | 206,Morphine,7,5,5,6,7,3,0,0 18 | 207,Morphine,9,9,7,8,8,8,7,5 19 | 208,Fentanyl,7,4,3,3,4,2,2,1 20 | 209,Fentanyl,9,8,4,4,5,4,2,3 21 | 210,Morphine,10,8,6,5,5,4,4,3 22 | 301,Morphine,7,5,5,4,4,5,4,5 23 | 302,Fentanyl,7,4,4,4,4,4,1,1 24 | 303,Morphine,8,8,6,6,5,5,5,3 25 | 304,Morphine,8,8,6,6,6,6,6,5 26 | 305,Fentanyl,8,8,6,6,6,5,4,3 27 | 306,Fentanyl,7,5,4,4,3,2,1,1 28 | 307,Fentanyl,7,5,3,1,0,0,0,0 29 | 308,Morphine,9,6,3,0,0,0,0,0 30 | 309,Fentanyl,7,7,5,5,5,4,4,2 31 | 310,Morphine,8,8,7,6,6,5,4,4 32 | 401,Morphine,7,6,6,6,5,5,4,3 33 | -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/plots/figure2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/plots/figure2.pdf -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/plots/figure_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/plots/figure_1.pdf -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/plots/figure_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_2_Equivalance_Trials/plots/figure_2.pdf -------------------------------------------------------------------------------- /Unit_2_Equivalance_Trials/scripts/ggplot2_primer.R: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # EH6126: Advanced Clinical Trial Design and Analysis 3 | # Unit 2 4 | # Data analysis tutorial - ggplot primer 5 | ########################################################################### 6 | 7 | 8 | # Example using the graphics packages that comes with base R -------------- 9 | # Consider the following... 10 | 11 | set.seed(27) 12 | 13 | x <- rnorm(1:20) # Numeric 14 | y <- rnorm(1:20) # Numeric 15 | 16 | plot(x, y) 17 | 18 | z <- lm(y ~ x) # z is the output of the linear 19 | 20 | # Note that z is an object that contains a lot of information 21 | typeof(z) 22 | str(z) 23 | 24 | plot(z) 25 | 26 | 27 | # ggplot2 is loaded when the tidyverse package is called ------------------ 28 | 29 | library('tidyverse') 30 | 31 | # The key thing to note is that all elements of the plot can be modified 32 | 33 | # Let's look at an example data set that comes with ggplot 34 | 35 | diamonds <- diamonds 36 | 37 | # There are three main elements needed to generate a ggplot graph 38 | # A. The data 39 | # B. The geom function 40 | # C. The mappings 41 | 42 | # The first part of building a plot is the data 43 | 44 | plot_a <- ggplot(data = diamonds) # Creates a coordinate system 45 | plot_a # i.e. an empty graph 46 | 47 | # The second part is defining the coordinate system; 48 | # - the aesthetics of the data are to be plotted 49 | # - the geoms (geometric objects) that represent the data 50 | 51 | plot_b <- plot_a + 52 | aes(x = carat, y = price) # In ggplot2 we add layers to develop the plot 53 | plot_b 54 | 55 | plot_c <- plot_b + 56 | geom_point() 57 | plot_c 58 | 59 | # Using 'facet_wrap' we can create subplots 60 | 61 | plot_c + 62 | facet_wrap(~ clarity) 63 | 64 | # What this means for this weeks tutorial --------------------------------- 65 | # Darren creates the plot "g1" 66 | 67 | library('viridis') # Package for adding colour schemes 68 | 69 | g1 <- ggplot(data.frame(x = c(-4, 4)), # The data 70 | aes(x = x)) + # The x-axis values 71 | 72 | stat_function(fun = dnorm, # Probability density function for the normal distribution 73 | args = list(0, 1), # Mean 0 and SD of 1 74 | geom = "area", # Coloring under density curve 75 | fill = viridis(1), # What colour to use 76 | alpha = 0.3) + # Transparency to apply 77 | 78 | xlim(c(-4, 4)) + # x-axis limits 79 | xlab("z") + # x-axis label 80 | ylab("Density") + # y-axis label 81 | theme_minimal() + # plot formatting 82 | 83 | stat_function(xlim = c(1.96, 4), # Area +1.96 SD from mean to 4 SD from mean 84 | fun = dnorm, 85 | args = list(0, 1), 86 | geom = "area", 87 | fill = viridis(1)) + # Solid fill, no transparency 88 | stat_function(xlim = c(-4, -1.96), # Area -1.96 SD from mean to 4 SD from mean 89 | fun = dnorm, args = list(0, 1), 90 | geom = "area", 91 | fill = viridis(1)) + # Solid fill, no transparency 92 | 93 | geom_errorbarh(aes(x = 0, xmax = 1.96, xmin = -1.96, y = 0.1), # Defining our error bar 94 | height = 0.05, 95 | size = 2, 96 | color = viridis(1)) + 97 | 98 | geom_point(aes(x = 0, y = 0.1), 99 | size = 4, 100 | color = viridis(1)) + 101 | 102 | ggtitle(paste0("The proportion of the total area in the darker part of the distribution\n for t is ", 103 | signif(2 * pnorm(1.96, 0, 1, lower.tail = FALSE), 3))) 104 | 105 | # Added treatment effect -------------------------------------------------- 106 | 107 | g2 <- g1 + 108 | 109 | stat_function(fun = dnorm, 110 | args = list(2, 1), # This distribution has a mean of 2 111 | geom = "area", 112 | fill = viridis(1, direction = -1), # Defining a different colour 113 | alpha = 0.3) + 114 | 115 | xlim(c(-4, 6)) + 116 | xlab("z") + 117 | ylab("Density") + 118 | theme_minimal() + 119 | 120 | stat_function(xlim = c(2 + 1.96, 6), 121 | fun = dnorm, 122 | args = list(2, 1), 123 | geom = "area", 124 | fill = viridis(1, direction = -1)) + 125 | stat_function(xlim = c(-4, -1.96 + 2), 126 | fun = dnorm, 127 | args = list(2, 1), 128 | geom = "area", 129 | fill = viridis(1, direction = -1)) + 130 | 131 | geom_errorbarh(aes(x = 2, xmax = 2 + 1.96, xmin = 2 + -1.96, y = 0.15), 132 | height = 0.05, 133 | size = 2, 134 | color = viridis(1, direction = -1)) + 135 | geom_point(aes(x = 2, y = 0.15), 136 | size = 4, 137 | color = viridis(1, direction = -1)) + 138 | 139 | ggtitle("") 140 | 141 | g2 142 | 143 | -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Crossover Trials" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # Introduction 9 | 10 | For this tutorial we will be working with a dataset from a standard 2-period, 2-treatment AB:BA crossover trial of a treatment aimed at lowering blood pressure in people who usually have mildly-evaluated values. In other words, each person in the trial gets exposed to each intervention (active vs placebo), but in one of two possible sequences (active first vs placebo first). 11 | 12 | # Getting the data ready 13 | 14 | As usual, we will first load the necessary packages and bring in the dataset. 15 | 16 | ```{r setup, include = FALSE} 17 | 18 | knitr::opts_chunk$set(message = FALSE, warning = FALSE) 19 | 20 | # Install/load packages 21 | 22 | packs <- c("tidyverse", "knitr", "viridis", "broom", "lme4", "sjPlot", 23 | "summarytools", "readr", "flextable") 24 | install.packages(packs[!packs %in% installed.packages()]) 25 | lapply(packs, library, character.only = TRUE) 26 | 27 | # Import the dataset from github 28 | data <- read_csv("https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/master/Unit_3_Crossover_Trials/data/data.csv") 29 | 30 | ``` 31 | 32 | Have a look at the dataset. 33 | 34 | ```{r inspect_data_1} 35 | 36 | # View(data) 37 | 38 | # view(dfSummary(data)) 39 | 40 | ``` 41 | 42 | 43 | ```{r dfsummary_table1, eval = knitr::opts_knit$get("rmarkdown.pandoc.to") == "html"} 44 | 45 | print(summarytools::dfSummary(data, style = "grid", plain.ascii = FALSE), 46 | method = "render") 47 | 48 | ``` 49 | 50 | ```{r} 51 | 52 | # We should add labels for sex 53 | 54 | data$sex <- factor(data$sex, labels = c("Males", "Females")) 55 | 56 | # Challange: Based on the information available in the dataset, how could you 57 | # confirm that 0 = Males and 1 = Females, as I have coded it here? 58 | 59 | ``` 60 | 61 | 62 | We can see there are 4 SBP values per patient (row). These are the start and end values for each of the two periods. To visualize and analyze these data correctly, we need to convert the dataset so that it's "long", i.e. one row for each patient/time-point (more on why we do this below). 63 | 64 | ```{r reshape} 65 | 66 | # Being able to switch between wide and long datasets is important for both 67 | # modeling and plotting data. 68 | 69 | # "Wide" data: 70 | # ID Time1Measure T2M T3M T4M 71 | # 1 85 82 83 81 72 | # 2 45 96 32 99 73 | 74 | # "Long" data: 75 | # ID Time Measure 76 | # 1 1 85 77 | # 1 2 82 78 | # 1 3 83 79 | # 1 4 81 80 | 81 | # Reshape the data on 4 SBP values in order to plot the within period changes 82 | # by tx group, get missing values. See functions.R 83 | 84 | data_long <- gather(data, time, value, starts_with("sbp")) %>% 85 | select(subj_id, time, value,sequence, treatment_p1, treatment_p2, sex, 86 | everything()) %>% 87 | mutate(sequence = factor(sequence), treatment_p1 = factor(treatment_p1), 88 | treatment_p2 = factor(treatment_p2)) %>% 89 | arrange(subj_id) 90 | 91 | # view(dfSummary(data_long)) 92 | 93 | ``` 94 | 95 | Now you can see there are five timepoints (screening, plus the start and end values for each of the two periods), each with 83 observations, which is the number of study participants. 96 | 97 | Now let's clean up the data a bit. 98 | 99 | ```{r clean} 100 | 101 | data_long$time <- gsub("sbp__|sbp", "", data_long$time) # remove extraneous info 102 | 103 | # Just reordering the levels so they match time. This will help when we plot 104 | # the data. 105 | times <- c("b_p1", "ep_p1", "b_p2", "ep_p2") 106 | 107 | data_long <- mutate(data_long, time = factor(time, levels = times)) 108 | # table(data_long$time) 109 | 110 | # Create a new variable to reflect the period 111 | data_long$period[grepl("_p1", data_long$time)] <- "First" 112 | data_long$period[grepl("_p2", data_long$time)] <- "Second" 113 | 114 | # Create a new variable to reflect start (baseline) or end of period 115 | bp <- grepl("b_p", data_long$time) # Baseline times 116 | ep <- grepl("ep_p", data_long$time) # End times 117 | 118 | data_long$timing[bp] <- "Baseline" 119 | data_long$timing[ep] <- "EoP" 120 | 121 | # with(data_long, table(period, timing)) 122 | 123 | # These are the same info but we'll use them in the models below where I'll 124 | # explain why we want them split into 2 columns like this. 125 | data_long$bl[bp] <- data_long$value[bp] # Baseline SBPs 126 | data_long$ep[ep] <- data_long$value[ep] # End SBPs 127 | 128 | # Treatment indicator 129 | p1 <- data_long$period == "First" & !is.na(data_long$period) 130 | p2 <- data_long$period == "Second" & !is.na(data_long$period) 131 | data_long$tx[p1] <- data_long$treatment_p1[p1] 132 | data_long$tx[p2] <- data_long$treatment_p2[p2] 133 | 134 | data_long <- filter(data_long, !is.na(time)) %>% 135 | arrange(subj_id, period, timing) %>% 136 | select(subj_id, sequence, period, timing, tx, value, everything()) %>% 137 | mutate(period = factor(period), 138 | timing = factor(timing), 139 | tx = factor(tx)) 140 | 141 | data_long$time2 <- factor( 142 | data_long$time, 143 | labels = c("p1_b", "p1_ep", "p2_b", "p2_ep") 144 | ) 145 | 146 | # View(data_long) 147 | 148 | # view(dfSummary(data_long)) 149 | 150 | 151 | ``` 152 | 153 | Now plot the data in a way that shows the overall structure. 154 | 155 | ```{r cross_over_plot} 156 | 157 | ggplot(data_long, aes(x = time2, y = value, group = subj_id)) + 158 | 159 | geom_line(data = filter(data_long, as.numeric(time) < 3), alpha = 0.2, 160 | aes(color = treatment_p1)) + 161 | 162 | geom_smooth(data = filter(data_long, as.numeric(time) < 3), method = "lm", 163 | aes(color = treatment_p1, linetype = treatment_p1, 164 | group = treatment_p1), 165 | se = FALSE, size = 2) + 166 | 167 | geom_line(data = filter(data_long, as.numeric(time) > 2), alpha = 0.2, 168 | aes(color = treatment_p2)) + 169 | 170 | geom_smooth(data = filter(data_long, as.numeric(time) > 2), method = "lm", 171 | aes(color = treatment_p2, linetype = treatment_p2, 172 | group = treatment_p2), 173 | se = FALSE, size = 2) + 174 | 175 | scale_linetype(guide = FALSE) + 176 | 177 | theme_minimal() + 178 | 179 | scale_color_brewer("Tx", palette = "Set1") + 180 | 181 | theme(panel.grid.major = element_blank(), 182 | panel.grid.minor = element_blank()) + 183 | 184 | scale_x_discrete(labels = c("Start P1", "End P1", "Start P2", "End P2")) + 185 | 186 | xlab("") + 187 | ylab("SBP mmHg") 188 | 189 | ``` 190 | 191 | Here we can get a sense of the variability in SBP both within- and between-people. There seems to be quite a lot of both as it happens. We'll dig into this a bit more below. We can also see that the mean change for both groups in both periods is pretty similar. 192 | 193 | ```{r distribution_plot} 194 | 195 | # Distribution plot 196 | 197 | ggplot(data_long, aes(x = value, fill = tx, color = tx)) + 198 | geom_density(alpha = 0.7) + 199 | geom_rug() + 200 | scale_fill_brewer("Tx", palette = "Set1") + 201 | scale_color_brewer("Tx", palette = "Set1") + 202 | facet_wrap(~period + timing) + 203 | theme_minimal() + 204 | theme(panel.grid.major = element_blank(), 205 | panel.grid.minor = element_blank()) + 206 | ylab("") + 207 | xlab("SBP mmHg") 208 | 209 | ``` 210 | 211 | This is just another look at the overall distribution of SBP by period and treatment, showing a great deal of overlap. 212 | 213 | Note on long/wide data: 214 | 215 | A key concept in plotting with ggplot is that we map variables onto different aesthetics. Aesthetics are things like distances along the x and y axes, colors, and facets. We needed to convert our data from wide to long, because we can only map one variable onto any one aesthetic. So by going from wide to long, we turned our 4 time-specific SBP variables into 2 variables - one for time and one for the actual SBP value. Then we could map the SBP values to the y axis, and time to the x axis. As an exercise, you might sit with a pen and paper and try to figure out how to map 4 separate SBP variables onto a set of aesthetics that would make for a sensible plot (but don't take too much time, because you can't!). 216 | 217 | Being able to switch between long and wide also matters for modeling the data. For example, for a linear regression of an outcome where you want to also adjust for the baseline values of that variable, you need the data in a wide format, where each row is an observation (e.g. a patient) and there is only one row per observation. Thus the two measures at baseline and the end of the study are contained in two different columns. However, if we were doing a paired analysis, as we will below, we would need the data in a long format, where each row corresponds to a specific patient and time, and thus the information for the outcome and baseline measurements can be contained in a single column. 218 | 219 | # Between vs within-person variance 220 | 221 | The main advantage of a crossover trial is that you get to evaluate your treatment against the backdrop of the within-person variability of the outcome, which is usually less variable than the between-person variability that comes into play in a parallel trial. Remember, the variance is just the average squared-distance between each measurement and the mean; and the larger it is, the more "spread out" the data are from the mean. 222 | 223 | First, let's look at the between-person variability for the 4 SBP measurements. 224 | 225 | ```{r} 226 | 227 | data_long %>% 228 | group_by(time) %>% 229 | summarise( 230 | n = n(), 231 | variance = var(value, na.rm = TRUE) 232 | ) 233 | 234 | ``` 235 | Now, let's look at all the within-person variances. There are 83 of them, so we'll plot them to make it easier to take them all in. I've also added reference lines at 150 and 210, the lowest and highest of the between-person variances above. 236 | 237 | ```{r} 238 | 239 | data_long %>% 240 | group_by(subj_id) %>% 241 | summarise( 242 | n = n(), 243 | variance = var(value, na.rm = TRUE) 244 | ) %>% 245 | ggplot(aes(x = variance)) + 246 | geom_histogram() + 247 | geom_vline(xintercept = c(150, 210)) 248 | 249 | 250 | ``` 251 | 252 | It should be clear that the within-person variances are well below the observed between-person variances. 253 | 254 | We can also look at this from the perspective of a linear model, and adjust for person (just like we might adjust for any other categorical variable). This model will thus report 82 (83 - 1) different means (i.e. person-specific intercepts). Take note of the R2 value, which is the % of the outcome variance explained. 255 | 256 | ```{r} 257 | 258 | lm(value ~ subj_id, data = data_long) %>% 259 | tab_model() 260 | 261 | ``` 262 | 263 | Once we've scrolled to the bottom, you'll see that the model explains 75% of the variability. Just like we saw in the [change score tutorial](https://github.com/CRFCSDAU/EH6126_data_analysis_tutorials/tree/master/Unit_1_Review/Change_scores), adjusting for person like this (we call this using a "fixed-effect" for person), essentially eliminates all of that variability, leaving less variability against which to try and detect our treatment effect. Below we will use what's called a random effect for person, which is just a more sophisticated way of doing this. 264 | 265 | # Tests 266 | 267 | Now we can model the effect of the treatment, though I suspect that you already have some idea of what it might be! Before we move on to full models though, let's have a look in terms of statistical tests - [namely the t-test and paired t-test](https://www.bmj.com/about-bmj/resources-readers/publications/statistics-square-one/7-t-tests). 268 | 269 | In this design we have a set of end-of-period SBP values for 2 periods. Thus every participant has 2 outcomes - one at the end of a period where they were on the active treatment and one where they were on the control. 270 | 271 | First let's ignore the fact that we have paired measures (2 per participant) and just use the good old t-test (since our outcome is continuously measured without any excessive skewness). 272 | 273 | ```{r} 274 | 275 | # Get rid of period specific start values 276 | dta <- filter(data_long, timing == "EoP") 277 | 278 | t.test(dta$value[dta$tx == 1], dta$value[dta$tx == 2]) 279 | 280 | ``` 281 | 282 | Very quickly, what happens if we use a linear regression instead of the t-test? 283 | 284 | ```{r} 285 | 286 | summary(lm(value ~ tx, data = dta)) 287 | 288 | ``` 289 | 290 | Same! All tests can be recast as special cases of a linear model, which is why we focus on models so much. 291 | 292 | Now let's look at the paired t-test. This will respect the fact that we have multiple measurements (2) per person. 293 | 294 | ```{r} 295 | 296 | dta <- filter(data_long, timing == "EoP") %>% 297 | arrange(subj_id) 298 | # I am just making sure they are in the right order since the function for the 299 | # paired t-test just wants 2 vectors of values and *assumes* they are in the 300 | # same order. So we want to double check this. 301 | 302 | # table(dta$subj_id[dta$tx == 1] == dta$subj_id[dta$tx == 2]) # All true 303 | 304 | t.test(dta$value[dta$tx == 1], dta$value[dta$tx == 2], paired = TRUE) 305 | 306 | ``` 307 | 308 | Same basic result (no difference in the means comparing the two sets of observations), but you can see that the CI is now much narrower. This is because the paired t-test is based on the within-participant difference in outcomes under each treatment regime. By taking differences like this, we eliminate the between person variation. For example, the following two pairs of values [(150, 153); (89, 92)] are different in that one pair is clearly much larger than the other, but the within-pair difference is the same. So let's make a simple table with the values under each treatment and their differences, by patient. 309 | 310 | ```{r} 311 | 312 | table <- data_frame( 313 | id = unique(dta$subj_id), 314 | tx_1_sbp = dta$value[dta$tx == 1], 315 | tx_2_sbp = dta$value[dta$tx == 2], 316 | diff = tx_2_sbp - tx_1_sbp 317 | ) 318 | 319 | flextable(table) 320 | 321 | ``` 322 | 323 | Now lets look at the standard errors of each of the 3 columns of outcome data (the SBP under each tx, and their difference) 324 | 325 | ```{r} 326 | 327 | map(table[2:4], function(x)sd(x, na.rm = TRUE)/sqrt(83)) 328 | 329 | ``` 330 | 331 | Challenge: Go back and compare those SE values to the CIs from the unpaired and paired t-tests, remembering that the upper limit of the 95% CI will be the estimate + (1.96 * SE). Do they match? 332 | 333 | This is the strength of the crossover design - a smaller SE, which means a more precise estimate of our treatment effect. 334 | 335 | 336 | # Models 337 | 338 | Now we've demonstrated the basics of why the crossover design works by looking at the paired t-test. However, we also want to incorporate other information, like covariates. To do this, we'll used what are called [mixed-effects models or multi-level models](https://en.wikipedia.org/wiki/Multilevel_model). They are pretty much just like the linear regression models you are used to, but they include an effect for "clustering", which is where you have more than one observation per unit (e.g. 2 measurements for each participant). Just like the regression model above where we included a fixed effect for participant (effectively fitting a separate intercept for each patient), using the mixed-effects model also removes the between-participant variability from the outcome, resulting in effect estimates with narrower SEs. 339 | 340 | Now we'll use these models to make different adjustments for things like prognostic covariates (sex), period specific effects, and period-specific baseline (start) values. 341 | 342 | ```{r models} 343 | 344 | # Models 345 | 346 | # Re-configure the data so we can adjust for period-specific baselines if we 347 | # want to. 348 | 349 | me_sbp_df <- full_join( 350 | select(data_long, subj_id, sex, period, bl, tx) %>% filter(!is.na(bl)), 351 | select(data_long, subj_id, sex, period, ep, tx) %>% filter(!is.na(ep)), 352 | by = c("subj_id", "period", "tx", "sex") 353 | ) %>% 354 | mutate(bl = scale(bl, scale = FALSE)) 355 | 356 | 357 | # 4 models. Adjust for sex; + period; +tx*period interaction; +bl 358 | me_sbp <- lmer(ep ~ tx + sex + (1 | subj_id), 359 | data = me_sbp_df) 360 | me_sbp_p <- lmer(ep ~ tx + sex + period + (1 | subj_id), 361 | data = me_sbp_df) 362 | me_sbp_int <- lmer(ep ~ tx * period + sex + (1 | subj_id), 363 | data = me_sbp_df) 364 | me_sbp_bl <- lmer(ep ~ tx + sex + period + bl + (1 | subj_id), 365 | data = me_sbp_df) 366 | 367 | labs <- c("Intercept", "Active Tx (vs Control)", "Female (vs Male)", 368 | "Period (2 vs 1)") 369 | 370 | tab_model( 371 | me_sbp, me_sbp_p, me_sbp_bl, 372 | show.se = TRUE, 373 | p.val = "kr", 374 | pred.labels = c(labs, "SBP Baseline"), 375 | dv.labels = c("Unadjusted", "+ Period effect", "+ Baselines") 376 | ) 377 | 378 | ``` 379 | 380 | When interpreting these tables, the top half (every thing above "Random Effects") is what you are more used to seeing in a regression table. These are the "fixed effects" for all the predictors in the model, and they are interpreted the same way you are used to: the difference in the mean outcome for a 1 unit increase in the predictor. So based on the first model, women have a mean SBP that is 2.94 mmHg lower than males (though the estimate has a very wide CI and a large p-value). 381 | 382 | All of the stuff below "Random Effects" are what we call the variance components of the model. Looking at the first model, you see $\sigma^{2}$ and $\tau{00}$. The latter of these, 144.89 is the variance explained by the random effect of patient (subj_id); while $\sigma^{2}$ is the residual variance (the outcome variance not explained by the patient level random effect). The ratio of $\tau{00}$ to the total variance ($\tau{00} + \sigma^{2}$) is 0.71, which is labeled as the ICC, which we can now interpret at the % of total variance explained by the patient level effect. This is, not coincidentally, close to the 75% of variance explained in the regression model we looked at before with a fixed effect for person. 383 | 384 | Returning back to the fixed effects of the model, they confirm what we probably should have expected from the plotting, which is that there was no appreciable effect on the outcome. But just for fun, now we will add an effect. We are going to make it pretty big, equal to -10 mmHg (almost a full SD of the observed SBP values), and add it to the active tx end of period values. 385 | 386 | ```{r} 387 | 388 | effect <- -10 389 | 390 | data_long$value_2 <- data_long$value 391 | 392 | tar <- data_long$tx == 2 & data_long$timing == "EoP" 393 | 394 | data_long$value_2[tar] <- data_long$value_2[tar] + effect 395 | 396 | data_long$ep_2 <- data_long$ep 397 | 398 | data_long$ep_2[tar] <- data_long$ep_2[tar] + effect 399 | 400 | ``` 401 | 402 | Now just repeat everything we did before, replacing value_2 for value. 403 | 404 | ```{r cross_over_plot_2} 405 | 406 | ggplot(data_long, aes(x = time2, y = value_2, group = subj_id)) + 407 | geom_line(data = filter(data_long, as.numeric(time) < 3), alpha = 0.2, 408 | aes(color = treatment_p1)) + 409 | geom_smooth(data = filter(data_long, as.numeric(time) < 3), method = "lm", 410 | aes(color = treatment_p1, linetype = treatment_p1, 411 | group = treatment_p1), 412 | se = FALSE, size = 2) + 413 | geom_line(data = filter(data_long, as.numeric(time) > 2), alpha = 0.2, 414 | aes(color = treatment_p2)) + 415 | geom_smooth(data = filter(data_long, as.numeric(time) > 2), method = "lm", 416 | aes(color = treatment_p2, linetype = treatment_p2, 417 | group = treatment_p2), 418 | se = FALSE, size = 2) + 419 | scale_linetype(guide = FALSE) + 420 | theme_minimal() + 421 | scale_color_brewer("Tx", palette = "Set1") + 422 | theme(panel.grid.major = element_blank(), 423 | panel.grid.minor = element_blank()) + 424 | scale_x_discrete(labels = c("Start P1", "End P1", "Start P2", "End P2")) + 425 | xlab("") + 426 | ylab("SBP mmHg") 427 | 428 | ``` 429 | 430 | 431 | ```{r distribution_plot_2} 432 | 433 | ggplot(data_long, aes(x = value_2, fill = tx, color = tx)) + 434 | geom_density(alpha = 0.7) + 435 | geom_rug() + 436 | scale_fill_brewer("Tx", palette = "Set1") + 437 | scale_color_brewer("Tx", palette = "Set1") + 438 | facet_wrap(~period + timing) + 439 | theme_minimal() + 440 | theme(panel.grid.major = element_blank(), 441 | panel.grid.minor = element_blank()) + 442 | ylab("") + 443 | xlab("SBP mmHg") 444 | 445 | ``` 446 | 447 | 448 | ```{r models_2} 449 | 450 | # Models 451 | 452 | # Re-configure the data so we can adjust for period-specific baselines if we 453 | # want to. 454 | 455 | me_sbp_df <- full_join( 456 | select(data_long, subj_id, sex, period, bl, tx) %>% filter(!is.na(bl)), 457 | select(data_long, subj_id, sex, period, ep_2, tx) %>% filter(!is.na(ep_2)), 458 | by = c("subj_id", "period", "tx", "sex") 459 | ) %>% 460 | mutate(bl = scale(bl, scale = FALSE)) 461 | 462 | 463 | # 4 models. Adjust for sex; + period; +tx*period interaction; +bl 464 | me_sbp <- lmer(ep_2 ~ tx + sex + (1 | subj_id), 465 | data = me_sbp_df) 466 | me_sbp_p <- lmer(ep_2 ~ tx + sex + period + (1 | subj_id), 467 | data = me_sbp_df) 468 | me_sbp_int <- lmer(ep_2 ~ tx * period + sex + (1 | subj_id), 469 | data = me_sbp_df) 470 | me_sbp_bl <- lmer(ep_2 ~ tx + sex + period + bl + (1 | subj_id), 471 | data = me_sbp_df) 472 | 473 | labs <- c("Intercept", "Treatment", "Sex", "Period") 474 | 475 | tab_model( 476 | me_sbp, me_sbp_p, me_sbp_bl, 477 | p.val = "kr", 478 | pred.labels = c(labs, "SBP Baseline"), 479 | dv.labels = c("Unadjusted", "+ Period effect", "+ Baselines") 480 | ) 481 | 482 | ``` 483 | 484 | 485 | 486 | -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/cross_over_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/cross_over_plot-1.png -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/cross_over_plot_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/cross_over_plot_2-1.png -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/distribution_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/distribution_plot-1.png -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/distribution_plot_2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/distribution_plot_2-1.png -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_3_Crossover_Trials/Crossover_trials_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /Unit_3_Crossover_Trials/data/data.csv: -------------------------------------------------------------------------------- 1 | subj_id,sequence,treatment_p1,treatment_p2,sex,age_screening,days_p1,days_p2,height_m,weight_kg_screen,bmi_screening,sbpscreening,dbpscreening,sbp__b_p1,sbp__ep_p1,sbp__b_p2,sbp__ep_p2 2 | R001,1,1,0,0,68,43,48,1.64,76,28.256989886972047,149,92,146,152,145,142 3 | R002,1,1,0,0,54,43,42,1.7,85.5,29.584775086505193,127,89,NA,129,130,130 4 | R003,1,1,0,0,61,46,35,1.76,97.5,31.475981404958677,131,79,140,134,140,128 5 | R004,1,1,0,0,60,42,43,1.78,88.5,27.93207928291882,146,96,141,166,150,166 6 | R005,0,0,1,0,63,35,42,1.73,87.3,29.169033378996957,140,82,146,143,149,133 7 | R006,0,0,1,0,65,42,39,1.73,74,24.725182932941294,145,87,134,140,145,145 8 | R007,1,1,0,0,56,41,44,1.7,85.8,29.688581314878896,148,90,139,137,143,141 9 | R008,1,1,0,0,60,43,42,1.67,72.2,25.888343074330383,130,67,131,106,126,117 10 | R009,1,1,0,0,55,41,48,1.71,94.1,32.18084196846893,153,84,153,152,141,145 11 | R010,1,1,0,0,50,43,40,1.84,91.8,27.114839319470697,133,84,141,131,136,120 12 | R011,1,1,0,0,60,44,49,1.78,87.3,27.553339224845345,139,87,149,135,148,147 13 | R012,1,1,0,0,47,53,48,1.69,86.1,30.14600329120129,132,71,142,138,131,144 14 | R013,0,0,1,0,57,35,42,1.81,90.9,27.74640578736913,155,105,160,178,159,174 15 | R014,1,1,0,0,59,34,42,1.73,75,25.05930702662969,154,104,154,149,170,149 16 | R015,0,0,1,0,56,36,42,1.7,78.1,27.024221453287197,141,91,140,138,140,128 17 | R016,0,0,1,0,58,37,38,1.69,80.9,28.32533874864326,141,95,160,158,151,151 18 | R017,0,0,1,0,65,49,42,1.74,73.1,24.144536926938827,130,88,142,146,134,131 19 | R018,0,0,1,0,63,50,43,1.82,82.2,24.815843497162177,135,88,130,128,129,133 20 | R019,1,1,0,0,46,50,44,1.81,81.9,24.9992368975306,146,103,129,134,122,130 21 | R020,1,1,0,0,56,42,42,1.82,96.8,29.223523729018233,129,79,141,145,135,141 22 | R021,1,1,0,0,55,43,47,1.7,83.6,28.92733564013841,131,89,123,116,116,115 23 | R022,0,0,1,0,64,42,40,1.78,74,23.355636914530994,138,85,120,124,131,128 24 | R023,0,0,1,0,51,42,42,1.74,99.7,32.930373893513014,134,77,126,123,117,125 25 | R024,1,1,0,0,63,43,42,1.69,74.6,26.119533629774867,142,89,139,163,152,155 26 | R025,0,0,1,1,51,42,39,1.63,87.9,33.08366893748354,148,94,136,133,130,148 27 | R026,1,1,0,1,60,39,37,1.5,65.7,29.200000000000003,128,71,109,110,116,118 28 | R027,1,1,0,1,67,43,42,1.6,95.3,37.22656249999999,138,97,131,145,114,129 29 | R028,0,0,1,1,62,42,37,1.62,76,28.959000152415786,137,89,143,128,133,153 30 | R029,1,1,0,1,57,68,NA,1.63,82.7,31.12650080921375,142,82,130,114,NA,NA 31 | R030,1,1,0,1,51,44,49,1.56,70,28.763971071663377,142,89,132,129,125,150 32 | R031,0,0,1,1,59,51,47,1.71,77.3,26.43548442255737,131,79,138,129,124,127 33 | R032,0,0,1,1,62,42,46,1.55,56.8,23.642039542143596,135,86,117,119,141,123 34 | R033,1,1,0,1,50,35,48,1.78,80,25.24933720489837,136,89,111,121,112,104 35 | R034,1,1,0,1,52,42,42,1.61,67.4,26.00208325296092,132,98,155,141,124,143 36 | R035,1,1,0,1,70,42,49,1.63,62.2,23.410741841996312,130,77,127,125,119,125 37 | R036,0,0,1,1,56,35,50,1.65,74.3,27.291092745638203,137,88,154,142,137,152 38 | R037,0,0,1,1,57,42,49,1.65,63.8,23.434343434343436,140,95,145,137,147,148 39 | R038,0,0,1,1,55,35,42,1.6,68.2,26.640624999999996,145,101,134,151,134,133 40 | R039,1,1,0,1,66,49,42,1.7,65.4,22.62975778546713,137,74,136,142,124,131 41 | R040,0,0,1,1,69,49,42,1.62,53.5,20.385611949397955,143,93,141,130,134,124 42 | R041,0,0,1,1,52,42,42,1.69,81.1,28.39536430797241,144,94,152,150,143,139 43 | R042,1,1,0,1,61,42,42,1.7,67.7,23.425605536332185,143,97,125,136,130,123 44 | R043,0,0,1,1,61,42,44,1.58,57.6,23.07322544464028,132,108,121,121,133,117 45 | R044,1,1,0,1,65,43,38,1.63,63.6,23.937671722684332,147,90,141,164,136,149 46 | R045,1,1,0,1,56,42,45,1.7,97,33.564013840830455,147,91,158,154,138,159 47 | R046,0,0,1,1,57,35,41,1.57,91.2,36.99947259523713,124,93,142,140,124,126 48 | R047,0,0,1,1,55,35,42,1.62,64.3,24.500838286846513,138,96,144,137,136,145 49 | R048,1,1,0,1,50,42,42,1.72,81.6,27.58247701460249,125,82,117,128,135,134 50 | R049,1,1,0,0,51,42,43,1.72,75.8,25.621957815035156,126,94,130,123,126,140 51 | R050,1,1,0,0,56,42,40,1.76,91,29.3775826446281,135,87,126,132,131,125 52 | R051,0,0,1,0,53,42,42,1.73,97.6,32.610511543987435,145,104,141,130,132,133 53 | R052,0,0,1,0,52,42,42,1.87,106,30.312562555406213,126,92,123,134,126,126 54 | R053,0,0,1,0,58,47,42,1.83,89.7,26.784914449520734,155,102,149,169,159,136 55 | R054,1,1,0,0,50,43,42,1.78,93.8,29.60484787274334,131,78,117,116,117,127 56 | R055,1,1,0,0,52,42,42,1.79,79.2,24.71832964014856,144,91,132,134,122,119 57 | R056,0,0,1,0,65,42,45,1.77,81.9,26.141913243320886,142,90,132,132,137,135 58 | R057,0,0,1,0,65,42,45,1.81,68.2,20.81743536522084,133,81,143,152,144,130 59 | R058,0,0,1,0,50,42,42,1.79,97.1,30.304921818919507,133,76,133,112,118,126 60 | R059,1,1,0,0,49,42,41,1.83,104.1,31.084833826032423,140,87,130,133,137,125 61 | R060,0,0,1,0,50,42,43,1.73,100.8,33.6797086437903,137,91,132,NA,113,124 62 | R061,0,0,1,0,45,42,42,1.82,106.3,32.09153483878758,156,105,155,145,159,143 63 | R062,0,0,1,1,62,48,41,1.64,67.5,25.09666864961333,149,94,134,156,156,163 64 | R063,1,1,0,0,57,42,42,1.78,81,25.5649539199596,152,96,142,145,138,155 65 | R064,0,0,1,1,63,42,48,1.58,80.2,32.1262618170165,147,92,133,145,135,152 66 | R065,0,0,1,0,59,40,42,1.8,102.9,31.75925925925926,140,92,137,136,143,141 67 | R066,1,1,0,0,54,42,42,1.8,99.2,30.61728395061728,163,100,147,146,148,146 68 | R067,0,0,1,1,56,42,42,1.64,64.8,24.092801903628796,135,84,121,124,130,111 69 | R068,0,0,1,0,54,42,44,1.72,89.3,30.18523526230395,146,99,155,146,148,143 70 | R069,1,1,0,0,61,37,34,1.8,96.4,29.753086419753085,171,92,166,171,152,169 71 | R070,0,0,1,0,66,42,42,1.75,80.1,26.155102040816324,144,81,138,131,129,128 72 | R071,0,0,1,0,63,43,40,1.71,85.3,29.171368968229544,166,112,167,143,144,148 73 | R072,1,1,0,1,67,42,39,1.49,70,31.530111256249718,145,90,132,143,142,144 74 | R073,1,1,0,1,66,42,42,1.54,71.5,30.148423005565864,126,81,125,130,134,146 75 | R074,0,0,1,1,64,41,42,1.49,60.3,27.160938696455112,170,96,158,151,159,169 76 | R075,0,0,1,1,47,41,44,1.76,71.2,22.985537190082646,130,83,139,128,137,127 77 | R076,1,1,0,1,56,42,42,1.67,58.4,20.94015561690989,128,88,132,134,145,134 78 | R077,0,0,1,1,50,38,33,1.6,72.9,28.476562499999996,137,89,132,126,123,120 79 | R078,1,1,0,1,65,42,47,1.66,81.4,29.53984613151401,141,82,156,144,137,132 80 | R079,0,0,1,1,64,44,43,1.59,62.7,24.801234128396818,129,86,130,120,140,125 81 | R080,0,0,1,1,68,41,42,1.68,63.7,22.56944444444445,133,83,113,107,118,133 82 | R081,1,1,0,1,49,42,47,1.61,72.2,27.853863662667333,153,97,131,144,126,134 83 | R082,1,1,0,0,58,46,45,1.79,93.3,29.11894135638713,152,97,140,137,146,133 84 | R083,0,0,1,0,54,43,47,1.79,91.3,28.494741112949033,141,79,129,134,130,124 85 | -------------------------------------------------------------------------------- /Unit_4_Cluster_randomized_trials/Cluster_randomized_trials.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cluster randomized trials" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | ```{r setup, include = FALSE} 9 | 10 | knitr::opts_chunk$set(message = FALSE, warning = FALSE) 11 | 12 | # Install/load packages 13 | 14 | packs <- c("tidyverse", "knitr", "viridis", "broom", "pwr") 15 | install.packages(packs[!packs %in% installed.packages()]) 16 | lapply(packs, library, character.only = TRUE) 17 | 18 | set.seed(1209) 19 | 20 | ``` 21 | 22 | ```{r independent} 23 | 24 | data_frame( 25 | color = rnorm(1000), 26 | x = rnorm(1000), 27 | y = rnorm(1000) 28 | ) %>% 29 | ggplot(aes(x, y, color = color)) + 30 | geom_point(size = 2) + 31 | xlab("") + 32 | ylab("") + 33 | scale_color_viridis(guide = FALSE) 34 | 35 | ``` 36 | 37 | ```{r dependent} 38 | 39 | data_frame( 40 | x = rnorm(1000), 41 | y = rnorm(1000), 42 | color = x + y, 43 | ) %>% 44 | ggplot(aes(x, y, color = color)) + 45 | geom_point(size = 2) + 46 | xlab("") + 47 | ylab("") + 48 | scale_color_viridis(guide = FALSE) 49 | 50 | ``` 51 | 52 | ```{r clustered_data_1} 53 | 54 | df <- data_frame( 55 | cluster = rep(1:10, each = 100), 56 | y = rnorm(1000) 57 | ) 58 | 59 | ``` 60 | 61 | ```{r clustered_icc_1} 62 | 63 | tab_model(lmer(y ~ (1|cluster), data = df)) 64 | # summary(lmer(y ~ (1|cluster), data = df)) 65 | 66 | ``` 67 | 68 | ```{r clustered_plot_1} 69 | 70 | ggplot(df, aes(x = cluster, y = y, color = y)) + 71 | geom_point(size = 3, alpha = 0.6) + 72 | geom_line(aes(group = cluster), color = "grey50") + 73 | scale_color_viridis(guide = FALSE) + 74 | ylab("Outcome") + 75 | xlab("Cluster") + 76 | ggtitle("ICC = ~0") 77 | 78 | ``` 79 | 80 | ```{r clustered_data_2} 81 | 82 | df_2 <- data_frame( 83 | mean = rep(rnorm(10, 0, 2), each = 100), 84 | cluster = rep(1:10, each = 100), 85 | y = rnorm(1000) + mean 86 | ) 87 | 88 | ``` 89 | 90 | ```{r clustered_icc_2} 91 | 92 | tab_model(lmer(y ~ (1|cluster), data = df_2)) 93 | # summary(lmer(y ~ (1|cluster), data = df_2)) 94 | 95 | ``` 96 | 97 | ```{r clustered_plot_2} 98 | ``` 99 | 100 | 101 | ```{r clustered_plot_2} 102 | ggplot(df_2, aes(x = cluster, y = y, color = y)) + 103 | geom_point(size = 3, alpha = 0.6) + 104 | geom_line(aes(group = cluster), color = "grey50") + 105 | scale_color_viridis(guide = FALSE) + 106 | ylab("Outcome") + 107 | xlab("Cluster") + 108 | ggtitle("ICC = 0.58") 109 | 110 | ``` 111 | 112 | ```{r simple_power} 113 | 114 | pwr.t.test(d = 0.5, sig.level = .05, power = 0.90) 115 | 116 | ``` 117 | ```{r sim_sample_data} 118 | 119 | sample <- data_frame( 120 | rho = rep(c(0.01, 0.02, 0.05), each = 20), 121 | m = rep(seq(5, 100, by = 5), times = 3), 122 | de = 1 + (m - 1)*rho, 123 | n = 190, 124 | clusters = n * de / m, 125 | samp = n * de 126 | ) 127 | 128 | ``` 129 | 130 | ```{r de_plot} 131 | 132 | ggplot(sample, aes(y = samp, x = m, color = clusters, group = rho, 133 | shape = factor(rho))) + 134 | geom_line() + 135 | geom_point(size = 3) + 136 | theme_minimal() + 137 | scale_shape("ICC") + 138 | scale_x_continuous(breaks = seq(5, 100, by = 5)) + 139 | scale_color_viridis("Clusters needed", direction = -1) + 140 | ggtitle("Randomized design n = 100") + 141 | ylab("Cluster RCT n") + 142 | xlab("Average cluster size") + 143 | theme(panel.grid.minor.x = element_blank()) 144 | 145 | ``` 146 | 147 | 148 | -------------------------------------------------------------------------------- /Unit_4_Cluster_randomized_trials/Cluster_randomized_trials.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cluster randomized trials" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Unit_4_Cluster_randomized_trials/Cluster_randomized_trials_files/figure-html/dependent-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_4_Cluster_randomized_trials/Cluster_randomized_trials_files/figure-html/dependent-1.png -------------------------------------------------------------------------------- /Unit_4_Cluster_randomized_trials/Cluster_randomized_trials_files/figure-html/independent-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/Unit_4_Cluster_randomized_trials/Cluster_randomized_trials_files/figure-html/independent-1.png -------------------------------------------------------------------------------- /collapsability/collapsability.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Alerts" 3 | author: '' 4 | date: '' 5 | output: 6 | html_document: 7 | code_download: yes 8 | df_print: paged 9 | keep_md: yes 10 | theme: flatly 11 | toc: yes 12 | toc_float: yes 13 | pdf_document: 14 | toc: yes 15 | --- 16 | 17 | ```{r setup, include = FALSE} 18 | 19 | knitr::opts_chunk$set( 20 | echo = FALSE, message = FALSE, warning = FALSE, fig.width = 6 * 1.67, 21 | fig.height = 6 22 | ) 23 | 24 | ``` 25 | 26 | ```{r} 27 | proj <- rprojroot::find_rstudio_root_file() 28 | # knitr::opts_knit$set(root.dir = proj) 29 | ``` 30 | 31 | 32 | ```{r packages} 33 | 34 | library(tidyverse) 35 | # library(mgcv) 36 | # library(lubridate) 37 | # library(readxl) 38 | # library(ggplot2) 39 | # library(ggrepel) 40 | # library(janitor) 41 | # library(viridis) 42 | # library(surveillance) 43 | # library(testthat) 44 | # library(spdep) 45 | # library(qcc) 46 | # library(segmented) 47 | # library(prediction) 48 | # library(MASS); select <- dplyr::select 49 | 50 | ``` 51 | 52 | # Introduction 53 | 54 | - Binary outcomes 55 | 56 | - Effects of interventions on binary outcomes 57 | 58 | - Different ways of representing potential effects 59 | 60 | - Risk difference 61 | 62 | - Risk ratio 63 | 64 | - Odds ratio 65 | 66 | # Set-up 67 | 68 | Y is a binary outcome (0, 1) 69 | 70 | X is a binary indicator for treatment (0, 1) that is randomly allocated with a fair coin (50% of either outcome). 71 | 72 | C is a covariate uniformly distributed between -10 and 10 73 | 74 | We want to know the probability of Y based on the logistic model of: 75 | 76 | logit(Prob(Y\|X, C)) = log(10)\*X + C 77 | 78 | Prob = Odds / 1 + Odds `r 1 / (1+1)` Odds = Prob / 1 - Prob `r 0.5 / (1 - 0.5)` 79 | 80 | ```{r, echo = TRUE} 81 | 82 | to_odds <- function(prob){prob / (1 - prob)} 83 | to_prob <- function(odds){odds / (1 + odds)} 84 | logit <- function(prob){log(prob / (1 - prob))} 85 | invlogit <- function(log_odds){to_prob(exp(log_odds))} 86 | 87 | invlogit(logit(0.99)) == 0.99 88 | 89 | ``` 90 | 91 | ```{r, echo = TRUE} 92 | 93 | cov_1 <- seq(-10, 10, 0.1) 94 | 95 | df <- data_frame( 96 | arm = c(rep(0, length(cov_1)), rep(1, length(cov_1))), 97 | cov_1 = rep(cov_1, 2), 98 | y = log(10) * arm + cov_1 99 | ) %>% 100 | mutate(arm = factor(arm, labels = c("Control", "Active"))) 101 | 102 | ``` 103 | 104 | ```{r} 105 | 106 | ggplot(df, aes(x = cov_1, y = y, group = arm, color = arm)) + 107 | geom_line(size = 1) + 108 | scale_color_brewer("", palette = "Set1") + 109 | ylab("log odds of Y (i.e. logit(Y)") + 110 | xlab("Covariate value") + 111 | geom_segment( 112 | x = -5, y = -5, 113 | xend = -5, yend = -5 + log(10), 114 | color = "black", line_type = "dashed" 115 | ) + 116 | theme_minimal() 117 | 118 | ``` 119 | 120 | The vertical difference between the 2 lines is log(10) or \~ 2.3. 121 | 122 | ```{r} 123 | 124 | ggplot(df, aes(x = cov_1, y = invlogit(y), group = arm, color = arm)) + 125 | geom_line(size = 1) + 126 | scale_color_brewer("", palette = "Set1") + 127 | ylab("P(Y)") + 128 | xlab("Covariate value") + 129 | geom_segment( 130 | x = -5, y = -5, 131 | xend = -5, yend = -5 + log(10), 132 | color = "black", line_type = "dashed" 133 | ) + 134 | theme_minimal() 135 | 136 | ``` 137 | 138 | ```{r, fig.width = 8} 139 | 140 | # knitr::include_graphics(paste0(proj, "/collapsability/figs/fig_1_a.JPG")) 141 | knitr::include_graphics("figs/fig_1_a.JPG") 142 | 143 | ``` 144 | 145 | 146 | ```{r, echo = TRUE} 147 | 148 | give_or <- function(covariate){ 149 | #odds x = 1 150 | (invlogit(covariate + log(10))/(1-invlogit(covariate + log(10)))) / 151 | #odds x = 0 152 | (invlogit(covariate)/(1-invlogit(covariate))) 153 | } 154 | 155 | give_or(-3) 156 | give_or(2) 157 | 158 | 159 | ``` 160 | 161 | ```{r, fig.width = 8} 162 | 163 | #knitr::include_graphics(paste0(proj, "/collapsability/figs/fig_1_b.JPG")) 164 | knitr::include_graphics("figs/fig_1_b.JPG") 165 | 166 | ``` 167 | 168 | ```{r, echo = TRUE} 169 | 170 | df <- data_frame( 171 | # P(Y|X = 0, C) 172 | x = invlogit(cov_1), 173 | # P(Y|X = 1, C) 174 | y = invlogit(cov_1 + log(10)), 175 | ) 176 | 177 | df %>% 178 | ggplot(aes(x, y)) + 179 | geom_line() + 180 | geom_abline() + 181 | theme_minimal() + 182 | theme(aspect.ratio = 1) + 183 | xlab("P(Y|X = 0, C)") + 184 | ylab("P(Y|X = 1, C)") 185 | 186 | 187 | 188 | ``` 189 | 190 | 191 | ```{r} 192 | 193 | # knitr::include_graphics(paste0(proj, "/collapsability/figs/eq_1.JPG")) 194 | knitr::include_graphics("figs/eq_1.JPG") 195 | 196 | 197 | ``` 198 | 199 | 200 | g(x) is a probability, because it's odds/1+odds 201 | So logit(x) + log(10) must be an log odds, since it is exponentiated to an odds 202 | so x must be a probability...the prob Y given C 203 | So g(x) maps x = P(Y = 1|X = 0, C) to g(x) = P(Y = 1|X = 1, C) 204 | 205 | ```{r, echo = TRUE} 206 | 207 | df <- data_frame( 208 | # P(Y|X = 0, C) 209 | x = invlogit(cov_1), 210 | # P(Y|X = 1, C) 211 | y = invlogit(cov_1 + log(10)), 212 | g_x = 10*x / (1 + 9*x) # Add g(x) 213 | ) 214 | 215 | df %>% 216 | ggplot(aes(x, g_x)) + 217 | geom_line() + 218 | geom_abline() + 219 | theme_minimal() + 220 | theme(aspect.ratio = 1) + 221 | xlab("P(Y|X = 0, C)") + 222 | ylab("P(Y|X = 1, C)") 223 | 224 | ``` 225 | 226 | ```{r} 227 | 228 | with(df, plot(y, g_x)) # It's the same 229 | 230 | ``` 231 | 232 | -------------------------------------------------------------------------------- /collapsability/collapsability.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Alerts" 3 | author: '' 4 | date: '' 5 | output: 6 | html_document: 7 | code_download: yes 8 | df_print: paged 9 | keep_md: yes 10 | theme: flatly 11 | toc: yes 12 | toc_float: yes 13 | pdf_document: 14 | toc: yes 15 | --- 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | # Introduction 25 | 26 | - Binary outcomes 27 | 28 | - Effects of interventions on binary outcomes 29 | 30 | - Different ways of representing potential effects 31 | 32 | - Risk difference 33 | 34 | - Risk ratio 35 | 36 | - Odds ratio 37 | 38 | # Set-up 39 | 40 | Y is a binary outcome (0, 1) 41 | 42 | X is a binary indicator for treatment (0, 1) that is randomly allocated with a fair coin (50% of either outcome). 43 | 44 | C is a covariate uniformly distributed between -10 and 10 45 | 46 | We want to know the probability of Y based on the logistic model of: 47 | 48 | logit(Prob(Y\|X, C)) = log(10)\*X + C 49 | 50 | Prob = Odds / 1 + Odds 0.5 Odds = Prob / 1 - Prob 1 51 | 52 | 53 | ```r 54 | to_odds <- function(prob){prob / (1 - prob)} 55 | to_prob <- function(odds){odds / (1 + odds)} 56 | logit <- function(prob){log(prob / (1 - prob))} 57 | invlogit <- function(log_odds){to_prob(exp(log_odds))} 58 | 59 | invlogit(logit(0.99)) == 0.99 60 | ``` 61 | 62 | ``` 63 | ## [1] TRUE 64 | ``` 65 | 66 | 67 | ```r 68 | cov_1 <- seq(-10, 10, 0.1) 69 | 70 | df <- data_frame( 71 | arm = c(rep(0, length(cov_1)), rep(1, length(cov_1))), 72 | cov_1 = rep(cov_1, 2), 73 | y = log(10) * arm + cov_1 74 | ) %>% 75 | mutate(arm = factor(arm, labels = c("Control", "Active"))) 76 | ``` 77 | 78 | ![](collapsability_files/figure-html/unnamed-chunk-4-1.png) 79 | 80 | The vertical difference between the 2 lines is log(10) or \~ 2.3. 81 | 82 | ![](collapsability_files/figure-html/unnamed-chunk-5-1.png) 83 | 84 | 85 | 86 | 87 | 88 | ```r 89 | give_or <- function(covariate){ 90 | #odds x = 1 91 | (invlogit(covariate + log(10))/(1-invlogit(covariate + log(10)))) / 92 | #odds x = 0 93 | (invlogit(covariate)/(1-invlogit(covariate))) 94 | } 95 | 96 | give_or(-3) 97 | ``` 98 | 99 | ``` 100 | ## [1] 10 101 | ``` 102 | 103 | ```r 104 | give_or(2) 105 | ``` 106 | 107 | ``` 108 | ## [1] 10 109 | ``` 110 | 111 | 112 | 113 | 114 | ```r 115 | df <- data_frame( 116 | # P(Y|X = 0, C) 117 | x = invlogit(cov_1), 118 | # P(Y|X = 1, C) 119 | y = invlogit(cov_1 + log(10)), 120 | ) 121 | 122 | df %>% 123 | ggplot(aes(x, y)) + 124 | geom_line() + 125 | geom_abline() + 126 | theme_minimal() + 127 | theme(aspect.ratio = 1) + 128 | xlab("P(Y|X = 0, C)") + 129 | ylab("P(Y|X = 1, C)") 130 | ``` 131 | 132 | ![](collapsability_files/figure-html/unnamed-chunk-9-1.png) 133 | 134 | 135 | 136 | 137 | 138 | g(x) is a probability, because it's odds/1+odds 139 | So logit(x) + log(10) must be an log odds, since it is exponentiated to an odds 140 | so x must be a probability...the prob Y given C 141 | So g(x) maps x = P(Y = 1|X = 0, C) to g(x) = P(Y = 1|X = 1, C) 142 | 143 | 144 | ```r 145 | df <- data_frame( 146 | # P(Y|X = 0, C) 147 | x = invlogit(cov_1), 148 | # P(Y|X = 1, C) 149 | y = invlogit(cov_1 + log(10)), 150 | g_x = 10*x / (1 + 9*x) # Add g(x) 151 | ) 152 | 153 | df %>% 154 | ggplot(aes(x, g_x)) + 155 | geom_line() + 156 | geom_abline() + 157 | theme_minimal() + 158 | theme(aspect.ratio = 1) + 159 | xlab("P(Y|X = 0, C)") + 160 | ylab("P(Y|X = 1, C)") 161 | ``` 162 | 163 | ![](collapsability_files/figure-html/unnamed-chunk-11-1.png) 164 | 165 | ![](collapsability_files/figure-html/unnamed-chunk-12-1.png) 166 | 167 | -------------------------------------------------------------------------------- /collapsability/collapsability_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/collapsability_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /collapsability/collapsability_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/collapsability_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /collapsability/collapsability_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/collapsability_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /collapsability/collapsability_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/collapsability_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /collapsability/collapsability_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/collapsability_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /collapsability/figs/eq_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/figs/eq_1.JPG -------------------------------------------------------------------------------- /collapsability/figs/fig_1_a.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/figs/fig_1_a.JPG -------------------------------------------------------------------------------- /collapsability/figs/fig_1_b.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/collapsability/figs/fig_1_b.JPG -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "~/R_Users_Workshop/PG_module/materials/PG6030_materials/Pre-workshop", 3 | "sortOrder" : [ 4 | { 5 | "ascending" : true, 6 | "columnIndex" : 2 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/pcs/packages-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "installOptions" : { 3 | "installDependencies" : true, 4 | "installFromRepository" : true, 5 | "libraryPath" : "C:/Users/bpalmer/Documents/R/win-library/3.6" 6 | } 7 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab" : -1 3 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left" : { 3 | "panelheight" : 848, 4 | "splitterpos" : 354, 5 | "topwindowstate" : "HIDE", 6 | "windowheight" : 886 7 | }, 8 | "right" : { 9 | "panelheight" : 848, 10 | "splitterpos" : 531, 11 | "topwindowstate" : "NORMAL", 12 | "windowheight" : 886 13 | } 14 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1" : 0, 3 | "TabSet2" : 1, 4 | "TabZoom" : { 5 | } 6 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/rmd-outputs: -------------------------------------------------------------------------------- 1 | ~/R_Users_Workshop/PG_module/course_notes/R-A_Hitchhikers_Guide_to_Reproducible_Research/Pre-workshop/README.docx 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/saved_source_markers: -------------------------------------------------------------------------------- 1 | {"active_set":"","sets":[]} -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/28A47032: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "67,0", 3 | "scrollLine" : "50" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/36475FDC: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "0,0", 3 | "scrollLine" : "0" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/5318C6C7: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "31,6", 3 | "scrollLine" : "24" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/6937C0F1: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "67,0", 3 | "scrollLine" : "52" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/6FDF2087: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "6,0", 3 | "scrollLine" : "0" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/B5ECED0D: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "67,0", 3 | "scrollLine" : "53" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/B65F4002: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "48,20", 3 | "scrollLine" : "30" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/E01D0251: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "67,0", 3 | "scrollLine" : "50" 4 | } -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/1503004/sources/prop/INDEX: -------------------------------------------------------------------------------- 1 | ~%2FMSc_Clinical_Trials%2FEH6126_Advanced_Trial_Design_and_Analysis%2FEH6126_data_analysis_tutorials%2Fgetting_started%2Fscripts%2Ftest_script.R="6937C0F1" 2 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FPG6030_materials%2FPre-workshop%2Fscripts%2Ftest_script.R="E01D0251" 3 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FR-A_Hitchhikers_Guide_to_Reproducible_Research%2FDay_1%2Fscripts%2F01_baseR_introduction.R="36475FDC" 4 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FR-A_Hitchhikers_Guide_to_Reproducible_Research%2FDay_2%2Fscripts%2Fpractice_transforming_data_solutions.R="5318C6C7" 5 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FR-A_Hitchhikers_Guide_to_Reproducible_Research%2FPre-workshop%2FREADME.md="6FDF2087" 6 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FR-A_Hitchhikers_Guide_to_Reproducible_Research%2FPre-workshop%2Fscripts%2Ftest_script.R="28A47032" 7 | ~%2FR_Users_Workshop%2FPG_module%2Fcourse_notes%2FR-A_Hitchhikers_Guide_to_Reproducible_Research%2FPre-workshop%2Ftest_script.R="B65F4002" 8 | ~%2FR_Users_Workshop%2FPG_module%2Fmaterials%2FPG6030_materials%2FPre-workshop%2Fscripts%2Ftest_script.R="B5ECED0D" 9 | -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/shared/notebooks/patch-chunk-names: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/testing_testing/.Rproj.user/shared/notebooks/patch-chunk-names -------------------------------------------------------------------------------- /testing_testing/.Rproj.user/shared/notebooks/paths: -------------------------------------------------------------------------------- 1 | C:/Users/bpalmer/Documents/MSc_Clinical_Trials/EH6126_Advanced_Trial_Design_and_Analysis/EH6126_data_analysis_tutorials/getting_started/scripts/test_script.R="B80A1110" 2 | C:/Users/bpalmer/Documents/R_Users_Workshop/PG_module/course_notes/R-A_Hitchhikers_Guide_to_Reproducible_Research/Day_1/scripts/01_baseR_introduction.R="96CB15CE" 3 | C:/Users/bpalmer/Documents/R_Users_Workshop/PG_module/course_notes/R-A_Hitchhikers_Guide_to_Reproducible_Research/Day_2/scripts/practice_transforming_data_solutions.R="2714BCE2" 4 | C:/Users/bpalmer/Documents/R_Users_Workshop/PG_module/course_notes/R-A_Hitchhikers_Guide_to_Reproducible_Research/Pre-workshop/README.md="EE3555E7" 5 | C:/Users/bpalmer/Documents/R_Users_Workshop/PG_module/course_notes/R-A_Hitchhikers_Guide_to_Reproducible_Research/Pre-workshop/scripts/test_script.R="6267F760" 6 | -------------------------------------------------------------------------------- /testing_testing/README.md: -------------------------------------------------------------------------------- 1 | # EH6126 - Advanced Clinical Trial Design and Analysis 2 | 3 | ## Testing, testing, 1, 2, 3, testing 4 | 5 | Welcome to EH6126. As part of the module we will provide some data analysis examples delivered through the [R programming language](https://cran.r-project.org/) whilst using the [RStudio GUI](https://rstudio.com/). While most of the content will be demonstrated in the tutorial videos, we will also provide you with an opportunity to perform the analysis yourself, if you wish to dip your toes in warm waters of R. 6 | 7 | You have two main options at this point: 8 | 9 | 1. Download R and RStudio on your own machine 10 | 2. Use [RStudio Cloud](https://rstudio.cloud/) web interface 11 | 12 | Use the introductory video to help make this decision. At the time of writing, the current R release is 4.0.2 ("Taking Off Again"). 13 | 14 | **Note:** If you already have R installed but need to update it, follow the steps provided in the R_update_presentation.pdf that is located in the `docs` folder. Best to do this via the R GUI, rather than in the RStudio GUI. 15 | 16 | ## Once R and RStudio are installed, check that the following works... 17 | 18 | **In RStudio GUI, open the testing_testing R-project file (.Rproj)** 19 | 20 | + Go to the `File` tab 21 | + Open the R Project file (`testing_testing.Rproj`) 22 | 23 | **Next open and run testing_testing.R R script.** 24 | 25 | + Go to the `File` tab 26 | + `Open File`... 27 | + Open the `testing_testing.R` file 28 | 29 | Click on the run button to run each line of code. If your RStudio session window mirrors that in the `successful_test.png` file located in the `plots` folder, GREAT! You're ready to go. -------------------------------------------------------------------------------- /testing_testing/docs/R-RStudio_installation_steps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/testing_testing/docs/R-RStudio_installation_steps.pdf -------------------------------------------------------------------------------- /testing_testing/docs/R_update_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/testing_testing/docs/R_update_presentation.pdf -------------------------------------------------------------------------------- /testing_testing/plots/successful_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRFCSDAU/EH6126_data_analysis_tutorials/40dd88e89d5e1ea02198e6c21ff343df78ad1acc/testing_testing/plots/successful_test.png -------------------------------------------------------------------------------- /testing_testing/scripts/test_script.R: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # EH6126 R setup 3 | # Pre-tutorial to-do steps 4 | ########################################################################### 5 | 6 | # Install the packages you'll require for the workshop 7 | 8 | # First we creating an object with a list of the packages that we'll need 9 | 10 | list.of.packages <- c('tidyverse', 'cowplot', 'datapasta', 'janitor', 11 | 'igraph', 'installr', 'knitr', 'kableExtra', 'MASS', 12 | 'plotly', 'patchwork', 'reprex', 'summarytools', 13 | 'viridis', 'pwr', 'ggbeeswarm', 'TOSTER', 'lme4', 14 | 'sjPlot') 15 | 16 | # Now we will check to see if any of the packages required are not yet on our system 17 | 18 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] 19 | 20 | # Any package missing will be added to the ‘new.packages’ object 21 | # which can then be used to install any missing ones 22 | 23 | if(length(new.packages)) install.packages(new.packages) 24 | 25 | # Load the tidyverse package 26 | 27 | library(tidyverse) 28 | 29 | # Does this work on your system? ------------------------------------------ 30 | 31 | nutrient_names <- c(G = "Glucose", L = "Leucine", P = "Phosphate", 32 | S = "Sulfate", N = "Ammonia", U = "Uracil") 33 | 34 | # Create and object that is the weblink to the data 35 | 36 | url <- "http://varianceexplained.org/files/Brauer2008_DataSet1.tds" 37 | 38 | # Some example R code we'll see again during the workshop 39 | # Here we're reading in data from a remote source and cleaning it 40 | 41 | cleaned_genes_tbl <- read_delim(url, 42 | delim = "\t") %>% 43 | 44 | separate(NAME, 45 | c("name", "BP", "MF", "systematic_name", "number"), 46 | sep = "\\|\\|") %>% 47 | 48 | mutate_at(vars(name:systematic_name), list(trimws)) %>% 49 | 50 | select(-number, -GID, -YORF, -GWEIGHT) %>% 51 | 52 | gather(sample, expression, G0.05:U0.3) %>% 53 | 54 | separate(sample, c("nutrient", "rate"), sep = 1, convert = TRUE) %>% 55 | 56 | mutate(nutrient = plyr::revalue(nutrient, nutrient_names)) %>% 57 | 58 | filter(!is.na(expression), systematic_name != "") 59 | 60 | # Plot the clean data 61 | 62 | cleaned_genes_tbl %>% 63 | 64 | filter(BP == "leucine biosynthesis") %>% 65 | 66 | ggplot(mapping = aes(x = rate, y = expression, color = nutrient)) + 67 | geom_point() + 68 | geom_smooth(method = "lm", se = FALSE) + 69 | facet_wrap(~ name) 70 | 71 | -------------------------------------------------------------------------------- /testing_testing/testing_testing.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | --------------------------------------------------------------------------------