├── .gitignore ├── LICENSE ├── README.md ├── assignments ├── 2021_A1.pdf ├── 2021_A1_solutions.pdf ├── 2021_A2.pdf ├── 2021_A2.tex ├── 2024_A1.tex ├── 2024_A1_solutions.pdf ├── 2024_A2.pdf ├── A1.tex ├── A1_solutions.tex ├── a1.bib ├── a1_sol.bib ├── a2.bib ├── bayes_2024.bib ├── discussion_papers.md ├── figures │ ├── Bayes_risks_beta_binomial.pdf │ ├── beta_density_sketch.pdf │ ├── impala.jpeg │ ├── integrated_risks_beta_binomial.pdf │ └── waterbuck.jpeg ├── refs.bib ├── sol1.tex ├── sol2.tex └── sol3.tex ├── code ├── Raftery_1988.r ├── cauchy_bimodal.r ├── cauchy_bimodal_PT.r ├── example_5.2.7.r ├── normal_vs_cauchy_BC_ex326.r ├── stan │ ├── BC_example_326.stan │ ├── cauchy.stan │ ├── raftery.stan │ └── raftery_wrong.stan └── tramcar_problem.r ├── exercises └── BC_exercises.md └── slides ├── bayes.bib ├── bayes_stats.pdf ├── bayes_stats.tex ├── beamercolorthemechameleon.sty ├── beamercolorthemefreewilly.sty ├── beamercolorthemenouvelle.sty ├── beamerinnerthemefancy.sty ├── beamerouterthemedecolines.sty ├── beamerthemeTorino.sty ├── compile.sh ├── figures ├── BC_example_326.pdf ├── HDI.pdf ├── PPC.jpg ├── concentration_measure_volume.pdf ├── conjugate_table.pdf ├── conjugate_table_expectations.pdf ├── galaxies.pdf ├── oranges.pdf ├── pi_MC.png ├── posterior_prob_half.pdf ├── tiger.jpg ├── traceplots.png └── turtles_all_the_way_down.jpeg ├── lecture_0.tex ├── lecture_1.tex ├── lecture_11.tex ├── lecture_2.tex ├── lecture_3.tex ├── lecture_4.tex ├── lecture_5.tex ├── lecture_6.tex ├── lecture_7.tex ├── lecture_8.tex ├── lecture_extra.tex └── logo.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | # *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Build tool directories for auxiliary files 44 | # latexrun 45 | latex.out/ 46 | 47 | ## Auxiliary and intermediate files from other packages: 48 | # algorithms 49 | *.alg 50 | *.loa 51 | 52 | # achemso 53 | acs-*.bib 54 | 55 | # amsthm 56 | *.thm 57 | 58 | # beamer 59 | *.nav 60 | *.pre 61 | *.snm 62 | *.vrb 63 | 64 | # changes 65 | *.soc 66 | 67 | # comment 68 | *.cut 69 | 70 | # cprotect 71 | *.cpt 72 | 73 | # elsarticle (documentclass of Elsevier journals) 74 | *.spl 75 | 76 | # endnotes 77 | *.ent 78 | 79 | # fixme 80 | *.lox 81 | 82 | # feynmf/feynmp 83 | *.mf 84 | *.mp 85 | *.t[1-9] 86 | *.t[1-9][0-9] 87 | *.tfm 88 | 89 | #(r)(e)ledmac/(r)(e)ledpar 90 | *.end 91 | *.?end 92 | *.[1-9] 93 | *.[1-9][0-9] 94 | *.[1-9][0-9][0-9] 95 | *.[1-9]R 96 | *.[1-9][0-9]R 97 | *.[1-9][0-9][0-9]R 98 | *.eledsec[1-9] 99 | *.eledsec[1-9]R 100 | *.eledsec[1-9][0-9] 101 | *.eledsec[1-9][0-9]R 102 | *.eledsec[1-9][0-9][0-9] 103 | *.eledsec[1-9][0-9][0-9]R 104 | 105 | # glossaries 106 | *.acn 107 | *.acr 108 | *.glg 109 | *.glo 110 | *.gls 111 | *.glsdefs 112 | *.lzo 113 | *.lzs 114 | 115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 116 | # *.ist 117 | 118 | # gnuplottex 119 | *-gnuplottex-* 120 | 121 | # gregoriotex 122 | *.gaux 123 | *.gtex 124 | 125 | # htlatex 126 | *.4ct 127 | *.4tc 128 | *.idv 129 | *.lg 130 | *.trc 131 | *.xref 132 | 133 | # hyperref 134 | *.brf 135 | 136 | # knitr 137 | *-concordance.tex 138 | # TODO Comment the next line if you want to keep your tikz graphics files 139 | *.tikz 140 | *-tikzDictionary 141 | 142 | # listings 143 | *.lol 144 | 145 | # luatexja-ruby 146 | *.ltjruby 147 | 148 | # makeidx 149 | *.idx 150 | *.ilg 151 | *.ind 152 | 153 | # minitoc 154 | *.maf 155 | *.mlf 156 | *.mlt 157 | *.mtc[0-9]* 158 | *.slf[0-9]* 159 | *.slt[0-9]* 160 | *.stc[0-9]* 161 | 162 | # minted 163 | _minted* 164 | *.pyg 165 | 166 | # morewrites 167 | *.mw 168 | 169 | # nomencl 170 | *.nlg 171 | *.nlo 172 | *.nls 173 | 174 | # pax 175 | *.pax 176 | 177 | # pdfpcnotes 178 | *.pdfpc 179 | 180 | # sagetex 181 | *.sagetex.sage 182 | *.sagetex.py 183 | *.sagetex.scmd 184 | 185 | # scrwfile 186 | *.wrt 187 | 188 | # sympy 189 | *.sout 190 | *.sympy 191 | sympy-plots-for-*.tex/ 192 | 193 | # pdfcomment 194 | *.upa 195 | *.upb 196 | 197 | # pythontex 198 | *.pytxcode 199 | pythontex-files-*/ 200 | 201 | # tcolorbox 202 | *.listing 203 | 204 | # thmtools 205 | *.loe 206 | 207 | # TikZ & PGF 208 | *.dpth 209 | *.md5 210 | *.auxlock 211 | 212 | # todonotes 213 | *.tdo 214 | 215 | # vhistory 216 | *.hst 217 | *.ver 218 | 219 | # easy-todo 220 | *.lod 221 | 222 | # xcolor 223 | *.xcp 224 | 225 | # xmpincl 226 | *.xmpi 227 | 228 | # xindy 229 | *.xdy 230 | 231 | # xypic precompiled matrices and outlines 232 | *.xyc 233 | *.xyd 234 | 235 | # endfloat 236 | *.ttt 237 | *.fff 238 | 239 | # Latexian 240 | TSWLatexianTemp* 241 | 242 | ## Editors: 243 | # WinEdt 244 | *.bak 245 | *.sav 246 | 247 | # Texpad 248 | .texpadtmp 249 | 250 | # LyX 251 | *.lyx~ 252 | 253 | # Kile 254 | *.backup 255 | 256 | # gummi 257 | .*.swp 258 | 259 | # KBibTeX 260 | *~[0-9]* 261 | 262 | # TeXnicCenter 263 | *.tps 264 | 265 | # auto folder when using emacs and auctex 266 | ./auto/* 267 | *.el 268 | 269 | # expex forward references with \gathertags 270 | *-tags.tex 271 | 272 | # standalone packages 273 | *.sta 274 | 275 | # Makeindex log files 276 | *.lpz 277 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Luiz Max F. Carvalho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bayesian Statistics 2 | A PhD-level course at [EMAp](https://emap.fgv.br/en). 3 | 4 | To compile the [slides](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/slides/bayes_stats.pdf), you'll need to do 5 | 6 | ```bash 7 | pdflatex -interaction=nonstopmode --shell-escape bayes_stats 8 | ``` 9 | a few times to get it right. 10 | 11 | ## Pre-requisites 12 | - Probability theory with measure. [Jeff Rosenthal](http://probability.ca/jeff/)'s book, [A First Look at Rigorous Probability Theory](http://probability.ca/jeff/grprobbook.html), is a good resource. 13 | - Classical Statistics at the same level as [Mathematical Statistics](https://emap.fgv.br/disciplina/doutorado/mathematical-statistics). For a review, I suggest 14 | [Theory of Statistics](https://www.springer.com/gp/book/9780387945460) by [Mark Schervish](http://www.stat.cmu.edu/people/faculty/mark-schervish). 15 | 16 | # Books 17 | - [The Bayesian Choice](https://link.springer.com/book/10.1007/0-387-71599-1) (BC) by [Christian Robert](https://stats.stackexchange.com/users/7224/xian) will be our main source. 18 | - [A first course in Bayesian statistical methods](https://pdhoff.github.io/book/) (FC) by [Peter Hoff](https://stat.duke.edu/research/hoff#:~:text=Hoff,-Professor%20of%20Statistical&text=Peter%20Hoff%20develops%20statistical%20methodology,area%20inference%2C%20and%20multigroup%20analysis.) is a good all-purpose introduction. 19 | - [Theory of Statistics](https://link.springer.com/book/10.1007/978-1-4612-4250-5) (SV) by [Mark Schervish](https://www.cmu.edu/dietrich/statistics-datascience/people/faculty/mark-schervish.html) is a comprehensive reference. 20 | - [Bayesian Theory](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470316870) (BT) by [José Bernardo](https://www.uv.es/bernardo/) and [Adrian Smith](https://en.wikipedia.org/wiki/Adrian_Smith_(statistician)) is a technical behemoth, suitable for use as a reference guide. 21 | 22 | # Resources 23 | - An overview of computing techniques for Bayesian inference can be found [here](https://arxiv.org/pdf/2004.06425.pdf). 24 | - See Esteves, Stern and Izbicki's [course notes](https://github.com/rbstern/bayesian_inference_book/raw/gh-pages/book.pdf). 25 | - Rafael Stern's excellent [course](https://www.rafaelstern.science/classes/2021_1_bayes/). 26 | - [Principles of Uncertainty](https://www.taylorfrancis.com/books/principles-uncertainty-joseph-kadane/10.1201/9781315167565) by the inimitable [J. Kadane](https://en.wikipedia.org/wiki/Joseph_Born_Kadane) is a book about avoiding being a sure loser. See [this](https://www.ceremade.dauphine.fr/~xian/uncertain.pdf) review by Christian Robert. 27 | - [Bayesian Learning](https://github.com/mattiasvillani/BayesLearnCourse) by [Mattias Vilani](mattiasvillani.com) is a book for a computation-minded audience. 28 | - Michael Betancourt's website is a treasure trove of rigorous, modern and insightful applied Bayesian statistics. See [this](https://betanalpha.github.io/assets/case_studies/principled_bayesian_workflow.html#1_Questioning_Authority) as a gateway drug. 29 | - [Awesome Bayes](https://github.com/hectormz/awesome-bayes) is a curated list of bayesian resources, including blog posts and podcasts. 30 | 31 | ### Acknowledgements 32 | [Guido Moreira](http://github.com/GuidoAMoreira/) and [Isaque Pim](https://github.com/isaquepim) suggested topics, exercises and exam questions. 33 | [Lucas Moschen](https://github.com/lucasmoschen) made many good suggestions. 34 | 35 | # Exercises 36 | 37 | We keep a list [here](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/exercises/BC_exercises.md). I recommend you check back every so often because this is likely to be updated (if infrequently). 38 | 39 | # News 40 | 41 | - Papers for the assignment are [here](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/assignments/discussion_papers.md). A bib [file](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/assignments/bayes_2024.bib) is also made available. Turn in your choice by 18h (Brasília time) on 2024-06-19. 42 | - Discussion guide is now [available](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/assignments/2024_A2.pdf). Hand-in deadlineis 2024-07-05 at 16h Brasília time. 43 | 44 | # Syllabus 45 | ## Lecture 0: Overview 46 | - The LaplacesDemon introductory [vignette](https://cran.r-project.org/web/packages/LaplacesDemon/vignettes/BayesianInference.pdf) gives a very nice overview of Bayesian Statistics. 47 | - [What is a statistical model?](https://projecteuclid.org/journals/annals-of-statistics/volume-30/issue-5/What-is-a-statistical-model/10.1214/aos/1035844977.full) by Peter McCullagh gives a good explanation of what a statistical model is. See also BC Ch1. 48 | - There are a few [Interpretations of Probability](https://plato.stanford.edu/entries/probability-interpret/) and its important to understand them so the various schools of statistical inference make sense. 49 | - [WHAT IS BAYESIAN/FREQUENTIST INFERENCE?](https://normaldeviate.wordpress.com/2012/11/17/what-is-bayesianfrequentist-inference/) by [Larry Wasserman](https://www.stat.cmu.edu/~larry/) is a must read in order to understand what makes each inference paradigm tick. 50 | - This [cross-validated post](https://stats.stackexchange.com/questions/444080/a-measure-theoretic-formulation-of-bayes-theorem) has a very nice, measure-theoretic proof of Bayes's theorem. 51 | 52 | ## Lecture 1: Principled Inference, decision-theoretical foundations 53 | 54 | - Berger and Wolpert's 1988 [monograph](https://errorstatistics.files.wordpress.com/2016/04/berger-wolpert-1988.pdf) is the definitive text on the [Likelihood Principle](https://en.wikipedia.org/wiki/Likelihood_principle) (LP). 55 | - See [this](https://arxiv.org/pdf/1906.10733.pdf) paper By Franklin and Bambirra for a generalised version of the LP. 56 | - As advanced reading, one can consider [Birnbaum (1962)](https://www.tandfonline.com/doi/abs/10.1080/01621459.1962.10480660) and a helpful review [paper](https://link.springer.com/content/pdf/10.1007/978-1-4612-0919-5_31.pdf) published 30 years later by Bjornstad. 57 | - Michael Evans has a few papers on the LP. See [Evans, Fraser & Monette (1986)](https://errorstatistics.files.wordpress.com/2017/12/evans-fraser-monette-1986.pdf) for an argument using a stronger version of CP and [Evans, 2013](https://projecteuclid.org/journals/electronic-journal-of-statistics/volume-7/issue-none/What-does-the-proof-of-Birnbaums-theorem-prove/10.1214/13-EJS857.full) for a flaw with the original 1962 paper by Birnbaum. 58 | - Deborah G. Mayo [challenged](https://projecteuclid.org/journals/statistical-science/volume-29/issue-2/On-the-Birnbaum-Argument-for-the-Strong-Likelihood-Principle/10.1214/13-STS457.full) Birnbaum's argument on the LP. But Mayo implicitly changed the statement of the SP, nullifing her point. This [Cross-Validate](https://stats.stackexchange.com/questions/379798/did-deborah-mayo-refute-birnbaums-proof-of-the-likelihood-principle) post adds more details to the story and to the relevance of the LP. 59 | 60 | ## Lecture 2: Belief functions, coherence, exchangeability 61 | 62 | - David Alvarez-Melis and Tamara Broderick were kind enough to provide an [English translation](https://arxiv.org/abs/1512.01229) of De Finetti's seminal 1930 [paper](http://www.brunodefinetti.it/Opere/funzioneCaratteristica.pdf). 63 | - [Heath and Sudderth (1976)](https://www.tandfonline.com/doi/abs/10.1080/00031305.1976.10479175?journalCode=utas20) provide a simpler proof of De Finetti's representation theorem for binary variables. 64 | 65 | ## Lecture 3: Priors I: rationale and construction; conjugate analysis 66 | - The [SHeffield ELicitation Framework (SHELF)](http://tonyohagan.co.uk/shelf/) is a package for rigorous elicitation of probability distributions. 67 | - John Cook provides a nice [compendium](https://www.johndcook.com/CompendiumOfConjugatePriors.pdf) of conjugate priors by Daniel Fink. 68 | 69 | ## Lecture 4: Priors II: types of priors; implementation 70 | **Required reading** 71 | - [Hidden Dangers of Specifying Noninformative Priors](https://www.tandfonline.com/doi/abs/10.1080/00031305.2012.695938) is a must-read for those who wish to understand the counter-intuitive nature of prior measures and their push-forwards. 72 | - [The Prior Can Often Only Be Understood in the Context of the Likelihood](https://www.mdpi.com/1099-4300/19/10/555) explains that, from a practical perspective, priors can be seen as regularisation devices and should control what the model _does_ rather than what values the parameter takes. 73 | - [Penalising Model Component Complexity: A Principled, Practical Approach to Constructing Priors](https://projecteuclid.org/journals/statistical-science/volume-32/issue-1/Penalising-Model-Component-Complexity--A-Principled-Practical-Approach-to/10.1214/16-STS576.full) shows how to formalise the idea that one should prefer a simpler model by penalising deviations from a base model. 74 | 75 | **Optional reading** 76 | - [The Case for Objective Bayesian Analysis](https://www.ime.usp.br/~abe/lista/pdfTFOW5ADDD0.pdf) is a good read to try and put objective Bayes on solid footing. 77 | 78 | ## Lecture 5: Bayesian point estimation 79 | - The paper [The Federalist Papers As a Case Study](https://link.springer.com/chapter/10.1007/978-1-4612-5256-6_1) by Mosteller and Wallace (1984) is a very nice example of capture-recapture models. It is cited in Sharon McGrayne's book ["The Theory That Would Not Die"](https://www.amazon.com/Theory-That-Would-Not-Die/dp/0300188226) as triumph of Bayesian inference. It is also a serious contender for coolest paper abstract ever. 80 | - [This](https://statmodeling.stat.columbia.edu/2011/01/31/using_sample_si/) post in the Andrew Gelman blog discusses how to deal with the sample size (`n`) in a Bayesian problem: either write out a full model that specifies a probabilistic model for `n` or write an approximate prior pi(theta|n). 81 | 82 | ## Lecture 6: Bayesian Testing I 83 | **Required reading** 84 | - In their seminal 1995 [paper](https://www.tandfonline.com/doi/abs/10.1080/01621459.1995.10476572), [Robert Kass](http://www.stat.cmu.edu/people/faculty/rob-kass) and [Adrian Raftery](https://sites.stat.washington.edu/raftery/) give a nice overview of, along with recommendations for, Bayes factors. 85 | 86 | **Optional reading** 87 | - [This](https://arxiv.org/pdf/1303.5973.pdf) paper by Christian Robert gives a nice discussion of the Jeffreys-Lindley paradox. 88 | - [This](https://link.springer.com/content/pdf/10.1007/s00407-022-00298-3.pdf) paper by [Wagenmakers](https://www.ejwagenmakers.com/) is an excellent historical account of the paradox, and clears many misconceptions. 89 | - [Jaynes](https://en.wikipedia.org/wiki/Edwin_Thompson_Jaynes)'s 1976 monograph [Confidence Intervals vs Bayesian Intervals](https://link.springer.com/chapter/10.1007/978-94-009-6581-2_9) is a great source of useful discussion. [PDF](https://link.springer.com/content/pdf/10.1007/978-94-010-1436-6_6.pdf). 90 | 91 | ## Lecture 7: Bayesian Testing II 92 | - [This](https://www.tandfonline.com/doi/pdf/10.1080/00031305.1999.10474443?casa_token=PvYUGVh0CjwAAAAA:B6UnfgSkoeUNQ5g4nh-D0DxaLTLAOOuoa2I37u33xdxIlair84fSzUuKUcsnHlC24BjRlfWWEcgZ3Q) paper by Lavine and Schervish provides a nice "disambiguation" for what Bayes factors can and cannot do inferentially. 93 | - [Yao et al. (2018)](https://projecteuclid.org/journals/bayesian-analysis/volume-13/issue-3/Using-Stacking-to-Average-Bayesian-Predictive-Distributions-with-Discussion/10.1214/17-BA1091.full) along with ensuing discussion is a must-read for an understanding of modern prediction-based Bayesian model comparison. 94 | 95 | ## Lecture 8: Asymptotics 96 | 97 | - The [entry](https://encyclopediaofmath.org/wiki/Bernstein-von_Mises_theorem) on the Encyclopedia of Mathematics on the Bernstein-von Mises theorem is nicely written. 98 | - The integrated nested Laplace approximation ([INLA](https://www.r-inla.org/)) methodology leverages Laplace approximations to provide accurate approximations to the posterior in latent Gaussian models, which cover a huge class of models used in applied modelling. [This](https://arxiv.org/pdf/1210.0333.pdf) by Thiago G. Martins and others, specially section 2, is a good introduction. 99 | 100 | ## Lecture 9: Applications I 101 | 102 | - Ever wondered what to do when both the number of trials and success probability are unknown in a binomial model? Well, [this](https://pluto.mscc.huji.ac.il/~galelidan/52558/Material/Raftery.pdf) paper by Adrian Raftery has _an_ answer. See also [this](https://stats.stackexchange.com/questions/113851/bayesian-estimation-of-n-of-a-binomial-distribution) discussion with JAGS and Stan implementations. 103 | - [This](https://mc-stan.org/users/documentation/case-studies/golf.html) case study shows how to create a model from first (physical) principles. 104 | 105 | ## Lecture 10: Applications II 106 | - See [Reporting Bayesian Results](https://journals.sagepub.com/doi/abs/10.1177/0193841X20977619?journalCode=erxb) for a guide on which summaries are indispensable in a Bayesian analysis. 107 | - [Visualization in Bayesian workflow](https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/rssa.12378) is a great paper about making useful graphs for a well-calibrated Bayesian analysis. 108 | 109 | ## Lecture 11: Discussion Bayes vs Frequentism 110 | **Disclaimer**: everything in this section needs to be read with care so one does not become a zealot! 111 | 112 | - See Jayne's monograph above. 113 | - See [Frequentism and Bayesianism: A Practical Introduction](https://jakevdp.github.io/blog/2014/03/11/frequentism-and-bayesianism-a-practical-intro/) for a five-part discussion of the Bayesian vs Orthodox statistics. 114 | - [Why isn't everyone a Bayesian?](https://www2.stat.duke.edu/courses/Spring10/sta122/Handouts/EfronWhyEveryone.pdf) is a nice discussion of the trade-offs between paradigms by [Bradley Efron](https://statweb.stanford.edu/~ckirby/brad/). 115 | - [Holes in Bayesian statistics](https://iopscience.iop.org/article/10.1088/1361-6471/abc3a5#:~:text=Here%20are%20a%20few%20holes,Bayes%20factors%20fail%20in%20the) is a collection of holes in Bayesian data analysis, such as conditional probability in the quantum real, flat and weak priors, and model checking, written by Andrew Gelman and Yuling Yao. 116 | - [Bayesian Estimation with Informative Priors is Indistinguishable from Data Falsification](https://www.cambridge.org/core/journals/spanish-journal-of-psychology/article/bayesian-estimation-with-informative-priors-is-indistinguishable-from-data-falsification/FFAB96BDC5EE3C64B144ECF8F90F31E9) is paper that attempts to draw a connection between strong priors and data falsification. Not for the faint of heart. 117 | 118 | # Computational resources 119 | - A few pointers from my [summer course](https://github.com/maxbiostat/computing_applied_bayes). 120 | - Darren Wilkinson's [blog](https://darrenjw.wordpress.com/2013/09/29/parallel-tempering-and-metropolis-coupled-mcmc/) on parallel tempering. I took the code and applied it to our [multimodal Cauchy example](https://github.com/maxbiostat/BayesianStatisticsCourse/blob/main/code/cauchy_bimodal_PT.r). 121 | -------------------------------------------------------------------------------- /assignments/2021_A1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/2021_A1.pdf -------------------------------------------------------------------------------- /assignments/2021_A1_solutions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/2021_A1_solutions.pdf -------------------------------------------------------------------------------- /assignments/2021_A2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/2021_A2.pdf -------------------------------------------------------------------------------- /assignments/2021_A2.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,10pt, notitlepage]{report} 2 | \usepackage{geometry} 3 | \geometry{verbose,tmargin=30mm,bmargin=25mm,lmargin=25mm,rmargin=25mm} 4 | \usepackage[utf8]{inputenc} 5 | \usepackage[sectionbib]{natbib} 6 | \usepackage{amssymb} 7 | \usepackage{amsmath} 8 | \usepackage{enumitem} 9 | \usepackage{xcolor} 10 | \usepackage{cancel} 11 | \usepackage{mathtools} 12 | \usepackage{caption} 13 | \usepackage{subcaption} 14 | \usepackage{float} 15 | \PassOptionsToPackage{hyphens}{url}\usepackage{hyperref} 16 | \hypersetup{colorlinks=true,citecolor=blue} 17 | 18 | 19 | \newtheorem{thm}{Theorem} 20 | \newtheorem{lemma}[thm]{Lemma} 21 | \newtheorem{proposition}[thm]{Proposition} 22 | \newtheorem{remark}[thm]{Remark} 23 | \newtheorem{defn}[thm]{Definition} 24 | 25 | %%%%%%%%%%%%%%%%%%%% Notation stuff 26 | \newcommand{\pr}{\operatorname{Pr}} %% probability 27 | \newcommand{\vr}{\operatorname{Var}} %% variance 28 | \newcommand{\rs}{X_1, X_2, \ldots, X_n} %% random sample 29 | \newcommand{\irs}{X_1, X_2, \ldots} %% infinite random sample 30 | \newcommand{\rsd}{x_1, x_2, \ldots, x_n} %% random sample, realised 31 | \newcommand{\bX}{\boldsymbol{X}} %% random sample, contracted form (bold) 32 | \newcommand{\bx}{\boldsymbol{x}} %% random sample, realised, contracted form (bold) 33 | \newcommand{\bT}{\boldsymbol{T}} %% Statistic, vector form (bold) 34 | \newcommand{\bt}{\boldsymbol{t}} %% Statistic, realised, vector form (bold) 35 | \newcommand{\emv}{\hat{\theta}} 36 | \DeclarePairedDelimiter\ceil{\lceil}{\rceil} 37 | \DeclarePairedDelimiter\floor{\lfloor}{\rfloor} 38 | 39 | % Title Page 40 | \title{Exam 2 (A2)} 41 | \author{Class: Bayesian Statistics \\ Instructor: Luiz Max Carvalho} 42 | \date{02/06/2021} 43 | 44 | \begin{document} 45 | \maketitle 46 | 47 | \textbf{Turn in date: until 16/06/2021 at 23:59h Brasilia Time.} 48 | 49 | \begin{center} 50 | \fbox{\fbox{\parbox{1.0\textwidth}{\textsf{ 51 | \begin{itemize} 52 | \item Please read through the whole exam before starting to answer; 53 | \item State and prove all non-trivial mathematical results necessary to substantiate your arguments; 54 | \item Do not forget to add appropriate scholarly references~\textit{at the end} of the document; 55 | \item Mathematical expressions also receive punctuation; 56 | \item You can write your answer to a question as a point-by-point response or in ``essay'' form, your call; 57 | \item Please hand in a single, \textbf{typeset} ( \LaTeX) PDF file as your final main document. 58 | Code appendices are welcome,~\textit{in addition} to the main PDF document. 59 | \item You may consult any sources, provided you cite \textbf{ALL} of your sources (books, papers, blog posts, videos); 60 | \item You may use symbolic algebra programs such as Sympy or Wolfram Alpha to help you get through the hairier calculations, provided you cite the tools you have used. 61 | \item The exam is worth 100 %$\min\left\{\text{your\:score}, 100\right\}$ 62 | marks. 63 | \end{itemize}} 64 | }}} 65 | \end{center} 66 | % \newpage 67 | % \section*{Hints} 68 | % \begin{itemize} 69 | % \item a 70 | % \item b 71 | % \end{itemize} 72 | % 73 | \newpage 74 | 75 | \section*{Background} 76 | 77 | This exam covers applications, namely estimation, prior sensitivity and prediction. 78 | You will need a working knowledge of basic computing tools, and knowledge of MCMC is highly valuable. 79 | Chapter 6 in \cite{Robert2007} gives an overview of computational techniques for Bayesian statistics. 80 | 81 | \section*{Inferring population sizes -- theory} 82 | 83 | Consider the model 84 | \begin{equation*} 85 | x_i \sim \operatorname{Binomial}(N, \theta), 86 | \end{equation*} 87 | with \textbf{both} $N$ and $\theta$ unknown and suppose one observes $\boldsymbol{x} = \{x_1, x_2, \ldots, x_K\}$. 88 | Here, we will write $\xi = (N, \theta)$. 89 | 90 | \begin{enumerate}[label=\alph*)] 91 | \item (10 marks) Formulate a hierarchical prior ($\pi_1$) for $N$, i.e., elicit $F$ such that $N \mid \alpha \sim F(\alpha)$ and $\alpha \sim \Pi_A$. 92 | Justify your choice; 93 | \item (5 marks) Using the prior from the previous item, write out the full joint posterior kernel for all unknown quantities in the model, $p_1(\xi \mid \boldsymbol{x})$. \textit{Hint:} do not forget to include the appropriate indicator functions!; 94 | \item (5 marks) Is your model identifiable? 95 | \item (5 marks) Exhibit the marginal posterior density for $N$, $p_1(N \mid \boldsymbol{x})$; 96 | \item (5 marks) Return to point (a) above and consider an alternative, uninformative prior structure for $\xi$, $\pi_2$. 97 | Then, derive $p_2(N \mid \boldsymbol{x})$; 98 | \item (10 marks) Formulate a third prior structure on $\xi$, $\pi_3$, that allows for the closed-form marginalisation over the hyperparameters $\alpha$ -- see (a) -- and write out $p_3(N \mid \boldsymbol{x})$; 99 | \item (10 marks) Show whether each of the marginal posteriors considered is proper. 100 | Then, derive the posterior predictive distribution, $g_i(\tilde{x} \mid \boldsymbol{x})$, for each of the posteriors considered ($i = 1, 2, 3$). 101 | \item (5 marks) Consider the loss function 102 | \begin{equation} 103 | \label{eq:relative_loss} 104 | L(\delta(\boldsymbol{x}), N) = \left(\frac{\delta(\boldsymbol{x})-N}{N} \right)^2. 105 | \end{equation} 106 | Derive the Bayes estimator under this loss. 107 | \end{enumerate} 108 | 109 | \section*{Inferring population sizes -- practice} 110 | Consider the problem of inferring the population sizes of major herbivores~\citep{Carroll1985}. 111 | In the first case, one is interested in estimating the number of impala (\textit{Aepyceros melampus}) herds in the Kruger National Park, in northeastern South Africa. 112 | In an initial survey collected the following numbers of herds: $\boldsymbol{x}_{\text{impala}} = \{15, 20, 21, 23, 26\}$. 113 | Another scientific question is the number of individual waterbuck (\textit{Kobus ellipsiprymnus}) in the same park. 114 | The observed numbers of waterbuck in separate sightings were $\boldsymbol{x}_{\text{waterbuck}} = \{53, 57, 66, 67, 72\}$ and may be regarded (for simplicity) as independent and identically distributed. 115 | 116 | \begin{figure}[H] 117 | \centering 118 | \begin{subfigure}[b]{0.45\textwidth} 119 | \centering 120 | \includegraphics[scale=0.75]{figures/impala.jpeg} 121 | \caption{Impala} 122 | \end{subfigure} 123 | \begin{subfigure}[b]{0.45\textwidth} 124 | \centering 125 | \includegraphics[scale=0.75]{figures/waterbuck.jpeg} 126 | \caption{Waterbuck} 127 | \end{subfigure} 128 | \caption{Two antelope species whose population sizes we want to estimate.} 129 | \label{fig:antelopes} 130 | \end{figure} 131 | 132 | 133 | \begin{enumerate}[label=\alph*)] 134 | \setcounter{enumi}{8} 135 | \item (20 marks) For each data set, sketch the marginal posterior distributions $p_1(N \mid \boldsymbol{x})$, $p_2(N \mid \boldsymbol{x})$ and $p_3(N \mid \boldsymbol{x})$. 136 | Moreover, under each posterior, provide (i) the Bayes estimator under quadratic loss and under the loss in (\ref{eq:relative_loss}) and (ii) a 95\% credibility interval for $N$. 137 | Discuss the differences and similarities between these distributions and estimates: do the prior modelling choices substantially impact the final inferences? If so, how? 138 | \item (25 marks) Let $\bar{x} = K^{-1}\sum_{k =1}^K x_k$ and $s^2 = K^{-1}\sum_{k =1}^K (x_k-\bar{x})^2$. 139 | For this problem, a sample is said to be \textit{stable} if $\bar{x}/s^2 \geq (\sqrt{2} + 1)/\sqrt{2}$ and \textit{unstable} otherwise. 140 | Devise a simple method of moments estimator (MME) for $N$. 141 | Then, using a Monte Carlo simulation, compare the MME to the three Bayes estimators under quadratic loss in terms of relative mean squared error. 142 | How do the Bayes estimators compare to MME in terms of the statibility of the generated samples? 143 | \textit{Hint}: You may want to follow the simulation setup of~\cite{Carroll1985}. 144 | \end{enumerate} 145 | 146 | \bibliographystyle{apalike} 147 | \bibliography{a2} 148 | \end{document} 149 | -------------------------------------------------------------------------------- /assignments/2024_A1.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,10pt, notitlepage]{report} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{natbib} 4 | \usepackage{amssymb} 5 | \usepackage{amsmath} 6 | \usepackage{enumitem} 7 | \usepackage{dsfont} 8 | \usepackage{xcolor} 9 | \usepackage{url} 10 | \usepackage{cancel} 11 | \usepackage{mathtools} 12 | \usepackage{newclude} 13 | \usepackage{booktabs} 14 | \usepackage[normalem]{ulem} 15 | 16 | %%%%%%%%%%%%%%%%%%%% Notation stuff 17 | \newcommand{\pr}{\operatorname{Pr}} %% probability 18 | \newcommand{\vr}{\operatorname{Var}} %% variance 19 | \newcommand{\rs}{X_1, X_2, \ldots, X_n} %% random sample 20 | \newcommand{\irs}{X_1, X_2, \ldots} %% infinite random sample 21 | \newcommand{\rsd}{x_1, x_2, \ldots, x_n} %% random sample, realised 22 | \newcommand{\bX}{\boldsymbol{X}} %% random sample, contracted form (bold) 23 | \newcommand{\bx}{\boldsymbol{x}} %% random sample, realised, contracted form (bold) 24 | \newcommand{\bT}{\boldsymbol{T}} %% Statistic, vector form (bold) 25 | \newcommand{\bt}{\boldsymbol{t}} %% Statistic, realised, vector form (bold) 26 | \newcommand{\emv}{\hat{\theta}} 27 | \DeclarePairedDelimiter\ceil{\lceil}{\rceil} 28 | \DeclarePairedDelimiter\floor{\lfloor}{\rfloor} 29 | \DeclareMathOperator*{\argmax}{arg\,max} 30 | \DeclareMathOperator*{\argmin}{arg\,min} 31 | 32 | \DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}} 33 | \DeclareMathOperator{\EX}{\mathbb{E}} %% Expected Value 34 | 35 | %%%% 36 | \newif\ifanswers 37 | \answerstrue % comment out to hide answers 38 | 39 | % Title Page 40 | \title{Fist exam (A1)} 41 | \author{Class: Bayesian Statistics \\ Instructor: Luiz Max Carvalho \\ TA: Isaque Pim} 42 | \date{22 May 2024} 43 | 44 | \begin{document} 45 | \maketitle 46 | 47 | \begin{center} 48 | \fbox{\fbox{\parbox{1.0\textwidth}{\textsf{ 49 | \begin{itemize} 50 | \item You have 4 (four) hours to complete the exam; 51 | \item Please read through the whole exam before you start giving your answers; 52 | \item Answer all questions briefly; 53 | \item Clealy mark your final answer with a square, circle or preferred geometric figure; 54 | \item The exam is worth $\min\left\{\text{your\:score}, 100\right\}$ marks. 55 | \item You can bring \textbf{\underline{one} ``cheat sheet''} A4 both sides, which must be turned in together with your answers. 56 | \end{itemize}} 57 | }}} 58 | \end{center} 59 | 60 | \newpage 61 | 62 | \section*{1. I like 'em short.} 63 | 64 | For a prior distribution $\pi$, a set $C_x$ is said to be an 65 | $\alpha$-credible set if $$P^\pi (\theta \in C_x |x) \geq 1-\alpha.$$ 66 | This region is called an HPD $\alpha$-credible region (for highest posterior density) if it can be written in the form: 67 | \begin{equation*} 68 | \{\theta; \pi(\theta|x) > k_{\alpha}\} \subset C_x^\pi \subset \{\theta; \pi(\theta|x) \geq k_{\alpha}\}, 69 | \end{equation*} 70 | where $k_{\alpha}$ is the largest bound such that 71 | $P^\pi (\theta \in C_x^\alpha |x) \geq 1-\alpha$. 72 | This construction is motivated by the fact that they minimise the volume among $\alpha$-credible regions. 73 | A special and important case are \textit{HPD intervals}, when $C_x$ is an interval $(a, b)$. 74 | 75 | \begin{enumerate}[label=\alph*)] 76 | \item (20 marks) Show that if the posterior density (i) is unimodal and (ii) never uniform for all intervals of ($1 - \alpha$) probability mass of $\Omega$, then the HPD region is an interval and it is unique. 77 | 78 | \textbf{Hint:} formulate a minimisation problem on two variables $a$ and $b$ with a probability restriction and solve for the Lagrangian. 79 | 80 | \item (20 marks) We can also use decision-theoretical criteria to pick between credible intervals. 81 | A first idea is to balance between the volume of the region and coverage guarantees through the loss function $$L(\theta, C) = \operatorname{vol}(C) + \mathds{1}_{C^c}(\theta).$$ 82 | Explain why the above loss is problematic. 83 | \item * (20 bonus marks) Define the new loss function $$L^*(\theta, C) = g\left(\operatorname{vol}(C)\right) + \mathds{1}_{C^c}(\theta),$$ 84 | where $g$ is increasing and $0 \leq g(t) \leq 1$ for all $t$. Show that the Bayes estimator $C^\pi_x$ for $L^*$ is a HPD region. 85 | \end{enumerate} 86 | \ifanswers 87 | \nocite{*} 88 | \include*{sol1} 89 | \fi 90 | 91 | \section*{2. Savage!} 92 | 93 | We will now study the case of point hypothesis testing as a case of two nested models. 94 | Let $\theta_0 \in \Omega_0 \subset \Omega$. 95 | We want to compare model $M_0: \theta = \theta_0$ to $M_1: \theta \in \Omega$. 96 | That is, under model $M_1$, $\theta$ can vary freely. 97 | Assume further that the models are \textit{properly nested}, that is, 98 | $$P(x | \theta, M_0) = P(x | \theta = \theta_0, M_1).$$ 99 | 100 | \begin{enumerate}[label=\alph*)] 101 | \item (25 marks) Given observed data $x$, show that the Bayes Factor $\operatorname{BF_{01}}$ can be written as 102 | \begin{equation*} 103 | \operatorname{BF_{01}} = \frac{p(\theta_0| x, M_1)}{p(\theta_0|M_1)}, 104 | \end{equation*} 105 | where the numerator is the posterior under $M_1$ and the denominator the prior probability under $M_1$. 106 | \item (25 marks) Apply the result from part (a) to the problem of testing whether a coin is fair. 107 | Specifically, we want to compare $H_0: \theta = 0.5$ against $H_1: \theta \neq 0.5$, where theta is the probability of the coin landing heads. 108 | Given $n=24$ trials and $x = 3$ heads and employing a uniform prior on $\theta$, calculate the Bayes factor $\operatorname{BF_{01}}$. 109 | Based on the Bayes factor, would you prefer $H_0$ over $H_1$? How strong should the prior be for a change in this preference? 110 | \end{enumerate} 111 | \textbf{Note}: The ratio above is called the \textit{Savage-Dickey} ratio. It provides a straightforward way to compute Bayes factors, which can be more intuitive and less computationally intensive than other methods. 112 | \ifanswers 113 | \include*{sol2} 114 | \fi 115 | 116 | \section*{3. Hey, you're biased! } 117 | 118 | Let $\bX = (\rs)$ be a random sample from an $\operatorname{Exponential}(\theta)$ distribution with $\theta > 0$ and common density $f(x \mid \theta) = \theta^{-1}\exp(-x/\theta)\mathbb{I}(x > 0)$ w.r.t. the Lebesgue measure on $\mathbb{R}$. 119 | 120 | \begin{enumerate}[label=\alph*)] 121 | \item (10 marks) Find a conjugate prior for $\theta$; 122 | \item (20 marks) Exhibit the Bayes estimator under quadratic loss for $\theta$, $\delta_B(\bX)$; 123 | \item (10 marks) Show that the bias $\delta_B(\bX)$ is $O(n^{-1})$. 124 | \item $\ast$ (10 bonus marks) Show how to obtain the uniformly minimum variance unbiased estimator (UMVUE) from $\delta_B(\bX)$ by taking limits of the hyperparameters. 125 | \end{enumerate} 126 | 127 | \ifanswers 128 | \include*{sol3} 129 | \fi 130 | 131 | \bibliographystyle{apalike} 132 | \bibliography{refs} 133 | 134 | \end{document} 135 | -------------------------------------------------------------------------------- /assignments/2024_A1_solutions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/2024_A1_solutions.pdf -------------------------------------------------------------------------------- /assignments/2024_A2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/2024_A2.pdf -------------------------------------------------------------------------------- /assignments/A1.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,10pt, notitlepage]{report} 2 | \usepackage{geometry} 3 | \geometry{verbose,tmargin=30mm,bmargin=25mm,lmargin=25mm,rmargin=25mm} 4 | \usepackage[utf8]{inputenc} 5 | \usepackage[sectionbib]{natbib} 6 | \usepackage{amssymb} 7 | \usepackage{amsmath} 8 | \usepackage{enumitem} 9 | \usepackage{xcolor} 10 | \usepackage{cancel} 11 | \usepackage{mathtools} 12 | \PassOptionsToPackage{hyphens}{url}\usepackage{hyperref} 13 | \hypersetup{colorlinks=true,citecolor=blue} 14 | 15 | 16 | \newtheorem{thm}{Theorem} 17 | \newtheorem{lemma}[thm]{Lemma} 18 | \newtheorem{proposition}[thm]{Proposition} 19 | \newtheorem{remark}[thm]{Remark} 20 | \newtheorem{defn}[thm]{Definition} 21 | 22 | %%%%%%%%%%%%%%%%%%%% Notation stuff 23 | \newcommand{\pr}{\operatorname{Pr}} %% probability 24 | \newcommand{\vr}{\operatorname{Var}} %% variance 25 | \newcommand{\rs}{X_1, X_2, \ldots, X_n} %% random sample 26 | \newcommand{\irs}{X_1, X_2, \ldots} %% infinite random sample 27 | \newcommand{\rsd}{x_1, x_2, \ldots, x_n} %% random sample, realised 28 | \newcommand{\bX}{\boldsymbol{X}} %% random sample, contracted form (bold) 29 | \newcommand{\bx}{\boldsymbol{x}} %% random sample, realised, contracted form (bold) 30 | \newcommand{\bT}{\boldsymbol{T}} %% Statistic, vector form (bold) 31 | \newcommand{\bt}{\boldsymbol{t}} %% Statistic, realised, vector form (bold) 32 | \newcommand{\emv}{\hat{\theta}} 33 | \DeclarePairedDelimiter\ceil{\lceil}{\rceil} 34 | \DeclarePairedDelimiter\floor{\lfloor}{\rfloor} 35 | 36 | % Title Page 37 | \title{Exam 1 (A1)} 38 | \author{Class: Bayesian Statistics \\ Instructor: Luiz Max Carvalho} 39 | \date{26/04/2021} 40 | 41 | \begin{document} 42 | \maketitle 43 | 44 | \textbf{Turn in date: until 28/04/2021 at 23:59h Brasilia Time.} 45 | 46 | \begin{center} 47 | \fbox{\fbox{\parbox{1.0\textwidth}{\textsf{ 48 | \begin{itemize} 49 | \item Please read through the whole exam before starting to answer; 50 | \item State and prove all non-trivial mathematical results necessary to substantiate your arguments; 51 | \item Do not forget to add appropriate scholarly references~\textit{at the end} of the document; 52 | \item Mathematical expressions also receive punctuation; 53 | \item You can write your answer to a question as a point-by-point response or in ``essay'' form, your call; 54 | \item Please hand in a single, \textbf{typeset} ( \LaTeX) PDF file as your final main document. 55 | Code appendices are welcome,~\textit{in addition} to the main PDF document. 56 | \item You may consult any sources, provided you cite \textbf{ALL} of your sources (books, papers, blog posts, videos); 57 | \item You may use symbolic algebra programs such as Sympy or Wolfram Alpha to help you get through the hairier calculations, provided you cite the tools you have used. 58 | \item The exam is worth $\min\left\{\text{your\:score}, 100\right\}$ marks. 59 | \end{itemize}} 60 | }}} 61 | \end{center} 62 | % \newpage 63 | % \section*{Hints} 64 | % \begin{itemize} 65 | % \item a 66 | % \item b 67 | % \end{itemize} 68 | % 69 | \newpage 70 | 71 | \section*{Background} 72 | 73 | This exam covers decision theory, prior elicitation and point estimation. 74 | You will need a little measure theory here and there and also be sharp with your knowledge of expectations and conditional expectations. 75 | 76 | \section*{1. Warm up: stretching our Bayesian muscles with a classic.} 77 | 78 | \cite{Bayes1763}\footnote{The paper was published posthumously and read to the Royal Society by Bayes's close friend Richard Price (1723-1791).}: a billiard ball $W$ is rolled on a line of length $L = 1$, with uniform probability of stopping anywhere on $L$. 79 | It stops at $0 < p < 1$. 80 | A second ball, $O$, is then rolled $n$ times under the same assumptions. 81 | Let $X$ denote the number of times (out of $n$) that $O$ stopped to the left of $W$. 82 | Given $X=x$, \textbf{what inference(s) can we make on $p$}? 83 | You may assume $n\geq2$. 84 | 85 | The idea here is to provide a rigorous, principled Bayesian analysis of this $250$ years old problem. 86 | Here are a few road signs to help you in your analysis: 87 | \begin{enumerate}[label=\alph*)] 88 | \item (10 marks) For the problem at hand, \textbf{carefully} define the parameter space, data-generating mechanism, likelihood function and all of crucial elements of a Bayesian analysis. What is the dominating measure? 89 | \item (10 marks) Exhibit your posterior measure and sketch its density. 90 | 91 | \textit{Hint:} it might be convenient to prove the following proposition: 92 | \begin{proposition} 93 | \label{prop:joint_bayes} 94 | The joint distribution of $p$ and $X$ is given by 95 | \begin{equation*} 96 | \pr(a < p < b, X = x) = \int_{a}^{b} \binom{n}{x} t^x(1-t)^{n-x}\,dt. 97 | \end{equation*} 98 | \end{proposition} 99 | \item (10 marks) Provide a Bayes estimator under (i) quadratic and (ii) 0-1 loss\footnote{Recall the discussion in class about how to properly define 0-1 loss as for continuous parameter spaces.}; 100 | \item (10 marks) Contrast the estimators obtained in the previous item with the maximum likelihood estimator, in terms of (i) posterior expected loss and (ii) integrated risk. 101 | Is any of the estimators preferable according to both risks? 102 | If not, which estimator should be preferred under each goal risk? 103 | \item (10 marks) Suppose one observes $X = x = 6$ for $n=9$ rolls. 104 | Produce an updated prediction of where the next $m$ balls are going to stop along $L$. 105 | Please provide (i) a point prediction in the form of an expected value and (ii) a full probability distribution. 106 | \end{enumerate} 107 | 108 | 109 | \section*{2. Proper behaviour.} 110 | 111 | In this question we will explore propriety and its implications for inference. 112 | Consider a parameter space $\boldsymbol{\Theta}$ and a sampling model with density $f(x \mid \theta)$. 113 | A density $h : \boldsymbol{\Theta} \to (0, \infty)$ is said to be proper if 114 | \begin{equation*} 115 | \int_{\boldsymbol{\Theta}} h(t)\,d\mu(t) < \infty, 116 | \end{equation*} 117 | where $\mu$ is the dominating measure. 118 | Assuming $\mu$ to be the Lebesgue measure and that the prior $\pi(\theta)$ is proper, show that 119 | \begin{itemize} 120 | \item[a)] (10 marks) The posterior density, 121 | \begin{equation*} 122 | \xi(\theta \mid x) = \frac{f(x\mid \theta)\pi(\theta)}{m(x)}, 123 | \end{equation*} 124 | is proper almost surely. 125 | \item[b)] (10 marks) The Bayes estimator under quadratic loss, $\delta_{\text{S}}(x) = E_\xi[\theta]$ is biased almost surely. 126 | \textit{Hint:} consider what happens to the integrated risk under unbiasedness. 127 | \end{itemize} 128 | Now, 129 | \begin{itemize} 130 | \item[c)] (10 marks) Take 131 | $$f(x \mid \theta) = \frac{2x}{\theta}, x \in [0, \sqrt{\theta}],$$ 132 | and 133 | $$ \pi(\theta) = \frac{2}{\pi (1 +\theta^2)}, \theta > 0.$$ 134 | The posterior density is then 135 | \begin{equation*} 136 | p(\theta \mid x) = \frac{1}{m(x)} \frac{4x}{\pi}\frac{1}{\theta(1 +\theta^2)}, 137 | \end{equation*} 138 | and thus 139 | \begin{equation*} 140 | m(x) = \int_{0}^\infty \frac{4x}{\pi} \frac{1}{t(1 +t^2)}\,dt = \frac{4x}{\pi} \int_{0}^\infty \frac{1}{t(1 +t^2)}\,dt = \infty, 141 | \end{equation*} 142 | i.e., the posterior is apparently improper. 143 | Explain how this ``counter-example'' is wrong. 144 | \end{itemize} 145 | 146 | \section*{3. Rayleigh dispersion\footnote{This question's title is pun: \url{https://en.wikipedia.org/wiki/Rayleigh_scattering}.}.} 147 | 148 | Let $X_1, \ldots, X_n$ be a random sample from a distribution whose probability density function is 149 | \begin{equation*} 150 | f(x \mid \sigma) = \frac{x}{\sigma^2} \exp\left(-\frac{x^2}{2\sigma^2}\right),\, \sigma>0,\, 0 < x < \infty. 151 | \end{equation*} 152 | This is the Rayleigh\footnote{John William Strutt (1842-1919), the Third Baron Rayleigh, was a British physicist, known for his work on Statistical Mechanics.} distribution, used in reliability modelling, where the $X$ are failure times (time-until-failure). 153 | \begin{itemize} 154 | \item[a)] (5 marks) For $t$ fixed and known, find $\eta = \pr(X > t)$ as function of $\sigma^2$; 155 | \item[b)] (5 marks) Find the likelihood function for $\eta$ for a sample $\boldsymbol{x} = \{x_1, \ldots, x_n\}$; 156 | \item[c)] (10 marks) Is the Rayleigh distribution a member of the exponential family? Justify your answer; 157 | \item[d)] (10 marks) Find the Jeffreys's prior for $\eta$, $\pi_J(\eta)$; 158 | \item[e)] (10 marks) Using $\pi_J(\eta)$, derive the predictive distribution $p(\tilde{x} \mid \boldsymbol{x})$. 159 | \end{itemize} 160 | 161 | \bibliographystyle{apalike} 162 | \bibliography{a1} 163 | \end{document} 164 | -------------------------------------------------------------------------------- /assignments/a1.bib: -------------------------------------------------------------------------------- 1 | @article{Bayes1763, 2 | author = {Bayes, T.}, 3 | journal = {Phil. Trans. of the Royal Soc. of London}, 4 | pages = {370--418}, 5 | title = {An essay towards solving a problem in the doctrine of chances}, 6 | volume = 53, 7 | year = 1763 8 | } 9 | @article{Koopman1936, 10 | title={On distributions admitting a sufficient statistic}, 11 | author={Koopman, Bernard Osgood}, 12 | journal={Transactions of the American Mathematical society}, 13 | volume={39}, 14 | number={3}, 15 | pages={399--409}, 16 | year={1936}, 17 | publisher={JSTOR} 18 | } 19 | @article{Edwards1978, 20 | title={Commentary on the arguments of {T}homas {B}ayes}, 21 | author={Edwards, AWF}, 22 | journal={Scandinavian Journal of Statistics}, 23 | pages={116--118}, 24 | year={1978}, 25 | publisher={JSTOR} 26 | } 27 | @article{Diaconis1979, 28 | title={Conjugate priors for exponential families}, 29 | author={Diaconis, Persi and Ylvisaker, Donald and others}, 30 | journal={The {A}nnals of {S}tatistics}, 31 | volume={7}, 32 | number={2}, 33 | pages={269--281}, 34 | year={1979}, 35 | publisher={Institute of Mathematical Statistics} 36 | } 37 | @book{Robert2007, 38 | title={The {B}ayesian choice: from decision-theoretic foundations to computational implementation}, 39 | author={Robert, Christian}, 40 | year={2007}, 41 | publisher={Springer Science \& Business Media} 42 | } 43 | -------------------------------------------------------------------------------- /assignments/a1_sol.bib: -------------------------------------------------------------------------------- 1 | @article{Bayes1763, 2 | author = {Bayes, T.}, 3 | journal = {Phil. Trans. of the Royal Soc. of London}, 4 | pages = {370--418}, 5 | title = {An essay towards solving a problem in the doctrine of chances}, 6 | volume = 53, 7 | year = 1763 8 | } 9 | @article{Edwards1978, 10 | title={Commentary on the arguments of {T}homas {B}ayes}, 11 | author={Edwards, AWF}, 12 | journal={Scandinavian Journal of Statistics}, 13 | pages={116--118}, 14 | year={1978}, 15 | publisher={JSTOR} 16 | } 17 | @book{Robert2007, 18 | title={The {B}ayesian choice: from decision-theoretic foundations to computational implementation}, 19 | author={Robert, Christian}, 20 | year={2007}, 21 | publisher={Springer Science \& Business Media} 22 | } 23 | -------------------------------------------------------------------------------- /assignments/a2.bib: -------------------------------------------------------------------------------- 1 | @article{Carroll1985, 2 | title={A note on {N} estimators for the binomial distribution}, 3 | author={Carroll, Raymond J and Lombard, F}, 4 | journal={Journal of the American Statistical Association}, 5 | volume={80}, 6 | number={390}, 7 | pages={423--426}, 8 | year={1985}, 9 | publisher={Taylor \& Francis} 10 | } 11 | @book{Robert2007, 12 | title={The {B}ayesian choice: from decision-theoretic foundations to computational implementation}, 13 | author={Robert, Christian}, 14 | year={2007}, 15 | publisher={Springer Science \& Business Media} 16 | } 17 | -------------------------------------------------------------------------------- /assignments/bayes_2024.bib: -------------------------------------------------------------------------------- 1 | @article{Goldstein1980, 2 | title={The linear Bayes regression estimator under weak prior assumptions}, 3 | author={Goldstein, Michael}, 4 | journal={Biometrika}, 5 | volume={67}, 6 | number={3}, 7 | pages={621--628}, 8 | year={1980}, 9 | publisher={Oxford University Press} 10 | } 11 | @article{Dawid1982, 12 | title={The well-calibrated {B}ayesian}, 13 | author={Dawid, A Philip}, 14 | journal={Journal of the American Statistical Association}, 15 | volume={77}, 16 | number={379}, 17 | pages={605--610}, 18 | year={1982}, 19 | publisher={Taylor \& Francis} 20 | } 21 | @article{OHagan1995, 22 | title={Fractional {B}ayes factors for model comparison}, 23 | author={O'Hagan, Anthony}, 24 | journal={Journal of the Royal Statistical Society: Series B (Methodological)}, 25 | volume={57}, 26 | number={1}, 27 | pages={99--118}, 28 | year={1995}, 29 | publisher={Wiley Online Library} 30 | } 31 | @article{West1985, 32 | title={Dynamic generalized linear models and {B}ayesian forecasting}, 33 | author={West, Mike and Harrison, P Jeff and Migon, Helio S}, 34 | journal={Journal of the American Statistical Association}, 35 | volume={80}, 36 | number={389}, 37 | pages={73--83}, 38 | year={1985}, 39 | publisher={Taylor \& Francis} 40 | } 41 | @article{Besag1991, 42 | title={{B}ayesian image restoration, with two applications in spatial statistics}, 43 | author={Besag, Julian and York, Jeremy and Molli{\'e}, Annie}, 44 | journal={Annals of the institute of statistical mathematics}, 45 | volume={43}, 46 | pages={1--20}, 47 | year={1991}, 48 | publisher={Springer} 49 | } 50 | @article{OHagan1998, 51 | title={Eliciting expert beliefs in substantial practical applications}, 52 | author={O'Hagan, Anthony}, 53 | journal={Journal of the Royal Statistical Society Series D: The Statistician}, 54 | volume={47}, 55 | number={1}, 56 | pages={21--35}, 57 | year={1998}, 58 | publisher={Oxford University Press} 59 | } 60 | @article{Tierney1998, 61 | title={A note on {M}etropolis-{H}astings kernels for general state spaces}, 62 | author={Tierney, Luke}, 63 | journal={Annals of Applied Probability}, 64 | pages={1--9}, 65 | year={1998}, 66 | publisher={JSTOR} 67 | } 68 | @article{Kennedy2001, 69 | title={{B}ayesian calibration of computer models}, 70 | author={Kennedy, Marc C and O'Hagan, Anthony}, 71 | journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, 72 | volume={63}, 73 | number={3}, 74 | pages={425--464}, 75 | year={2001}, 76 | publisher={Wiley Online Library} 77 | } 78 | @article{Oakley2004, 79 | title={Probabilistic sensitivity analysis of complex models: a {B}ayesian approach}, 80 | author={Oakley, Jeremy E and O'Hagan, Anthony}, 81 | journal={Journal of the Royal Statistical Society Series B: Statistical Methodology}, 82 | volume={66}, 83 | number={3}, 84 | pages={751--769}, 85 | year={2004}, 86 | publisher={Oxford University Press} 87 | } 88 | @article{Papaspiliopoulos2007, 89 | title={A general framework for the parametrization of hierarchical models}, 90 | author={Papaspiliopoulos, Omiros and Roberts, Gareth O and Sk{\"o}ld, Martin}, 91 | journal={Statistical Science}, 92 | pages={59--73}, 93 | year={2007}, 94 | publisher={JSTOR} 95 | } 96 | @article{Gelman2008, 97 | author = {Andrew Gelman and Aleks Jakulin and Maria Grazia Pittau and Yu-Sung Su}, 98 | title = {{A weakly informative default prior distribution for logistic and other regression models}}, 99 | volume = {2}, 100 | journal = {The Annals of Applied Statistics}, 101 | number = {4}, 102 | publisher = {Institute of Mathematical Statistics}, 103 | pages = {1360 -- 1383}, 104 | keywords = {{B}ayesian inference, generalized linear model, hierarchical model, least squares, Linear regression, logistic regression, multilevel model, noninformative prior distribution, weakly informative prior distribution}, 105 | year = {2008}, 106 | doi = {10.1214/08-AOAS191}, 107 | URL = {https://doi.org/10.1214/08-AOAS191} 108 | } 109 | @article{Park2008, 110 | title={The {B}ayesian lasso}, 111 | author={Park, Trevor and Casella, George}, 112 | journal={Journal of the american statistical association}, 113 | volume={103}, 114 | number={482}, 115 | pages={681--686}, 116 | year={2008}, 117 | publisher={Taylor \& Francis} 118 | } 119 | @article{Piironen2017, 120 | author = {Juho Piironen and Aki Vehtari}, 121 | title = {{Sparsity information and regularization in the horseshoe and other shrinkage priors}}, 122 | volume = {11}, 123 | journal = {Electronic Journal of Statistics}, 124 | number = {2}, 125 | publisher = {Institute of Mathematical Statistics and Bernoulli Society}, 126 | pages = {5018 -- 5051}, 127 | keywords = {{B}ayesian inference, horseshoe prior, shrinkage priors, Sparse estimation}, 128 | year = {2017}, 129 | doi = {10.1214/17-EJS1337SI}, 130 | URL = {https://doi.org/10.1214/17-EJS1337SI} 131 | } 132 | @article{Simpson2017, 133 | author = {Daniel Simpson and H{\aa}vard Rue and Andrea Riebler and Thiago G. Martins and Sigrunn H. S{\o}rbye}, 134 | title = {{Penalising Model Component Complexity: A Principled, Practical Approach to Constructing Priors}}, 135 | volume = {32}, 136 | journal = {Statistical Science}, 137 | number = {1}, 138 | publisher = {Institute of Mathematical Statistics}, 139 | pages = {1 -- 28}, 140 | keywords = {{B}ayesian theory, disease mapping, hierarchical models, information geometry, interpretable prior distributions, prior on correlation matrices}, 141 | year = {2017}, 142 | doi = {10.1214/16-STS576}, 143 | URL = {https://doi.org/10.1214/16-STS576} 144 | } 145 | @article{Bai2021, 146 | title={Spike-and-slab meets {LASSO}: A review of the spike-and-slab {LASSO} }, 147 | author={Bai, Ray and Ro{\v{c}}kov{\'a}, Veronika and George, Edward I}, 148 | journal={Handbook of {B}ayesian variable selection}, 149 | pages={81--108}, 150 | year={2021}, 151 | publisher={Chapman and Hall/CRC} 152 | } 153 | 154 | @article{Tibshirani1996, 155 | ISSN = {00359246}, 156 | URL = {http://www.jstor.org/stable/2346178}, 157 | author = {Robert Tibshirani}, 158 | journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, 159 | number = {1}, 160 | pages = {267--288}, 161 | publisher = {[Royal Statistical Society, Wiley]}, 162 | title = {Regression Shrinkage and Selection via the {L}asso}, 163 | urldate = {2024-06-24}, 164 | volume = {58}, 165 | year = {1996} 166 | } 167 | 168 | @book{Lindley1980, 169 | title={The Bayesian approach to statistics}, 170 | author={Lindley, Dennis V}, 171 | year={1980}, 172 | publisher={University of California (Berkeley). Operations Research Center} 173 | } 174 | 175 | @article{Barker1965, 176 | title={Monte {C}arlo calculations of the radial distribution functions for a proton-electron plasma}, 177 | author={Barker, Anthony Alfred}, 178 | journal={Australian Journal of Physics}, 179 | volume={18}, 180 | number={2}, 181 | pages={119--134}, 182 | year={1965}, 183 | publisher={CSIRO Publishing} 184 | } 185 | 186 | @article{Green1995, 187 | title={Reversible jump Markov chain Monte Carlo computation and Bayesian model determination}, 188 | author={Green, Peter J}, 189 | journal={Biometrika}, 190 | volume={82}, 191 | number={4}, 192 | pages={711--732}, 193 | year={1995}, 194 | publisher={Oxford University Press} 195 | } 196 | 197 | @article{George1993, 198 | title={Variable selection via Gibbs sampling}, 199 | author={George, Edward I and McCulloch, Robert E}, 200 | journal={Journal of the American Statistical Association}, 201 | volume={88}, 202 | number={423}, 203 | pages={881--889}, 204 | year={1993}, 205 | publisher={Taylor \& Francis} 206 | } 207 | 208 | @article{Carvalho2010, 209 | title={The horseshoe estimator for sparse signals}, 210 | author={Carvalho, Carlos M and Polson, Nicholas G and Scott, James G}, 211 | journal={Biometrika}, 212 | volume={97}, 213 | number={2}, 214 | pages={465--480}, 215 | year={2010}, 216 | publisher={Oxford University Press} 217 | } 218 | 219 | @article{Raftery1996, 220 | title={Approximate {B}ayes factors and accounting for model uncertainty in generalised linear models}, 221 | author={Raftery, Adrian E}, 222 | journal={Biometrika}, 223 | volume={83}, 224 | number={2}, 225 | pages={251--266}, 226 | year={1996}, 227 | publisher={Oxford University Press} 228 | } 229 | 230 | @article{Gabry2019, 231 | title={Visualization in Bayesian workflow}, 232 | author={Gabry, Jonah and Simpson, Daniel and Vehtari, Aki and Betancourt, Michael and Gelman, Andrew}, 233 | journal={Journal of the Royal Statistical Society Series A: Statistics in Society}, 234 | volume={182}, 235 | number={2}, 236 | pages={389--402}, 237 | year={2019}, 238 | publisher={Oxford University Press} 239 | } -------------------------------------------------------------------------------- /assignments/discussion_papers.md: -------------------------------------------------------------------------------- 1 | - Goldstein, M. (1980). [The linear Bayes regression estimator under weak prior assumptions](https://doi.org/10.1093/biomet/67.3.621). Biometrika, 67(3), 621-628. 2 | - Dawid, A. P. (1982). [The well-calibrated Bayesian](https://www.tandfonline.com/doi/abs/10.1080/01621459.1982.10477856). Journal of the American Statistical Association, 77(379), 605-610. 3 | - O'Hagan, A. (1995). [Fractional Bayes factors for model comparison](https://doi.org/10.1111/j.2517-6161.1995.tb02017.x). Journal of the Royal Statistical Society: Series B (Methodological), 57(1), 99-118. 4 | - West, M., Harrison, P. J., & Migon, H. S. (1985). [Dynamic generalized linear models and Bayesian forecasting](https://www.tandfonline.com/doi/abs/10.1080/01621459.1985.10477131). Journal of the American Statistical Association, 80(389), 73-83. 5 | - Besag, J., York, J., & Mollié, A. (1991). [Bayesian image restoration, with two applications in spatial statistics](https://link.springer.com/article/10.1007/bf00116466). Annals of the institute of statistical mathematics, 43, 1-20. 6 | - O'Hagan, A. (1998). [Eliciting expert beliefs in substantial practical applications](https://doi.org/10.1111/1467-9884.00114). Journal of the Royal Statistical Society Series D: The Statistician, 47(1), 21-35. 7 | - Tierney, L. (1998). [A note on Metropolis-Hastings kernels for general state spaces](https://www.jstor.org/stable/2667233). Annals of applied probability, 1-9. 8 | - Kennedy, M. C., & O'Hagan, A. (2001). [Bayesian calibration of computer models](https://doi.org/10.1111/1467-9868.00294). Journal of the Royal Statistical Society: Series B (Statistical Methodology), 63(3), 425-464. 9 | - Oakley, J. E., & O'Hagan, A. (2004). [Probabilistic sensitivity analysis of complex models: a Bayesian approach](https://doi.org/10.1111/j.1467-9868.2004.05304.x). Journal of the Royal Statistical Society Series B: Statistical Methodology, 66(3), 751-769. 10 | - Papaspiliopoulos, O., Roberts, G. O., & Sköld, M. (2007). [A general framework for the parametrization of hierarchical models](https://www.jstor.org/stable/2764580). Statistical Science, 59-73. 11 | - Gelman, A., Jakulin, A., Pittau, M. G., & Su, Y. S. (2008). [A weakly informative default prior distribution for logistic and other regression models](https://doi.org/10.1214/08-AOAS191). 12 | - Park, T., & Casella, G. (2008). [The Bayesian LASSO](https://doi.org/10.1198/016214508000000337). Journal of the American Statistical Association, 103(482), 681-686. 13 | - Piironen, J., & Vehtari, A. (2017). [Sparsity information and regularization in the horseshoe and other shrinkage priors](https://doi.org/10.1214/17-EJS1337SI). Electronic Journal of Statistics, 11, 5018-5051. 14 | - Simpson, D., Rue, H., Riebler, A., Martins, T. G., & Sørbye, S. H. (2017). [Penalising model component complexity: A principled, practical approach to constructing priors](https://doi.org/10.1214/16-STS576). Statistical Science, 32(1), 1-28. 15 | - Bai, R., Ročková, V., & George, E. I. (2021). [Spike-and-slab meets LASSO: A review of the spike-and-slab LASSO](https://doi.org/10.1201/9781003089018-4). Handbook of Bayesian variable selection, 81-108. 16 | -------------------------------------------------------------------------------- /assignments/figures/Bayes_risks_beta_binomial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/figures/Bayes_risks_beta_binomial.pdf -------------------------------------------------------------------------------- /assignments/figures/beta_density_sketch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/figures/beta_density_sketch.pdf -------------------------------------------------------------------------------- /assignments/figures/impala.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/figures/impala.jpeg -------------------------------------------------------------------------------- /assignments/figures/integrated_risks_beta_binomial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/figures/integrated_risks_beta_binomial.pdf -------------------------------------------------------------------------------- /assignments/figures/waterbuck.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/assignments/figures/waterbuck.jpeg -------------------------------------------------------------------------------- /assignments/refs.bib: -------------------------------------------------------------------------------- 1 | @article{Dickey1971, 2 | title={The weighted likelihood ratio, linear hypotheses on normal location parameters}, 3 | author={Dickey, James M}, 4 | journal={The Annals of Mathematical Statistics}, 5 | pages={204--223}, 6 | year={1971}, 7 | publisher={JSTOR} 8 | } 9 | 10 | @book{Shao2003, 11 | title={Mathematical Statistics}, 12 | author={Shao, Jun}, 13 | year={2003}, 14 | publisher={Springer Science \& Business Media} 15 | } 16 | @book{Robert2007, 17 | title={The Bayesian choice: from decision-theoretic foundations to computational implementation}, 18 | author={Robert, Christian P and others}, 19 | volume={2}, 20 | year={2007}, 21 | publisher={Springer} 22 | } 23 | -------------------------------------------------------------------------------- /assignments/sol1.tex: -------------------------------------------------------------------------------- 1 | \textcolor{red}{\textbf{Concepts}: highest posterior density; interval estimation, loss function.} 2 | \textcolor{purple}{\textbf{Difficulty}: intermediate.}\\ 3 | \textcolor{blue}{ 4 | \textbf{Resolution:} 5 | \begin{enumerate}[label = \alph*)] 6 | \item Let $b$ and $a$ be the upper and lower bounds of our interval and let $\pi(\theta|x)$ be the posterior distribution. We seek to minimise the quantity $b - a$. Adding the probability restrictions we get: 7 | \begin{equation*} 8 | \begin{aligned} 9 | \min_{b,a} \quad & b - a\\ 10 | \textrm{s.t.} \quad & \int_a^b \pi(\theta|x) d\theta = 1-\alpha. \\ 11 | \end{aligned} 12 | \end{equation*} 13 | The Lagrangian can then be written as 14 | \begin{equation*} 15 | \mathcal{L} = (b-a) + \lambda \left[ \int_a^b \pi(\theta|x) d\theta - (1-\alpha)\right]. 16 | \end{equation*} 17 | Differentiate w.r.t $b$ and $a$ and set the results to zero to get 18 | \begin{align*} 19 | \frac{\partial \mathcal{L}}{\partial a} &= -1 - \lambda \pi(a|x) = 0, \\ 20 | \frac{\partial \mathcal{L}}{\partial b} &= 1 - \lambda \pi(b|x) = 0 . 21 | \end{align*} 22 | From this we get $\pi(a|x) = \pi(b|x) = -1/\lambda$. As $\pi$ is a probability density, $\lambda < 0$. Note that the density on both ends of our interval must be equal, which makes sense according to our definition. The second order conditions gives us that 23 | \begin{equation*} 24 | \frac{\partial^2 \mathcal{L}}{(\partial a)^2} = -\lambda \frac{\partial \pi(a|x)}{\partial a} \quad;\quad \frac{\partial^2 \mathcal{L}}{(\partial a)^2} = -\lambda \frac{\partial \pi(b|x)}{\partial b} \quad;\quad \frac{\partial^2 \mathcal{L}}{\partial a\partial b} = 0 25 | \end{equation*} 26 | Since the posterior density is unimodal and non-constant, we must have have that the derivative at $a$ is positive and the derivative at $b$ is negative. Then the Hessian matrix of second derivatives is positive definite, which implies we have achieved a minimum for the interval $(a,b)$. 27 | \item It is problematic because volume and coverage are not on the same scale. 28 | If the volume needs to be large to ensure coverage, is better to pick a region with null volume. For example, consider the case of finding a HPD for the mean of a normal distribution. Under Jeffrey's prior the HPD will be the classical \textit{t} interval 29 | \begin{equation*} 30 | C(\Bar{x}, \Bar{s}^2) = \left(\Bar{x} - t_{\alpha} \sqrt{\frac{\Bar{s}^2}{n}}, \Bar{x} + t_{\alpha} \sqrt{\frac{\Bar{s}^2}{n}}\right). 31 | \end{equation*} 32 | The volume of the HPD above is twice the standard deviation term. 33 | If this volume is larger than 1, it is better to collapse the interval to a point if we are trying to minimise the loss. So the interval under this loss becomes 34 | \begin{equation*} 35 | C'(\Bar{x}, \Bar{s}^2) = \left\{\begin{split} C(\Bar{x}&, \Bar{s}^2), \quad \sqrt{\Bar{s}^2} > \sqrt{n}/2t_{\alpha}, \\ \{\Bar{x}\}&, \quad \text{otherwise}. 36 | \end{split}\right. 37 | \end{equation*} 38 | This makes little sense, as one deposits essentially infinite certainty on a single point. 39 | See Section 5.5.3 in Robert (2007) for more details. 40 | \item Let $C^\pi$ be the Bayes estimator under the given loss. By definition $C^\pi$ minimises the posterior expected loss 41 | \begin{equation*} 42 | R(C|x) = \EX \left[ L^*(C , \theta | x) \right] = g(\operatorname{vol(C)}) + \int_{C^c} \pi(\theta|x) d\theta, 43 | \end{equation*} 44 | which is equivalent to finding C that minimises 45 | \begin{equation*} 46 | g(\operatorname{vol(C)}) - \int_{C} \pi(\theta|x) d\theta. 47 | \end{equation*} 48 | If the Bayes estimator is not an HPD, there exists $k \geq 0$ such that 49 | \begin{equation*} 50 | C^\pi \cap \{\theta: \pi(\theta|x) < k\} \neq \emptyset \quad \text{and} \quad (C^\pi)^c \cap \{\theta: \pi(\theta|x) \geq k\} \neq \emptyset, 51 | \end{equation*} 52 | the intersections being different from zero (we are working with sets defined only up to sets of Lebesgue measure zero). Thus, there exists sets A and B such that 53 | \begin{equation*} 54 | A \subset C^\pi \cap \{\theta: \pi(\theta|x) < k\} \quad \text{and} \quad B \subset (C^\pi)^c \cap \{\theta: \pi(\theta|x) \geq k\} , 55 | \end{equation*} 56 | and $\operatorname{vol}(A) = \operatorname{vol}(B) > 0$. If we now define $C^* = (C^\pi - A) \cup B$, it follows that 57 | \begin{equation*} 58 | R(C^\pi|x) > R(C^*|x), 59 | \end{equation*} 60 | as $\operatorname{vol}(C^\pi) = \operatorname{vol}(C^*)$ and $\int_A \pi(\theta|x) d\theta < \int_B \pi(\theta|x) d\theta$. Therefore we have a contradiction, so $C^\pi$ must be an HPD. 61 | \end{enumerate} 62 | $\blacksquare$\\ 63 | \textbf{Comment:} Here we saw how to frame the problem of interval inference -- from a unimodal posterior -- as an optimisation problem, which under regularity conditions yields a well-behaved solution. 64 | Moreover, we saw that a loss function that makes intuitive sense might lead to strange conclusions. 65 | Finally, we proved a little result that characterises the HPD as the solution of a particular class of problems, where the volume of the resulting estimate (interval) is transformed through an increasing function, generalising the previous finding. 66 | } -------------------------------------------------------------------------------- /assignments/sol2.tex: -------------------------------------------------------------------------------- 1 | \textcolor{red}{\textbf{Concepts}: Bayes factors, priors for testing, Savage-Dickey.} 2 | \textcolor{purple}{\textbf{Difficulty}: intermediate.}\\ 3 | \textcolor{blue}{ 4 | \textbf{Resolution:} 5 | \begin{enumerate}[label = \alph*)] 6 | \item The Bayes Factor is given by 7 | \begin{equation*} 8 | \operatorname{BF_{01}} = \frac{p(x|M_0)}{p(x|M_1)}. 9 | \end{equation*} 10 | We can expand the numerator and use the nesting condition to get 11 | \begin{align*} 12 | p(x|M_0) &= \int p(x|\theta, M_0)p(\theta|M_0)d\theta \\ 13 | &= \int p(x|\theta = \theta_0, M_1)p(\theta|M_0)d\theta \\ 14 | &= p(x| \theta = \theta_0, M_1). 15 | \end{align*} 16 | Now, using Bayes theorem we get 17 | \begin{equation*} 18 | p(x| \theta = \theta_0, M_1) = \frac{p(\theta_0 | x, M_1) p(x|M_1)}{p(\theta_0|M_1)}. 19 | \end{equation*} 20 | Substitute $p(x|M_0)$ back into our first expression and we get the result 21 | \begin{equation*} 22 | \operatorname{BF_{01}} = \frac{p(x|M_0)}{p(x|M_1)} = \frac{p(\theta_0 | x, M_1) p(x|M_1)}{p(x|M_1)p(\theta_0|M_1)} = \frac{p(\theta_0 | x, M_1)}{p(\theta_0|M_1)} 23 | \end{equation*} 24 | Now we can test point hypothesis by just evaluating the ratio of the prior and the posterior under $M_1$ on the point representing the null set. 25 | \item The uniform prior is a $\operatorname{Beta}(1,1)$ distribution, conjugate to the binomial. The posterior is then a $\operatorname{Beta}(x+1,n-x+1)$ distribution. Evaluating the posterior/prior ratio at $1/2$ we get 26 | \begin{equation*} 27 | \operatorname{BF_{01}} = \frac{\frac{\Gamma(n+2)}{\Gamma(x+1)\Gamma(n-x+1)}\frac{1}{2^{n}}}{\frac{\Gamma(2)}{\Gamma(1)\Gamma(1)} \frac{1}{2}} = \frac{\Gamma(26)}{\Gamma(5)\Gamma(21)} \frac{1}{2^{24}} = \frac{26 \cdot 23 \cdot 22 \cdot 5}{2^{24}}. 28 | \end{equation*} 29 | That is approximately $0.004$, so it is 255 times more likely for the coin to be biased than not -- which makes perfect sense, since there were only 3 heads out of 24 throws. 30 | If we wanted to change this decision we could put aside the idea of nested models and place a point hypothesis for $H_1$, such as $\theta = 1$. 31 | We could keep the nested model and try to concentrate prior density on a point to the right of $1/2$. 32 | If we use a prior $\operatorname{Beta}(\alpha, \alpha)$ and take $\alpha$ to infinity, it is easy to show that the Bayes Factor converges to 1 -- basically prior and posterior will be a point mass at $1/2$. 33 | Both cases are super strong prior choices. 34 | \end{enumerate} 35 | $\blacksquare$\\ 36 | \textbf{Comment:} See Dickey (1971) for more details. 37 | } -------------------------------------------------------------------------------- /assignments/sol3.tex: -------------------------------------------------------------------------------- 1 | \textcolor{red}{\textbf{Concepts}: Bayes estimator, conjugacy, connections with frequentist/orthodox theory.} 2 | \textcolor{purple}{\textbf{Difficulty}: easy.}\\ 3 | \textcolor{blue}{ 4 | \textbf{Resolution:} 5 | This is a very straightforward question and we shall proceed accordingly. 6 | First we note that the assumption that the $X_i$ are conditionally i.i.d given $\theta$ leads to 7 | \begin{align*} 8 | f\left(\boldsymbol{X} \mid \theta \right) &= \prod_{i=1}^n \frac{\exp(-X_i/\theta)}{\theta}\mathbb{I}(X_i > 0),\\ 9 | &= \theta^{-n} \exp\left(S_n/\theta\right) \mathbb{I}\left(\prod_{i=1}^n X_i > 0\right), 10 | \end{align*} 11 | where $S_n := \sum_{i=1}^n X_i$. 12 | From here, there is no point in pretending that we don't know what a good guess for a conjugate family to this likelihood is: an inverse gamma distribution with parameters $\alpha,\beta > 0$ would lead to a posterior 13 | \begin{align*} 14 | p\left(\theta \mid \boldsymbol{X}\right) &\propto \left(\boldsymbol{X} \mid \theta \right)\pi(\theta \mid \alpha, \beta),\\ 15 | &= \theta^{-n - (\alpha+1)}\exp\left(S_n/\theta + \beta/\theta \right) \mathbb{I}\left(\prod_{i=1}^n X_i > 0\right), 16 | \end{align*} 17 | which, after re-arranging, can be recognised as the kernel of an inverse gamma distribution with parameters $\alpha_n = n + \alpha$ and $\beta_n = S_n + \beta$. 18 | To answer b), we need to remember that the Bayes estimator under quadratic loss is the posterior mean. 19 | Thus, 20 | \begin{align*} 21 | \delta_B(\boldsymbol{X}_n) &= \frac{\beta_n}{\alpha_n - 1},\\ 22 | &= \frac{n\bar{X}_n + \beta}{ n+ \alpha - 1}, 23 | \end{align*} 24 | where the last line comes from noticing we can write $S_n = n\bar{X}_n$ where $\bar{X}_n$ is the sample mean. 25 | To compute the bias, we will take 26 | \begin{equation*} 27 | \mathbb{E}_\theta\left[\delta_B(\boldsymbol{X}_n) - \theta \right] = \frac{n + \theta \beta -(n+ \alpha - 1)\theta^2}{\theta (n+ \alpha - 1)}, 28 | \end{equation*} 29 | which is $O(1/n)$, as requested. 30 | From orthodox\footnote{Frequentist} theory we know\footnote{If you need a refresher, consider: (i) showing that $\bar{X}_n$ is unbiased, computing the Cramér-Rao lower bound for unbiased estimators and showing that its variance matches the bound or (ii) noticing that $S_n$ is complete suficient and using Lehmann-Scheffé or, yet, (iii) noticing that the exponential distribution belongs to the exponential family -- in canonical form -- and thus the sample mean is UMVUE.} that the UMVUE for $\theta$ is $\bar{X}_n$. 31 | So the way to get it from $\delta_{B}(\boldsymbol{X}_n)$ is to take $\alpha, \beta \to 0$, i.e., to ``flatten'' out the prior so it approaches the (improper) uniform on $\mathbb{R}_+$. 32 | $\blacksquare$\\ 33 | \textbf{Comment:} This is a very straightforward question just to make sure we know our basics. 34 | There is some interesting discussion about the relationship with frequentist estimation if we consider other estimands. 35 | Consider estimating $\eta_t := \exp(t/\theta)$ for some $t>0$, for instance. 36 | In this case we can show\footnote{Just consider the moment-generating function of an inverse-gamma distribution.} that the Bayes estimator under quadratic loss is 37 | \begin{equation*} 38 | \tilde{\delta}_B(\boldsymbol{X}_n) = \left(1 + \frac{t}{n \bar{X}_n + \beta}\right)^{-(n + \alpha)}, 39 | \end{equation*} 40 | which is biased but consistent. 41 | The UMVUE is , 42 | \begin{equation*} 43 | \tilde{\delta}_{\text{UMVUE}}(\boldsymbol{X}_n) = \left(1 - \frac{t}{n \bar{X}_n}\right)^{-n}, 44 | \end{equation*} 45 | however, so it is not a limit of Bayes estimators of the sort we considered -- or any for that matter. 46 | See example 4.7 (page 242) in Shao (2003). 47 | } -------------------------------------------------------------------------------- /code/Raftery_1988.r: -------------------------------------------------------------------------------- 1 | library(cmdstanr) 2 | library(rstan) 3 | stanfit <- function(fit) rstan::read_stan_csv(fit$output_files()) 4 | 5 | 6 | impala <- c(15, 20, 21, 23, 26) 7 | waterbuck <- c(53, 57, 66, 67, 72) 8 | xobs <- impala[1] 9 | 10 | stan.data <- list(x = c(xobs), n = length(xobs), 11 | kappa_1 = 1, kappa_2 = .1, 12 | M = 50000) 13 | 14 | raftery <- cmdstanr::cmdstan_model("stan/raftery_1.2_single_obs.stan") 15 | 16 | fit <- stanfit(raftery$sample(data = stan.data, 17 | max_treedepth = 10, 18 | adapt_delta = .99, 19 | iter_warmup = 2000, 20 | iter_sampling = 2000, 21 | parallel_chains = 4)) 22 | 23 | fit 24 | 25 | N.samples <- extract(fit, 'N')$N 26 | quantile(N.samples, prob = c(.10, .90)) 27 | -------------------------------------------------------------------------------- /code/cauchy_bimodal.r: -------------------------------------------------------------------------------- 1 | xs <- c(-3, 2) 2 | 3 | ### The likelihood 4 | 5 | c_loglik <- function(theta){ 6 | sum(dcauchy(x = xs, location = theta, log = TRUE)) 7 | } 8 | c_loglik <- Vectorize(c_loglik) 9 | 10 | curve(c_loglik, -abs(2*min(xs)), abs(2*max(xs)), 11 | xlab = expression(theta), 12 | lwd = 3, 13 | ylab = "Log-likelihood", 14 | main = "Cauchy example -- BC exercise 1.28") 15 | 16 | ### Bayesian inference 17 | 18 | #### Quadrature 19 | Mu <- 0 20 | Sigma <- 1 21 | 22 | c_logposterior_kernel <- function(theta){ 23 | dnorm(x = theta, mean = Mu, sd = Sigma) + 24 | c_loglik(theta) 25 | } 26 | c_logposterior_kernel <- Vectorize(c_logposterior_kernel) 27 | 28 | 29 | ## marginal likelihood via quadrature 30 | m_of_x <- integrate(function(t) exp(c_logposterior_kernel(t)), 31 | -Inf, Inf) 32 | 33 | c_posterior <- function(theta){ 34 | exp(c_logposterior_kernel(theta) - log(m_of_x$value)) 35 | } 36 | c_posterior <- Vectorize(c_posterior) 37 | 38 | minT <- -abs(4*min(xs)) 39 | maxT <- abs(4*max(xs)) 40 | 41 | curve(c_posterior, minT, maxT, 42 | lwd = 4, 43 | xlab = expression(theta), 44 | ylab = "Posterior density", 45 | main = "Cauchy example -- BC exercise 1.28") 46 | 47 | 48 | integrand <- function(t){ 49 | t * c_posterior(t) 50 | } 51 | 52 | posterior.mean.quadrature <- integrate(integrand, 53 | -Inf, Inf, 54 | subdivisions = 1E5) 55 | 56 | #### MCMC 57 | 58 | library(cmdstanr) 59 | 60 | c_model <- cmdstan_model("stan/cauchy.stan") 61 | 62 | s.data <- list( 63 | n = length(xs), 64 | x = xs, 65 | prior_loc = Mu, 66 | prior_scale = Sigma 67 | ) 68 | 69 | mcmc.samples <- c_model$sample(data = s.data, 70 | max_treedepth = 13, 71 | adapt_delta = .999, 72 | chains = 10, 73 | parallel_chains = 10, 74 | metric = "diag_e") 75 | mcmc.samples 76 | 77 | theta.draws <- mcmc.samples$draws("theta") 78 | 79 | #### Importance sampling 80 | #$$ Example 6.2.2, Eq (6.2.6) 81 | library(matrixStats) 82 | M <- length(theta.draws) 83 | prior.draws <- rnorm(n = M, mean = Mu, sd = Sigma) 84 | logWs <- sapply(prior.draws, function(t){ 85 | sum(dcauchy(x = xs, location = t, log = TRUE)) 86 | }) 87 | logZW <- logSumExp(logWs) 88 | weights <- exp(logWs - logZW) 89 | m_is <- sum(weights * prior.draws) 90 | 91 | IS.draws <- sample(x = prior.draws, size = M, replace = TRUE, 92 | prob = weights) 93 | 94 | hist(theta.draws, probability = TRUE, 95 | xlim = c(minT, maxT), 96 | xlab = expression(theta)) 97 | curve(c_posterior, lwd = 3, add = TRUE) 98 | lines(density(IS.draws), lwd = 3, lty = 2, col = 2) 99 | 100 | 101 | ############ Parallel tempering 102 | 103 | U = function(gam, x) 104 | { 105 | - gam * c_logposterior_kernel(x) 106 | } 107 | 108 | curried = function(gam) 109 | { 110 | message(paste("Returning a function for gamma =", gam)) 111 | function(x) 112 | U(gam, x) 113 | } 114 | U4 = curried(4) 115 | 116 | op = par(mfrow = c(2, 1)) 117 | curve(U4(x), minT, maxT, main = "Potential function, U(x)") 118 | curve(exp(-U4(x)), minT, maxT, main = "Unnormalised density function, exp(-U(x))") 119 | par(op) 120 | 121 | 122 | chain = function(target, tune = 0.1, init = 1) 123 | { 124 | x = init 125 | xvec = numeric(iters) 126 | for (i in 1:iters) { 127 | can = x + rnorm(1, 0, tune) 128 | logA = target(x) - target(can) 129 | if (log(runif(1)) < logA) 130 | x = can 131 | xvec[i] = x 132 | } 133 | xvec 134 | } 135 | 136 | temps = 2 ^ (0:3) 137 | iters = 1e5 138 | 139 | mat = sapply(lapply(temps, curried), chain) 140 | colnames(mat) = paste("gamma=", temps, sep = "") 141 | 142 | require(smfsb) 143 | mcmcSummary(mat, rows = length(temps)) 144 | 145 | 146 | chains_coupled = function(pot = U, 147 | tune = 0.1, 148 | init = 1) 149 | { 150 | x = rep(init, length(temps)) 151 | xmat = matrix(0, iters, length(temps)) 152 | for (i in 1:iters) { 153 | can = x + rnorm(length(temps), 0, tune) 154 | logA = unlist(Map(pot, temps, x)) - unlist(Map(pot, temps, can)) 155 | accept = (log(runif(length(temps))) < logA) 156 | x[accept] = can[accept] 157 | # now the coupling update 158 | swap = sample(1:length(temps), 2) 159 | logA = pot(temps[swap[1]], x[swap[1]]) + pot(temps[swap[2]], x[swap[2]]) - 160 | pot(temps[swap[1]], x[swap[2]]) - pot(temps[swap[2]], x[swap[1]]) 161 | if (log(runif(1)) < logA) 162 | x[swap] = rev(x[swap]) 163 | # end of the coupling update 164 | xmat[i, ] = x 165 | } 166 | colnames(xmat) = paste("gamma=", temps, sep = "") 167 | xmat 168 | } 169 | 170 | mc3 <- chains_coupled(tune = .1) 171 | 172 | mcmcSummary(mc3, rows = length(temps)) 173 | 174 | 175 | par(mfrow = c(2, 2)) 176 | for (k in 1:ncol(mc3)){ 177 | hist(mc3[, k], probability = TRUE, 178 | xlim = c(minT, maxT), 179 | main = paste("gamma =", temps[k]), 180 | xlab = expression(theta)) 181 | curve(c_posterior, lwd = 3, add = TRUE) 182 | } 183 | 184 | ##### Posterior mean estimates 185 | mean(theta.draws) ## MCMC (HMC) 186 | posterior.mean.quadrature ## quadrature 187 | m_is ## Importance sampling 188 | colMeans(mc3) 189 | -------------------------------------------------------------------------------- /code/cauchy_bimodal_PT.r: -------------------------------------------------------------------------------- 1 | xs <- c(-3, 2) 2 | 3 | ### The likelihood 4 | 5 | c_loglik <- function(theta) { 6 | sum(dcauchy( 7 | x = xs, 8 | location = theta, 9 | log = TRUE 10 | )) 11 | } 12 | c_loglik <- Vectorize(c_loglik) 13 | 14 | Mu <- 0 15 | Sigma <- 1 16 | 17 | c_logposterior_kernel <- function(theta) { 18 | dnorm(x = theta, mean = Mu, sd = Sigma) + 19 | c_loglik(theta) 20 | } 21 | c_logposterior_kernel <- Vectorize(c_logposterior_kernel) 22 | 23 | 24 | minT <- -abs(4 * min(xs)) 25 | maxT <- abs(4 * max(xs)) 26 | 27 | 28 | ## marginal likelihood via quadrature 29 | m_of_x <- integrate(function(t) 30 | exp(c_logposterior_kernel(t)), -Inf, Inf) 31 | 32 | c_posterior <- function(theta) { 33 | exp(c_logposterior_kernel(theta) - log(m_of_x$value)) 34 | } 35 | c_posterior <- Vectorize(c_posterior) 36 | 37 | 38 | curve( 39 | c_posterior, 40 | minT, 41 | maxT, 42 | lwd = 4, 43 | xlab = expression(theta), 44 | ylab = "Posterior density", 45 | main = "Cauchy example -- BC exercise 1.28" 46 | ) 47 | 48 | -------------------------------------------------------------------------------- /code/example_5.2.7.r: -------------------------------------------------------------------------------- 1 | log1pexp <- function(x) 2 | { 3 | ## taken from https://github.com/mfasiolo/qgam/blob/3bff42449b865a12c264c5b61438def5d74fdc70/R/log1pexp.R 4 | indx <- .bincode(x, 5 | c(-Inf, -37, 18, 33.3, Inf), 6 | right = TRUE, 7 | include.lowest = TRUE) 8 | 9 | kk <- which(indx==1) 10 | if( length(kk) ){ x[kk] <- exp(x[kk]) } 11 | 12 | kk <- which(indx==2) 13 | if( length(kk) ){ x[kk] <- log1p( exp(x[kk]) ) } 14 | 15 | kk <- which(indx==3) 16 | if( length(kk) ){ x[kk] <- x[kk] + exp(-x[kk]) } 17 | 18 | return(x) 19 | } 20 | #### 21 | 22 | logPr <- function(x, n, rho0){ 23 | -log1pexp( 24 | log1p(-rho0)-log(rho0) + 25 | n * log(2) + 26 | lbeta(x + 1, n -x +1) 27 | ) 28 | } 29 | 30 | f_rho <- function(rho){ 31 | exp(logPr(x = 5, n = 10, rho0 = rho)) 32 | } 33 | f_rho <- Vectorize(f_rho) 34 | 35 | curve(f_rho, xlab = expression(rho[0]), ylab = "Posterior probability", 36 | main = "x = 5, n = 10") 37 | 38 | f_x <- function(x){ 39 | exp(logPr(x = x, n = 10, rho0 = 1/2)) 40 | } 41 | f_x <- Vectorize(f_x) 42 | 43 | curve(f_x, 0, 10, 44 | xlab = expression(x), ylab = "Posterior probability", 45 | main = "n = 10, rho0 = 1/2") 46 | -------------------------------------------------------------------------------- /code/normal_vs_cauchy_BC_ex326.r: -------------------------------------------------------------------------------- 1 | M <- 1E6 2 | theta.samp.1 <- rnorm(M, mean = 0, sd = sqrt(2.19)) 3 | theta.samp.2 <- rcauchy(M) 4 | 5 | predictive.X1 <- rnorm(M, mean = theta.samp.1, sd = 1) 6 | predictive.X2 <- rnorm(M, mean = theta.samp.2, sd = 1) 7 | 8 | 9 | library(ggplot2) 10 | 11 | forplot <- data.frame(x_pred = c(predictive.X1, predictive.X2), 12 | prior = rep(c("Normal", "Cauchy"), each = M)) 13 | 14 | p0 <- ggplot(forplot, 15 | aes(x = x_pred, colour = prior, fill = prior)) + 16 | geom_density(alpha = .4) + 17 | scale_x_continuous(expression(x[pred]), limits = c(-10, 10)) + 18 | scale_y_continuous("Density", expand = c(0, 0)) + 19 | theme_bw(base_size = 16) + 20 | geom_vline(xintercept = c(-4, 4), linetype = "longdash") + 21 | theme(legend.position = "bottom", 22 | legend.justification = "centre", 23 | legend.title = element_blank(), 24 | strip.background = element_blank(), 25 | strip.text.y = element_blank(), 26 | legend.margin = margin(0, 0, 0, 0), 27 | legend.box.margin = margin(0, 0, 0, 0)) 28 | 29 | p0 30 | 31 | ggsave(plot = p0, filename = "../slides/figures/BC_example_326.pdf") 32 | 33 | #### Second part: inference 34 | ## First let's generate some data 35 | 36 | N <- 3 37 | true_theta <- -20 38 | X <- rnorm(n = N, mean = true_theta, sd = 1) 39 | 40 | ## Now let's fit the model under both priors 41 | library(cmdstanr) 42 | library(rstan) 43 | stanfit <- function(fit) rstan::read_stan_csv(fit$output_files()) 44 | 45 | example_326 <- cmdstanr::cmdstan_model("stan/BC_example_326.stan") 46 | 47 | stan.data <- list(N = N, x = X, prior = 1) 48 | fit.normal <- stanfit(example_326$sample(data = stan.data, refresh = 0)) 49 | stan.data$prior <- 2 50 | fit.cauchy <- stanfit(example_326$sample(data = stan.data, refresh = 0)) 51 | 52 | theta.df <- data.frame( 53 | theta = c(extract(fit.normal, 'theta')$theta, 54 | extract(fit.cauchy, 'theta')$theta) 55 | ) 56 | theta.df$prior <- rep(c("Normal", "Cauchy"), each = nrow(theta.df)/2) 57 | 58 | ## Compare posteriors 59 | p1 <- ggplot(theta.df, 60 | aes(x = theta, colour = prior, fill = prior)) + 61 | geom_density(alpha = .4) + 62 | scale_x_continuous(expression(theta)) + 63 | scale_y_continuous("Density", expand = c(0, 0)) + 64 | theme_bw(base_size = 16) + 65 | theme(legend.position = "bottom", 66 | legend.justification = "centre", 67 | legend.title = element_blank(), 68 | strip.background = element_blank(), 69 | strip.text.y = element_blank(), 70 | legend.margin = margin(0, 0, 0, 0), 71 | legend.box.margin = margin(0, 0, 0, 0)) 72 | p1 73 | 74 | ## Compare posterior predictives 75 | pred.df <- data.frame( 76 | x_pred = c(as.vector(extract(fit.normal, 'x_pred')$x_pred), 77 | as.vector(extract(fit.cauchy, 'x_pred')$x_pred)) 78 | ) 79 | pred.df$prior <- rep(c("Normal", "Cauchy"), each = nrow(pred.df)/2) 80 | 81 | 82 | p2 <- ggplot(pred.df, 83 | aes(x = x_pred, colour = prior, fill = prior)) + 84 | geom_density(alpha = .4) + 85 | scale_x_continuous(expression(x[pred])) + 86 | scale_y_continuous("Density", expand = c(0, 0)) + 87 | theme_bw(base_size = 16) + 88 | # geom_vline(xintercept = c(-4, 4), linetype = "longdash") + 89 | theme(legend.position = "bottom", 90 | legend.justification = "centre", 91 | legend.title = element_blank(), 92 | strip.background = element_blank(), 93 | strip.text.y = element_blank(), 94 | legend.margin = margin(0, 0, 0, 0), 95 | legend.box.margin = margin(0, 0, 0, 0)) 96 | 97 | p2 98 | -------------------------------------------------------------------------------- /code/stan/BC_example_326.stan: -------------------------------------------------------------------------------- 1 | data{ 2 | int N; 3 | real x[N]; 4 | int prior; 5 | } 6 | parameters{ 7 | real theta; 8 | } 9 | model{ 10 | target += normal_lpdf(x | theta, 1); 11 | if(prior == 1){ 12 | target += normal_lpdf(theta | 0, sqrt(2.19)); 13 | }else{ 14 | target += cauchy_lpdf(theta | 0, 1); 15 | } 16 | } 17 | generated quantities{ 18 | real x_pred[N]; 19 | for(i in 1:N) x_pred[i] = normal_rng(theta, 1); 20 | } -------------------------------------------------------------------------------- /code/stan/cauchy.stan: -------------------------------------------------------------------------------- 1 | data{ 2 | int n; 3 | vector[n] x; 4 | real prior_loc; 5 | real prior_scale; 6 | } 7 | parameters{ 8 | real theta; 9 | } 10 | model{ 11 | target += cauchy_lpdf(x | theta, 1); 12 | target += normal_lpdf(theta | prior_loc, prior_scale); 13 | } -------------------------------------------------------------------------------- /code/stan/raftery.stan: -------------------------------------------------------------------------------- 1 | data{ 2 | int n; 3 | real x[n]; 4 | real kappa_1; 5 | real kappa_2; 6 | } 7 | transformed data{ 8 | real S = sum(x); 9 | } 10 | parameters{ 11 | real N; 12 | real theta; 13 | } 14 | transformed parameters{ 15 | real lPbar = 0.0; 16 | for(j in 1:n) lPbar += lchoose(N, x[j]); 17 | } 18 | model{ 19 | target += -lgamma(N + 1) + lgamma(N + kappa_1) + lPbar; 20 | target += (-N + S)*log(theta) + (n*N -S)*log1m(theta) 21 | -(N + kappa_1)*log(1/theta + kappa_2); 22 | } 23 | -------------------------------------------------------------------------------- /code/stan/raftery_wrong.stan: -------------------------------------------------------------------------------- 1 | functions{ 2 | real bin_lpdf(real x, real n, real p){ 3 | real ans = lchoose(n, x) + x*log(p) + (n-x)*log1m(p); 4 | return(ans); 5 | } 6 | real pois_lpdf(real z, real mu){ 7 | real ans = -mu + z*log(mu) - lgamma(z + 1); 8 | return(ans); 9 | } 10 | } 11 | data{ 12 | int n; 13 | real x[n]; 14 | real kappa_1; 15 | real kappa_2; 16 | } 17 | parameters{ 18 | real N; 19 | real theta; 20 | real mu; 21 | } 22 | model{ 23 | for(j in 1:n) target += bin_lpdf(x[j] | N, theta); 24 | target += pois_lpdf(N | mu); 25 | } 26 | -------------------------------------------------------------------------------- /code/tramcar_problem.r: -------------------------------------------------------------------------------- 1 | ## See Bayesian Choice, pg. 181 2 | posterior_ccdf <- function(n0, m){ 3 | if(n0 < 1 || m < 1 || n0 < m) stop("Something is wrong, check your parameters") 4 | require(VGAM) 5 | lnum <- log(VGAM::zeta(x = 2, shift = n0)) 6 | ldenom <- log(VGAM::zeta(x = 2, shift = m)) 7 | return(exp(lnum - ldenom)) 8 | } 9 | posterior_ccdf <- Vectorize(posterior_ccdf) 10 | approximate_posterior_ccdf <- function(n0, m){ 11 | m/n0 12 | } 13 | approximate_posterior_ccdf <- Vectorize(approximate_posterior_ccdf) 14 | 15 | obs.tram.number <- 100 16 | 17 | k <- 10 18 | 19 | N0s <- obs.tram.number:(k*obs.tram.number) 20 | 21 | ps <- posterior_ccdf(n0 = N0s, m = obs.tram.number) 22 | app.ps <- approximate_posterior_ccdf(n0 = N0s, m = obs.tram.number) 23 | 24 | plot(N0s, 1-ps, type = "l", lwd = 3, 25 | main = "CDF of N given T", 26 | xlab = expression(n[0]), ylab = expression(Pr(N >= n[0]))) 27 | lines(N0s, 1-app.ps, col = 2, lwd = 2, lty = 2) 28 | legend(x="bottomright", 29 | legend = c("Exact", "Approximate"), 30 | col = 1:2, lty = 1:2, lwd = 2, bty = 'n') 31 | abline(h = 1/2, lty = 2) 32 | abline(v = 2*obs.tram.number, lwd = 2, col = 2, lty = 2) -------------------------------------------------------------------------------- /exercises/BC_exercises.md: -------------------------------------------------------------------------------- 1 | ## Exercises from [The Bayesian Choice](https://link.springer.com/book/10.1007/0-387-71599-1) 2 | 3 | ### Ch 1 4 | 1.1, 1.2, 1.3, 1.4, 1.5, 1.9, 1.10, 1.15, 1.17, 1.24, 1.25, 1.26, 1.29, 1.32, 1.33, 1.34, 1.35, 1.36, 1.39, 1.40, 1.43, 1.47, 1.48, 1.50, 1.51, 1.53, 1.55, 1.62. 5 | ### Ch 2 6 | 2.2, 2.6, 2.11, 2.16, 2.18, 2.19, 2.20, 2.23, 2.24, 2.28, 2.31, 2.43, 2.49, 2.51. 7 | ### Ch 3 8 | 3.2, 3.9, 3.11, 3.18, 3.21, 3.27, 3.28, 3.30, 3.43, 3.51, 3.53, 3.57, 3.59. 9 | ### Ch 4 10 | 4.2, 4.5, 4.12, 4.14, 4.15, 4.19, 4.25, 4.26, 4.33, 4.51. 11 | ### Ch 5 12 | 5.2, 5.3, 5.9, 5.10, 5.11, 5.15, 5.19, 5.25, 5.26, 5.35, 5.42, 5.47, 5.51. 13 | ### Ch 6 14 | ### Ch 7 15 | 7.1, 7.10, 7.12, 7.28, 7.31, 7.32, 7.33, 7.34, 7.35, 7.38 16 | 17 | ## Computational stuff 18 | 1.41 (Gibbs sampling) 19 | -------------------------------------------------------------------------------- /slides/bayes.bib: -------------------------------------------------------------------------------- 1 | @book{Keynes1921, 2 | title={A {T}reatise on {P}robability}, 3 | author={Keynes, John Maynard}, 4 | year={1921}, 5 | publisher={Macmillan and Company, limited} 6 | } 7 | @inproceedings{DeFinetti1931, 8 | title={Funzione caratteristica di un fenomeno aleatorio}, 9 | author={De Finetti, Bruno}, 10 | booktitle={Atti della R Academia Nazionale dei Lincei}, 11 | volume={4}, 12 | pages={251--299}, 13 | year={1931} 14 | } 15 | @inproceedings{Haldane1932, 16 | title={A note on inverse probability}, 17 | author={Haldane, John Burdon Sanderson}, 18 | booktitle={Mathematical Proceedings of the Cambridge Philosophical Society}, 19 | volume={28}, 20 | pages={55--61}, 21 | year={1932}, 22 | organization={Cambridge University Press} 23 | } 24 | @article{Jeffreys1946, 25 | title={An invariant form for the prior probability in estimation problems}, 26 | author={Jeffreys, Harold}, 27 | journal={Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences}, 28 | volume={186}, 29 | number={1007}, 30 | pages={453--461}, 31 | year={1946}, 32 | publisher={The Royal Society London} 33 | } 34 | @article{Kolmogorov1950, 35 | title={Foundations of the theory of probability}, 36 | author={Kolmogorov, Andreĭ Nikolaevich and others}, 37 | year={1950}, 38 | publisher={Chelsea Pub. Co.} 39 | } 40 | @article{Birnbaum1962, 41 | title={On the foundations of statistical inference}, 42 | author={Birnbaum, Allan}, 43 | journal={Journal of the American Statistical Association}, 44 | volume={57}, 45 | number={298}, 46 | pages={269--306}, 47 | year={1962}, 48 | publisher={Taylor \& Francis} 49 | } 50 | @article{Degroot1973, 51 | title={Doing what comes naturally: Interpreting a tail area as a posterior probability or as a likelihood ratio}, 52 | author={DeGroot, Morris H}, 53 | journal={Journal of the American Statistical Association}, 54 | volume={68}, 55 | number={344}, 56 | pages={966--969}, 57 | year={1973}, 58 | publisher={Taylor \& Francis Group} 59 | } 60 | @incollection{Jaynes1976, 61 | title={Confidence intervals vs Bayesian intervals}, 62 | author={Jaynes, Edwin T}, 63 | editor={Kempthorne, Oscar}, 64 | booktitle={Foundations of probability theory, statistical inference, and statistical theories of science}, 65 | pages={175--257}, 66 | year={1976}, 67 | publisher={Springer} 68 | } 69 | @article{Bernardo1979, 70 | title={Reference posterior distributions for Bayesian inference}, 71 | author={Bernardo, Jose M}, 72 | journal={Journal of the Royal Statistical Society: Series B (Methodological)}, 73 | volume={41}, 74 | number={2}, 75 | pages={113--128}, 76 | year={1979}, 77 | publisher={Wiley Online Library} 78 | } 79 | @article{Diaconis1979, 80 | title={Conjugate priors for exponential families}, 81 | author={Diaconis, Persi and Ylvisaker, Donald}, 82 | journal={The Annals of Statistics}, 83 | pages={269--281}, 84 | year={1979}, 85 | publisher={JSTOR} 86 | } 87 | @article{Diaconis1980, 88 | title={Finite exchangeable sequences}, 89 | author={Diaconis, Persi and Freedman, David}, 90 | journal={The Annals of Probability}, 91 | pages={745--764}, 92 | year={1980}, 93 | publisher={JSTOR} 94 | } 95 | @article{Efron1986, 96 | title={Why isn't everyone a Bayesian?}, 97 | author={Efron, Bradley}, 98 | journal={The American Statistician}, 99 | volume={40}, 100 | number={1}, 101 | pages={1--5}, 102 | year={1986}, 103 | publisher={Taylor \& Francis Group} 104 | } 105 | @article{Raftery1988, 106 | title={Inference for the binomial N parameter: A hierarchical Bayes approach}, 107 | author={Raftery, Adrian E}, 108 | journal={Biometrika}, 109 | volume={75}, 110 | number={2}, 111 | pages={223--228}, 112 | year={1988}, 113 | publisher={Oxford University Press} 114 | } 115 | @article{Gelman1992, 116 | title={Inference from iterative simulation using multiple sequences}, 117 | author={Gelman, Andrew and Rubin, Donald B}, 118 | journal={Statistical science}, 119 | pages={457--472}, 120 | year={1992}, 121 | publisher={JSTOR} 122 | } 123 | @article{Kass1995, 124 | title={Bayes factors}, 125 | author={Kass, Robert E and Raftery, Adrian E}, 126 | journal={Journal of the american statistical association}, 127 | volume={90}, 128 | number={430}, 129 | pages={773--795}, 130 | year={1995}, 131 | publisher={Taylor \& Francis} 132 | } 133 | @book{Schervish1995, 134 | title={Theory of statistics}, 135 | author={Schervish, Mark J}, 136 | year={1995}, 137 | publisher={Springer Science \& Business Media} 138 | } 139 | @article{Berkhof2000, 140 | title={Posterior predictive checks: Principles and discussion}, 141 | author={Berkhof, Johannes and Van Mechelen, Iven and Hoijtink, Herbert}, 142 | journal={Computational Statistics}, 143 | volume={15}, 144 | number={3}, 145 | pages={337--354}, 146 | year={2000}, 147 | publisher={Springer} 148 | } 149 | @book{Bernardo2000, 150 | title={Bayesian {T}heory}, 151 | author={Bernardo, Jos{\'e} M and Smith, Adrian FM}, 152 | year={2000}, 153 | publisher={John Wiley \& Sons} 154 | } 155 | @article{Gelman2002, 156 | title={A probability model for golf putting}, 157 | author={Gelman, Andrew and Nolan, Deborah}, 158 | journal={Teaching statistics}, 159 | volume={24}, 160 | number={3}, 161 | pages={93--95}, 162 | year={2002}, 163 | publisher={Wiley Online Library} 164 | } 165 | @book{Robert2007, 166 | title={The {B}ayesian choice: from decision-theoretic foundations to computational implementation}, 167 | author={Robert, Christian}, 168 | year={2007}, 169 | publisher={Springer Science \& Business Media} 170 | } 171 | @book{Hoff2009, 172 | title={A first course in {B}ayesian statistical methods}, 173 | author={Hoff, Peter D}, 174 | volume={580}, 175 | year={2009}, 176 | publisher={Springer} 177 | } 178 | @book{DeGroot2012, 179 | title={Probability and {S}tatistics}, 180 | author={DeGroot, Morris H and Schervish, Mark J}, 181 | year={2012}, 182 | publisher={Pearson Education} 183 | } 184 | @book{Schervish2012, 185 | title={Theory of {S}tatistics}, 186 | author={Schervish, Mark J}, 187 | year={2012}, 188 | publisher={Springer Science \& Business Media} 189 | } 190 | @article{Seaman2012, 191 | title={Hidden dangers of specifying noninformative priors}, 192 | author={Seaman III, John W and Seaman Jr, John W and Stamey, James D}, 193 | journal={The American Statistician}, 194 | volume={66}, 195 | number={2}, 196 | pages={77--84}, 197 | year={2012}, 198 | publisher={Taylor \& Francis} 199 | } 200 | @article{Gelman2017, 201 | title={The prior can often only be understood in the context of the likelihood}, 202 | author={Gelman, Andrew and Simpson, Daniel and Betancourt, Michael}, 203 | journal={Entropy}, 204 | volume={19}, 205 | number={10}, 206 | pages={555}, 207 | year={2017}, 208 | publisher={Multidisciplinary Digital Publishing Institute} 209 | } 210 | @article{Hennig2017, 211 | title={Beyond subjective and objective in statistics}, 212 | author={Gelman, Andrew and Hennig, Christian}, 213 | journal={Journal of the Royal Statistical Society: Series A (Statistics in Society)}, 214 | volume={180}, 215 | number={4}, 216 | pages={967--1033}, 217 | year={2017}, 218 | publisher={Wiley Online Library} 219 | } 220 | @article{Simpson2017, 221 | title={Penalising model component complexity: A principled, practical approach to constructing priors}, 222 | author={Simpson, Daniel and Rue, H{\aa}vard and Riebler, Andrea and Martins, Thiago G and S{\o}rbye, Sigrunn H}, 223 | journal={Statistical science}, 224 | pages={1--28}, 225 | year={2017}, 226 | publisher={JSTOR} 227 | } 228 | @article{Vehtari2017, 229 | title={Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC}, 230 | author={Vehtari, Aki and Gelman, Andrew and Gabry, Jonah}, 231 | journal={Statistics and computing}, 232 | volume={27}, 233 | number={5}, 234 | pages={1413--1432}, 235 | year={2017}, 236 | publisher={Springer} 237 | } 238 | @article{Yao2018, 239 | title={Using stacking to average Bayesian predictive distributions (with discussion)}, 240 | author={Yao, Yuling and Vehtari, Aki and Simpson, Daniel and Gelman, Andrew and others}, 241 | journal={Bayesian Analysis}, 242 | volume={13}, 243 | number={3}, 244 | pages={917--1007}, 245 | year={2018}, 246 | publisher={International Society for Bayesian Analysis} 247 | } 248 | @article{Gabry2019, 249 | title={Visualization in Bayesian workflow}, 250 | author={Gabry, Jonah and Simpson, Daniel and Vehtari, Aki and Betancourt, Michael and Gelman, Andrew}, 251 | journal={Journal of the Royal Statistical Society: Series A (Statistics in Society)}, 252 | volume={182}, 253 | number={2}, 254 | pages={389--402}, 255 | year={2019}, 256 | publisher={Wiley Online Library} 257 | } 258 | @article{Goncalves2019, 259 | title={On the definition of likelihood function}, 260 | author={Gon{\c{c}}alves, Fl{\'a}vio B and Franklin, Pedro}, 261 | journal={arXiv preprint arXiv:1906.10733}, 262 | year={2019} 263 | } 264 | -------------------------------------------------------------------------------- /slides/bayes_stats.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/bayes_stats.pdf -------------------------------------------------------------------------------- /slides/bayes_stats.tex: -------------------------------------------------------------------------------- 1 | \documentclass[9pt]{beamer} 2 | \usepackage{amsmath, amssymb, amsthm, mathtools, graphicx, float, subfigure, booktabs, enumitem} 3 | \usepackage{hyperref} 4 | \urlstyle{same} 5 | \usepackage{minted} 6 | \usepackage{pifont} 7 | \usepackage{xcolor} 8 | \usepackage[utf8]{inputenc} % usually not needed (loaded by default) 9 | \usepackage[T1]{fontenc} 10 | \hypersetup{colorlinks=true,citecolor=blue} 11 | \usepackage{tikz} 12 | \usepackage{fontawesome} 13 | \usepackage{libertine} 14 | \usepackage[libertine]{newtxmath} 15 | \usetikzlibrary{calc,shapes} 16 | \usepackage[normalem]{ulem} 17 | \setbeamertemplate{theorems}[numbered] 18 | \usepackage[authoryear,round]{natbib} 19 | % \usepackage[portuguese]{babel} 20 | \usetheme[pageofpages=of,% String used between the current page and the 21 | % total page count. 22 | bullet=circle,% Use circles instead of squares for bullets. 23 | titleline=true,% Show a line below the frame title. 24 | alternativetitlepage=true,% Use the fancy title page. 25 | %titlepagelogo=logo-fiocruz,% Logo for the first page. 26 | %watermark=watermark-poli 27 | to,% Watermark used in every page. 28 | %watermarkheight=100px,% Height of the watermark. 29 | %watermarkheightmult=4,% The watermark image is 4 times bigger 30 | % than watermarkheight. 31 | ]{Torino} 32 | \usecolortheme{freewilly} 33 | %%%% Box options 34 | \newcommand{\tikzmark}[1]{\tikz[overlay,remember picture] \node (#1) {};} 35 | %%%% Background settings 36 | % \setbeamercolor{normal text}{fg=white,bg=black!90} 37 | % \setbeamercolor{structure}{fg=white} 38 | % \setbeamercolor{alerted text}{fg=red!85!black} 39 | % \setbeamercolor{item projected}{use=item,fg=black,bg=item.fg!95} 40 | % \setbeamercolor*{palette primary}{use=structure,fg=structure.fg} 41 | % \setbeamercolor*{palette secondary}{use=structure,fg=structure.fg!95!black} 42 | % \setbeamercolor*{palette tertiary}{use=structure,fg=structure.fg!90!black} 43 | % \setbeamercolor*{palette quaternary}{use=structure,fg=structure.fg!95!black,bg=black!80} 44 | % \setbeamercolor{title}{fg=white} 45 | % \setbeamercolor{frametitle}{bg=white} 46 | % \setbeamercolor*{framesubtitle}{fg=white} 47 | % \setbeamercolor*{block title}{parent=structure,bg=black!95} 48 | % \setbeamercolor*{block body}{fg=black,bg=black!10} 49 | % \setbeamercolor*{block title alerted}{parent=alerted text,bg=black!95} 50 | % \setbeamercolor*{block title example}{parent=example text,bg=black!95} 51 | 52 | 53 | %%%% Maths crap 54 | \newtheorem{remark}{Remark}[] 55 | \newtheorem{theo}{Theorem}[] 56 | \newtheorem{exercise}{Exercise}[] 57 | \newtheorem{defn}{Definition}[] 58 | \newtheorem{question}{Question}[] 59 | \newtheorem{idea}{Idea}[] 60 | \newtheorem{property}{Property}[] 61 | %%%% Itemize settings 62 | \setlist[itemize,1]{label=$\bullet$} 63 | \setlist[itemize,2]{label=$\diamond$} 64 | 65 | % \setbeamercolor{block title}{use=structure,fg=white,bg=structure.fg!75!black} 66 | % \setbeamercolor{block body}{parent=normal text,use=block title,bg=block title.bg!10!bg} 67 | 68 | \setbeamercolor{block title}{use=structure,fg=white,bg=black} 69 | \setbeamercolor{block body}{parent=normal text,use=block title,fg=white,bg=gray} 70 | \setbeamercolor{frametitle}{bg=black, fg=white} 71 | 72 | %%%%%%%%%%%%%%%%%%%% Notation stuff 73 | \newcommand{\indep}{\perp \!\!\! \perp} %% indepence 74 | \newcommand{\pr}{\operatorname{Pr}} %% probability 75 | \newcommand{\vr}{\operatorname{Var}} %% variance 76 | \newcommand{\rs}{X_1, X_2, \ldots, X_n} %% random sample 77 | \newcommand{\irs}{X_1, X_2, \ldots} %% infinite random sample 78 | \newcommand{\rsd}{x_1, x_2, \ldots, x_n} %% random sample, realised 79 | \newcommand{\Sm}{\bar{X}_n} %% sample mean, random variable 80 | \newcommand{\sm}{\bar{x}_n} %% sample mean, realised 81 | \newcommand{\Sv}{\bar{S}^2_n} %% sample variance, random variable 82 | \newcommand{\sv}{\bar{s}^2_n} %% sample variance, realised 83 | \newcommand{\bX}{\boldsymbol{X}} %% random sample, contracted form (bold) 84 | \newcommand{\bx}{\boldsymbol{x}} %% random sample, realised, contracted form (bold) 85 | \newcommand{\bT}{\boldsymbol{T}} %% Statistic, vector form (bold) 86 | \newcommand{\bt}{\boldsymbol{t}} %% Statistic, realised, vector form (bold) 87 | \newcommand{\mle}{\hat{\theta}_{\text{MLE}}} 88 | \newcommand{\mb}{\hat{\theta}_{\text{B}}} 89 | \newcommand{\map}{\hat{\theta}_{\text{MAP}}} 90 | \newcommand{\be}{\operatorname{Be}} %% probability 91 | \DeclareMathOperator*{\argmin}{arg\,min} 92 | \DeclareMathOperator*{\argmax}{arg\,max} 93 | \DeclareMathOperator\supp{supp} 94 | \usepackage{url} 95 | %%%% Hyperref stuff 96 | \hypersetup{ 97 | colorlinks = true, %Colours links instead of ugly boxes 98 | urlcolor = cyan, %Colour for external hyperlinks 99 | linkcolor = cyan, %Colour of internal links 100 | citecolor = red %Colour of citations 101 | } 102 | %%%% To create without the 'Figure' prefix. Remove if you need'em 103 | \usepackage{caption} 104 | \captionsetup[figure]{labelformat=empty} 105 | %%%% 106 | \author{ 107 | \underline{Luiz Max de Carvalho}[lmax.fgv@gmail.com]\linebreak 108 | } 109 | \title{ 110 | \Huge Bayesian Statistics 111 | } 112 | \institute{ 113 | PhD-level course\\ 114 | School of Applied Mathematics (EMAp/FGV), Rio de Janeiro. 115 | } 116 | \date{\today} 117 | \logo{\includegraphics[scale=.15]{logo.jpg}} 118 | \begin{document} 119 | \include{lecture_0} 120 | \include{lecture_1} 121 | \include{lecture_2} 122 | \include{lecture_3} 123 | \include{lecture_4} 124 | \include{lecture_5} 125 | \include{lecture_6} 126 | \include{lecture_7} 127 | \include{lecture_8} 128 | \include{lecture_11} 129 | \include{lecture_extra} 130 | %%%%%%% 131 | \begin{frame}[t, allowframebreaks] 132 | \frametitle{References} 133 | \bibliographystyle{apalike} 134 | \bibliography{bayes} 135 | \end{frame} 136 | \end{document} 137 | -------------------------------------------------------------------------------- /slides/beamercolorthemechameleon.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | \definecolor{chameleongreen1}{RGB}{98,189,25} 11 | \definecolor{chameleongreen2}{RGB}{188,225,141} 12 | \definecolor{chameleongreen3}{RGB}{51,149,48} 13 | \definecolor{chameleongreen4}{RGB}{0,98,90} 14 | 15 | \setbeamercolor*{palette primary}{fg=white,bg=chameleongreen2} 16 | \setbeamercolor*{palette secondary}{fg=white,bg=chameleongreen3} 17 | \setbeamercolor*{palette tertiary}{fg=white,bg=chameleongreen4} 18 | \setbeamercolor*{palette quaternary}{fg=white,bg=chameleongreen1} 19 | 20 | \setbeamercolor*{titlelike}{bg=chameleongreen3} 21 | \setbeamercolor*{frametitle}{bg=black,fg=black} 22 | \setbeamercolor*{part title}{bg=black,fg=black} 23 | \setbeamercolor*{item}{fg=chameleongreen3} 24 | 25 | \setbeamercolor*{separation line}{} 26 | \setbeamercolor*{fine separation line}{} 27 | 28 | \mode 29 | 30 | -------------------------------------------------------------------------------- /slides/beamercolorthemefreewilly.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | \setbeamercolor*{palette primary}{use=structure,fg=white,bg=structure.fg!30!black} 11 | \setbeamercolor*{palette secondary}{use=structure,fg=white,bg=structure.fg!90!black} 12 | \setbeamercolor*{palette tertiary}{use=structure,fg=white,bg=structure.fg!90!white} 13 | \setbeamercolor*{palette quaternary}{use=structure,fg=structure.fg!70!black,bg=structure.fg!40!white} 14 | 15 | \setbeamercolor*{sidebar}{use=structure,bg=structure.fg} 16 | 17 | \setbeamercolor*{palette sidebar primary}{use=structure,fg=structure.fg!10} 18 | \setbeamercolor*{palette sidebar secondary}{fg=white} 19 | \setbeamercolor*{palette sidebar tertiary}{use=structure,fg=structure.fg!50} 20 | \setbeamercolor*{palette sidebar quaternary}{fg=white} 21 | 22 | \setbeamercolor*{titlelike}{parent=palette primary} 23 | \setbeamercolor*{item}{use=structure,fg=structure.fg!50!black} 24 | 25 | \setbeamercolor*{separation line}{} 26 | \setbeamercolor*{fine separation line}{} 27 | 28 | \mode 29 | 30 | -------------------------------------------------------------------------------- /slides/beamercolorthemenouvelle.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | \usecolortheme{chameleon} 11 | 12 | \definecolor{nouvellebordeaux}{RGB}{195,2,36} 13 | 14 | \setbeamercolor*{palette primary}{fg=white,bg=chameleongreen2} 15 | \setbeamercolor*{palette secondary}{fg=white,bg=chameleongreen3} 16 | \setbeamercolor*{palette tertiary}{fg=white,bg=chameleongreen4} 17 | \setbeamercolor*{palette quaternary}{fg=white,bg=chameleongreen1} 18 | 19 | \setbeamercolor*{titlelike}{bg=nouvellebordeaux} 20 | \setbeamercolor*{item}{fg=nouvellebordeaux} 21 | 22 | \setbeamercolor*{separation line}{} 23 | \setbeamercolor*{fine separation line}{} 24 | 25 | \mode 26 | 27 | -------------------------------------------------------------------------------- /slides/beamerinnerthemefancy.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | % Use alternative title page style. 11 | \DeclareOptionBeamer{alternativetitlepage}[true]{\def\beamer@fancy@alternativetitlepage{#1}} 12 | 13 | % Logo to use in the alternative title page. 14 | \def\beamer@fancy@titlepagelogo{} 15 | \DeclareOptionBeamer{titlepagelogo}{\def\beamer@fancy@titlepagelogo{#1}} 16 | 17 | % Bullet shape. 18 | \DeclareOptionBeamer{bullet}{\def\beamer@fancy@bullet{#1}} 19 | 20 | \ExecuteOptionsBeamer{alternativetitlepage=false,bullet=square} 21 | \ProcessOptionsBeamer 22 | 23 | % Colors. 24 | \setbeamercolor*{lineup}{parent=palette primary} 25 | \setbeamercolor*{linemid}{parent=palette secondary} 26 | \setbeamercolor*{linebottom}{parent=palette tertiary} 27 | \setbeamercolor*{title page header}{parent=palette quaternary} 28 | 29 | % Lengths. 30 | \newlength{\beamer@fancy@lineup} 31 | \setlength{\beamer@fancy@lineup}{.025\paperheight} 32 | \newlength{\beamer@fancy@linemid} 33 | \setlength{\beamer@fancy@linemid}{.015\paperheight} 34 | \newlength{\beamer@fancy@linebottom} 35 | \setlength{\beamer@fancy@linebottom}{.01\paperheight} 36 | 37 | % Margins. 38 | \newlength{\beamer@fancy@normalmargin} 39 | \setlength{\beamer@fancy@normalmargin}{.06\paperwidth} 40 | \setbeamersize{text margin left=\beamer@fancy@normalmargin} 41 | \setbeamersize{text margin right=\beamer@fancy@normalmargin} 42 | \setlength\leftmargini{.6\beamer@fancy@normalmargin} 43 | \setlength\leftmarginii{.6\beamer@fancy@normalmargin} 44 | \setlength\leftmarginiii{.6\beamer@fancy@normalmargin} 45 | 46 | % Normal title page. 47 | \defbeamertemplate*{title page normal}{fancy theme}[1][] 48 | { 49 | \vbox{} 50 | \vfill 51 | \begin{centering} 52 | \begin{beamercolorbox}[wd=\paperwidth,sep=8pt,center,#1]{title page header} 53 | \usebeamerfont{title}\inserttitle\par% 54 | \ifx\insertsubtitle\@empty% 55 | \else% 56 | \vskip0.25em% 57 | {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}% 58 | \fi% 59 | \end{beamercolorbox}% 60 | \vskip1em\par 61 | \begin{beamercolorbox}[sep=8pt,center,#1]{author} 62 | \usebeamerfont{author}\insertauthor 63 | \end{beamercolorbox} 64 | \begin{beamercolorbox}[sep=8pt,center,#1]{institute} 65 | \usebeamerfont{institute}\insertinstitute 66 | \end{beamercolorbox} 67 | \begin{beamercolorbox}[sep=8pt,center,#1]{date} 68 | \usebeamerfont{date}\insertdate 69 | \end{beamercolorbox}\vskip0.5em 70 | {\usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par} 71 | \end{centering} 72 | \vfill 73 | } 74 | 75 | % Alternative title page, you should use this in a frame with the [plain] 76 | % option. 77 | \defbeamertemplate*{title page alternative}{fancy theme}[1][] 78 | { 79 | {\parskip0pt\offinterlineskip% 80 | \hbox{\hskip-\Gm@lmargin\hbox{\vbox{% 81 | \@tempdima=\textwidth\textwidth=\paperwidth\hsize=\textwidth\def\\{,}\vbox{}\vskip-1.5ex% 82 | % Title. 83 | \begin{beamercolorbox}[wd=\paperwidth,ht=.4\paperheight,center,#1]{title page header} 84 | \usebeamerfont{title}\inserttitle\par% 85 | \ifx\insertsubtitle\@empty% 86 | \else% 87 | \vskip0.25em% 88 | {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}% 89 | \fi% 90 | \vspace{.125\paperheight}% 91 | \end{beamercolorbox}% 92 | \vbox{}\vskip-\beamer@fancy@lineup% 93 | \vbox{}\vskip-\beamer@fancy@linemid% 94 | % First line. 95 | \hbox{% 96 | \begin{beamercolorbox}[wd=.2\paperwidth,ht=\beamer@fancy@lineup,dp=0pt]{}% 97 | \end{beamercolorbox}% 98 | \begin{beamercolorbox}[wd=.8\paperwidth,ht=\beamer@fancy@lineup,dp=0pt]{lineup}% 99 | \end{beamercolorbox}% 100 | }% 101 | \vbox{}\vskip0ex% 102 | % Second line. 103 | \hbox{% 104 | \begin{beamercolorbox}[wd=.1\paperwidth,ht=\beamer@fancy@linemid,dp=0pt]{}% 105 | \end{beamercolorbox}% 106 | \begin{beamercolorbox}[wd=.9\paperwidth,ht=\beamer@fancy@linemid,dp=0pt]{linemid}% 107 | \end{beamercolorbox}% 108 | }% 109 | % Third line. 110 | \hbox{% 111 | \begin{beamercolorbox}[wd=.5\paperwidth,ht=\beamer@fancy@linebottom,dp=0pt]{}% 112 | \end{beamercolorbox}% 113 | \begin{beamercolorbox}[wd=.5\paperwidth,ht=\beamer@fancy@linebottom,dp=0pt]{linebottom}% 114 | \end{beamercolorbox}% 115 | }% 116 | \vskip0pt% 117 | }}% 118 | \hskip-\Gm@rmargin% 119 | }}\hfil% 120 | % 121 | \begin{columns} 122 | \ifx\beamer@fancy@titlepagelogo\@empty% 123 | \column{\textwidth} 124 | \else 125 | \column{.5\textwidth} 126 | % Logo. 127 | \begin{centering} 128 | \vbox{}\vfill 129 | \includegraphics[height=.4\paperheight]{\beamer@fancy@titlepagelogo} 130 | \vfill 131 | \end{centering} 132 | \column{.5\textwidth} 133 | \fi 134 | % Authors, institute and date 135 | \vskip1em\par 136 | \begin{beamercolorbox}[sep=8pt,center,#1]{author} 137 | \usebeamerfont{author}\insertauthor 138 | \end{beamercolorbox} 139 | \begin{beamercolorbox}[sep=8pt,center,#1]{institute} 140 | \usebeamerfont{institute}\insertinstitute 141 | \end{beamercolorbox} 142 | \begin{beamercolorbox}[sep=8pt,center,#1]{date} 143 | \usebeamerfont{date}\insertdate 144 | \end{beamercolorbox}\vskip0.5em 145 | {\usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par} 146 | \end{columns} 147 | } 148 | 149 | \defbeamertemplate*{title page}{fancy}[1][] 150 | { 151 | \def\beamer@fancy@truetext{true}% 152 | \ifx\beamer@fancy@alternativetitlepage\beamer@fancy@truetext% 153 | \usebeamertemplate{title page alternative}% 154 | \else% 155 | \usebeamertemplate{title page normal}% 156 | \fi% 157 | } 158 | 159 | % Items. 160 | \defbeamertemplate{itemize item}{squarealt}% 161 | {\tiny\raise.5ex\hbox{\donotcoloroutermaths$\blacksquare$}} 162 | \defbeamertemplate{itemize subitem}{squarealt}% 163 | {\tiny\raise.4ex\hbox{\donotcoloroutermaths$\square$}} 164 | \defbeamertemplate{itemize subsubitem}{squarealt}% 165 | {\tiny\raise.3ex\hbox{\donotcoloroutermaths$\blacksquare$}} 166 | 167 | \defbeamertemplate{itemize item}{circlealt}% 168 | {\small\raise.2ex\hbox{\donotcoloroutermaths$\bullet$}} 169 | \defbeamertemplate{itemize subitem}{circlealt}% 170 | {\small\raise.1ex\hbox{\donotcoloroutermaths$\circ$}} 171 | \defbeamertemplate{itemize subsubitem}{circlealt}% 172 | {\scriptsize\raise.1ex\hbox{\donotcoloroutermaths$\bullet$}} 173 | 174 | \def\circletext{circle} 175 | \ifx\beamer@fancy@bullet\circletext 176 | \setbeamertemplate{items}[circlealt] 177 | \else 178 | \setbeamertemplate{items}[squarealt] 179 | \fi 180 | 181 | \mode 182 | 183 | -------------------------------------------------------------------------------- /slides/beamerouterthemedecolines.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | % String used between the current page and the total page count. 11 | \def\beamer@decolines@pageofpages{/} 12 | \DeclareOptionBeamer{pageofpages}{\def\beamer@decolines@pageofpages{#1}} 13 | 14 | % Show a line below the frame title. 15 | \DeclareOptionBeamer{titleline}[true]{\def\beamer@decolines@titleline{#1}} 16 | 17 | % Image used for the watermark. 18 | \def\beamer@decolines@watermarkorig{} 19 | \DeclareOptionBeamer{watermark}{\def\beamer@decolines@watermarkorig{#1}} 20 | 21 | % Height of the watermark. 22 | \def\beamer@decolines@watermarkheight{100px} 23 | \DeclareOptionBeamer{watermarkheight}{\def\beamer@decolines@watermarkheight{#1}} 24 | 25 | % The original image height is watermarkheightmult * watermarkheight. 26 | \def\beamer@decolines@watermarkheightmult{1} 27 | \DeclareOptionBeamer{watermarkheightmult}{\def\beamer@decolines@watermarkheightmult{#1}} 28 | 29 | \ExecuteOptionsBeamer{titleline=false} 30 | \ProcessOptionsBeamer 31 | 32 | % Enable/disable the watermark. 33 | \def\watermarkon{% 34 | \def\beamer@decolines@watermark{\beamer@decolines@watermarkorig}% 35 | } 36 | \def\watermarkoff{\def\beamer@decolines@watermark{}} 37 | 38 | % Initially enable the watermark. 39 | \watermarkon 40 | 41 | % Colors. 42 | \setbeamercolor*{lineup}{parent=palette primary} 43 | \setbeamercolor*{linemid}{parent=palette secondary} 44 | \setbeamercolor*{linebottom}{parent=palette tertiary} 45 | \setbeamercolor*{page header}{parent=titlelike} 46 | 47 | % Lengths 48 | \newlength{\headerheight} 49 | \setlength{\headerheight}{.045\paperheight} 50 | \newlength{\beamer@decolines@lineup} 51 | \setlength{\beamer@decolines@lineup}{.025\paperheight} 52 | \newlength{\beamer@decolines@linemid} 53 | \setlength{\beamer@decolines@linemid}{.015\paperheight} 54 | \newlength{\beamer@decolines@linebottom} 55 | \setlength{\beamer@decolines@linebottom}{.01\paperheight} 56 | 57 | % The height of the watermark part below the 3 bottom lines. 58 | \newlength{\beamer@decolines@watermarkheightbottom} 59 | \addtolength{\beamer@decolines@watermarkheightbottom}{\beamer@decolines@lineup} 60 | \addtolength{\beamer@decolines@watermarkheightbottom}{\beamer@decolines@linemid} 61 | \addtolength{\beamer@decolines@watermarkheightbottom}{\beamer@decolines@linebottom} 62 | 63 | % The height of the watermark part over the 3 bottom lines before shrinking. 64 | \newlength{\beamer@decolines@watermarkheightupperorig} 65 | \setlength{\beamer@decolines@watermarkheightupperorig}{\beamer@decolines@watermarkheight} 66 | \addtolength{\beamer@decolines@watermarkheightupperorig}{-\beamer@decolines@watermarkheightbottom} 67 | \multiply\beamer@decolines@watermarkheightupperorig by \beamer@decolines@watermarkheightmult 68 | 69 | % Footer. 70 | \defbeamertemplate*{footline}{decolines theme} 71 | { 72 | \leavevmode% 73 | % Page number. 74 | \hbox{% 75 | \begin{beamercolorbox}[wd=.2\paperwidth,ht=0ex,dp=0ex,center]{}% 76 | \usebeamerfont{palette primary}\insertframenumber{} \beamer@decolines@pageofpages{} \inserttotalframenumber% 77 | \end{beamercolorbox}% 78 | \begin{beamercolorbox}[wd=.8\paperwidth,ht=0ex,dp=0ex]{}% 79 | \end{beamercolorbox}% 80 | } % 81 | % First line. 82 | \hbox{% 83 | \begin{beamercolorbox}[wd=.2\paperwidth,ht=\beamer@decolines@lineup,dp=0pt]{}% 84 | \end{beamercolorbox}% 85 | \begin{beamercolorbox}[wd=.8\paperwidth,ht=\beamer@decolines@lineup,dp=0pt]{lineup}% 86 | \end{beamercolorbox}% 87 | } % 88 | % Second line. 89 | \hbox{% 90 | \begin{beamercolorbox}[wd=\paperwidth,ht=\beamer@decolines@linemid,dp=0pt]{linemid}% 91 | \end{beamercolorbox}% 92 | } % 93 | % Third line. 94 | \hbox{% 95 | \begin{beamercolorbox}[wd=.1\paperwidth,ht=\beamer@decolines@linebottom,dp=0pt]{}% 96 | \end{beamercolorbox}% 97 | \begin{beamercolorbox}[wd=.9\paperwidth,ht=\beamer@decolines@linebottom,dp=0pt]{linebottom}% 98 | \end{beamercolorbox}% 99 | }% 100 | % This seems to fix some alignment problems with the watermark. It has to be 101 | % always applied if you do not want to see the footer moving up and down when 102 | % moving from a page with watermark to a page without or vice versa. 103 | \vskip-.5px% 104 | % Watermark. 105 | \if\beamer@decolines@watermark\@empty\else% 106 | \vskip-\beamer@decolines@watermarkheightbottom% 107 | \llap{\includegraphics[height=\beamer@decolines@watermarkheightbottom,clip=true,% 108 | trim=0pt 0pt 0pt \beamer@decolines@watermarkheightupperorig]{\beamer@decolines@watermark}\hskip-\paperwidth}% 109 | \fi% 110 | } 111 | 112 | \defbeamertemplate*{headline}{decolines theme} 113 | { 114 | \leavevmode% 115 | \hbox{% 116 | \begin{beamercolorbox}[wd=\paperwidth,ht=\headerheight,dp=0pt]{page header}% 117 | \end{beamercolorbox}% 118 | } % 119 | \vskip0pt% 120 | } 121 | 122 | \defbeamertemplate*{frametitle}{decolines theme}[1][left] 123 | { 124 | \ifbeamercolorempty[bg]{frametitle}{}{\nointerlineskip}% 125 | \@tempdima=\textwidth% 126 | \advance\@tempdima by\beamer@leftmargin% 127 | \advance\@tempdima by\beamer@rightmargin% 128 | \vbox{}\vskip-.5\beamer@leftmargin% 129 | \begin{beamercolorbox}[sep=\beamer@leftmargin,#1,wd=\the\@tempdima]{} 130 | \usebeamerfont{frametitle}\usebeamercolor[bg]{framesubtitle}% 131 | \vbox{}\vskip0ex% 132 | \if@tempswa\else\csname beamer@fte#1\endcsname\fi% 133 | \strut\insertframetitle\strut\par% 134 | {% 135 | \ifx\insertframesubtitle\@empty% 136 | \else% 137 | {\usebeamerfont{framesubtitle}\usebeamercolor[bg]{framesubtitle}\insertframesubtitle\strut\par}% 138 | \fi 139 | }% 140 | \vskip-1ex% 141 | \if@tempswa\else\vskip-\beamer@leftmargin\fi 142 | \end{beamercolorbox}% 143 | \def\beamer@decolines@truetext{true}% 144 | \ifx\beamer@decolines@titleline\beamer@decolines@truetext% 145 | \vskip-.5\beamer@leftmargin% 146 | \begin{beamercolorbox}[wd=\textwidth,ht=.1ex,dp=0ex]{linemid}% 147 | \end{beamercolorbox}% 148 | \fi 149 | } 150 | 151 | % Frame title continuations, default 152 | \defbeamertemplate*{frametitle continuation}{decolines theme}{(\insertcontinuationcount)} 153 | 154 | \defbeamertemplate*{sidebar right}{decolines theme} 155 | { 156 | \vskip.1\beamer@leftmargin% 157 | \llap{\insertlogo\hskip.5\beamer@leftmargin}% 158 | \vfill% 159 | \if\beamer@decolines@watermark\@empty\else% 160 | \llap{\includegraphics[height=\beamer@decolines@watermarkheight]{\beamer@decolines@watermark}}% 161 | \vskip-\beamer@decolines@watermarkheightbottom% 162 | \fi 163 | } 164 | 165 | \mode 166 | 167 | -------------------------------------------------------------------------------- /slides/beamerthemeTorino.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2007 by Marco Barisione 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | 8 | \mode 9 | 10 | \DeclareOptionBeamer{alternativetitlepage}[true]{\PassOptionsToPackage{alternativetitlepage=#1}{beamerinnerthemefancy}} 11 | \DeclareOptionBeamer{titlepagelogo}{\PassOptionsToPackage{titlepagelogo=#1}{beamerinnerthemefancy}} 12 | \DeclareOptionBeamer{bullet}{\PassOptionsToPackage{bullet=#1}{beamerinnerthemefancy}} 13 | \DeclareOptionBeamer{pageofpages}{\PassOptionsToPackage{pageofpages=#1}{beamerouterthemedecolines}} 14 | \DeclareOptionBeamer{titleline}[true]{\PassOptionsToPackage{titleline=#1}{beamerouterthemedecolines}} 15 | \DeclareOptionBeamer{watermark}{\PassOptionsToPackage{watermark=#1}{beamerouterthemedecolines}} 16 | \DeclareOptionBeamer{watermarkheight}{\PassOptionsToPackage{watermarkheight=#1}{beamerouterthemedecolines}} 17 | \DeclareOptionBeamer{watermarkheightmult}{\PassOptionsToPackage{watermarkheightmult=#1}{beamerouterthemedecolines}} 18 | 19 | \ProcessOptionsBeamer 20 | 21 | \useinnertheme{fancy} 22 | \useoutertheme{decolines} 23 | \usecolortheme{chameleon} 24 | 25 | \setbeamertemplate{navigation symbols}{} 26 | 27 | \mode 28 | 29 | -------------------------------------------------------------------------------- /slides/compile.sh: -------------------------------------------------------------------------------- 1 | bibtex bayes_stats.aux 2 | pdflatex -interaction=nonstopmode --shell-escape bayes_stats.tex 3 | -------------------------------------------------------------------------------- /slides/figures/BC_example_326.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/BC_example_326.pdf -------------------------------------------------------------------------------- /slides/figures/HDI.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/HDI.pdf -------------------------------------------------------------------------------- /slides/figures/PPC.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/PPC.jpg -------------------------------------------------------------------------------- /slides/figures/concentration_measure_volume.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/concentration_measure_volume.pdf -------------------------------------------------------------------------------- /slides/figures/conjugate_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/conjugate_table.pdf -------------------------------------------------------------------------------- /slides/figures/conjugate_table_expectations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/conjugate_table_expectations.pdf -------------------------------------------------------------------------------- /slides/figures/galaxies.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/galaxies.pdf -------------------------------------------------------------------------------- /slides/figures/oranges.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/oranges.pdf -------------------------------------------------------------------------------- /slides/figures/pi_MC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/pi_MC.png -------------------------------------------------------------------------------- /slides/figures/posterior_prob_half.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/posterior_prob_half.pdf -------------------------------------------------------------------------------- /slides/figures/tiger.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/tiger.jpg -------------------------------------------------------------------------------- /slides/figures/traceplots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/traceplots.png -------------------------------------------------------------------------------- /slides/figures/turtles_all_the_way_down.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/figures/turtles_all_the_way_down.jpeg -------------------------------------------------------------------------------- /slides/lecture_0.tex: -------------------------------------------------------------------------------- 1 | \begin{frame} 2 | \titlepage % Print the title page as the first slide 3 | \end{frame} 4 | \section{Part I: Foundations} 5 | \begin{frame}{Welcome!} 6 | \begin{itemize} 7 | \item This is a 60-hour, PhD-level course on Bayesian inference. 8 | \item We have 11 planned weeks. Reading material is posted at~\url{https://github.com/maxbiostat/BayesianStatisticsCourse/} 9 | \item Assessment will be done via a written exam (70\%) and an assignment ($30\%$); 10 | \item Tenets: 11 | \begin{itemize} 12 | \item Respect the instructor and your classmates; 13 | \item Read before class; 14 | \item Engage in the discussion; 15 | \item Don't be afraid to ask/disagree. 16 | \end{itemize} 17 | \item Books are 18 | \begin{itemize} 19 | \item \cite{Robert2007}; 20 | \item \cite{Hoff2009}; 21 | \item \cite{Schervish1995}; 22 | \item \cite{Bernardo2000}. 23 | \end{itemize} 24 | \end{itemize} 25 | \end{frame} 26 | 27 | \begin{frame}{Bayes's Theorem} 28 | What do 29 | \begin{equation} 30 | \label{eq:BT_1} 31 | \pr(A \mid B) = \frac{\pr(B \mid A)\pr(A)}{\pr(B)}, 32 | \end{equation} 33 | and 34 | \begin{equation} 35 | \label{eq:BT_2} 36 | \pr(A_i \mid B) = \frac{\pr(B \mid A)\pr(A)}{\sum_{i=1}^n \pr(B \mid A_i)\pr(A_i)}, 37 | \end{equation} 38 | and 39 | \begin{equation} 40 | \label{eq:BT_3} 41 | p(\theta \mid \boldsymbol{y}) = \frac{l(\boldsymbol{y} \mid \theta)\pi(\theta)}{\int_{\boldsymbol{\Theta}} l(\boldsymbol{y} \mid t)\pi(t) \, dt}, 42 | \end{equation} 43 | and 44 | \begin{equation} 45 | \label{eq:BT_4} 46 | p(\theta \mid \boldsymbol{y}) = \frac{l(\boldsymbol{y} \mid \theta)\pi(\theta)}{m(\boldsymbol{y})}, 47 | \end{equation} 48 | all have in common? 49 | In this course, we will find out how to use Bayes's rule in order to draw statistical inferences in a coherent and mathematically sound way. 50 | \end{frame} 51 | \begin{frame}{Bayesian Statistics is a complete approach} 52 | Our whole paradigm revolves around the posterior: 53 | $$ p(\theta \mid \boldsymbol{x}) \propto l(\theta \mid \boldsymbol{x})\pi(\theta).$$ 54 | Within the Bayesian paradigm, you are able to 55 | \begin{itemize} 56 | \item Perform point and interval inference about unknown quantities; 57 | \begin{align*} 58 | \delta(\boldsymbol{x}) &= E_p[\theta] := \int_{\boldsymbol{\Theta}} t p(t \mid \boldsymbol{x} )\,dt,\\ 59 | \pr( a \leq \theta \leq b) &= 0.95 = \int_{a}^{b} p(t \mid \boldsymbol{x} )\,dt; 60 | \end{align*} 61 | \item Compare models: 62 | $$\operatorname{BF}_{12} = \frac{\pr(M_1 \mid \boldsymbol{x})}{\pr(M_2 \mid \boldsymbol{x})} = \frac{\pr(\boldsymbol{x} \mid M_1)\pr(M_1)}{\pr(\boldsymbol{x} \mid M_2)\pr(M_2)};$$ 63 | \item Make predictions: $g(\tilde{x} \mid \boldsymbol{x}) := \int_{\boldsymbol{\Theta}} f(\tilde{x} \mid t)p(t\mid \boldsymbol{x})\,dt$; 64 | \item Make decisions: $E_p[U(r)]$. 65 | \end{itemize} 66 | \end{frame} 67 | \begin{frame}{Statistical model: informal definition} 68 | Stuff you say at the bar: 69 | \begin{defn}[Statistical model: informal] 70 | \label{def:statistical_model_informal} 71 | DeGroot, def 7.1.1, pp. 377 72 | A statistical model consists in identifying the random variables of interest (observable and potentially observable), the specification of the joint distribution of these variables and the identification of parameters ($\theta$) that index this joint distribution. 73 | Sometimes it is also convenient to assum that the parameters are themselves random variables, but then one needs to specify a joint distribution for $\theta$ also. 74 | \end{defn} 75 | \end{frame} 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 77 | \begin{frame}{Statistical model: formal definition} 78 | Stuff you say in a Lecture: 79 | \begin{defn}[Statistical model: formal] 80 | \label{def:statistical_model_formal} 81 | \href{https://projecteuclid.org/download/pdf_1/euclid.aos/1035844977}{McCullagh, 2002}. 82 | Let $\mathcal{X}$ be an arbitrary sample space, $\Theta$ a non-empty set and $\mathcal{P}(\mathcal{X})$ the set of all probability distributions on $\mathcal{X}$, i.e. $P : \Theta \to [0, \infty)$, $P \in \mathcal{P}$. 83 | A \underline{parametric} statistical model is a function $P : \Theta \to \mathcal{P}(\mathcal{X})$, that associates each point $\theta \in \Theta$ to a probability distribution $P_\theta$ over $\mathcal{X}$. 84 | \end{defn} 85 | \textbf{Examples}: 86 | \begin{itemize} 87 | \item Put $\mathcal{X} = \mathbb{R}$ and $\Theta = (-\infty, \infty)\times (0, \infty)$. 88 | We say $P$ is a \textit{normal} (or \textit{Gaussian}) statistical model\footnote{Note the abuse of notation: striclty speaking, $P_\theta$ is a probability~\textbf{measure} and not a ~\textit{density} as we have presented it here.} if for every $\theta = \{\mu, \sigma^2\} \in \Theta$, 89 | $$P_{\theta}(x) \equiv \frac{1}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right), \: x \in \mathbb{R}.$$ 90 | \item Put $\mathcal{X} = \mathbb{N}\cup \{0\}$ and $\Theta = (0, \infty)$. 91 | $P$ is a Poisson statistical model if, for $\lambda \in \Theta$, 92 | $$P_{\lambda}(k) \equiv \frac{e^{-\lambda}\lambda^k}{k!}, \: k = 0, 1, \ldots$$ 93 | \end{itemize} 94 | \end{frame} 95 | % \begin{frame} 96 | % Theorem 97 | % $$ \int_{\mathcal{X}} f_X(t)\,dt$$ 98 | % \begin{theo}[b] 99 | % a 100 | % \end{theo} 101 | % \end{frame} 102 | % \begin{frame}{Overview} 103 | % \tableofcontents 104 | % \end{frame} 105 | -------------------------------------------------------------------------------- /slides/lecture_1.tex: -------------------------------------------------------------------------------- 1 | \subsection{Principled statistical inference} 2 | \begin{frame}{Principle I: the sufficiency principle} 3 | Sufficiency plays a central role in all of Statistics. 4 | \begin{defn}[Sufficient statistic] 5 | Let $x \sim f(x \mid \theta)$. 6 | We say $T : \mathcal{X} \to \mathbb{R}$ is a \textbf{sufficient statistic} for the parameter $\theta$ if $\pr(X = x \mid T(x), \theta)$ is independent of $\theta$. 7 | \end{defn} 8 | This is the basis for a cornerstone of Statistics, 9 | \begin{theo}[Factorisation theorem] 10 | Under mild regularity conditions, we can write: 11 | $$ f(x \mid \theta) = g(T(x) \mid \theta) h(x \mid T(x)).$$ 12 | \end{theo} 13 | We can now state 14 | \begin{idea}[Sufficiency principle (SP)] 15 | \label{idea:SP} 16 | For $x, y \in \mathcal{X}$, if $T$ is sufficient for $\theta$ and $T(x) = T(y)$, then $x$ and $y$ should lead to the same inferences about $\theta$. 17 | \end{idea} 18 | \end{frame} 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 20 | \begin{frame}[allowframebreaks]{Principle II: the Likelihood principle} 21 | The Likelihood Principle (LP) is a key concept in Statistics, of particular Bayesian Statistics. 22 | \begin{idea}[Likelihood Principle] 23 | \label{idea:LP} 24 | The information brought by an observation $x \in \mathcal{X}$ about a parameter $\theta \in \boldsymbol{\Theta}$ is \textbf{completely} contained in the likelihood function $l(\theta \mid x) \propto f(x \mid \theta)$. 25 | \end{idea} 26 | \begin{example}[Uma vez Flamengo...] 27 | Suppose a pollster is interested in estimating the fraction $\theta$ of football fans that cheer for Clube de Regatas do Flamengo (CRF). 28 | They survey $n=12$ people and get $x=9$ supporters and $y=3$ ``antis''. 29 | Consider the following two designs: 30 | \begin{itemize} 31 | \item[i)] Survey $12$ people and record the number of supporters; 32 | \item[ii)] Survey until they get $y=3$. 33 | \end{itemize} 34 | The likelihoods for both surveys are, respectively, 35 | \begin{align*} 36 | x \sim \operatorname{Binomial}(n, \theta) \implies l_1(\theta \mid x, n) &= \binom{n}{x} \theta^{x}(1-\theta)^{n-x},\\ 37 | n \sim \operatorname{Negative Binomial}(y, 1-\theta) \implies l2(\theta \mid n, y) &= \binom{n-1}{y-1}y (1-\theta)^{n-y} \theta^y, 38 | \end{align*} 39 | hence 40 | \begin{equation*} 41 | l_1(\theta) \propto l_2(\theta) \propto \theta^{3}(1-\theta)^9. 42 | \end{equation*} 43 | Therefore, we say that these two experiments bring exactly the same information about $\theta$. 44 | \end{example} 45 | A generalised version of the LP can be stated as follows: 46 | \begin{theorem}[\textbf{Likelihood Proportionality Theorem}~\citep{Goncalves2019}] 47 | Let $\Theta$ be a nonempty set and $\mathcal{P} = \{ P_\theta; \theta \in \Theta \}$ be a family of probability measures on $(\Omega, \mathcal{A})$ and $\nu_1$ and $\nu_2$ be $\sigma$-finite measures on $(\Omega, \mathcal{A})$. 48 | Suppose $P \ll \nu_1$ and $P \ll \nu_2$ for all $P \in \mathcal{P}$. 49 | Then there exists a measurable set $A \in \mathcal{A}$ such that $P_\theta(A) = 1$ for all $\theta \in \Theta$ and there exist $f_{1,\theta} \in \left[ \frac{dP_\theta}{d\nu_1}\right]$ and $f_{2,\theta} \in \left[ \frac{dP_\theta}{d\nu_2}\right]$ and a measurable function $h$ such that 50 | \begin{equation*} 51 | f_{1,\theta}(\omega) = h(\omega)f_{2,\theta}(\omega), \forall\, \theta \in \Theta\, \forall\, \omega \in A. 52 | \end{equation*} 53 | \end{theorem} 54 | \end{frame} 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | \begin{frame}{Principle III: stopping rule principle} 57 | A subject of contention between inference paradigms is the role of stopping rules in the inferences drawn. 58 | \begin{idea}[Stopping rule principle (SRP)] 59 | \label{idea:SRP} 60 | Let $\tau$ be a stopping rule directing a series of experiments $\mathcal{E}_1, \mathcal{E}_2, \ldots$, which generates data $\boldsymbol{x} = (x_1, x_2, \ldots)$. 61 | Inferences about $\theta$ should depend on $\tau$ only through $\boldsymbol{x}$. 62 | \end{idea} 63 | \begin{example}[Finite stopping rules] 64 | Suppose experiment $\mathcal{E}_i$ leads to the observation of $x_i \sim f(x_i \mid \theta)$ and let $\mathcal{A}_i \subset \mathcal{X}_1 \times \ldots \times \mathcal{X}_i$ be a sequence of events. 65 | Define 66 | $$ \tau := \inf \left\{ n : (x_1, \ldots, x_n) \in \mathcal{A}_n \right\}.$$ 67 | It can be shown that $\pr(\tau < \infty) = 1$ (exercise 1.20 BC). 68 | \end{example} 69 | \end{frame} 70 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 71 | \begin{frame}{Principle IV: the conditionality principle} 72 | We will now state one of the main ingredients of the derivation of the LP. 73 | The Conditionality Principle (CP) is a statement about the permissible inferences from randomised experiments. 74 | \begin{idea}[Conditionality Principle] 75 | \label{idea:CP} 76 | Let $\mathcal{E}_1$ and $\mathcal{E}_2$ be two experiments about $\theta$. 77 | Let $Z \sim \operatorname{Bernoulli}(p)$ and 78 | \begin{itemize} 79 | \item If $Z=1$, perform $\mathcal{E}_1$ to generate $x_1 \sim f_1(x_1 \mid \theta)$; 80 | \item If $Z=0$ perform $\mathcal{E}_2$ to generate $x_2 \sim f_2(x_2 \mid \theta)$. 81 | \end{itemize} 82 | Inferences about $\theta$ should depend \textbf{only} on the selected experiment, $\mathcal{E}_i$. 83 | \end{idea} 84 | \end{frame} 85 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 86 | \begin{frame}{Deriving the Likelihood Principle} 87 | \cite{Birnbaum1962} showed that the simpler and mostly uncontroversial Sufficiency and Conditionality principles lead to the Likelihood Principle. 88 | \begin{theo}[Birnbaum's theorem~\citep{Birnbaum1962}] 89 | \label{thm:Birnbaum} 90 | \begin{equation} 91 | \operatorname{SP} + \operatorname{CP} \implies \operatorname{LP}. 92 | \end{equation} 93 | \end{theo} 94 | \begin{proof} 95 | Sketch: 96 | \begin{itemize} 97 | \item Define a function $\operatorname{EV}(\mathcal{E}, x)$ to quantify the evidence about $\theta$ brought by data $x$ from experiment $\mathcal{E}$ and consider a randomised experiment $\mathcal{E}^*$ in which $\mathcal{E}_1$ and $\mathcal{E}_2$ are performed with probability $p$; 98 | \item Show that CP implies 99 | $\operatorname{EV}(\mathcal{E}^*, (j, x_j)) = \operatorname{EV}(\mathcal{E}_j, x_j), j = 1, 2$; 100 | \item Show that SP implies 101 | $\operatorname{EV}(\mathcal{E}^*, (1, x_1)) = \operatorname{EV}(\mathcal{E}^*, (2, x_2))$ when 102 | $$ l(\theta \mid x_1) = c l(\theta \mid x_2).$$ 103 | \end{itemize} 104 | \end{proof} 105 | See~\cite{Robert2007}, pg.18 for a complete proof. 106 | \end{frame} 107 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 108 | \begin{frame}{Recommended reading} 109 | \begin{itemize} 110 | \item[\faBook] \cite{Robert2007} Ch. 1; 111 | \item[\faForward] Next lecture: \cite{Robert2007} Ch. 2 and $^\ast$ \cite{Schervish2012} Ch.3; 112 | % \item {\large\textbf{Recommended exercises}} 113 | % \begin{itemize} 114 | % \item[\faBookmark] \cite{Robert2007}. 115 | % \begin{itemize} 116 | % \item Sections. 117 | % \item $^\ast$ Sections . 118 | % \end{itemize} 119 | % \end{itemize} 120 | \end{itemize} 121 | \end{frame} 122 | -------------------------------------------------------------------------------- /slides/lecture_11.tex: -------------------------------------------------------------------------------- 1 | \section*{Bayesian rules} 2 | \begin{frame}{Why be Bayesian I: probabilistic representation} 3 | 4 | We already reduce our uncertainty about phenomena to probability distributions for sampling distributions (likelihoods). 5 | 6 | \begin{idea}[Probabilisation of uncertainty] 7 | \label{id:prob_uncertainty} 8 | Our statistical models are \textit{interpretations} of reality, rather than \textit{explanations} of it. 9 | Moreover, 10 | \begin{quote} 11 | `` ...the representation of unknown phenomena by a probabilistic model, at the observational level as well as at the parameter level, does not need to correspond effectively—or physically—to a generation from a probability distribution, nor does it compel us to enter a supradeterministic scheme, fundamentally because of the nonrepeatability of most experiments.'' 12 | \end{quote} 13 | \cite{Robert2007}, pg 508. 14 | \end{idea} 15 | \end{frame} 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | \begin{frame}{Why be Bayesian II: conditioning on OBSERVED data} 18 | Remember Idea~\ref{id:soul}: conditioning is the soul of (Bayesian) Statistics. 19 | \begin{idea}[Conditioning on what is actually observed] 20 | \label{id:obs_data} 21 | A quantitative analysis about the parameter(s) $\theta$ conditioning \textit{only} on the observed data, $x$ unavoidably requires a distribution over $\theta$. 22 | To this end, the \textbf{only} coherent way to achieve this goal starting from a distribution $\pi(\theta)$ is to use Bayes's theorem. 23 | \end{idea} 24 | 25 | Frequentist arguments are, necessarily, about procedures that behave well under a given data-generating process and thus forcibly make reference to unobserved data sets that could, in theory, have been observed. 26 | \end{frame} 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | \begin{frame}{Why be Bayesian III: priors as inferential tools} 29 | 30 | A refreshing break from the strictly subjectivist view of Bayesianism can be had if we think about inference functionally. 31 | 32 | \begin{idea}[The prior as a regularisation tool] 33 | \label{id:prior_tool} 34 | If one adopts a mechanistic view of Bayesian inference, the prior can be seen as an additional regularisation or penalty term that enforces certain model behaviours, such as sparsity or parsimony. 35 | A good prior both \textit{summarises} substantitve knowledge about the process and rules out unlikely model configurations. 36 | \end{idea} 37 | 38 | In other words, sometimes it pays to use the prior to control what the model \textit{does}, rather than which specific values the parameter takes. 39 | 40 | \end{frame} 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 42 | \begin{frame}{Why be Bayesian IV: embracing subjectivity} 43 | The common notion of ``objectivity'' is ruse. 44 | There is no such thing as a truly objective analysis, and taking objectivity as premise might hinder our ability to focus on actual discovery and explanation~\citep{Hennig2017}. 45 | \begin{idea}[The subjective basis of knowledge] 46 | \label{id:subjective} 47 | Knowledge arises from a confrontation between \textit{a prioris} and experiments (data). 48 | Let us hear what Poincaré\footnote{Jules Henri Poincaré (1854--1912) was a French mathematician and the quote is from \textit{La Science and l'Hypóthese} (1902).} had to say: 49 | \begin{quote} 50 | ``It is often stated that one should experiment without preconceived ideas. 51 | This is simply impossible; not only would it make every experiment sterile, but even if we were ready to do so, we could not implement this principle. 52 | Everyone stands by [their] own conception of the world, which [they] cannot get rid of so easily.'' 53 | \end{quote} 54 | \end{idea} 55 | \end{frame} 56 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 57 | \begin{frame}{Why be Bayesian V: principled inference} 58 | As we saw in the first lectures of this course, the Bayesian approach is coherent with a few very compelling principles, namely Sufficiency, Conditionality and the Likelihood principle. 59 | \begin{idea}[Bayesian inference follows from strong principles] 60 | Starting from a few desiderata, namely conditioning on the \textbf{observed} data, independence of stopping criteria and respecting the sufficiency, conditionality and likelihood principles, one arrives at a single approach: Bayesian inference using proper priors. 61 | \label{id:principled_inference} 62 | \end{idea} 63 | \end{frame} 64 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 65 | \begin{frame}{Why be Bayesian VI: universal inference} 66 | Bayesian Statistics provides an universal procedure for drawing inference about probabilistic models, something Frequentists can only dream of. 67 | \begin{idea}[Bayesian inference is universal] 68 | Starting from a sampling model, a (proper) prior and a loss (or utility) function, the Bayesian analyst can always derive an estimator. 69 | Moreover, and importantly, many optimal frequentist estimators can be recovered from Bayesian estimators or limits of Bayesian estimators. 70 | Paradoxically, this means that one can be a staunch advocate of Frequentism and still employ Bayesian methods (see, e.g. least favourable priors). 71 | \label{id:universal} 72 | \end{idea} 73 | \end{frame} 74 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 75 | \begin{frame}{How to be Bayesian I: clarity \& openness} 76 | Being subjective does not mean ``anything goes''. 77 | As a scientist, you are still bound by the laws of logic and reason. 78 | 79 | \begin{idea}[State your prior elicitation clearly and openly] 80 | As we have seen, prior information does not always translate exactly into one unique prior choice. 81 | In other words, the same prior information can be represented adequately by two or more probability distributions. 82 | Make sure your exposition \textbf{clearly} separates which features of the prior come from substantitve domain expertise and which ones are arbitrary constraints imposed by a particular choice of parametric family, for instance. 83 | An effort must be made to state all modelling choices \textbf{openly}. 84 | Openly stating limitations is not a bug, it is a feature. 85 | \end{idea} 86 | \end{frame} 87 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 88 | \begin{frame}{How to be Bayesian II: ``noninformativeness'' requires care} 89 | Mathematically, Bayesian Statistics is all-encompassing (see Idea~\ref{id:universal}). 90 | One must be careful\footnote{Personally, I'm not opposed to reference priors and the like, and gladly employ them in my own research work, but I do think one needs to know very well what one is doing in order to employ them properly.} when employing so-called ``objective'' Bayesian methods. 91 | \begin{idea}[Beware of objective priors] 92 | \label{id:careful} 93 | In a functional sense, non-informative priors are a welcome addition to Bayesian Statistics because they provide~\textit{closure}, and confer its universality. 94 | On the other hand, reference priors and the like cannot be justified as summarising prior information. 95 | From a technical standpoint, many noninformative priors are also improper and thus impose the need to check propriety of the resulting posterior distribution. 96 | \end{idea} 97 | \end{frame} 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 99 | \begin{frame}{A word of caution} 100 | 101 | A strong defence of the Bayesian paradigm should not cloud our view of the bigger picture. 102 | Statistics is the grammar of Science; whatever grammatical tradition you choose, be sure to employ it properly. 103 | \begin{idea}[Do not become a zealot!] 104 | \label{id:not_zealot} 105 | Statistics is about learning from data and making decisions under uncertainty. 106 | The key to a good statistical analysis is not which ideology underpins it, but how helpful it is at answering the scientific questions at hand. 107 | Ideally, you should know both\footnote{Here we are pretending for a second that there are only two schools of thought in Statistics.} schools well enough to be able to analyse any problem under each approach. 108 | \end{idea} 109 | \end{frame} 110 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 111 | \begin{frame}{So long, and thanks for all the fish!} 112 | Remember, kids: 113 | \begin{center} 114 | {\Huge Bayes rules!} 115 | \end{center} 116 | \end{frame} 117 | 118 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 119 | \begin{frame}{Recommended reading} 120 | \begin{itemize} 121 | \item[\faBook] \cite{Jaynes1976},~\cite{Efron1986} and Ch 11 of~\cite{Robert2007}. 122 | % \item 123 | \end{itemize} 124 | \end{frame} 125 | -------------------------------------------------------------------------------- /slides/lecture_2.tex: -------------------------------------------------------------------------------- 1 | \subsection{Decision Theory basics} 2 | \begin{frame}{The decision-theoretic foundations of the Bayesian paradigm} 3 | \begin{defn}[Loss function] 4 | \label{def:loss_fn} 5 | \end{defn} 6 | \end{frame} 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | \begin{frame}{Utility functions} 9 | Properties: 10 | \end{frame} 11 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 12 | \begin{frame}{} 13 | \begin{theo}[] 14 | \end{theo} 15 | See Proposition 4.3 in \cite{Bernardo2000} for a proof outline. 16 | Here we shall prove the version from~\cite{DeFinetti1931}. 17 | \begin{idea}[] 18 | \end{idea} 19 | \end{frame} 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | \begin{frame}{Recommended reading} 22 | \begin{itemize} 23 | \item[\faBook] \cite{Robert2007} Ch. 2. and $^\ast$\cite{Schervish2012} Ch.3; 24 | \item[\faForward] Next lecture: \cite{Hoff2009} Ch. 2 and $^\ast$\cite{Schervish2012} Ch.1; 25 | \end{itemize} 26 | \end{frame} 27 | -------------------------------------------------------------------------------- /slides/lecture_3.tex: -------------------------------------------------------------------------------- 1 | \subsection{Belief functions and exchangeability} 2 | \begin{frame}{Belief functions} 3 | Let $F, G$ and $H \in \mathcal{S}$ be three (possibly overlapping) statements about the world. 4 | For example, consider the following statements about a person: 5 | \begin{itemize} 6 | \item [F] = \{votes for a left-wing candidate\} ; 7 | \item [G] = \{is in the 10\% lower income bracket\} ; 8 | \item [H] = \{lives in a large city\} ; 9 | \end{itemize} 10 | 11 | \begin{defn}[Belief function] 12 | \label{def:belief_function} 13 | For $A, B \in \mathcal{S}$, a belief function $\be : \mathcal{S} \to \mathbb{R}$ assigns numbers to statements such that $\be(A) < \be(B)$ implies one is more confident in $B$ than in $A$. 14 | \end{defn} 15 | \end{frame} 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | \begin{frame}{Belief functions: properties} 18 | It is useful to think of $\be$ as~\textbf{preferences over bets}: 19 | \begin{itemize} 20 | \item $\be(F) > \be(G)$ means we would bet on $F$ being true over $G$ being true; 21 | \item $\be(F\mid H) > \be(G \mid H)$ means that, \textbf{conditional} on knowing $H$ to be true, we would bet on $F$ over $G$; 22 | \item $\be(F\mid G) > \be(F \mid H)$ means that if we were forced to bet on $F$, we would be prefer doing so if $G$ were true than $H$. 23 | \end{itemize} 24 | \end{frame} 25 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 26 | \begin{frame}{Belief functions: axioms} 27 | In order for $\be$ to be \textbf{coherent}, it must adhere to a certain set of properties/axioms. 28 | A self-sufficient collection is: 29 | \begin{itemize} 30 | \item [A1] (boundedness of complete [dis]belief): $$\be(\lnot H \mid H) \leq \be(F \mid H) \leq \be(H \mid H),\, \forall\: F \in \mathcal{S};$$ 31 | \item [A2] (monotonicity): 32 | $$\be(F \, \text{or} \, G \mid H) \geq \max \left\{ \be(F \mid H), \be(G \mid H) \right\};$$ 33 | \item [A3] (sequentiality): There exists $f: \mathbb{R}^2 \to \mathbb{R}$ such that 34 | $$ \be(F\, \text{and} \, G \mid H) = f\left(\be(G\mid H), \be(F \mid G\, \text{and} \, H) \right).$$ 35 | \end{itemize} 36 | \end{frame} 37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 38 | \begin{frame}{Probabilities can be beliefs!} 39 | \begin{exercise}[Probabilities and beliefs] 40 | Show that the axioms of belief functions map one-to-one to the axioms of probability: 41 | \begin{itemize} 42 | \item[P1.] $0 \leq \pr(E), \forall E \in \mathcal{S}$; 43 | \item[P2.] $\pr(\mathcal{S}) = 1$; 44 | \item[P3.] For any countable sequence of disjoint statements $E_1, E_2, \ldots \in \mathcal{S}$ we have 45 | $$ \pr \left(\bigcup_{i=1}^\infty E_i \right) = \sum_{i=1}^\infty \pr(E_i).$$ 46 | \end{itemize} 47 | \end{exercise} 48 | Hint: derive the consequences (e.g. monotonicity) of these axioms and compare them with the axioms of belief functions. 49 | \end{frame} 50 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 51 | \begin{frame}{Useful probability laws} 52 | \begin{defn}[Partition] 53 | \label{def:partition} 54 | If $H = \{H_1, H_2, \ldots, H_k\}$, $H_i \in \mathcal{S}$, such that $H_i \cap H_j = \emptyset$ for all $i \neq j$ and $\bigcup_{k=1}^K = \mathcal{S}$, we say $H$ is a partition of $\mathcal{S}$. 55 | \end{defn} 56 | For any $H \in \mathcal{D}(\mathcal{S})$: 57 | \begin{itemize} 58 | \item \textbf{Total probability}: $\sum_{k=1}^K \pr(H_k) = 1$; 59 | \item \textbf{Marginal probability}: $$\pr(E) = \sum_{k=1}^K = \pr(E \cap H_k) = \sum_{k=1}^K \pr(E \mid H_k)\pr(H_k),$$ 60 | for all $E \in \mathcal{S}$; 61 | \item Consequence $\implies$ Bayes's rule: 62 | $$ \pr(H_j \mid E) = \frac{\pr(E \mid H_j)\pr(H_j)}{\sum_{k=1}^K \pr(E \mid H_k)\pr(H_k)}.$$ 63 | \end{itemize} 64 | \end{frame} 65 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 66 | \begin{frame}{Independence} 67 | We will now state a central concept in probability theory and Statistics. 68 | \begin{defn}[ (Conditional) Independence] 69 | For any $F, G \in \mathcal{S}$, we say $F$ and $G$ are~\textbf{conditionally independent} given $A$ if 70 | $$ \pr(F \cap G \mid A) = \pr(F\mid A)\pr(G\mid A).$$ 71 | \end{defn} 72 | \begin{remark} 73 | \label{rmk:conditional_indep} 74 | If $F$ and $G$ are conditionally independent given $A$, then 75 | $$ \pr(F \mid A \cap G) = \pr(F \mid A).$$ 76 | \end{remark} 77 | \begin{proof} 78 | First, notice that the axioms P1-P3 imply $\pr(F \cap G \mid A) = \pr(G\mid A)\pr(F \mid A \cap G)$. 79 | Now use conditional independence to write 80 | \begin{align*} 81 | \pr(G \mid A) \pr(F \mid A \cap G) &= \pr(F \cap G \mid A) = \pr(F\mid A)\pr(G\mid A),\\ 82 | \pr(G\mid A) \pr(F \mid A \cap G) &= \pr(F\mid A) \pr(G \mid A). 83 | \end{align*} 84 | \end{proof} 85 | \end{frame} 86 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 87 | \begin{frame}{Exchangeability} 88 | \begin{defn}[Exchangeable] 89 | \label{def:exchangeable} 90 | We say a sequence of random variables $\boldsymbol{Y} = \{ Y_1, Y_2, \ldots, Y_n \}$ are \textbf{exchangeable} if 91 | $$ \pr(Y_1, Y_2, \ldots Y_n) = \pr(Y_{\xi_1}, Y_{\xi_2}, \ldots Y_{\xi_n}),$$ 92 | for all \textbf{permutations} $\boldsymbol{\xi}$ of the labels of $\boldsymbol{Y}$. 93 | \end{defn} 94 | \begin{example}[Uma vez Flamengo... continued] 95 | Suppose we survey 12 people and record whether they cheer for Flamengo $Y_i = 1$ or not $Y_i = 0$, $i=1, 2,\ldots, 12$. 96 | What value shoud we assign to : 97 | \begin{itemize} 98 | \item $p_1 := \pr(1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1)$; 99 | \item $p_2 :=\pr(1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1)$; 100 | \item $p_3 := \pr(1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0)$? 101 | \end{itemize} 102 | If your answer is $p_1 = p_2 = p_3$ then you are saying the $Y_i$ are (at least partially) exchangeable! 103 | \end{example} 104 | \end{frame} 105 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 106 | \begin{frame}{An application of conditional independence} 107 | For $\theta \in (0, 1)$, consider the following sequence of probability statements: 108 | \begin{align*} 109 | \pr(Y_{12} = 1 \mid \theta) &= \theta,\\ 110 | \pr(Y_{12} = 1 \mid Y_1, \ldots Y_{11}, \theta) & = \theta,\\ 111 | \pr(Y_{11} = 1 \mid Y_1, \ldots Y_{10}, Y_{12}, \theta) &= \theta. 112 | \end{align*} 113 | These imply that the $Y_i$ are conditionally independent and identically distributed (iid), and in particular: 114 | \begin{align*} 115 | \pr(Y_1 = y_1, \ldots, Y_{12} = y_{12} \mid \theta) &= \prod_{i=1}^{12} \theta^{y_i} (1-\theta)^{1-y_i},\\ 116 | &= \theta^{S} (1-\theta)^{12-S}, 117 | \end{align*} 118 | with $S := \sum_{i=1}^{12} y_i$. 119 | Also, under a uniform prior, 120 | $$ \pr(Y_1, \ldots Y_{12}) = \int_{0}^1 t^{S} (1-t)^{12-S} \pi(t)\,dt = \frac{(S + 1)!(12-S +1)!}{13!} = \binom{13}{S + 1}^{-1}.$$ 121 | \end{frame} 122 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 123 | \begin{frame}{Relaxing exchangeability (a bit)} 124 | Sometimes total symmetry can be a burden. 125 | We can relax this slightly by introducing the concept of \textbf{partial exchangeability}: 126 | \begin{defn}[Partially exchangeable] 127 | \label{def:partially_exchangeable} 128 | Let $\boldsymbol{X} = \{ X_1, \ldots, X_n\}$ and $\boldsymbol{X} = \{ Y_1, \ldots, Y_m\}$ be two sets of random variables. 129 | We say $\boldsymbol{X}$ and $\boldsymbol{Y}$ are \textbf{partially} exchangeable if 130 | $$ \pr\left(X_1, \ldots, X_n ; Y_1, \ldots, Y_m\right) = \pr\left(X_{\xi_1}, \ldots, X_{\xi_n} ; Y_{\sigma_1}, \ldots, Y_{\sigma_m}\right),$$ 131 | \end{defn} 132 | for any two permutations $\boldsymbol{\xi}$ and $\boldsymbol{\sigma}$ of $1, \ldots, n$ and $1, \ldots, m$, respectively. 133 | \begin{example}[Uma vez Flamengo...continued] 134 | To see how exchangeability can be relaxed into partial exchangeability, consider $\boldsymbol{X}$ and $\boldsymbol{Y}$ as observations coming from populations from Rio de Janeiro and Ceará, respectively. 135 | If the covariate ``state'' were deemed to not matter, then we would have complete exchangeability. 136 | \end{example} 137 | \end{frame} 138 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 139 | \begin{frame}{A statistically useful remark} 140 | \begin{remark}[Exchangeability from conditional independence] 141 | \label{rmk:pre_deFinetti} 142 | Take $\theta \sim \pi(\theta)$, i.e., represent uncertainty about $\theta$ using a probability distribution. 143 | If $ \pr(Y_1 = y_1, \ldots, Y_{n} = y_n \mid \theta) = \prod_{i=1}^{n} \pr(Y_i = y_i \mid \theta)$, then $Y_1, \ldots, Y_{n}$ are exchangeable. 144 | \end{remark} 145 | \begin{proof} 146 | Sketch: 147 | Use 148 | \begin{itemize} 149 | \item Marginalisation; 150 | \item Conditional independence; 151 | \item Commutativity of products in $\mathbb{R}$; 152 | \item Definition of exchangeability. 153 | \end{itemize} 154 | \end{proof} 155 | \end{frame} 156 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 157 | \begin{frame}{A fabulous theorem!} 158 | \begin{theo}[De Finetti's theorem\footnote{Technically, the theorem stated here is more general than the representation theorem proven by De Finetti in his seminal memoir, which concerned binary variables only.}] 159 | If $\pr\left(Y_1, \ldots, Y_n\right) = \pr\left(Y_{\xi_1}, \ldots, Y_{\xi_n}\right)$ for all permutations $\boldsymbol{\xi}$ of $1, \ldots, n$, then 160 | \begin{equation} 161 | \pr\left(Y_1, \ldots, Y_n\right) = \pr\left(Y_{\xi_1}, \ldots, Y_{\xi_n}\right) = \int_{\boldsymbol{\Theta}} \pr\left(Y_1, \ldots, Y_n \mid t\right) \pi(t)\,dt, 162 | \end{equation} 163 | for some choice of triplet $\{ \theta, \pi(\theta), f(y_i \mid \theta) \}$, i.e., a parameter, a prior and a sampling model. 164 | \end{theo} 165 | See Proposition 4.3 in \cite{Bernardo2000} for a proof outline. 166 | Here we shall prove the version from~\cite{DeFinetti1931}. 167 | \end{frame} 168 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 169 | \begin{frame}{Consequences} 170 | This theorem has a few important implications, namely: 171 | \begin{itemize} 172 | \item $\pi(\theta)$ represents our beliefs about $\lim_{n\to\infty} \sum_i (Y_i \leq c)/n$ for all $c \in \mathcal{Y}$; 173 | \item \{ $Y_1, \ldots, Y_n \mid \theta $ are i.i.d \} + \{ $\theta \sim \pi(\theta)$ \} $\iff$ \{ $Y_1, \ldots, Y_n$ are exchangeable for all $n$ \}; 174 | \item If $Y_i \in \{0, 1\}$, we can also claim that: 175 | \begin{itemize} 176 | \item If the $Y_i$ are assumed to be independent, then they are distributed Bernoulli conditional on a random quantity $\theta$; 177 | \item $\theta$ has a prior measure $\Pi \in \mathcal{P}( (0, 1) )$; 178 | \item By the strong law of large numbers (SLLN), $\theta = \lim_{n \to \infty} (\frac{1}{n}\sum_{i=1}^n Y_i)$, so $\Pi$ can be interpreted as a ``belief about the limiting relative frequency of 1's''. 179 | \end{itemize} 180 | \end{itemize} 181 | \end{frame} 182 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 183 | \begin{frame}{The soul of Statistics} 184 | As the exchangeability results above clearly demonstrate, being able to use conditional independence is a handy tool. 185 | More specifically, knowing on what to condition so as to make things exchangeable is key to statistical analysis. 186 | \begin{idea}[Conditioning is the soul of Statistics\footnote{This idea is due to Joe Blitzstein, who did his PhD under no other than the great Persi Diaconis.}] 187 | \label{idea:conditioning_soul} 188 | Knowing on what to condition can be the difference between an unsolvable problem and a trivial one. 189 | When confronted with a statistical problem, always ask yourself ``What do I know for sure?'' and then ``How can I create a conditional structure to include this information?''. 190 | \end{idea} 191 | \end{frame} 192 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 193 | \begin{frame}{Recommended reading} 194 | \begin{itemize} 195 | \item[\faBook] \cite{Hoff2009} Ch. 2 and $^\ast$\cite{Schervish2012} Ch.1; 196 | \item $^\ast$Paper: \cite{Diaconis1980} explains why if $n$ samples are taken from an exchangeable population of size $N \gg n$ without replacement, then the sample $Y_1, \ldots Y_n$ can be modelled as approximately exchangeable; 197 | \item[\faForward] Next lecture: \cite{Robert2007} Ch. 3. 198 | \end{itemize} 199 | \end{frame} 200 | -------------------------------------------------------------------------------- /slides/lecture_4.tex: -------------------------------------------------------------------------------- 1 | \subsection{Prior distributions I: the basics} 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | \begin{frame}{Priors: a curse and a blessing} 4 | \begin{itemize} 5 | \item Priors are the main point of contention between Bayesians and non-Bayesians; 6 | \item As we shall see, there is usually no unique way of constructing a prior measure; 7 | \item Moreover, in many situations the choice of prior is not inconsequential. 8 | \item There is always a question of when to stop adding uncertainty... 9 | \end{itemize} 10 | \begin{figure} 11 | \includegraphics[scale=0.6]{figures/turtles_all_the_way_down.jpeg} 12 | \end{figure} 13 | \end{frame} 14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 15 | \begin{frame}{Determination of priors: existence} 16 | It is usually quite hard to determine a (unique) prior even when substantial knowledge. 17 | Why? 18 | One reason is that a prior measure is guaranteed to exist only when there is a \textbf{coherent ordering} of the Borel sigma-algebra $\mathcal{B}(\boldsymbol{\Theta})$. 19 | This entails that the following axioms hold: 20 | \begin{itemize} 21 | \item [(A1)] Total ordering: For all measurable $A, B \in \mathcal{B}(\boldsymbol{\Theta})$ one and \underline{only one} of these can hold: 22 | $$ A \prec B , B \prec A \:\text{or} \: A \sim B.$$ 23 | \item [(A2)] Transitivity: For measurable $A_1, A_2, B_1, B_2 \in \mathcal{B}(\boldsymbol{\Theta})$ such that $A_1 \cap A_2 = \emptyset = B_1 \cap B_2$ and $A_i \preceq B_i, i = 1, 2$ then the following holds: 24 | \begin{itemize} 25 | \item $A_1 \cup A_2 \preceq B_1 \cup B_2$; 26 | \item If $A_1 \prec B_1$ then $A_1 \cup A_2 \prec B_1 \cup B_2$; 27 | \end{itemize} 28 | \item [(A3)] For any measurable $A$, $\emptyset \preceq A$ and also $\emptyset \prec \boldsymbol{\Theta}$; 29 | \item [(A4)] Continuity: If $E_1 \supset E_2 \ldots$ is a decreasing sequence of measurable sets and $B$ is such that $B \preceq E_i$ for all $i$, then 30 | $$ B \preceq \bigcap_{i=1}^\infty E_i.$$ 31 | \end{itemize} 32 | \end{frame} 33 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 34 | \begin{frame}{Approximation I: marginalisation} 35 | One way to approach the problem of determining a prior measure is to consider the marginal distribution of the data: 36 | \begin{equation} 37 | \label{eq:marginal_mx} 38 | m(x) = \int_{\boldsymbol{\Theta}} f(x\mid \theta)\pi(\theta)\,d\theta. 39 | \end{equation} 40 | In other words we are trying to solve an inverse problem in the form of an integral equation by placing restrictions on $m(x)$ and calibrating $\pi$ to satisfy them. 41 | \end{frame} 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 43 | \begin{frame}{Approximation II: moments} 44 | Another variation on the integral-equation-inverse-problem theme is to consider expectations of measurable functions. 45 | Suppose 46 | \begin{equation} 47 | \label{eq:prior_moments} 48 | E_\pi[g_k] := \int_{\boldsymbol{\Theta}} g_k(t)\pi(t)\,dt = w_k. 49 | \end{equation} 50 | For instance, if the analyst knows that $E_\pi[\theta] = \mu$ and $\vr_\pi(\theta) = \sigma^2$, then this restricts the class of functions in $\mathcal{L}_1(\boldsymbol{\Theta})$ that can be considered as prior density\footnote{As we shall see in the coming lectures, $\pi$ needs not be in $\mathcal{L}_1(\boldsymbol{\Theta})$, i.e., needs not be \textbf{proper}. But this ``method-of-moments'' approach is then complicated by lack of integrability.}. 51 | One can also consider \textit{order statistics} by taking $g_k(x) = \mathbb{I}_{(-\infty, a_k]}(x)$. 52 | \end{frame} 53 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 54 | \begin{frame}{Maximum entropy priors} 55 | The moments-based approach is not complete in the sense that it does not lead to a unique prior measure $\pi$. 56 | \begin{defn}[Entropy] 57 | The entropy of a probability distribution $P$ is defined as 58 | \begin{equation} 59 | H(P) := E_p[-\log p] = -\int_ 60 | {\mathcal{X}} \log p(x) dP(x). 61 | \end{equation} 62 | \end{defn} 63 | When $\theta$ has finite support, we get the familiar 64 | $$ H(P) = - \sum_i p(\theta_i) \log(p(\theta_i)).$$ 65 | We can leverage this concept in order to pick $\pi$. 66 | \begin{defn}[Maximum entropy prior] 67 | \label{def:maxent_prior} 68 | Let $\mathcal{P}_r$ be a class of probability measures on $\mathcal{B}(\boldsymbol{\Theta})$. 69 | A maximum entropy prior in $\mathcal{P}_r$ is a distribution that satisfies 70 | $$ \argmax_{ P \in \mathcal{P}_r } H(P).$$ 71 | \end{defn} 72 | \end{frame} 73 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 74 | \begin{frame}{Finding a maxent prior} 75 | When $\boldsymbol{\Theta}$ is finite, we can write 76 | \begin{equation*} 77 | \pi^\ast(\theta_i) = \frac{\exp\left\{\sum_{k=1} \lambda_k g_k(\theta_i) \right\}}{\sum_j \exp\left\{\sum_{k=1} \lambda_k g_k(\theta_j) \right\}}, 78 | \end{equation*} 79 | where the $\lambda_k$ are Lagrange multipliers. 80 | In the uncountable case things are significantly more delicate, but under regularity conditions there exists a reference measure $\Pi_0$ such that 81 | \begin{align*} 82 | H_\Pi &= E_{\pi_0}\left[\log \left(\frac{\pi(\theta)}{\pi_0(\theta)}\right)\right],\\ 83 | &= \int_{\boldsymbol{\Theta}} \log \left(\frac{\pi(\theta)}{\pi_0(\theta)}\right)\, \Pi_0(d\theta). 84 | \end{align*} 85 | \end{frame} 86 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 87 | \begin{frame}{Maximum entropy: practice} 88 | \begin{exercise}[Maximum entropy Beta prior] 89 | \label{exerc:maxent_beta} 90 | Find the maximum entropy Beta distribution under the following constraints: 91 | \begin{itemize} 92 | \item $E[\theta] = 1/2$; 93 | \item $E[\theta] = 9/10$. 94 | \end{itemize} 95 | \end{exercise} 96 | \textbf{Hint:} If $P$ is a Beta distribution with parameters $\alpha$ and $\beta$, then 97 | $$H_P = \log B(\alpha, \beta) -(\alpha-1)\psi(\alpha) - (\beta-1)\psi(\beta) + (\alpha + \beta -2)\psi(\alpha + \beta),$$ 98 | where $B(x, y) = \frac{\Gamma(x) \Gamma(y)}{\Gamma(x + y)}$ is the Beta function and $\psi(x) = \frac{d}{dx} \log(\Gamma(x))$ is the digamma function. 99 | \end{frame} 100 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 101 | \begin{frame}{Parametric approximations: easy-peasy} 102 | In some situations, the ``right'' parametric family presents itself naturally. 103 | \begin{example}[Eliciting Beta distributions] 104 | Let $x_i \sim \operatorname{Binomial}(n_i, p_i)$ be the number of Flamengo supporters out of $n_i$ people surveyed. 105 | Over the years, the average of $p_i$ has been $0.70$ with variance $0.1$. 106 | If we assume $p_i \sim \operatorname{Beta}(\alpha, \beta)$ we can elicit an informative distribution based on historical data by solving the system of equations 107 | \begin{align*} 108 | E[\theta] &= \frac{\alpha}{\alpha + \beta} = 0.7,\\ 109 | \vr(\theta) &= \frac{\alpha\beta}{(\alpha + \beta)^2(\alpha +\beta +1)} = 0.1. 110 | \end{align*} 111 | \end{example} 112 | \end{frame} 113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 114 | \begin{frame}{Parametric approximations: difficulties} 115 | Other times we may have a hard time narrowing down the prior to a specific parametric family. 116 | Consider the following example. 117 | \begin{example}[Normal or Cauchy?] 118 | Suppose $x_i \sim \operatorname{Normal}(\theta, 1)$ and we are informed that $\pr(\theta \leq -1) = 1/4$, $\pr(\theta \leq 0) = 1/2$ and $\pr(\theta \leq 1) = 3/4$. 119 | Seems like plenty of information. 120 | It can be shown that 121 | \begin{align*} 122 | \pi_1(\theta) &= \frac{1}{\sqrt{2\pi 2.19}}\exp\left(-\frac{\theta^2}{2\times2.19}\right) \: \text{(Normal)},\\ 123 | \pi_2(\theta) &= \frac{1}{\pi (1 + x^2)} \: \text{(Cauchy)}, 124 | \end{align*} 125 | both satisfy the requirements. 126 | Unfortunately, under quadratic loss we get $\delta_1(4) = 2.75$ and $\delta_2(4) = 3.76$ and differences are exacerbated for $|x|\geq 4$. 127 | \end{example} 128 | \end{frame} 129 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 130 | \begin{frame}{Why, though?} 131 | Remember the marginal approach? 132 | It is illuminating in this case. 133 | Heres $m(x)$: 134 | \begin{figure} 135 | \includegraphics[scale=0.5]{figures/BC_example_326.pdf} 136 | \caption{Prior predictive distributions of $x$ under Normal and Cauchy priors.} 137 | \end{figure} 138 | \end{frame} 139 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 140 | \begin{frame}{Conjugacy} 141 | Conjugacy is a central concept in Bayesian statistics. 142 | It provides a functional view of the prior-posterior mechanic that emphasises tractability over coherence. 143 | \begin{defn}[Conjugate] 144 | \label{def:conjugate} 145 | A family $\mathcal{F}$ of distributions on $\boldsymbol{\Theta}$ is called \textbf{conjugate} or closed under sampling for a likelihood $f(x \mid \theta)$ if, for every $\pi \in \mathcal{F}$, $p(\theta \mid x) \in \mathcal{F}$. 146 | \end{defn} 147 | \textbf{Arguments for using conjugate priors} 148 | \begin{itemize} 149 | \item ``Form-preservation'': in a limited-information setting it makes sense that $p(\theta \mid x)$ and $\pi(\theta)$ lie on the same family, since the information in $x$ might not be enough to change the structure of the model, just its parameters; 150 | \item Simplicity: when you do not know a whole lot, it makes sense to KISS\footnote{Keep it simple, stupid!}; 151 | \item Sequential learning: since $\mathcal{F}$ is closed under sampling, one can update a sequence of posteriors $p_i(\theta \mid x_1, \ldots, x_i)$ as data comes in. 152 | \end{itemize} 153 | \end{frame} 154 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 155 | \begin{frame}{Exponential families} 156 | The exponential family of distributions is a cornerstone of statistical practice, underlying many often-used models. 157 | Here are a few useful definitions. 158 | \begin{defn}[(Natural) Exponential family] 159 | \label{def:expo_family} 160 | Let $\mu$ be a $\sigma$-finite measure on $\mathcal{X}$ and let $\boldsymbol{\Theta}$ be a non-empty set serving as the parameter space. 161 | Let $C : \boldsymbol{\Theta} \to (0, \infty)$ and $h: \boldsymbol{\Theta} \to (0, \infty)$ and let $R : \boldsymbol{\Theta} \times \mathcal{X} \to \mathbb{R}^k$ and $T: \boldsymbol{\Theta} \times \mathcal{X} \to \mathbb{R}^k$. 162 | The family of distributions with density 163 | \begin{equation*} 164 | f(x \mid \theta) = C(\theta)h(x)\exp\left(R(\theta) \cdot T(x) \right) 165 | \end{equation*} 166 | w.r.t. $\mu$ is called an \textbf{exponential family}. 167 | Moreover, if $R(\theta) = \theta$, the family is said to be \textbf{natural}. 168 | \end{defn} 169 | \begin{defn}[Regular exponential family] 170 | We say a natural exponential family $f(x\mid\theta)$ is \textbf{regular} if the natural parameter space 171 | \begin{equation} 172 | N := \left\{ \theta : \int_{\mathcal{X}} \exp(\theta\cdot x) h(x)\,d\mu(x) < \infty \right\}, 173 | \end{equation} 174 | is an open set of the same dimension as the closure of the convex hull of $\supp(\mu)$. 175 | \end{defn} 176 | \end{frame} 177 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 178 | \begin{frame}{Conjugacy and sufficiency} 179 | There is an intimate link between sufficiency (i.e. the existence of sufficient statistics) and conjugacy. 180 | The following is a staple of Bayesian theory. 181 | \begin{theo}[Pitman-Koopman-Darmois] 182 | If a family of distributions $f(\cdot \mid \theta)$ whose support does not depend on $\theta$ is such that, for a sample size large enough, there exists a sufficient statistic of \underline{fixed dimension}, then $f(\cdot \mid \theta)$ is an exponential family. 183 | \end{theo} 184 | The support condition is not a complete deal breaker, however: 185 | \begin{remark}[Quasi-exponential] 186 | The $\operatorname{Uniform}(-\theta, \theta)$ and $\operatorname{Pareto}(\theta, \alpha)$ families are called \textit{quasi-exponential} due to the fact that there do exist sufficient statistics of fixed dimension for these families, even though their supports depend on $\theta$. 187 | \end{remark} 188 | \end{frame} 189 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 190 | \begin{frame}{Conjugacy in the exponential family} 191 | I hope you are convinced of the utility of the exponential family by now. 192 | It would be nice to have an automated way to deduce a conjugate prior for $f(x\mid \theta)$ when it is in the exponential family. 193 | This is exactly what the next result gives us. 194 | \begin{remark}[Conjugate prior for the exponential family] 195 | A conjugate family for $f(x\mid \theta)$ is given by 196 | \begin{equation} 197 | \label{eq:conjugate_exponential_family} 198 | \pi(\theta \mid \mu, \lambda) = K(\mu, \lambda) \exp\left(\theta \cdot \mu - \lambda g(\theta)\right), 199 | \end{equation} 200 | such that the posterior is given by $p(\theta \mid \mu + x, \lambda + 1)$. 201 | \end{remark} 202 | Please do note that (\ref{eq:conjugate_exponential_family}) is only a valid density when $\lambda > 0$ and $\mu/\lambda$ belongs to the interior of the natural space parameter. 203 | Then, it is a $\sigma$-finite measure. 204 | See \cite{Diaconis1979} for more details. 205 | \end{frame} 206 | 207 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 208 | \begin{frame}{Conjugacy: common families} 209 | \begin{figure} 210 | \includegraphics[scale=.5]{figures/conjugate_table.pdf} 211 | \caption{Taken from~\cite{Robert2007}, page 121.} 212 | \end{figure} 213 | \end{frame} 214 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 215 | \begin{frame}{Conjugacy: drawbacks} 216 | Conjugate modelling is certainly useful, but has its fair share of pitfalls. 217 | 218 | \textbf{Arguments against using conjugate priors} 219 | \begin{itemize} 220 | \item Conjugate priors are restrictive \textit{a priori}: in many settings, specially in high dimensions, the set of conjugate priors that retain tractability is so limited so as to not be able to encode all prior information available; 221 | \item Conjugate priors are not truly subjective: they limit the analyst's input to picking values for the hyperparameters; 222 | \item Conjugate priors are restrictive \textit{a posteriori}: you are stuck with a given structure forever, no matter how much data you run into. 223 | \end{itemize} 224 | \end{frame} 225 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 226 | \begin{frame}{The principle of insufficient reason} 227 | Also called principle of indifference by Keynes\footnote{John Maynard Keynes (1883--1946) was an English economist.}. 228 | \begin{quote} 229 | ...if there is no known reason for predicating of our subject one rather than another of several alternatives, then relatively to such knowledge the assertions of each of these alternatives have an equal probability." \cite[Ch4 pg. 52-53]{Keynes1921}. 230 | \end{quote} 231 | The idea dates back to Laplace and even Bayes himself and usually leads to 232 | $$ \pi(\theta) \propto 1.$$ 233 | \end{frame} 234 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 235 | \begin{frame}{Invariance} 236 | In many applications we might want some sort of~\textit{invariance} in our prior model. 237 | \begin{defn}[Invariant model] 238 | \label{def:invariant_model} 239 | A statistical model is said to be~\textbf{invariant} (or closed) under the action of a group $\mathcal{G}$ if $\forall g \in \mathcal{G} \: \exists \theta^\star \in \boldsymbol{\Theta}$ such that $y = g(x)$ is distributed with density $f(y\mid \theta^\star)$, denoting $\theta^\star = \bar{g}(\theta)$. 240 | \end{defn} 241 | Consider two types of invariance 242 | \begin{itemize} 243 | \item \textit{Translation} invariance: 244 | A model $f(x-\theta)$ such that $x-x_0$ has a distribution in the same family for every $x_0$ leads to 245 | $$\pi(\theta) = \pi(\theta-\theta_0),\: \forall\: \theta_0 \in \boldsymbol{\Theta}.$$ 246 | \item \textit{Scale} invariance: 247 | Similarly, a model of the form $\sigma^{-1} f(x/\sigma)$, $\sigma >0$ is \textit{scale-invariant} and leads to 248 | $$ \pi(A/c) = \pi(A),$$ 249 | for any measurable A. 250 | \end{itemize} 251 | \end{frame} 252 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 253 | \begin{frame}{Jeffreys's prior} 254 | One can try to build a prior that captures only the essential structural information about the problem by deriving an invariant distribution from the Fisher information: 255 | $$ I(\theta) = E\left[\left(\frac{\partial \log f(X \mid \theta)}{\partial \theta}\right)^2\right].$$ 256 | Under regularity conditions, we can usually also write 257 | $$ I(\theta) = - E\left[\frac{\partial^2 \log f(X \mid \theta)}{\partial \theta^2}\right].$$ 258 | Jeffreys showed that 259 | $$\pi_J(\theta) \propto \sqrt{I(\theta)},$$ 260 | is invariant. 261 | There are straightforward generalisations when $\theta$ is multidimensional. 262 | \end{frame} 263 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 264 | \begin{frame}{Jeffreys's priors: examples} 265 | A good exercise is to show that 266 | \begin{itemize} 267 | \item If $x \sim \operatorname{Normal}(0, \theta)$, $\pi_J(\theta) \propto 1/\theta^2$; 268 | \item If $x \sim \operatorname{Normal}_d(\theta, \boldsymbol{I}_d)$, $\pi_J(\theta) \propto 1$; 269 | \item If $x \sim \operatorname{Binomial}(n, \theta)$, $\pi_J(\theta) \equiv \operatorname{Beta}(1/2, 1/2)$; 270 | \item If $f(x\mid \theta) = h(x) \exp\left(\theta \cdot x - \psi(\theta)\right)$, then 271 | $$ \pi_J(\theta) \propto \sqrt{\prod_{i=1}^k\psi^{\prime\prime}(\theta)}.$$ 272 | \end{itemize} 273 | \end{frame} 274 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 275 | \begin{frame}{Beware!} 276 | One important caveat of Jeffreys's priors is that they violate the Likelihood Principle. 277 | To see why, consider the following exercise. 278 | \begin{exercise}[Poisson process] 279 | Suppose one is interested in estimating the rate, $\theta$, of a Poisson process: 280 | $$ Y(t) \sim \operatorname{Poisson}(t\theta).$$ 281 | There are two possible experimental designs: 282 | \begin{itemize} 283 | \item[a)] Fix a number $n$ of events to be observed and record the time $X$ to observe them, or; 284 | \item[b)] Wait a fixed amount of time, $t$, and count the number $Y$ of occurences of the event of interest. 285 | Show that 286 | \end{itemize} 287 | \begin{align*} 288 | \label{eq:poisson_process_informationMatrix} 289 | \text{a)}&\: I_X(\theta) = \frac{n}{\theta^2},\\ 290 | \text{b)}&\: I_Y(\theta) = \frac{t}{\theta}. 291 | \end{align*} 292 | Which conclusions can we draw from this example? 293 | \end{exercise} 294 | See also example 3.5.7 in~\cite{Robert2007}. 295 | \end{frame} 296 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 297 | \begin{frame}{Reference priors} 298 | Jeffreys's approach can sometimes lead to marginalisation paradoxes and calibration issues (see exercise 4.47 in \cite{Robert2007}). 299 | \cite{Bernardo1979} proposes a modification that avoids these difficulties by explicit separating parameters in~\textit{nuisance} and~\textit{interest}. 300 | It works like this: take $f(x\mid \theta)$, with $\theta = (\theta_1, \theta_2)$ and let $\theta_1$ be the parameter of interest. 301 | We must first compute\footnote{Notice that this need not be well-defined. One common way of dealing with difficulties is to integrate on a sequence of measurable compact sets and take the limit.} 302 | $$ \tilde{f}(x\mid \theta_1) = \int_{\boldsymbol{\Theta_2}} f(x \mid \theta_1, t_2)\pi(t_2\mid \theta_1)\, dt_2, $$ 303 | and then compute the Jeffreys's prior associated with this marginalised likelihood. 304 | Notice that this entails first deriving $\pi(\theta_2\mid \theta_1)$. 305 | \end{frame} 306 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 307 | \begin{frame}{Reference priors: example} 308 | Suppose we have $x_{ij} \sim \operatorname{Normal}(\mu_i, \sigma^2)$, $i = 1, \ldots, n$, $j=1,2$ and consider making inferences about $\boldsymbol{\theta} = (\sigma^2, \boldsymbol{\mu})$. 309 | Here $\theta_1 = \sigma$ is a nuisance parameter and we're interested in the location $\boldsymbol{\theta}_2 = \boldsymbol{\mu}$. 310 | The Jeffreys's prior is 311 | $$\pi_J(\boldsymbol{\theta}) \propto 1/\sigma^{n+1},$$ 312 | leading to a Bayes estimator under quadratic loss: 313 | $$ \hat{\sigma}_J := E[\sigma^2 \mid \boldsymbol{x}] = \frac{\sum_{i=1}^n (x_{i1}-x_{i2})^2}{4n-4},$$ 314 | which in not consistent. 315 | The reference approach give $\pi(\theta_1 \mid \boldsymbol{\theta_2})$ as a flat prior - because $\boldsymbol{\theta_2}$ is a location parameter. 316 | Marginalising the likelihood against this flat density over $(0,\infty)$ gives $\pi_R(\sigma^2) \propto 1/\sigma^2$, leading to 317 | $$ \hat{\sigma}_R =\frac{\sum_{i=1}^n (x_{i1}-x_{i2})^2}{2n-4},$$ 318 | which is consistent. 319 | Phew! 320 | \end{frame} 321 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 322 | \begin{frame}{Frequentist considerations} 323 | If you are a bit greedy and want to please Greeks and Troyans, you might also try to construct your prior so that it attains good frequency properties. 324 | One such way is to construct~\textbf{matching priors}: 325 | \begin{defn}[Matching prior] 326 | We say $\pi(\theta)$ is a \textbf{matching prior} for a confidence level $\alpha$ if it is constructed in such a way that 327 | \begin{equation*} 328 | \pr(g(\theta) \in C_x \mid x) = \frac{1}{m(x)}\int_{C_x} f(x\mid t) \pi(t) \, dt = 1-\alpha, 329 | \end{equation*} 330 | holds for a given confidence set $C_x(\alpha)$ for $g(\theta)$. 331 | \end{defn} 332 | In other words, if the posterior matches the confidence set. 333 | It can be shown that, in unidimensional families, the Jeffreys's prior gives 334 | $$ \pr(\theta \leq k_\alpha(x)) = 1-\alpha + O(n^{-1}),$$ 335 | where $C_x = (-\infty, k_\alpha(x))$ is a one-sided confidence interval. 336 | \end{frame} 337 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 338 | \begin{frame}{Prior classes} 339 | \cite{Robert2007} gives a classification of priors in classes: 340 | \begin{itemize} 341 | \item[i)] Conjugate classes: 342 | $$ \Gamma_C = \{\pi \in \mathcal{F} : p \in \mathcal{F} \}, $$ 343 | \item[ii)] Determined moment(s) classes: 344 | $$ \Gamma_M = \{\pi : a_i \leq E_\pi[\theta] \leq b_i, i = 1, \ldots, k\}, $$ 345 | \item[iii)] Neighbourhood (or $\epsilon$-contamination) classes: 346 | $$ \Gamma_{\epsilon, \mathcal{Q}} = \{\pi = (1-\epsilon)\pi_0 + q, q \in \mathcal{Q}\}, $$ 347 | \item[iv)] Underspecified classes: 348 | $$ \Gamma_{U} = \{\pi : \int_{I_i} \pi(t)\,dt \leq \mu_i, i = 1, \ldots, k\}, $$ 349 | \item[v)] Ratio of density classes: 350 | $$ \Gamma_{R} = \{\pi : L(\theta) \leq \pi(\theta) \leq U(\theta)\}. $$ 351 | \end{itemize} 352 | \end{frame} 353 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 354 | \begin{frame}{Prior sensitivity analysis} 355 | General recommendations about building priors: 356 | \begin{itemize} 357 | \item Check the \textbf{observable consequences} of your priors :what kinds of data does this produce? 358 | \item Check the inferential consequences of your priors: how do my estimators change under different priors? 359 | \item Make sure you know what your restrictions do to the tail of your prior; 360 | \item It is usually a good idea to understand what the prior \textbf{does} to the model, as opposed to only which values $\theta$ can plausibly take; 361 | \item Sometimes it may be useful to think of priors as \textit{penalisations} that \textbf{regularise} inference. 362 | \end{itemize} 363 | \end{frame} 364 | 365 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 366 | \begin{frame}{Recommended reading} 367 | \begin{itemize} 368 | \item[\faBook] \cite{Robert2007} Ch. 3; 369 | \item[\faForward] Next lecture: \cite{Robert2007} Ch. 3.6, \cite{Seaman2012}, \cite{Gelman2017} and \cite{Simpson2017}. 370 | \end{itemize} 371 | \end{frame} 372 | -------------------------------------------------------------------------------- /slides/lecture_5.tex: -------------------------------------------------------------------------------- 1 | \section*{Bayesian point estimation} 2 | \begin{frame}{The maximum~\textit{a posteriori} (MAP) estimator} 3 | \begin{defn}[Maximum~\textit{a posteriori}] 4 | The posterior mode or maximum~\textit{a posteriori} (MAP) estimator of a parameter $\theta$ is given by 5 | \begin{equation} 6 | \label{eq:MAP} 7 | \delta_{\pi}^{\text{MAP}}(x) := \argmax_{\theta \in \boldsymbol{\Theta}} p(\theta \mid x). 8 | \end{equation} 9 | \end{defn} 10 | \begin{example}[MAP for the binomial case] 11 | Suppose $x \sim \operatorname{Binomial}(n, p)$. 12 | Now consider the following three priors for $p$: 13 | \begin{itemize} 14 | \item $\pi_0(p) = \frac{\sqrt{p(1-p)}}{B(1/2, 1/2)} $ [Jeffreys]; 15 | \item $\pi_1(p) = 1$ [Beta(1,1)/Uniform]; 16 | \item $\pi_2(p) = \left(p(1-p)\right)^{-1}$ [\cite{Haldane1932}]. 17 | \end{itemize} 18 | These lead to 19 | \begin{itemize} 20 | \item $\delta_0^{\text{MAP}}(x) = \max \{(x-1/2)/(n-1), 0 \}$; 21 | \item $\delta_1^{\text{MAP}}(x) = x/n$; 22 | \item $\delta_2^{\text{MAP}}(x) = \max \{ (x-1)/(n-2), 0 \}$. 23 | \end{itemize} 24 | \end{example} 25 | \end{frame} 26 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 27 | \begin{frame}{A word of caution} 28 | We end this discussion with the following warning: 29 | \begin{idea}[Marginalise, not maximise] 30 | \label{idea:always_marginalise}~ 31 | Bayesian approaches to estimation and prediction \underline{usually} focus on \textit{marginalisation} rather than \textit{optmisation}. 32 | This is because, following the Likelihood Principle, all of the information available about the unknowns is contained in the posterior distribution, and thus all inferences must be made using this probability measure, usually by finding suitable expectations of functionals of interest. 33 | \end{idea} 34 | In particular, for higher dimensions, \textbf{concentration of measure}\footnote{See these excellent notes by Terence Tao:\url{https://terrytao.wordpress.com/2010/01/03/254a-notes-1-concentration-of-measure/} .} ensures that the posterior mode has less and less relevance as a summary, at least so far as the barycentre of the distribution is concerned. 35 | \end{frame} 36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 37 | \begin{frame}{Precision of Bayes estimators} 38 | A central quantity in the evaluation of Bayesian estimators is 39 | \begin{equation} 40 | \label{eq:bayes_risk} 41 | E_p\left[\left(\delta_\pi - h(\theta)\right)^2\right] = E_{\pi}\left[\left(\delta_\pi - h(\theta)\right)^2 \mid x \right] 42 | \end{equation} 43 | for measurable $h$. 44 | \begin{example}[Bayes versus frequentist risk] 45 | Take $x \sim \operatorname{Binomial}(n, \theta)$ with $n$ known and place a Jeffreys's prior on $\theta$. 46 | Consider the MLE: $\delta_1(x) = x/n$. 47 | It can be shown that: 48 | $$ E_\pi\left[\left(\delta_1 - \theta \right)^2 \mid x \right] = \left(\frac{x - n/2}{n(n+1)}\right)^2 + \frac{(x + 1/2)(n - x + 1/2)}{(n + 1)^2(n+2)}.$$ 49 | Moreover, 50 | $$\max_{\theta \in (0, 1)} E_\pi\left[\left(\delta_1 - \theta\right)^2 \mid x \right] = [4(n+2)]^{-1},$$ 51 | and 52 | $$ \max_{\theta \in (0, 1)} E_\theta \left[\left(\delta_1 - \theta\right)^2\right] = [4n]^{-1}.$$ 53 | \end{example} 54 | \end{frame} 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | \begin{frame}{A brief aside about prediction} 57 | Prediction is an important inferential task and is somewhat related to the previous discussion on precision. 58 | Consider predicting a quantity $z$ \textbf{conditional} on data $x$. 59 | For that we need $g(z \mid x, \theta)$, $f(x\mid \theta)$ and $\pi(\theta)$. 60 | Then, 61 | \begin{equation} 62 | \label{eq:general_predictive} 63 | g_\pi(z\mid x) = \int_{\boldsymbol{\Theta}} g(z\mid x, t) p(t \mid x)\,dt 64 | \end{equation} 65 | encodes all of the information brought by the posterior about $z$. 66 | A special case is i.i.d prediction: 67 | \begin{equation} 68 | \label{eq:posterior_predictive_data} 69 | g(\tilde{x} \mid x) = \int_{\boldsymbol{\Theta}} f(\tilde{x} \mid t) p(t \mid x)\,dt 70 | \end{equation} 71 | is the posterior predictive of the new data $\tilde{x}$. 72 | \begin{idea}[Calibrated priors for prediction] 73 | \label{idea:prediction_calibrated_priors} 74 | The prior, $\pi$, can be constructed so as to minimise error in a prediction task. 75 | \end{idea} 76 | \end{frame} 77 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 78 | \begin{frame}{A neat trick} 79 | Computing expectations all the time means we have to become familiar with a few tricks to facilitate obtaining approximate answers. 80 | \begin{example}[Mixture representation of the Student-t] 81 | Take $x \sim \operatorname{Normal}_p(\theta, \boldsymbol{I}_p)$ and put $\theta \sim \operatorname{Student-t}_p(\alpha, 0, \tau^2\boldsymbol{I}_p)$. 82 | Then $p(\theta \mid x)$ does not have a closed-form normalising constant and computing the Bayes estimator under quadratic loss is a chore. 83 | However, we can use the representation 84 | \begin{align*} 85 | \theta \mid z &\sim \operatorname{Normal}_p(0, \tau^2z\boldsymbol{I}_p),\\ 86 | z & \sim \operatorname{InverseGamma}(\alpha/2, \alpha/2), 87 | \end{align*} 88 | to get 89 | $$ \theta \mid x, z \sim \operatorname{Normal}_p\left(\frac{x}{ 1+ \tau^2z}, \frac{\tau^2z}{ 1+ \tau^2z}\boldsymbol{I}_p\right) $$ 90 | Thus, the Bayes estimator $\delta_\pi(x) = \int_0^\infty E_\pi[\theta \mid x, z]p(z \mid x)\,dz$ can be computed with a single integral for any dimension $p$. 91 | \end{example} 92 | \end{frame} 93 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 94 | \begin{frame}{Conjugacy is handy!\footnote{Taken from~\cite{Robert2007}.}} 95 | \begin{center} 96 | \includegraphics[scale=0.5]{figures/conjugate_table_expectations.pdf} 97 | \end{center} 98 | \end{frame} 99 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 100 | \begin{frame}{A worked example} 101 | We will stretch our Bayesian muscles with the next problem. 102 | \begin{exercise}[Inference for the rate of a Gamma] 103 | \label{exercise:rate_gamma_different_losses} 104 | Let $x \sim \operatorname{Gamma}(\nu, \theta)$ with $\nu >0$ known. 105 | A natural choice of prior is $\theta \sim \operatorname{Gamma}(\alpha, \beta)$. 106 | Find the Bayes estimator under 107 | $$ L_1(\delta, \theta) = \left(\delta - \frac{1}{\theta}\right)^2, $$ 108 | and the scale-invariant loss 109 | $$ L_2(\delta, \theta) = \theta^2 \left(\delta - \frac{1}{\theta}\right)^2$$ 110 | \textit{Hint:} If $X \sim \operatorname{Gamma}(\alpha, \beta)$, $Y = 1/X \sim \operatorname{InverseGamma}(\alpha, \beta)$ and $E[Y^k] = \frac{\beta^k}{(\alpha-1)\cdots(\alpha-k)}$ 111 | \end{exercise} 112 | \end{frame} 113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 114 | \begin{frame}{A quick note on quadratic loss} 115 | Exercise~\ref{exercise:rate_gamma_different_losses} is a special case of the general situation where 116 | $$ L(\delta, \theta) = w(\theta) ||\delta-\theta||_{\boldsymbol{G}}^2, $$ 117 | for $\boldsymbol{G}$ a $p \times p$ non-negative symmetric matrix. 118 | In this case, we get 119 | $$ \delta_\pi = \frac{E_p[w(\theta)\theta]}{E_p[w(\theta)]}. $$ 120 | Please \textbf{note} that there is no universal justification for quadratic loss other than (sometimes leading to increased) mathematical tractability. 121 | \end{frame} 122 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 123 | \begin{frame}{Loss estimation} 124 | Since the loss function, $L(\delta(x), \theta)$ is usually measurable w.r.t the posterior, it can be estimated much the same way as other functionals. 125 | In particular, if you are feeling particularly eclectic, you can always constructed $\pi$ such that 126 | $$ E\left[E_p[L(\delta_\pi(x), \theta]\right] \geq R(\delta_\pi(x), \theta), \theta \in \boldsymbol{\Theta}, $$ 127 | i.e. that the estimated loss never underestimates the error resulting from the use of $\delta_\pi$, at least in the long run. 128 | This is called \textbf{frequentist validity}. 129 | \end{frame} 130 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 131 | \begin{frame}{A nice little problem by Neyman} 132 | The following problem is described by Jeffreys as originating with Jerzy Neyman\footnote{Jerzy Neyman (1894-1981) was a Polish-American statistician, known for with work with Egon Pearson (1895-1980) on the foundations of the null hypothesis significance testing (NHST) framework.}. 133 | \begin{exercise}[The tramcar problem] 134 | \label{exercise:tramcar} 135 | A person travelling in a a foreign country has to change trains at a junction, and goes into the town, the existence of which they have only just heard. 136 | They have no idea of its size. 137 | The first thing they see is a tramcar numbered $100$. 138 | Assuming tramcars are numbered consecutively from $1$ onwards, what could one \textit{infer} about the number $N$ of tramcars in this town? 139 | \end{exercise} 140 | \end{frame} 141 | 142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 143 | \begin{frame}{Recommended reading} 144 | \begin{itemize} 145 | \item[\faBook] \cite{Robert2007}, Ch4. 146 | % \item 147 | \item[\faForward] Next lecture: \cite{Robert2007} Ch. 5. 148 | \end{itemize} 149 | \end{frame} 150 | -------------------------------------------------------------------------------- /slides/lecture_6.tex: -------------------------------------------------------------------------------- 1 | \section*{Bayesian teting and ``interval'' estimation} 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | \begin{frame}{The duality between estimation and testing} 4 | Similarly to the frequentist case, in Bayesian inference there is an intimate relationship between testing hypotheses an estimating measurable functions of the parameters. 5 | \begin{defn}[Test] 6 | Consider a statistical model $f(x \mid \theta)$ with $\theta \in \boldsymbol{\Theta}$. 7 | Given $\boldsymbol{\Theta}_0 \subset \boldsymbol{\Theta}$, a \textit{test} consists in answering the question of whether 8 | $$ H_0 : \theta \in \boldsymbol{\Theta}_0 $$ 9 | is true. 10 | We call $H_0$ the \textit{null hypothesis} and $\boldsymbol{\Theta}_0$ can often be a point, i.e. $\boldsymbol{\Theta}_0 = \{\theta_0 \}$. 11 | \end{defn} 12 | Notice that $\mathbb{I}_{\boldsymbol{\Theta}_0}(\theta)$ is measurable and thus we can define, for instance 13 | \begin{equation*} 14 | L_1(\theta, \varphi) = \begin{cases} 15 | 1, \varphi = \mathbb{I}_{\boldsymbol{\Theta}_0}(\theta),\\ 16 | 0, \: \text{otherwise}, 17 | \end{cases} 18 | \end{equation*} 19 | which in turn leads to 20 | \begin{equation*} 21 | \varphi_1 = \begin{cases} 22 | 1, \pr(\theta \in \boldsymbol{\Theta}_0 \mid x) > \pr(\theta \in \boldsymbol{\Theta}_0^c \mid x),\\ 23 | 0, \: \text{otherwise}. 24 | \end{cases} 25 | \end{equation*} 26 | \end{frame} 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | \begin{frame}{A refinement} 29 | The loss function just seen can be refined to 30 | \begin{equation*} 31 | L_2(\theta, \varphi) = \begin{cases} 32 | 0, \varphi = \mathbb{I}_{\boldsymbol{\Theta}_0}(\theta),\\ 33 | a_0, \theta \in \boldsymbol{\Theta}_0, \varphi = 0\\ 34 | a_1, \theta \in \boldsymbol{\Theta}_0^c, \varphi = 1. 35 | \end{cases} 36 | \end{equation*} 37 | Under this loss, we have 38 | \begin{equation*} 39 | \varphi_2 = \begin{cases} 40 | 1, \pr(\theta \in \boldsymbol{\Theta}_0 \mid x) > a_1/(a_0 + a_1),\\ 41 | 0, \: \text{otherwise}. 42 | \end{cases} 43 | \end{equation*} 44 | \end{frame} 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 46 | \begin{frame}{Example} 47 | \begin{example}[\textit{One} Normal test] 48 | Take, for example, $x \sim \operatorname{Normal}(\theta, \sigma^2)$, with $\theta \sim \operatorname{Normal}(\mu_0, \tau^2)$. 49 | This implies $\theta \mid x \sim \operatorname{Normal}(\mu(x), \omega^2)$, where 50 | \begin{align*} 51 | \mu(x) &= \frac{\sigma^2\mu_0 + \tau^2x}{\sigma^2 + \tau^2}; 52 | \omega^2 = \frac{\sigma^2\tau^2}{\sigma^2 + \tau^2}. 53 | \end{align*} 54 | To test $H_0: \theta < 0$, we can compute 55 | \begin{align*} 56 | \pr(\theta < 0 \mid x) &= \pr \left(\frac{\theta-\mu(x)}{\omega} < \frac{\mu(x)}{\omega}\right), \\ 57 | &= \Phi\left(\frac{-\mu(x)}{\omega}\right). 58 | \end{align*} 59 | This means that if $z_{a_0, a_1}$ is such that $\Phi(z_{a_0, a_1}) = a_1/(a_0 + a_1)$, we can accept $H_0$ if 60 | $$\mu(x) < -z_{a_0, a_1}\omega. $$ 61 | \end{example} 62 | \end{frame} 63 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 64 | \begin{frame}{Bayes factors} 65 | A central tool in Bayesian testing is the \textbf{Bayes factor} -- see \cite{Kass1995} for a review and guide for interpretation. 66 | \begin{defn}[Bayes factor] 67 | \label{def:Bayes_factor} 68 | The Bayes factor is the ratio of posterior odds and the prior odds over the null and the alternative: 69 | \begin{align*} 70 | B^\pi_{01}(x) &= \frac{\pr(\theta \in\boldsymbol{\Theta}_0 \mid x)}{\pr(\theta \in\boldsymbol{\Theta}_1 \mid x)}\bigg/\frac{\pr(\theta \in\boldsymbol{\Theta}_0)}{\pr(\theta \in\boldsymbol{\Theta}_1)},\\ 71 | &= \frac{\pr(\theta \in\boldsymbol{\Theta}_0 \mid x)\cdot\pr(\theta \in\boldsymbol{\Theta}_1)}{\pr(\theta \in\boldsymbol{\Theta}_1 \mid x)\cdot\pr(\theta \in\boldsymbol{\Theta}_0)}. 72 | \end{align*} 73 | \begin{remark} 74 | When $\boldsymbol{\Theta}_0 = \{\theta_0\}$ and $\boldsymbol{\Theta}_1 = \{\theta_1\}$ the Bayes factor simplifies to 75 | \begin{equation*} 76 | r_{01}(x) = \frac{f(x\mid\theta_0)}{f(x\mid\theta_1)}, 77 | \end{equation*} 78 | also known as the \textbf{likelihood ratio}. 79 | \end{remark} 80 | \end{defn} 81 | \end{frame} 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 83 | \begin{frame}{A few more considerations on the Bayes factor} 84 | The Bayes factor can also be written as 85 | \begin{align*} 86 | B^\pi_{01}(x) &= \frac{\int_{\boldsymbol{\Theta}_0} f(x\mid t)\pi_0(t)\,dt }{\int_{\boldsymbol{\Theta}_1} f(x\mid t)\pi_1(t)\,dt} = \frac{m_0(x)}{m_1(x)}, 87 | \end{align*} 88 | where $\pi_0$ and $\pi_1$ are the prior distributions under each hypothesis. 89 | Also, if $\hat{\theta}_0$ and $\hat{\theta}_1$ are the MLE under each hypothesis, by making $\pi_0$ and $\pi_1$ Dirac masses at $\hat{\theta}_0$ and $\hat{\theta}_1$, respectively, we recover 90 | \begin{equation} 91 | \label{eq:bayes_lrt} 92 | R(x) = \frac{\sup_{\theta \in \boldsymbol{\Theta}_0}f(x \mid \theta)}{\sup_{\theta \in \boldsymbol{\Theta}_1}f(x \mid \theta)} 93 | \end{equation} 94 | \begin{exercise}[Bayesian justifcation of LRT] 95 | Does (\ref{eq:bayes_lrt}) offer a Bayesian justifcation for likelihood ratios? 96 | \end{exercise} 97 | \end{frame} 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 99 | \begin{frame}{Testing point-null hypotheses} 100 | Hypotheses of the form $H_i : \theta \in \{ \theta_i \}$, called point-null hypotheses, are hard to deal with from a probabilistic point of view. 101 | \begin{remark}[Point-null hypotheses under continuous priors] 102 | Point-null cannot be tested under continuous prior distributions. 103 | More generally, if either $H_0$ or $H_1$ are \textbf{impossible} \textit{a priori}, then no amount of data can change that belief. 104 | \end{remark} 105 | \begin{idea}[Cromwell's law\footnote{This idea is attributed to British statistician Dennis Lindley (1923-2013), one of the founders of modern Bayesian theory.}] 106 | In general, one not assign probability zero to events that are not logically or physically demonstrably impossible. 107 | Or, more eloquently, as Oliver Cromwell writes to the General Assembly of the Church of Scotland on 3 August 1650: 108 | \begin{quotation} 109 | I beseech you, in the bowels of Christ, think it possible that you may be mistaken. 110 | \end{quotation} 111 | \end{idea} 112 | \end{frame} 113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 114 | \begin{frame}{Point-null hypotheses: modification of the prior} 115 | Testing point-null hypotheses involves a \textbf{modification of the prior} 116 | If $H_0: \theta \in \{\theta_0\}$ we can write $\rho_0 = \pr(\theta = \theta_0)$ and then 117 | \begin{equation*} 118 | \tilde{\pi}(\theta) = \rho_0 \mathbb{I}_{\boldsymbol{\Theta}_0}(\theta) + (1-\rho_0)\pi_1(\theta), 119 | \end{equation*} 120 | is our new prior, where $\pi_1$ is the distribution with density $g_1(\theta) \propto \pi(\theta)\mathbb{I}_{\boldsymbol{\Theta}_1}(\theta)$ with respect to the dominating measure on $\boldsymbol{\Theta}_1$. 121 | This gives a posterior probability 122 | \begin{equation*} 123 | \tilde{\pi}(\boldsymbol{\Theta}_0 \mid x) = \frac{f(x \mid \theta_0)\rho_0}{f(x \mid \theta_0)\rho_0 + (1-\rho_0)m_1(x)}. 124 | \end{equation*} 125 | where $m_1(x) = \int_{\boldsymbol{\Theta}_1} f(x \mid t)g_1(t)\,dt$. 126 | It can be shown that 127 | \begin{equation*} 128 | \tilde{\pi}(\boldsymbol{\Theta}_0 \mid x) = \left[1 + \frac{1-\rho_0}{\rho_0}\frac{1}{B^\pi_{01}(x)}\right]^{-1}, 129 | \end{equation*} 130 | which makes clear the relationship between posterior probabilities and Bayes factors. 131 | \end{frame} 132 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 133 | \begin{frame}{Example} 134 | Consider $x \sim \operatorname{Binomial}(n, p)$ and consider testing $H_0: p = 1/2$ against $H_1: p \neq 1/2$. 135 | Taking $g_1(p) = 1$, we have 136 | \begin{equation*} 137 | \tilde{\pi}(\boldsymbol{\Theta}_0 \mid x) = \left[1 + \frac{1-\rho_0}{\rho_0}2^n B(x+1, n-x+1)\right]^{-1}. 138 | \end{equation*} 139 | \begin{center} 140 | \includegraphics[scale=0.45]{figures/posterior_prob_half.pdf} 141 | \end{center} 142 | \end{frame} 143 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 144 | \begin{frame}{Testing with improper priors} 145 | \begin{idea}[Bayesian hypothesis testing with improper priors] 146 | No. Just... No. 147 | \end{idea} 148 | See~\cite{Degroot1973} for the many reasons why this is just a bad idea. 149 | If you insist, please see Section 5.2.5 in \cite{Robert2007} and references therein. 150 | \end{frame} 151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 152 | \begin{frame}{An interesting little paradox} 153 | \begin{idea}[The Jeffreys-Lindley paradox] 154 | Consider $x \sim \operatorname{Normal}(\theta, \sigma^2)$ with $\sigma^2$ known and suppose we are interested in testing $H_0: \theta = \theta_0$ against $H_1: \theta \neq \theta_0$. 155 | We can summarise the data using the sample mean $\bar{x}$ and then compute $t_n = \sqrt{n}(\bar{x}-\theta_0)/\sigma$. 156 | Employing a conjugate prior $\theta \sim \operatorname{Normal}(\mu_0, \sigma^2)$, 157 | the Bayes factor is 158 | \begin{equation*} 159 | B_{01}(\boldsymbol{x}) = \sqrt{1 + n}\exp\left(-\frac{nt_n^2}{2(1+n)}\right), 160 | \end{equation*} 161 | which goes to infinity with $n$, while the p-value: 162 | \begin{equation*} 163 | p(t_n) = 1-2\Phi(|t_n|), 164 | \end{equation*} 165 | is constant in $n$. 166 | In practice this means that, for instance $t_n = 1.96$ and $n = 16, 818$, we have 95\% frequentist confidence that $\theta \neq \theta_0$ whilst \textbf{at the same time} having 95\% belief that $\theta = \theta_0$. 167 | \end{idea} 168 | \end{frame} 169 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 170 | \begin{frame}{Another look at principled Bayesian testing} 171 | Before we were doing 172 | \begin{equation*} 173 | L_3(\theta, \varphi) = |\varphi - \mathbb{I}_{\boldsymbol{\Theta}_0}(\theta)|. 174 | \end{equation*} 175 | But considering a strictly convex loss such as the quadratic loss 176 | \begin{equation*} 177 | L_4(\theta, \varphi) = \left(\varphi - \mathbb{I}_{\boldsymbol{\Theta}_0}(\theta)\right)^2, 178 | \end{equation*} 179 | leads to better (more adaptable) estimators in general. 180 | For instance, the Bayes estimator under $L_4$ is 181 | \begin{equation*} 182 | \varphi_\pi(x) = \pr(\theta \in \boldsymbol{\Theta}_0 \mid x). 183 | \end{equation*} 184 | \end{frame} 185 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 186 | \begin{frame}{Credibility regions} 187 | After all of this work, we are finally ready to define credibility regions, the main object in Bayesian interval estimation. 188 | \begin{defn}[Credibility region] 189 | For a prior $\pi$, a set $C_x$ is called an $\alpha$-credible set if 190 | \begin{equation*} 191 | \pr(\theta \in C_x \mid x) \geq 1-\alpha. 192 | \end{equation*} 193 | We call $C_x$ a highest posterior density (HPD) $\alpha$-credible region if 194 | \begin{equation*} 195 | \left\{\theta : p(\theta \mid x) > k_\alpha \right\} \subset C_x \subset \left\{\theta : p(\theta \mid x) \geq k_\alpha \right\}, 196 | \end{equation*} 197 | subject to the restriction that 198 | \begin{equation*} 199 | \pr(\theta \in C_x^\alpha) \geq 1-\alpha. 200 | \end{equation*} 201 | \end{defn} 202 | \end{frame} 203 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 204 | \begin{frame}{A couple remarks} 205 | Credibility regions have a few desirable properties that make them quite attractive as ``interval'' estimates. 206 | \begin{remark}[No randomisation] 207 | One nice feature of credibility regions for discrete distributions is that, contrary to the frequentist approach, no randomisation is needed to attain a certain level $\alpha$. 208 | \end{remark} 209 | Also, 210 | \begin{remark}[Improper priors and credibility regions] 211 | In principle, the use of improper priors poses no problem for the derivation of credibility regions. 212 | \end{remark} 213 | \end{frame} 214 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 215 | \begin{frame}{Credibility regions: Example I} 216 | Sometimes we will be able to provide Bayesian justifcation for frequentist confidence regions/intervals. 217 | \begin{example}[Credibility intervals for the variance in the Normal] 218 | \label{ex:cred_var_normal_Jeffreys} 219 | Consider $\boldsymbol{x} = \{ x_1, \ldots, x_n \}$, $x_i \sim \operatorname{Normal}(\theta, \sigma^2)$, with both parameters unknown. 220 | Consider 221 | $$ \pi(\theta, \sigma^2) \propto \frac{1}{\sigma^2}. $$ 222 | Make $s^2 = \sum_{i=1}^n (x-\bar{x})^2$. 223 | It can be shown that $p(\sigma^2 \mid s^2) \equiv \operatorname{Gamma}(\sigma^2; (n-1)/2, s^2/2)$. 224 | In particular, this implies 225 | \begin{equation*} 226 | \frac{s^2}{\sigma^2} \mid \bar{x} \sim \operatorname{Chi-square}(n-1), 227 | \end{equation*} 228 | which the attentive student will notice leads to the same solution as the classical confidence approach. 229 | \end{example} 230 | \end{frame} 231 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 232 | \begin{frame}{Credibility regions: Example II} 233 | \begin{example}[HPD for the normal mean] 234 | \label{ex:cred_mean_normal_Jeffreys} 235 | Consider again the setting of example~\ref{ex:cred_var_normal_Jeffreys}. 236 | Define $\bar{s}^2 = s^2/(n-1)$ and take $t = F_{\text{Student}}^{-1}(\alpha; n-1)$. 237 | The classical ``T'' interval, 238 | \begin{equation*} 239 | C_t(\bar{x}, \bar{s}^2) = \left(\bar{x} - t\sqrt{\frac{\bar{s}^2}{n}}, \bar{x} + t\sqrt{\frac{\bar{s}^2}{n}}\right), 240 | \end{equation*} 241 | is a HPD region under the Jeffreys's prior. 242 | Again, we can show that 243 | \begin{equation*} 244 | \sqrt{n}\frac{\theta -\bar{x}}{\sqrt{\bar{s}^2}} \mid \bar{x}, \sqrt{\bar{s}^2} \sim \operatorname{Student-t}(n-1). 245 | \end{equation*} 246 | \end{example} 247 | 248 | \end{frame} 249 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 250 | \begin{frame}{A little decision theory can't hurt... Or can it?} 251 | Consider the loss 252 | \begin{equation*} 253 | L_1(C, \theta) = \operatorname{vol}(C) + (1-\mathbb{I}_{C}(\theta))a, 254 | \end{equation*} 255 | which leads to the risk 256 | \begin{equation*} 257 | R(C_x, \theta) = E[\operatorname{vol}(C_x)] + \pr(\theta \notin C_x). 258 | \end{equation*} 259 | Under this loss, the interval in Example~\ref{ex:cred_mean_normal_Jeffreys} is dominated by 260 | \begin{equation*} 261 | C_t^\prime(\bar{x}, \bar{s}^2) = \begin{cases} 262 | C_t(\bar{x}, \bar{s}^2), \sqrt{\bar{s}^2} < \sqrt{n}c/(2t),\\ 263 | \{\bar{x}\}, \: \text{otherwise}, 264 | \end{cases} 265 | \end{equation*} 266 | which is a bit weird -- why? 267 | 268 | Now, consider what happens under a \textit{rational loss} 269 | \begin{equation*} 270 | L_k(C, \theta) = \frac{\operatorname{vol}(C)}{\operatorname{vol}(C) + k} + (1-\mathbb{I}_{C}(\theta)), k >0. 271 | \end{equation*} 272 | \end{frame} 273 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 274 | \begin{frame}{HPD (or HDI in one dimension)} 275 | \begin{center} 276 | \includegraphics[scale=0.5]{figures/HDI.pdf} 277 | \end{center} 278 | \end{frame} 279 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 280 | \begin{frame}{Recommended reading} 281 | \begin{itemize} 282 | \item[\faBook] \cite{Robert2007}, Ch. 5. 283 | % \item 284 | \item[\faForward] Next lecture: \cite{Robert2007} Ch. 7. 285 | \end{itemize} 286 | \end{frame} 287 | -------------------------------------------------------------------------------- /slides/lecture_7.tex: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | \section*{Bayesian model choice} 3 | \begin{frame}{Bayesian model selection: testing all over again} 4 | Model choice (or selection) is a \textbf{major} topic within any school of inference: it is how scientists make decisions about competing theories/hypotheses in light of data. 5 | One can associate a set of models $\boldsymbol{\mathcal{M}} = \{ \mathcal{M}_1, \ldots \mathcal{M}_n \}$ with a set of indices $I$ such that $\mu \in I$ we want to estimate the posterior distribution of the indicator function $\mathbb{I}_{\boldsymbol{\Theta}_\mu}(\theta)$. 6 | 7 | Recall that estimating indicator functions over $\boldsymbol{\Theta}$ was the fundamental mechanic of Bayesian testing. 8 | In the setting of Bayesian model selection (BMS), we have something of the form 9 | \begin{equation*} 10 | \mathcal{M}_i : x \sim f_i(x \mid \theta_i), \theta_i \in \boldsymbol{\Theta}_i, i \in I. 11 | \end{equation*} 12 | \end{frame} 13 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 14 | \begin{frame}{M-completeness} 15 | A key step in model selection is to identify in which regime the analyst finds themselves in. 16 | \begin{defn}[M-open, M-closed, M-complete] 17 | \label{def:m-open} 18 | Model selection can be categorised in three settings: 19 | \begin{itemize} 20 | \item \textbf{M-closed}: a situation where the true data-generating model is one of $\mathcal{M}_i \in \boldsymbol{\mathcal{M}}$, even though it is most often unknown to the analyst; 21 | \item \textbf{M-complete}: a situation where the true model exists and is out of the model set $\boldsymbol{\mathcal{M}}$. 22 | We nevertheless want to select one of the models in the set due to computational or mathematical tractability reasons. 23 | \item \textbf{M-open}: a situation in which we know the true data-generating model is not in $\boldsymbol{\mathcal{M}}$ and we have no idea what it looks like. 24 | \end{itemize} 25 | \end{defn} 26 | See~\cite{Bernardo2000} and \cite{Yao2018}. 27 | \end{frame} 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | \begin{frame}{BMS: example I} 30 | Suppose one has $x \in \mathbb{N}\cup \{0\}$, which measures, say, the number of eggs Balerion The Black Dread has laid in five consecutive breeding seasons. 31 | One can conjure up 32 | \begin{equation*} 33 | \mathcal{M}_1 : x \sim \operatorname{Poisson}(\lambda), \lambda > 0, 34 | \end{equation*} 35 | or, if feeling fancy, 36 | \begin{equation*} 37 | \mathcal{M}_2 : x \sim \operatorname{Negative-binomial}(\lambda, \phi), \lambda, \phi > 0. 38 | \end{equation*} 39 | Notice that, under $\mathcal{M}_2$, $E[X] = \lambda$ and $\vr(X) = \lambda ( 1 + \lambda/\phi)$. 40 | What happens as $\phi \to \infty$? 41 | \end{frame} 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 43 | \begin{frame}{BMS: example II} 44 | Take the famous Galaxy data set: 45 | \begin{center} 46 | \includegraphics[scale=0.3]{figures/galaxies.pdf} 47 | \end{center} 48 | A now classical model is a Gaussian mixture: 49 | \begin{equation*} 50 | \mathcal{M}_i : v_j \sim \sum_{l=1}^i p_{il} \cdot \operatorname{Normal}(v_j; \mu_{li},\sigma^2_{li}). 51 | \end{equation*} 52 | \end{frame} 53 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 54 | \begin{frame}{BMS: example III} 55 | Consider the data: 56 | \begin{center} 57 | \includegraphics[scale=0.3]{figures/oranges.pdf} 58 | \end{center} 59 | Amongst the models we can consider, 60 | \begin{align*} 61 | \mathcal{M}_1 :& y_{it} \sim \operatorname{Normal}(\beta_{10} + b_{1i}, \sigma_1^2), \\ 62 | \mathcal{M}_2 :& y_{it} \sim \operatorname{Normal}(\beta_{20} + \beta_{21}T_t + b_{2i}, \sigma_2^2) , \\ 63 | \mathcal{M}_3 :& y_{it} \sim \operatorname{Normal}\left(\frac{\beta_{30}}{1 + \beta_{31}\exp\left(\beta_{32} T_t\right)}, \sigma_3^2\right), \\ 64 | \mathcal{M}_4 :& y_{it} \sim \operatorname{Normal}\left(\frac{\beta_{40} + b_{4i}}{1 + \beta_{41}\exp\left(\beta_{42} T_t\right)}, \sigma_4^2\right). 65 | \end{align*} 66 | \end{frame} 67 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 68 | \begin{frame}{Step 0: priors} 69 | First, let us look at a convenient representation of model space: 70 | \begin{equation*} 71 | \boldsymbol{\Theta} = \bigcup_{i \in I} \{i\} \times \boldsymbol{\Theta}_i. 72 | \end{equation*} 73 | Now, to each $\mathcal{M}_i$, we associate a prior $\pi_i(\theta_i)$ on each subspace and, by Bayes' theorem we get 74 | \begin{align*} 75 | \pr(\mathcal{M}_i \mid x) & = \pr( \mu = i \mid x), \\ 76 | &= \frac{w_i \int_{\boldsymbol{\Theta}_i} f_i(x\mid t_i)\pi_i(t_i)\,dt_i}{\sum_{j} w_j \int_{\boldsymbol{\Theta}_j} f_j(x\mid t_j)\pi_j(t_j)\,dt_j }, 77 | \end{align*} 78 | where the $w_i$ are the \textbf{prior probabilities} for each model. 79 | \end{frame} 80 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 81 | % \begin{frame}{An intuitive predictive} 82 | % A nice consequence of the formulation we just saw is that the predictive distribution looks quite intuitive: 83 | % \begin{align} 84 | % \nonumber 85 | % p(\tilde{x} \mid \boldsymbol{x}) &= \sum_{j} w_j \frac{1}{m_j(\boldsymbol{x})}\int_{\boldsymbol{\Theta}_j} f_j(\tilde{x} \mid t_j) f_j(\boldsymbol{x}\mid t_j)\pi_j(t_j)\,dt_j,\\ 86 | % \label{eq:predictive_1} 87 | % &= \sum_{j} \pr(\mathcal{M}_j \mid \boldsymbol{x}) p_j(\tilde{x} \mid \boldsymbol{x}). 88 | % \end{align} 89 | % \end{frame} 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 91 | \begin{frame}{Hello, my old friend} 92 | Here, Bayes factors also play a central role: 93 | \begin{align*} 94 | \operatorname{BF}_{12} &= \frac{\pr(\mathcal{M}_1 \mid x)}{\pr(\mathcal{M}_2 \mid x)}\bigg/\frac{\pr(\mathcal{M}_1)}{\pr(\mathcal{M}_2)},\\ 95 | &= \frac{w_1^\prime \cdot w_2}{w_2^\prime \cdot w_1}, 96 | \end{align*} 97 | with $w_i^\prime := \pr(\mathcal{M}_1 \mid x)$. 98 | \end{frame} 99 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 100 | \begin{frame}{Model averaging} 101 | What if we simply \textbf{refuse} to select one model? 102 | We can write 103 | \begin{align} 104 | \nonumber 105 | p(\tilde{x} \mid \boldsymbol{x}) &= \int_{\boldsymbol{\Theta}} f(\tilde{x} \mid t) f(\boldsymbol{x}\mid t)\pi(t)\,dt,\\ 106 | \nonumber 107 | &= \sum_{j} \int_{\boldsymbol{\Theta}_j} f_j(\tilde{x} \mid t_j) g(j, t_j \mid \boldsymbol{x})\,dt_j,\\ 108 | \nonumber 109 | &= \sum_j p (\mathcal{M}_j \mid \boldsymbol{x}) \int_{\boldsymbol{\Theta}_j} f_j(\tilde{x} \mid t_j) p(t_j \mid \boldsymbol{x})\,dt_j,\\ 110 | \label{eq:predictive_2} 111 | &= \sum_j w_j^\prime \int_{\boldsymbol{\Theta}_j} f_j(\tilde{x} \mid t_j) p(t_j \mid \boldsymbol{x})\,dt_j. 112 | \end{align} 113 | % which is another version of the expression in (\ref{eq:predictive_1}). 114 | \end{frame} 115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 116 | \begin{frame}{Model checking} 117 | Modern Bayesian inference not only allows for, but actively encourages model interrogation and checking. 118 | \begin{itemize} 119 | \item The central idea of \textbf{Leave-one-out cross-validation (LOO)} is to estimate the \textit{expected log pointwise predictive density for a new dataset}, elpd: 120 | \begin{equation*} 121 | \operatorname{elpd} = \sum_{i=1}^n \int m(\tilde{x}_i)\log p(\tilde{x}_i \mid \boldsymbol{x})\,d\tilde{x}_i. 122 | \end{equation*} 123 | See \cite{Vehtari2017}. 124 | \item With \textbf{Posterior predictive checks (PPCs)} we wish to compare functions of the observed data, $f(\boldsymbol{x})$ with functions of the predictive distribution, $f(\boldsymbol{\tilde{x}})$. 125 | \begin{center} 126 | \includegraphics[scale=0.5]{figures/PPC.jpg} 127 | \end{center} 128 | See \cite{Berkhof2000} and~\cite{Gabry2019}. 129 | \end{itemize} 130 | \end{frame} 131 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 132 | \begin{frame}{Recommended reading} 133 | \begin{itemize} 134 | \item[\faBook] \cite{Robert2007}, Ch. 7. 135 | % \item 136 | \item[\faForward] Next lecture: \cite{Schervish1995} Ch. 7.4. 137 | \end{itemize} 138 | \end{frame} 139 | -------------------------------------------------------------------------------- /slides/lecture_8.tex: -------------------------------------------------------------------------------- 1 | \section*{Bayesian asymptotics} 2 | \begin{frame}{Asymptotics} 3 | A major part of a statistical approach is understanding what happens in the limit of many many observations. 4 | Consider the joint conditional density of the data, $f_n(\boldsymbol{x} \mid \theta)$ and a prior $\pi(\theta)$. 5 | What happens to $p_n(\theta \mid \boldsymbol{x}) = f_n(\boldsymbol{x} \mid \theta)\pi(\theta)/m_n(\boldsymbol{x})$ as $n \to \infty$ ? 6 | \begin{idea}[Asymptotics is about understanding] 7 | Infinity is a big ``number''. 8 | Considering what happens as $n \to \infty$ is less a statement about a real world situation than about the structure and regularity of a model. 9 | Doing asymptotics is about understanding what makes a model tick rather than getting useful results for a regime seldom achieved in practice. 10 | \end{idea} 11 | Another important aspect to consider is the \textbf{rate} at which things converge asymptotically. 12 | Studying rates provides complementary information about the structure of the model and gives hints as to the accuracy of asymptotic approximations. 13 | \end{frame} 14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 15 | \begin{frame}{Bayesian asymptotics I: consistency} 16 | \begin{theo}[The posterior concentrates around the ``true'' value] 17 | Let $(S, \mathcal{A}, \mu)$ be a probability space and let $(\Omega, \tau)$ be a finite-dimensional parameter space equipped with a Borel $\sigma$-field. 18 | Suppose there exist measurable $h_n: \mathcal{X}^n \to \Omega$ such that $h_n(\boldsymbol{X}_{n})$ converges in probability to $\Theta$. 19 | Writing $\mu_{\boldsymbol{\Theta} \mid \boldsymbol{X}_{n}}(\cdot \mid \boldsymbol{x}_{n})$ for the posterior measure, we have 20 | \begin{equation*} 21 | \lim_{n \to \infty} \mu_{\boldsymbol{\Theta} \mid \boldsymbol{X}_{n}}(A \mid \boldsymbol{X}_{n}) = I_A(\Theta), \: \mu-\textrm{a.s.} 22 | \end{equation*} 23 | \end{theo} 24 | \textbf{Please} see Theorem 7.78 in \cite{Schervish1995} (pg 429) for all of the \textit{many} details. 25 | 26 | \textbf{Discussion:} what we are essentially saying here is that if there exists a consistent (sequence of) estimator(s) for $\theta$ -- understood here as a random variable --, then the posterior will concentrate around the true generating distribution of the parameter asymptotically. 27 | \end{frame} 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | \begin{frame}{Remember Cromwell's law?} 30 | Here is another neat little theorem with a cumbersome proof. 31 | \begin{theo}[A ``nice'' prior ensures posterior consistency] 32 | Define $\operatorname{KL}(\theta, \theta^\prime)$ as the Kullback-Leibler divergence between $P_{\theta}$ and $P_{\theta^\prime}$. 33 | Let $\theta_0$ be the true data-generating parameter and define $C_\epsilon = \{\theta : \operatorname{KL}(\theta_0, \theta) < \epsilon\}$, $\epsilon > 0$. 34 | Let $\Pi$ be a prior measure such that $\Pi(C_\epsilon) > 0$ for every $\epsilon > 0$. 35 | Take $N_0$ open such that $C_\epsilon \subset N_0$. 36 | Then 37 | \begin{equation*} 38 | \lim_{n \to \infty} \mu_{\boldsymbol{\Theta} \mid \boldsymbol{X}_{n}}(N_0 \mid \boldsymbol{X}_{n}) = 1, \: P_{\theta_0}-\textrm{a.s.} 39 | \end{equation*} 40 | \end{theo} 41 | Again, \textbf{please} see Theorem 7.80 in \cite{Schervish1995} (pg 430) for the details. 42 | \end{frame} 43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 44 | \begin{frame}[allowframebreaks]{Interlude: regularity conditions} 45 | Before we proceed, we will need to make things nice. 46 | Consider the following regularity conditions 47 | \begin{itemize} 48 | \item[1] The parameter space is $\boldsymbol{\Theta} \subset \mathbb{R}^d$ for some finite $d$; 49 | \item[2] We have $\theta_0$ an an interior point of $\boldsymbol{\Theta}$; 50 | \item[3] The prior distribution has a density w.r.t. Lebesgue which is positive and continuous at $\theta_0$; 51 | \item[4] There exists $N_0 \subseteq \boldsymbol{\Theta}$ with $\theta_0 \in N_0$ such that the log-likelihood, $l_n(\theta)$, is twice-differentiable with respect to all coordinates of $\theta$, $P_{\theta}$-a.s. 52 | \item[5] The largest eigenvalue of the inverse observed Fisher information, $\Sigma_n$, vanishes in probability. 53 | \item[6] The MLE is consistent; 54 | \item[7] The Fisher information is a smooth function of $\theta$. 55 | \end{itemize} 56 | 57 | \end{frame} 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | \begin{frame}{Bayesian asymptotics II: asymptotic normality} 60 | We can now state a nice result which characterises the asymptotic form of the posterior. 61 | \begin{theo}[Bernstein von-Mises\footnote{Named after Austrian mathematician Richard Edler von Mises (1883--1953) and Russian mathematician Sergei Natanovich Bernstein (1880--1968).}] 62 | Under the regularity conditions we have discussed, take $\hat{\theta}$ to be the MLE. 63 | Put $\boldsymbol{\Psi}_n = \left(\Sigma_n\right)^{-1/2}(\theta- \hat{\theta})$. 64 | Then the posterior distribution of $\boldsymbol{\Psi}_n$ conditional on $\boldsymbol{X}$ converges in probability \textbf{uniformly} on compact sets to the multivariate normal distribution $\operatorname{Normal}_d\left(\boldsymbol{0}, \boldsymbol{I}_d\right)$ with density $\phi_d$. 65 | More precisely, 66 | \begin{equation*} 67 | \lim_{n \to \infty} P_{\theta_0}\left(\sup_{\psi \in B} \bigg\rvert f_{\boldsymbol{\Psi}_n \mid \boldsymbol{X}}(\psi) - \phi_d(\psi) \bigg\lvert > \epsilon \right) = 0, 68 | \end{equation*} 69 | for all $B \subset \mathbb{R}^d$ compact and $\epsilon > 0$. 70 | \end{theo} 71 | See Theorem 7.89 in \cite{Schervish1995} (page 437). 72 | \end{frame} 73 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 74 | \begin{frame}{Dabbling with normal approximations} 75 | \begin{exercise}[Cauchy location posterior] 76 | Take $X_i \sim \operatorname{Cauchy}(\theta, 1)$, $i = 1, 2,\ldots, 10$. 77 | In particular, suppose $\boldsymbol{x} = \{-5, -3, 0, 2, 4, 5, 7, 9, 11, 14\}$. 78 | \begin{itemize} 79 | \item[i)] Compute the MLE and $l^{\prime\prime}$; 80 | \item[ii)] Deduce the parameters of the normal approximation to $p(\theta \mid \boldsymbol{x})$; 81 | \item[iii)] Use an MCMC\footnote{The instructor can assist with this step.} routine to sample from $p(\theta \mid \boldsymbol{x})$, obtain a posterior approximation to its density and compare it to the normal approximation; 82 | \item[iv)] Simulate data sets of sizes $n=20, 50, 100, 500, 1000$ and $10, 000$ and repeat iii. 83 | \item[v)] See if you can reduce/increase the discrepancy between the posterior and its approximation by fiddling with the prior (without breaking the regularity assumptions!). 84 | \end{itemize} 85 | \end{exercise} 86 | See example 7.104 in \cite{Schervish1995} (page 444). 87 | \end{frame} 88 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 89 | \begin{frame}{Recommended reading} 90 | \begin{itemize} 91 | \item[\faBook] \cite{Schervish1995} Ch. 7.4. 92 | % \item 93 | \item[\faForward] Next lecture: \cite{Raftery1988} and~\cite{Gelman2002}. 94 | \end{itemize} 95 | \end{frame} 96 | -------------------------------------------------------------------------------- /slides/lecture_extra.tex: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | \section*{Markov chain Monte Carlo} 3 | \begin{frame}{MCMC: The best bad method you have ever seen} 4 | Markov chain Monte Carlo (MCMC) methods are a broad class of stochastic algorithms to compute integrals. 5 | 6 | Suppose you are confronted with the following question: what is the ratio between the circumference of inscribed circle and its diameter? 7 | You are \textbf{not} allowed to use any Geometry. 8 | \begin{figure} 9 | \includegraphics[scale=0.85]{figures/pi_MC.png} 10 | \end{figure} 11 | \end{frame} 12 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 13 | \begin{frame}{First, a warning} 14 | \begin{quote} 15 | ``Monte Carlo is an extremely bad method; it should be used only when all alternative methods are worse.'' 16 | \end{quote} 17 | Alan Sokal (1955-) in \textit{Monte Carlo Methods in Statistical Mechanics: Foundations and New Algorithms} (1996, pg. 1). 18 | \begin{figure} 19 | \includegraphics[scale=0.25]{figures/tiger.jpg} 20 | \caption{MCMC is, in a way, like a captive tiger...} 21 | \end{figure} 22 | \end{frame} 23 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 24 | \begin{frame}{Also...} 25 | Repeat after me, 26 | \begin{idea}[Bayesian MCMC is not a thing] 27 | \begin{center} 28 | {\Huge There is no such thing as ``Bayesian'' MCMC.} 29 | \end{center} 30 | 31 | MCMC is a numerical method for computing integrals. 32 | It does not care whether you are a Bayesian, frequentist, \textit{flamenguista} or \textit{corintiana}. 33 | \end{idea} 34 | \end{frame} 35 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 36 | \begin{frame}{Computing integrals} 37 | Technically, for a probability space $(X, \mathcal{F}, P)$, for $f : X \to \mathbb{R}$, we want to compute 38 | $$ 39 | \mu_f = E_P[f] = \int_{X} f\,dP. 40 | $$ 41 | When $P$ is absolutely continuous with respect to the Lebesgue measure, we have 42 | $$ 43 | \mu_f= \int_{X} f(x)p(x)\,dx, 44 | $$ 45 | as is usually written in introductory textbooks. 46 | 47 | A ``natural'' approach to obtain an estimator of $\mu_f$ is 48 | $$ 49 | \hat{\mu}_{f, N}^{\text{MC}} = \frac{1}{N} \sum_{n = 1}^{N} f(x_{n}), 50 | $$ 51 | with $x_1, \ldots, x_N \sim P$. 52 | \end{frame} 53 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 54 | \begin{frame}{A central (limit) theorem} 55 | Define 56 | $$ 57 | \text{MC-SE}_{N}[f] 58 | = \sqrt{ \frac{ \text{Var}_{P}[f]}{N} }. 59 | $$ 60 | Then 61 | $$ 62 | \lim_{N \rightarrow \infty} 63 | \frac{ \hat{\mu}_{f,N}^{\text{MC}} - \mathbb{E}_{P}[f] } 64 | { \text{MC-SE}_{N}[f] } 65 | \sim \text{Normal}(0, 1), 66 | $$ 67 | \begin{idea}[MCMC-CLT needs to hold] 68 | A key insight is that MCMC only trustworthy when a central limit theorem holds. 69 | This means $f$ needs to be $2+\epsilon$-integrable with respect to $P$. 70 | Look out for $\text{MC-SE}$, too. 71 | It is important to quantify ``the probable error of the mean''\footnote{A ``pun'' with William Gosset's (1876--1937) paper: Student. (1908). The probable error of a mean. Biometrika, 1-25.}, as it were. 72 | \end{idea} 73 | \end{frame} 74 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 75 | \begin{frame}{Diagnostics} 76 | \begin{idea}[Diagnose your MCMC!] 77 | Perhaps as important as learning how to run an MCMC is to learn to \textbf{diagnose} it. 78 | This means detecting failure to converge to $P$ and/or poor statistical performance. 79 | \end{idea} 80 | When running $K$ chains, the between sample variance can be written as 81 | \begin{equation*} 82 | \label{eq:Between} 83 | B = \frac{N}{K-1} \sum_{k = 1}^K \left(\bar{x}_k - \bar{\bar{x}}\right)^2, 84 | \end{equation*} 85 | where $\bar{x}_k = N^{-1}\sum_{n = 1}^N x_k^{(n)}$ and $\bar{\bar{x}} = K^{-1}\sum_{k=1}^K\bar{x}_k$. 86 | Now we can define the within variance as 87 | \begin{equation*} 88 | W = K^{-1}\sum_{k = 1}^K s_k^2 \: \text{and} \: s_k^2 = (N-1)^{-1} \sum_{n = 1}^N \left(x_k^{(n)} - \bar{x}_k\right)^2 89 | \end{equation*} 90 | Finally we can define the~\textbf{potential scale reduction factor} (PSRF)~\citep{Gelman1992}: 91 | \begin{equation*} 92 | \label{eq:PRSF} 93 | \hat{R} = \sqrt{\frac{ (N-1)W + B }{NW}}. 94 | \end{equation*} 95 | At convergence, $\hat{R} < 1.1$, providing a univariate measure of convergence across chains (for a given parameter). 96 | \end{frame} 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 98 | \begin{frame}{More diagnostics} 99 | One of the things we are interested in is \textit{statistical} performance, i.e., how precise the estimator $\hat{\mu}_{f,N}^{\text{MC}}$ is. 100 | To measure that, we can compute the \textbf{effective sample size}: 101 | \begin{equation*} 102 | \text{ESS} = \frac{N}{1 + 2\sum_{t=1}^\infty \rho_t}, 103 | \end{equation*} 104 | where $\rho_t$ is the \textbf{autocorrelation} at lag $t$, $t=1, 2, \ldots$. 105 | A good rule of thumb\footnote{Assuming approximate normality. Calculation stolen from~\url{https://www.biorxiv.org/content/10.1101/2021.05.04.442586v1.full.pdf}} is that if one wants to have a an standard error which is 1\% of the width of the 95\% interval of the true distribution is to have $\text{ESS} \geq 625$: 106 | \begin{align*} 107 | \frac{\sigma}{\sqrt{N}} &\leq \frac{\sigma}{\sqrt{\text{ESS}}},\\ 108 | 0.01 \times 4 \times \sigma &\leq \frac{\sigma}{\sqrt{\text{ESS}}},\\ 109 | &\implies\\ 110 | \text{ESS} &\geq 625, 111 | \end{align*} 112 | where $\sigma = \sqrt{\text{Var}_{P}[f]}$. 113 | \end{frame} 114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 115 | \begin{frame}{Even more diagnostics} 116 | \begin{figure} 117 | \includegraphics[scale=0.25]{figures/traceplots.png} 118 | \end{figure} 119 | \begin{idea}[No one diagnostic is enough] 120 | Use multiple diagnostic metrics, always. 121 | Every MCMC diagnostic out there has blind spots; using multiple simultaneously increases the chances those blind spots are covered. 122 | \end{idea} 123 | \end{frame} 124 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 125 | \begin{frame}{Scaling with dimension} 126 | \begin{figure} 127 | \includegraphics[scale=0.35]{figures/concentration_measure_volume.pdf} 128 | \end{figure} 129 | Taken from~\url{https://mc-stan.org/users/documentation/case-studies/curse-dims.html}. 130 | \begin{idea}[The higher the dimension, the more structure you need] 131 | As dimension increases, things start to get pretty lonely pretty fast for a particle. 132 | The only way to counteract this ``thinning'' is to introduce more structure. 133 | This is the intuitive basis for the success of gradient-based methods such as MALA\footnote{Metropolis-adjusted Langevin algorithm} and HMC\footnote{Hamiltonian (or Hybrid) Monte Carlo.}. 134 | \end{idea} 135 | \end{frame} 136 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 137 | \begin{frame}{Take home} 138 | \begin{itemize} 139 | \item MCMC allows us to make inferences about huge models in Science and Engineering; 140 | \item MCMC is a terrible method, which nevertheless is our best shot at computing high-dimensional integrals; 141 | \item One has to make sure a CLT holds; 142 | \item One has to verify diagnostics to ensure no convergence/performance problems are present; 143 | \item No one diagnostic is enough. 144 | \end{itemize} 145 | \end{frame} 146 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 147 | \begin{frame}{Recommended reading} 148 | \begin{itemize} 149 | \item[\faBook] \cite{Robert2007}, Ch. 6\footnote{The Bayesian Choice by Christian Robert (2007, 2nd edition).}. 150 | \item[\faBook] \url{https://betanalpha.github.io/assets/case_studies/markov_chain_monte_carlo.html} 151 | \end{itemize} 152 | \end{frame} 153 | -------------------------------------------------------------------------------- /slides/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxbiostat/BayesianStatisticsCourse/ff673af7144838e7e29fa4aaf6adc628095911fb/slides/logo.jpg --------------------------------------------------------------------------------