├── images ├── seq.png ├── .DS_Store ├── Table.png ├── bimodal.png ├── McGrayne.png ├── R4DSfig1.png ├── RobertFig2.4.png ├── pointwiseELPD.png ├── RethinkingFig1.2.png ├── SobolIntegration.png ├── chp11_KL_heatmap.png ├── EssaysOnProbability.png ├── MacroscopicEntropy.png ├── PierreSimonLaPlace.png ├── StatisticalEntropy.png └── compareModelsbyLOO.png ├── LICENSE ├── meeting2b.Rmd ├── meeting1.Rmd ├── README.md ├── meeting3b.Rmd └── SectionD10.ipynb /images/seq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/seq.png -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/.DS_Store -------------------------------------------------------------------------------- /images/Table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/Table.png -------------------------------------------------------------------------------- /images/bimodal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/bimodal.png -------------------------------------------------------------------------------- /images/McGrayne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/McGrayne.png -------------------------------------------------------------------------------- /images/R4DSfig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/R4DSfig1.png -------------------------------------------------------------------------------- /images/RobertFig2.4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/RobertFig2.4.png -------------------------------------------------------------------------------- /images/pointwiseELPD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/pointwiseELPD.png -------------------------------------------------------------------------------- /images/RethinkingFig1.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/RethinkingFig1.2.png -------------------------------------------------------------------------------- /images/SobolIntegration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/SobolIntegration.png -------------------------------------------------------------------------------- /images/chp11_KL_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/chp11_KL_heatmap.png -------------------------------------------------------------------------------- /images/EssaysOnProbability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/EssaysOnProbability.png -------------------------------------------------------------------------------- /images/MacroscopicEntropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/MacroscopicEntropy.png -------------------------------------------------------------------------------- /images/PierreSimonLaPlace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/PierreSimonLaPlace.png -------------------------------------------------------------------------------- /images/StatisticalEntropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/StatisticalEntropy.png -------------------------------------------------------------------------------- /images/compareModelsbyLOO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooersLab/jointprob1D/main/images/compareModelsbyLOO.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Blaine Mooers and the University of Oklahoma Health Board of Regents 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /meeting2b.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "3 September 2022 jointprob meeting" 3 | output: html_notebook 4 | editor_options: 5 | markdown: 6 | wrap: 72 7 | --- 8 | 9 | # Grid Approximations 10 | 11 | A. Five-step protocol in one dimension. 12 | 13 | 1. Define a grid 14 | 15 | ```{r} 16 | p_grid <-seq(from=0,to=1,length.out=100) 17 | ``` 18 | 19 | 2. Define the prior 20 | 21 | ```{r} 22 | prior <-rep(1,100)/sum(rep(1,100)) 23 | sum(prior) 24 | ``` 25 | 26 | 3. Compute the likelihood at each value in grid 27 | 28 | ```{r} 29 | likelihood <- dbinom(6,size=9,prob=p_grid) 30 | sum(likelihood) 31 | ``` 32 | 33 | 4. Compute product of likelihood and prior 34 | 35 | ```{r} 36 | unstd.posterior <-likelihood*prior 37 | ``` 38 | 39 | 5. Standardize the posterior, so it sums to 1. 40 | 41 | ```{r} 42 | posterior <-unstd.posterior/sum(unstd.posterior) 43 | sum(posterior) 44 | ``` 45 | 46 | ```{r} 47 | plot( p_grid, posterior, 48 | type="b", 49 | xlab="probability ofwater", 50 | ylab="posteriorprobability") 51 | mtext( "100 points") 52 | ``` 53 | ```{r} 54 | p_grid <-seq(from=0,to=1,length.out=100) 55 | prior <-rep(1,100)/sum(rep(1,100)) 56 | sum(prior) 57 | likelihood <-dbinom(6,size=9,prob=p_grid) 58 | sum(likelihood) 59 | unstd.posterior <-likelihood*prior 60 | posterior <-unstd.posterior/sum(unstd.posterior) 61 | sum(posterior) 62 | plot( p_grid, posterior, 63 | type="b", 64 | xlab="probability ofwater", 65 | ylab="posteriorprobability") 66 | mtext( "100 points") 67 | ``` 68 | 69 | ## Repeat with Step prior 70 | 71 | ```{r} 72 | sprior <- ifelse(p_grid < 0.5, 0, 1)/sum(ifelse(p_grid < 0.5, 0, 1)) 73 | sum(sprior) 74 | slikelihood <-dbinom(6,size=9,prob=p_grid) 75 | unstd.sposterior <-slikelihood*sprior 76 | sposterior <-unstd.sposterior/sum(unstd.sposterior) 77 | sum(sposterior) 78 | 79 | plot( p_grid, sposterior, 80 | type="b", 81 | xlab="probability of water", 82 | ylab="posterior probability") 83 | points(p_grid, sprior, col='magenta') 84 | points(p_grid, posterior, col='cyan') 85 | mtext( "Step prior, 100 grid points") 86 | 87 | legend(x = "topright", 88 | c("sposterior", "sprior", "posterior"), 89 | cex=.8, 90 | col=c("black","magenta","cyan"), 91 | lwd = 2, 92 | bty = "n" 93 | ) 94 | ``` 95 | 96 | ## Repeat with Peaked prior 97 | 98 | ```{r} 99 | pkprior <- exp( -5*abs(p_grid -0.5))/sum(exp( -5*abs(p_grid -0.5))) 100 | sum(pkprior) 101 | pklikelihood <-dbinom(6,size=9,prob=p_grid) 102 | unstd.pkposterior <-pklikelihood*pkprior 103 | pkposterior <-unstd.pkposterior/sum(unstd.pkposterior) 104 | sum(pkposterior) 105 | 106 | plot( p_grid, pkposterior, 107 | type="b", 108 | xlab="Probability of water", 109 | ylab="Posterior probability") 110 | points(p_grid, pkprior, col='magenta') 111 | points(p_grid, posterior, col='cyan') 112 | mtext( "Peaked prior, 100 grid points") 113 | 114 | legend(x = "topright", 115 | c("pkposterior", "pkprior", "posterior"), 116 | cex=.8, 117 | col=c("black","magenta","cyan"), 118 | lwd = 2, 119 | bty = "n" 120 | ) 121 | ``` 122 | 123 | ## Triangular distriution 124 | 125 | ```{r} 126 | #install.packages(extraDistr) 127 | library(extraDistr) 128 | ??extraDistr 129 | x <- rtriang(1e5, 0, 1, 0.1) 130 | hist(x, 100, freq = FALSE) 131 | #curve(dtriang(x, 5, 7, 6), 3, 10, n = 500, col = "red", add = TRUE) 132 | #hist(ptriang(x, 5, 7, 6)) 133 | #plot(ecdf(x)) 134 | #curve(ptriang(x, 5, 7, 6), 3, 10, n = 500, col = "red", lwd = 2, add = TRUE) 135 | ``` 136 | 137 | ## Repeat with triangle prior 138 | 139 | ```{r} 140 | tprior <- dtriang(p_grid, 0, 1, 0.1)/sum(dtriang(p_grid, 0, 1, 0.1)) 141 | sum(tprior) 142 | 143 | plot( p_grid, tprior, 144 | type="b", 145 | xlab="Probability of water", 146 | ylab="Prior probability") 147 | ``` 148 | 149 | ```{r} 150 | tlikelihood <-dbinom(6,size=9,prob=p_grid) 151 | unstd.tposterior <-tlikelihood*tprior 152 | tposterior <-unstd.tposterior/sum(unstd.tposterior) 153 | tposterior 154 | plot( p_grid, tposterior, 155 | type="b", 156 | xlab="Probability of water", 157 | ylab="Prior and posterior probabilities") 158 | points(p_grid, tprior, col='magenta') 159 | points(p_grid, posterior, col='cyan') 160 | legend(x = "topright", 161 | c("tposterior", "tprior", "posterior"), 162 | cex=.8, 163 | col=c("black","magenta","cyan"), 164 | lwd = 2, 165 | bty = "n" 166 | ) 167 | ``` 168 | 169 | 170 | -------------------------------------------------------------------------------- /meeting1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "jointprob Section D meeting 1" 3 | author: "Blaine Mooers" 4 | date: "`r Sys.Date()`" 5 | output: pdf_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ```{r functions, include=FALSE} 13 | # A function for captioning and referencing images 14 | fig <- local({ 15 | i <- 0 16 | ref <- list() 17 | list( 18 | cap=function(refName, text) { 19 | i <<- i + 1 20 | ref[[refName]] <<- i 21 | paste("Figure ", i, ": ", text, sep="") 22 | }, 23 | ref=function(refName) { 24 | ref[[refName]] 25 | }) 26 | }) 27 | ``` 28 | 29 | # Notes from Rethinking Statistics 30 | 31 | ## The golem of Prague 32 | 33 | ## 1.1 Statistical golems 34 | 35 | ### Statistics as a branch of engineering 36 | 37 | - A set of recipes (statistical tests) are handed out in service courses. 38 | - There are many causes for this situation. 39 | 40 | ## 1.2 Statistical Rethinking 41 | 42 | ### Rethinking statistical inference 43 | 44 | - Develop a set of strategies rather than a set of recipes 45 | 46 | ### Null Hypothesis Significance Testing 47 | 48 | - Falsifying the null hypothesis, not the hypothesis 49 | - In my opinion, an artifact of the frequentist approach 50 | - In my opinion, this does not reflect how science is really done. 51 | 52 | ### Uniqueness problem in mapping of Hypotheses to Statistical Models 53 | 54 | ![Source: (McElreath 2021) Fig. 1.2](./images/RethinkingFig1.2.png){width="80%"} 55 | 56 | ### Falsification is consensual 57 | 58 | ## 1.3 Tools for golem engineering 59 | 60 | 1. Bayesian statistical analysis 61 | - Frequentist approach defines probability in term of the frequency of a very large number of imaginary events. 62 | - It is a major challenge not to equate probability with frequency in Bayesian Data Analysis 63 | 64 | 2. Model comparison and prediction 65 | 66 | 3. Multilevel models 67 | 68 | - hierarchical, random effects, varying effects, or mixed effects models 69 | - The levels are layers of parameters 70 | - Helpful in dealing with overfitting 71 | - **Partial pooling** of data into sub-units to produce better estimates for all sub-units 72 | 73 | - Applications of **partial pooling**: 74 | 75 | - Adjust parameter estimates for repeated sampling of the same sample 76 | - Adjust parameter estimates for the imbalances in sampling 77 | - Model variation between groups of samples. 78 | - Preserve the uncertainty in pre-averaged values (premature averaging can degrade the downstream analyses) 79 | 80 | 4. Graphical Casual Models 81 | 82 | - The Identification Problem: the problem of identifying causes in cause and effect relationships 83 | - Need casual models in addition to statistical models to do casual inference. 84 | - DAG Directed Acyclic Graphs 85 | 86 | ## Book Outline 87 | 88 | 1. Chapters 2-3: Probability theory behind Bayesian inference. 89 | 2. Chapters 4-8: multiple linear regression as a Bayesian tool 90 | 3. Chapters 9-12: generalized linear models (MCMC, maximum entropy, GLMs) 91 | 4. Chapters 13-16: Multilevel models, including measurement error and covariation. 92 | 93 | 94 | 95 | 96 | # R for Data Science 97 | 98 | ## Introduction 99 | 100 | - [R for Data Science](https://r4ds.had.co.nz/introduction.html) could have been titled the *Introduction to the Tidyverse*. The tidyverse could be considered as a fork of R. It is a toolkit to ease doing data science. 101 | 102 | ![Fig1 from Wickham and ](./images/R4DSfig1.png){width="80%"} 103 | 104 | - **tidy** When data are tidy, there is one row per observation and one column per variable. 105 | - **transform** narrowing to observations of interest, calculating derived variables, and calculating summary statistics 106 | - **visualization and modeling** Two tools for generating new knowledge and understanding. 107 | - **communication** essential step 108 | - **programming** Provides the tools for doing all parts. 109 | 110 | The combination of data tidying and transfroming is known as *data wrangling*. The author makes analogy to the global definition of wrangling, which refers to politicians haggling or struggling over issues. I prefer the analogy to the North American definition of rounding up scattered horses because my data are often in a scattered and disorganized state initially. 111 | 112 | ### Plan of book 113 | 114 | 1. data visualization 115 | 2. data wrangling\ 116 | 3. progamming 117 | 4. model 118 | 5. Communication (Rmarkdown) 119 | 120 | ### Prerequisites 121 | 122 | - R 123 | - Rstudio (or your favorite text editor with a R repl). 124 | 125 | + In Emacs, you can use org documents with jupyter-R via org-babel and emacs-jupyter packages. 126 | + Alternatively, you can edit Rmarkdown directly in Emacs and evaluate the code blocks. See this [post](https://plantarum.ca/2021/10/03/emacs-tutorial-rmarkdown) for an introduction. 127 | + In Jupyter, you can install and use the R kernel (you to install R first). Jupytext or pandoc can be used to conver the ipynb file to other formats. 128 | + [radian](https://github.com/randy3k/radian) The Ipython-like console for R. 129 | + [quarto](https://quarto.org/) The org-babel for Rstudio. Write in markdown. Set up for R (via knitr), Julia (via IJulia i.e., Jupyter), and Python via Jupyter. For the latter, you specify the kernel, so in principle quartp for Python could handle every programming language for which there is a Jupyter kernel. This would solve the snippet problem in Jupyter. 130 | 131 | ### Installation of the tidyverse 132 | 133 | ```{r} 134 | install.packages("tidyverse") 135 | ``` 136 | 137 | ```{r} 138 | library("tidyverse") 139 | ``` 140 | 141 | ```{r} 142 | tidyverse_update() 143 | ``` 144 | 145 | ```{r} 146 | install.packages(c("nycflights13", "gapminder", "Lahman")) 147 | ``` 148 | 149 | ## Chapter 3. Data Visualization with **ggplot2** 150 | 151 | ### mpg dataframe 152 | 153 | ```{r} 154 | mpg 155 | ``` 156 | 157 | To get documentation on mpg, enter: 158 | 159 | ```{r} 160 | ?mpg 161 | ``` 162 | 163 | ### ggplot() creates the coordinate system onto which the plot elements are added as layers. 164 | 165 | ```{r} 166 | ggplot(data=mpg) 167 | ``` 168 | 169 | ### geom_point() adds a layer of scatterplot points. 170 | 171 | You must specify the x and y variables. The use of color is an example of aesthetic mapping. 172 | 173 | ```{r, fig.cap="Scatterplot of mpg vs. engine size."} 174 | ggplot(data = mpg) + 175 | geom_point(mapping = aes(x = displ, y = hwy, color=class)) 176 | ``` 177 | 178 | ### geom\_ will trigger intellisense feature 179 | 180 | ```{r} 181 | geom_ 182 | ``` 183 | 184 | ### Histogram of vehicle class 185 | 186 | ```{r} 187 | ggplot(mpg, aes(class)) + 188 | geom_histogram(stat="count") 189 | ``` 190 | 191 | 192 | 193 | 194 | ### Facets == subplots 195 | 196 | ```{r} 197 | ggplot(data = mpg) + 198 | geom_point(mapping = aes(x = displ, y = hwy)) + 199 | facet_wrap(~ class, nrow = 2) 200 | ``` 201 | 202 | ### Geometric Objects 203 | A plot uses a **geom** or geometrical object to represent data. 204 | 205 | Every geom function in ggplot2 takes a mapping argument. However, not every aesthetic works with every geom. 206 | 207 | ```{r} 208 | # left 209 | ggplot(data = mpg) + 210 | geom_point(mapping = aes(x = displ, y = hwy)) 211 | 212 | # right 213 | ggplot(data = mpg) + 214 | geom_smooth(mapping = aes(x = displ, y = hwy)) 215 | ``` 216 | Can display mulitple geoms in one plot: 217 | 218 | ```{r} 219 | ggplot(data = mpg) + 220 | geom_point(mapping = aes(x = displ, y = hwy)) + 221 | geom_smooth(mapping = aes(x = displ, y = hwy)) 222 | ``` 223 | 224 | 225 | 226 | 227 | 228 | ### Statistical Transformations 229 | 230 | ```{r} 231 | ggplot(data = diamonds) + 232 | geom_bar(mapping = aes(x = cut)) 233 | ``` 234 | Every **geom** has a default **stat**; and every **stat** has a default **geom*8. 235 | 236 | ```{r} 237 | ggplot(data = diamonds) + 238 | stat_count(mapping = aes(x = cut)) 239 | ``` 240 | 241 | 242 | ### Position Adjustments 243 | 244 | position = "dodge" places overlapping objects directly beside one another. 245 | 246 | ```{r} 247 | ggplot(data = diamonds) + 248 | geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge") 249 | ``` 250 | ### Overplotting addressed with jitter 251 | 252 | position = "jitter" adds a small amount of random noise to each point. 253 | 254 | 255 | 256 | ```{r} 257 | ggplot(data = mpg) + 258 | geom_point(mapping = aes(x = displ, y = hwy), position = "jitter") 259 | ``` 260 | 261 | ## Coordinate systems 262 | 263 | ### Flipping X and Y to a horizontal box plot 264 | 265 | ```{r} 266 | ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 267 | geom_boxplot() 268 | ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 269 | geom_boxplot() + 270 | coord_flip() 271 | ``` 272 | 273 | 274 | ## Chapter 4. Workflow 275 | 276 | ### Assignment operator 277 | 278 | ```{r} 279 | object_name <- value 280 | ``` 281 | 282 | 283 | ### Naming conventions 284 | 285 | ```{r} 286 | snakecase wickham_favors_snakecase 287 | pascalCase iFavorPascalCase 288 | ``` 289 | 290 | Just be consistent. Develop a style guide for your group. 291 | 292 | Case matters. 293 | 294 | ### Calling Functions 295 | 296 | ![](./images/seq.png){width="140%"} 297 | ```{r} 298 | A <- seq(1,12) 299 | A 300 | ``` 301 | 302 | 303 | ```{r} 304 | (seq(1,12)) 305 | ``` 306 | 307 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![version](https://img.shields.io/static/v1?label=jointProb1D&message=0.2&color=brightcolor) 2 | [![license: mit](https://img.shields.io/badge/license-mit-blue.svg)](https://opensource.org/licenses/mit) 3 | 4 | 5 | # jointprob 6 | 7 | The [jointprob community](https://scicloj.github.io/docs/community/groups/jointprob/) is a Bayesian data analysis (BDA) study group. 8 | We aimed to work through Richard McElreath's *Rethinking Statistics, 2nd Ed*. 9 | We met for two hours once every two weeks. 10 | We made it through chapter 7. 11 | 12 | There were initially about 30 people interested in jointprob last summer. 13 | Four parallel sections were held to accommodate different schedules and time zones. 14 | I participated in Section D, which met on Saturdays. 15 | Section D was the last section left standing as of December. 16 | We were down to 4-6 participants. 17 | It was time for a refresh. 18 | 19 | The consensus was to shift our focus to the book *Bayesian Modeling and Computation In Python (BMCP)* by Osvaldo Martin, Ravin Kumar, and Junpeng Lao. 20 | We had an organizing meeting on January 7th. 21 | 24 people attended. 22 | 23 | Ravin Kumar also attended and spoke for about an hour. 24 | He provided invaluable insights about how best to read his book. 25 | He stressed that his book emphasizes the practice side before the theory. 26 | He recommended McElreath's book for the theory. 27 | The book uses three probabilistic programming languages: PyMC, tensorflow probability, and numpyro. 28 | 29 | Ravin is a mechanical engineer who quit SpaceX to spend time teaching himself BDA. 30 | He started contributing to the [PyMC3](https://www.pymc.io/welcome.html) project on GitHub. 31 | He then got hired at Google to apply BDA to various problems. 32 | His enthusiasm for BDA is contagious, as seen in the [video](https://www.youtube.com/watch?v=foSPfzYs4yY) that he made about Bayesian vs. Frequentist approaches. 33 | 34 | He and others are offering a paid [course](https://www.intuitivebayes.com/introductorycourse) for professionals. 35 | The introductory course is available and two more are preparation. 36 | 37 | The [book](https://bayesiancomputationbook.com/welcome.html) is available online for free. 38 | The [code](https://github.com/BayesianModelingandComputationInPython/BookCode_Edition1) for the book is located on GitHub. 39 | The code in the book uses PyMC3, but the current version of pymc is version 5. 40 | The code has not been translated to PyMC5. 41 | The pymc community has a [discourse channel](https://discourse.pymc.io/) and an upcoming [webinar](https://pymcon.com/about) series. 42 | They had a similar webinar [series](https://www.youtube.com/watch?v=UznM_-_760Y&list=PLD1x-BW9UdeHN2vwR6kIApJATd2jZzeya&index=1) in 2020. 43 | 44 | The first four chapters are the heart of the book. 45 | They are all that you need to start practicing BDA. 46 | Chapters 5 - 8 are specialized topics that cover areas that most people will use. 47 | Chapters 9, 10, and 11 contain appendix material. 48 | 49 | - 1 & 2 Basics 50 | - 3 & 4 Linear models and their applications 51 | - 5 Splines 52 | - 6 Time series 53 | - 7 BART 54 | - 8 ABC 55 | - 9 Bayesian workflow 56 | - 10 PPLs 57 | - 11 Appendical Topics (many theory topics are nicely summarized here ) 58 | 59 | Daniel Slutsky leads the meetings. 60 | Ryan Orsinger is the community organizer. 61 | The SciCloj community sponsors the jointprob events. 62 | This community is developing scientific computing tools in Clojure. 63 | 64 | 65 | These are the guiding principles for the group (Thanks to Ryan Onsinger!): 66 | 67 | - *No experts.* We do not assume that anybody is an expert in the field. We come to learn together with a student mindset. 68 | 69 | - *A clear path.* We will be very thoughtful about the agenda and where we wish to go. We will continually rethink and adapt our pathway going there. 70 | 71 | - *Confused together.* It is just fine to be confused. We will be there together and seek clarity together. 72 | 73 | - *Being active.* We encourage members to learn independently and take on projects. In a sense, its purpose is (also) to support those individual journeys. 74 | 75 | - *Mutual curiosity.* We make serious efforts to be inclusive to participants of various backgrounds. The different perspectives of our friends are part of what we wish to learn. 76 | 77 | You do not need to do the reading in advance, but you will get more out of the meetings if you do so. 78 | You will also get more out of the meetings by presenting a portion of the reading: the best way to learn to try to teach the material. 79 | This takes preparatory time. 80 | I found that 6-8 hours were required to assemble a 30-40-minute talk. 81 | 82 | 83 | ## meeting1.Rmd 84 | 85 | The Rmarkdown file that I presented in the first meeting of section D on Saturday, August 20, 2022. 86 | I covered Chapter 1 of McElreath and Chapters 1-4 of Grolemund and Wickham [*R for Data Science*](https://bookdown.org/roy_schumacher/r4ds/). 87 | 88 | ## meeting2b.Rmd 89 | 90 | I edited this Rmarkdown file that I presented in the second meeting of section D on Saturday, September 3, 2022. 91 | Daniel Slutsky gave an excellent 80-minute presentation about computing the posterior distribution, which prepared me well to present how to use grid approximation to estimate the posterior distribution in R. 92 | 93 | I included the suggested exercises from Chapter 2 of McElreath's *Rethinking Statistics, 2nd Ed*. 94 | Next, I ventured off and tried a triangular distribution from the `extraDistr` package as a prior. 95 | 96 | I added some embellishments, such as normalizing the prior, summing the prior, and estimating the likelihood, as sanity checks. 97 | These embellishments were not required to compute the correct posterior, but they deepened the understanding of what was happening. 98 | 99 | ## meeting3b.Rmd 100 | 101 | I presented this Rmarkdown file in the third meeting of section D on Saturday, September 17, 2202. 102 | It recaps the grid approximation presentation and then covers the two other *motors* of the Bayesian data analysis engine: quadratic approximation and Markov Chain Monte Carlo. 103 | 104 | 105 | ## SectionD10.ipynb 106 | 107 | I presented this Jupyter notebook on information theory at the Christmas Eve meeting of jointprob. 108 | I included material from chapter 11 of BMCP. 109 | One code cell does not work. 110 | 111 | 112 | ## Appendix of Useful Links 113 | 114 | 115 | ### Programs 116 | 117 | #### Stan 118 | 119 | [Stan](https://mc-stan.org/) implements that Hamiltonian Monte Carlo (HMC) with the No U-turn Sampler, which searches parameter space much faster than MCMC samplers. 120 | HMC cannot handle models with discrete parameters. These parameters have to be marginalized out via algebra. See the [Stan Users Guide](https://mc-stan.org/docs/stan-users-guide/latent-discrete.html). 121 | 122 | ##### cmdstan 123 | 124 | [cmdstan](https://mc-stan.org/users/interfaces/cmdstan) is probably the best way to access the current version of Stan. 125 | PyStan and RStan lag behind by several versions. 126 | There are cmdstanpy and cmdstanR to interface with cmdstan from Python or R. 127 | 128 | ##### BridgeStan 129 | 130 | [BridgeStan](https://github.com/roualdes/bridgestan) is a new way to interact with Stan model objects from R, Python, Julia, Rust, or C. 131 | They talk to each via their C interfaces. 132 | BrdigeStan allows you to access the methods of Stan model objects from a program than C++, which Stan is written in. 133 | You can also use [ArviZ](https://www.arviz.org/en/latest/) to make plots from the sampled posterior for an Stan object. 134 | 135 | ##### nutpie 136 | 137 | [nutpie](https://github.com/pymc-devs/nutpie) is a rust based interface to both Stan and PyMC. 138 | It is on version 0.1 and very underdeveloped. 139 | There is only a working example for modeling the mean of a sample from stan model. 140 | 141 | 142 | #### RStan (C++ wrapped in R) 143 | 144 | 145 | #### PyMC (Python) 146 | 147 | ##### Quick tutorial in PyMC4 148 | 149 | PyMC3 from the earlier PeerJ paper was translated to [PyMC4](https://www.pymc.io/projects/docs/en/stable/learn/core_notebooks/pymc_overview.html#pymc-overview). 150 | 151 | Note that [PyMC](https://www.pymc.io/welcome.html) is now in the version 5 series. 152 | The appending of a number has been dropped. 153 | 154 | #### Turing (Julia) 155 | 156 | 157 | 158 | #### Anglican (Clojure) 159 | 160 | ### Books 161 | 162 | Many of the popular books on BDA have associated computer code. 163 | Often, this computer code has been translated into other programming languages by kind people. 164 | 165 | ### Bayesian Analysis in Python (BAP) 166 | 167 | The second edition of BAP's code in PyMC3.11 is available (https://github.com/aloctavodia/BAP) 168 | 169 | 170 | ### Rethinking Statistics 171 | 172 | #### PyMC variation 173 | Note that McElreath's book has been fully translated into [PyMC3](https://github.com/pymc-devs/pymc-resources/tree/main/Rethinking_2) and largely translated into PyMC4, so the Rethinking Statistics book is ahead of the BMCP book in this regard. 174 | 175 | ### Rethinking has been translated into Julia 176 | McElreath's book has been translated into [Julia](https://github.com/StatisticalRethinkingJulia). 177 | 178 | ### BMCP has been translated into Julia 179 | 180 | 181 | #### Julia and the Turing Package 182 | 183 | [Fun introduction](https://storopoli.github.io/Bayesian-Julia/) 184 | 185 | ### John Krusche's Doing Bayesian Data Analysis (Puppydog book) in PyMC3 186 | 187 | https://github.com/JWarmenhoven/DBDA-python 188 | 189 | 190 | ### Bayesian Ddata Analysis Edition 3 in PyMC3 191 | 192 | This is a more advanced (aka harder to read) book that was published with a minimal amount of code for Stan. 193 | The translation of the book is still a work in [progress](https://github.com/pymc-devs/pymc-resources/tree/main/BDA3). 194 | 195 | 196 | ### Regression and Other Stories 197 | 198 | 199 | #### R code 200 | This is a more accessible book. It is an update of an earlier book by Gelman and Hill. It is free and [on-line](https://statmodeling.stat.columbia.edu/2022/01/27/regression-and-other-stories-free-pdf/). 201 | 202 | #### PyMC3 203 | 204 | It is being translated into the bambi wrapper for [PyMC](https://github.com/bambinos/educational-resources). Nothing has happened in two years. 205 | 206 | 207 | ## Update History 208 | 209 | |Version | Changes | Date | 210 | |:-----------:|:-----------------------------------------------:|:---------------:| 211 | | Version 0.2 | Fixed typos in README.md | 2024 April 10 | 212 | | Version 0.3 | Added more links. | 2024 May 28 | 213 | 214 | 215 | ## Sources of funding 216 | 217 | - NIH: R01 CA242845 218 | - NIH: R01 AI088011 219 | - NIH: P30 CA225520 (PI: R. Mannel) 220 | - NIH P20GM103640 and P30GM145423 (PI: A. West) 221 | -------------------------------------------------------------------------------- /meeting3b.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "17 September 2022 jointprob meeting" 3 | output: 4 | html_document: 5 | df_print: paged 6 | pdf_document: default 7 | editor_options: 8 | markdown: 9 | wrap: 72 10 | --- 11 | 12 | # Motors for Bayesian Models 13 | 14 | - Grid approximation 15 | - Quadratic approximation (Laplace's method) 16 | - Markov Chain Monte Carlo (MCMC) 17 | 18 | ### Features of the grid approximation 19 | 20 | - compute posterior at each grid point 21 | - grid spacing determines the precision of the approximation 22 | - simple to compute 23 | - limited to 5-D by computational costs 24 | - robust method in the presence of multiple modes 25 | 26 | ### Recap of the Grid Approximation 27 | 28 | A. Five-step protocol in one dimension. 29 | 30 | 1. Define a grid 31 | 32 | ```{r} 33 | p_grid <-seq(from=0,to=1,length.out=100) 34 | ``` 35 | 36 | 2. Define the prior 37 | 38 | ```{r} 39 | prior <-rep(1,100)/sum(rep(1,100)) 40 | sum(prior) 41 | ``` 42 | 43 | 3. Compute the likelihood at each value in grid 44 | 45 | ```{r} 46 | likelihood <- dbinom(6,size=9,prob=p_grid) 47 | sum(likelihood) 48 | ``` 49 | 50 | 4. Compute product of likelihood and prior 51 | 52 | ```{r} 53 | unstd.posterior <-likelihood*prior 54 | ``` 55 | 56 | 5. Standardize the posterior, so it sums to 1. 57 | 58 | ```{r} 59 | posterior <-unstd.posterior/sum(unstd.posterior) 60 | sum(posterior) 61 | ``` 62 | 63 | ```{r} 64 | plot( p_grid, posterior, 65 | type="b", 66 | xlab="Probability of water", 67 | ylab="Posterior probability") 68 | mtext( "100 points") 69 | ``` 70 | ```{r} 71 | p_grid <-seq(from=0,to=1,length.out=100) 72 | prior <-rep(1,100)/sum(rep(1,100)) 73 | sum(prior) 74 | likelihood <-dbinom(6,size=9,prob=p_grid) 75 | sum(likelihood) 76 | unstd.posterior <-likelihood*prior 77 | posterior <-unstd.posterior/sum(unstd.posterior) 78 | sum(posterior) 79 | plot( p_grid, posterior, 80 | type="b", 81 | xlab="Probability of water", 82 | ylab="Posterior probability") 83 | mtext( "100 points") 84 | ``` 85 | 86 | ### Repeat with Step prior 87 | 88 | ```{r} 89 | sprior <- ifelse(p_grid < 0.5, 0, 1)/sum(ifelse(p_grid < 0.5, 0, 1)) 90 | sum(sprior) 91 | slikelihood <-dbinom(6,size=9,prob=p_grid) 92 | unstd.sposterior <-slikelihood*sprior 93 | sposterior <-unstd.sposterior/sum(unstd.sposterior) 94 | sum(sposterior) 95 | 96 | plot( p_grid, sposterior, 97 | type="b", 98 | xlab="Probability of water", 99 | ylab="Posterior probability") 100 | points(p_grid, sprior, col='magenta') 101 | points(p_grid, posterior, col='cyan') 102 | mtext( "Step prior, 100 grid points") 103 | 104 | legend(x = "topright", 105 | c("sposterior", "sprior", "posterior"), 106 | cex=.8, 107 | col=c("black","magenta","cyan"), 108 | lwd = 2, 109 | bty = "n" 110 | ) 111 | ``` 112 | 113 | 114 | 115 | ### Example of a multimodal distribution 116 | 117 | ![bimodal](./images/bimodal.png){width="70%"} 118 | 119 | ### Quadratic approximation: Pierre Simon Laplace 120 | 121 | - France's Issac Newton. 122 | - Proved the general form of the Central Limit Theorem. 123 | - Proved a Bayesian interpretation of linear least-squares. 124 | - Developed the Laplace transform. 125 | - Developed spherical harmonics. 126 | 127 | 128 | ![Pierre Simon LaPlace](./images/PierreSimonLaPlace.png){width="30%"} 129 | 130 | ### A Philosophical Eassy on Probability 131 | 132 | ![Eassy on Probability](./images/EssaysOnProbability.png){width="20%"} 133 | 134 | [Source: Internet Archive](https://archive.org/details/philosophicaless00lapliala/page/n5/mode/2up) 135 | 136 | 137 | There is a newer translation by Andrew I. Dale in 1998 and 2012 published by Springer. 138 | It is suppose to be easier to read than the 1902 version and there is commentary on the mathematical notation. 139 | 140 | 141 | Andrew Dale also wrote 142 | 143 | Dale, A.I. (2012) *A history of inverse probability: From Thomas Bayes to Karl Pearson* Springer Science & Business Media. 144 | 145 | and 146 | 147 | Dale, A.I. (2003) *Most Honourable Remembrance: The Life and Work of Thomas Bayes*. Springer. 148 | 149 | 150 | 151 | ### McGrayne's book 152 | 153 | ![McGrayne's book](./images/McGrayne.png){width="40%"} 154 | 155 | ### Compute the quadratic approxmatiation using the quap function from the rethinking library 156 | 157 | ```{r} 158 | library(rethinking) 159 | globe.qa <- quap( 160 | alist( 161 | W ~ dbinom(W+L, p), # binomial likelihood 162 | p ~ dunif(0,1) # uniform prior 163 | ), 164 | data=list(W=6, L=3) ) 165 | 166 | # display summary of quadratic approximation 167 | precis(globe.qa) 168 | ``` 169 | 170 | ```{r} 171 | # analytical calculation 172 | W <-6 173 | L <-3 174 | curve( dbeta(x,W+1,L+1),from=0,to=1) 175 | # quadraticapproximation 176 | curve( dnorm(x,0.67,0.16),lty=2,add=TRUE) 177 | ``` 178 | 179 | 180 | #### Now double the amount of data 181 | 182 | 183 | ```{r} 184 | globe.qa <- quap( 185 | alist( 186 | W ~ dbinom(W+L, p), # binomial likelihood 187 | p ~ dunif(0,1) # uniform prior 188 | ), 189 | data=list(W=12, L=6) ) 190 | 191 | # display summary of quadratic approximation 192 | precis(globe.qa) 193 | ``` 194 | 195 | ```{r} 196 | # analytical calculation 197 | W <-12 198 | L <-6 199 | curve( dbeta(x,W+1,L+1),from=0,to=1) 200 | # quadraticapproximation 201 | curve( dnorm(x,0.67,0.11),lty=2,add=TRUE) 202 | ``` 203 | 204 | #### Now double the data again: 205 | 206 | 207 | 208 | ```{r} 209 | globe.qa <- quap( 210 | alist( 211 | W ~ dbinom(W+L, p), # binomial likelihood 212 | p ~ dunif(0,1) # uniform prior 213 | ), 214 | data=list(W=24, L=12) ) 215 | 216 | # display summary of quadratic approximation 217 | precis(globe.qa) 218 | ``` 219 | Plug the new sd into the quadratic approximation formula. 220 | We finally get more reasonable approximation. 221 | 222 | 223 | ```{r} 224 | # analytical calculation 225 | W <-24 226 | L <-12 227 | curve( dbeta(x,W+1,L+1),from=0,to=1) 228 | # quadraticapproximation 229 | curve( dnorm(x,0.67,0.08),lty=2,add=TRUE) 230 | ``` 231 | 232 | #### Now double the data again: 233 | 234 | 235 | 236 | ```{r} 237 | globe.qa <- quap( 238 | alist( 239 | W ~ dbinom(W+L, p), # binomial likelihood 240 | p ~ dunif(0,1) # uniform prior 241 | ), 242 | data=list(W=48, L=24) ) 243 | 244 | # display summary of quadratic approximation 245 | precis(globe.qa) 246 | ``` 247 | 248 | ```{r} 249 | # analytical calculation 250 | W <-48 251 | L <-24 252 | curve( dbeta(x,W+1,L+1),from=0,to=1) 253 | # quadraticapproximation 254 | curve( dnorm(x,0.67,0.06),lty=2,add=TRUE) 255 | ``` 256 | 257 | 258 | ### Caveats about the Quadration Approximation 259 | 260 | - The fit generally improves with more data. 261 | - Sometimes, it can remain a poor approximation even with 1000s of data points. 262 | - With a lot of data or an uniform prior, this approach is equavilent to a Maximum Likelihood Estimate (MLE). 263 | - When equivalent to MLE, can reinterpret the model fits in terms of a Bayesian analysis. 264 | - Shares the drawbacks of MLEs. 265 | - Computing a Hessian is required to get the standard deviation. This computation can go wrong. 266 | 267 | 268 | ## Markov Chain Monte Carlo (MCMC) 269 | 270 | 271 | Multi-level or hierarchical models are common and have large numbers of parameters. 272 | Grid approximation and Quadratic approximation are often not adquate for large models. 273 | 274 | MCMC merely draws samples from the posterior rather than computing the posterior directly. 275 | The frequencies of the parameter values of these samples is proportional to their posterior plausability. 276 | The histogram of these samples gives a picture of the posterior. 277 | So with MCMC, we work with samples rather then with an estimate of the posterior distribution. 278 | 279 | ### MCMC orginiated with the need to estimate high dimensional integrals 280 | 281 | #### 2-D example 282 | 283 | Source: Sobol, I. M. (1973). *Numerical Monte Carlo Methods*. Nauka, Moscow. 284 | 285 | The ratio of the number of points inside the boundary and outside give the proportion of the square occupied by the irregular area. 286 | 287 | ![Determine area of irregular shape](./images/SobolIntegration.png) 288 | #### The bounding area could be another curve 289 | 290 | Source: Robert, C. P., Casella, G., and Casella, G. (2010). *Introducing Monte Carlo methods with R*. New York: Springer. 291 | 292 | ![Determine area under aribtrary curve](./images/RobertFig2.4.png) 293 | 294 | 295 | 296 | ### Originated at Los Alamos in the late 1940s during work on the H-bomb 297 | 298 | \begin{equation} 299 | P(\Delta E)=e^{-\Delta E / k T} 300 | \end{equation} 301 | 302 | 1. Make a move (e.g., select a sample, change a torsion angle, translate the ligand, rotate the ligand) 303 | 2. Calculate the energy ($E$) 304 | 3. Compare E to prior $E^{\circ}$ 305 | 306 | - If $\Delta E < 0$, accept the move 307 | - If $\Delta E > 0$, accept move if a randomly selected number between 0 and 1 is less then $P$. Otherwise, reject the move. 308 | 309 | 310 | 311 | 312 | Source: Metropolis, Nicholas; Rosenbluth, Arianna W.; Rosenbluth, Marshall N.; Teller, Augusta H.; and 313 | Teller, Edward (1953) Equation of state Calculations by Fast Computing Machines. The Journal of Chemical Physics. 21(6):1087. 314 | 315 | 316 | 317 | 318 | ### 319 | 320 | 321 | ### MCMC applied to generating samples from the posterior for the globe tossing problem 322 | 323 | #### Example using the Metropolis algorithm with 1000 samples. 324 | 325 | ```{r} 326 | n_samples <-1000 327 | p <-rep(NA,n_samples) 328 | p[1] <- 0.5 329 | W <-6 330 | L <-3 331 | for (i in 2:n_samples){ 332 | p_new <-rnorm(1,p[i-1],0.1) 333 | if (p_new<0) p_new <- abs(p_new) 334 | if (p_new>1) p_new <- 2 - p_new 335 | q0 <- dbinom(W,W+L,p[i-1]) 336 | q1 <- dbinom(W,W+L,p_new) 337 | p[i] <- ifelse(runif(1)1) p_new <- 2 - p_new 355 | q0 <- dbinom(W,W+L,p[i-1]) 356 | q1 <- dbinom(W,W+L,p_new) 357 | p[i] <- ifelse(runif(1)1) p_new <- 2 - p_new 375 | q0 <- dbinom(W,W+L,p[i-1]) 376 | q1 <- dbinom(W,W+L,p_new) 377 | p[i] <- ifelse(runif(1)1) p_new <- 2 - p_new 396 | q0 <- dbinom(W,W+L,p[i-1]) 397 | q1 <- dbinom(W,W+L,p_new) 398 | p[i] <- ifelse(runif(1)" 79 | ], 80 | "text/plain": [ 81 | "" 82 | ] 83 | }, 84 | "execution_count": 13, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "from IPython.display import Image\n", 91 | "Image(url=\"./SectionD10/images/MacroscopicEntropy.png\", width=900, height=900)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "id": "e6ab74d8", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "" 104 | ], 105 | "text/plain": [ 106 | "" 107 | ] 108 | }, 109 | "execution_count": 8, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "from IPython.display import Image\n", 116 | "Image(url=\"./SectionD10/images/StatisticalEntropy.png\", width=900, height=900)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "id": "3227159c", 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/html": [ 128 | "" 129 | ], 130 | "text/plain": [ 131 | "" 132 | ] 133 | }, 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "from IPython.display import Image\n", 141 | "Image(url=\"./SectionD10/images/entropy.png\", width=900, height=900)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "ccb255b4", 147 | "metadata": {}, 148 | "source": [ 149 | "### Entropy quantifies the possible arrangement of objects\n", 150 | "\n", 151 | "- molecules\n", 152 | "- characters in text\n", 153 | "- pixels\n", 154 | "- bubbles in sourdough bread\n", 155 | "- books in a bookcase" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "6f9bbb63", 161 | "metadata": {}, 162 | "source": [ 163 | "## Rescaled, dimensionless Entropy == Information Entropy\n", 164 | "\n", 165 | "Set $k=1$:\n", 166 | "$$\n", 167 | "S^{\\prime}=\\ln W, \\quad \\Delta S^{\\prime}=\\int \\frac{\\mathrm{d} Q}{k T}\n", 168 | "$$" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "8625e3d1", 174 | "metadata": {}, 175 | "source": [ 176 | "$$S' = H(p)=-\\mathbb{E}[\\log p]=-\\sum_i^n p_i \\log p_i$$" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "id": "9751a67c", 182 | "metadata": {}, 183 | "source": [ 184 | "For a proability distribution p with n possible distinct outcomes, each event $i$ has a probability of $p_{i}$." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "0d5ada05", 190 | "metadata": {}, 191 | "source": [ 192 | "### Multiplicity, W\n", 193 | "\n", 194 | "\n", 195 | "$$\n", 196 | "W=\\frac{N !}{n_{1} ! n_{2} ! \\cdots n_{t} !}\n", 197 | "$$\n", 198 | "\n", 199 | "\n", 200 | "- $N:$ total number of events\n", 201 | "- $n_{i}$: number of times distinct event $i$ occurs\n", 202 | "- $t$: number of distinct events\n", 203 | "\n", 204 | "\n", 205 | "\n", 206 | "With large $N$, we can apply Stirling's approximation:\n", 207 | "$$x ! \\approx\\left(\\frac{x}{e}\\right)^x$$\n", 208 | "\n", 209 | "\n", 210 | "Because $p_i=\\frac{n_i}{N}$, we can rewrite the above as\n", 211 | "\n", 212 | "$$\n", 213 | "W=\\frac{1}{p_1^{n_1} p_2^{n_2} \\cdots p_t^{n_t}}\n", 214 | "$$\n", 215 | "\n", 216 | "Take the logarithm (????), \n", 217 | "\n", 218 | "$$\n", 219 | "\\log W=-\\sum_i^n p_i \\log p_i\n", 220 | "$$" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 32, 226 | "id": "75edd9bc", 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:27:35) [Clang 14.0.6 ]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "import sys\n", 239 | "print(sys.version)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 15, 245 | "id": "38d6cda8", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stderr", 250 | "output_type": "stream", 251 | "text": [ 252 | "/Users/blaine/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", 253 | " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", 254 | "2022-12-24 06:04:54.801900: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", 255 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "# Basic\n", 261 | "import numpy as np\n", 262 | "from scipy import stats\n", 263 | "import pandas as pd\n", 264 | "from patsy import bs, dmatrix\n", 265 | "import matplotlib.pyplot as plt\n", 266 | "\n", 267 | "# Exploratory Analysis of Bayesian Models\n", 268 | "import arviz as az\n", 269 | "\n", 270 | "# Probabilistic programming languages\n", 271 | "import bambi as bmb\n", 272 | "import pymc as pm\n", 273 | "import tensorflow_probability as tfp\n", 274 | "\n", 275 | "tfd =tfp.distributions\n", 276 | "\n", 277 | "# Computational Backend\n", 278 | "import theano\n", 279 | "import theano.tensor as tt\n", 280 | "import tensorflow as tf" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 16, 286 | "id": "1d0d6181", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "image/png": "\n", 292 | "text/plain": [ 293 | "
" 294 | ] 295 | }, 296 | "metadata": {}, 297 | "output_type": "display_data" 298 | } 299 | ], 300 | "source": [ 301 | "x = range(0, 26)\n", 302 | "q_pmf =stats.binom(10,0.75).pmf(x)\n", 303 | "qu_pmf =stats.randint(0,np.max(np.nonzero(q_pmf))+1).pmf(x)\n", 304 | "r_pmf =(q_pmf+np.roll(q_pmf,12))/2\n", 305 | "ru_pmf =stats.randint(0,np.max(np.nonzero(r_pmf))+1).pmf(x)\n", 306 | "s_pmf =(q_pmf+np.roll(q_pmf,15))/2\n", 307 | "su_pmf =(qu_pmf+np.roll(qu_pmf,15))/2\n", 308 | "\n", 309 | "_, ax=plt.subplots(3,2,figsize=(12,5),sharex=True,sharey=True,\n", 310 | " constrained_layout=True)\n", 311 | "ax = np.ravel(ax)\n", 312 | "\n", 313 | "zipped = zip([q_pmf, qu_pmf,r_pmf,ru_pmf,s_pmf,su_pmf],\n", 314 | " [\"q\", \"qu\", \"r\", \"ru\", \"s\", \"su\"])\n", 315 | "\n", 316 | "for idx, (dist,label) in enumerate(zipped):\n", 317 | " ax[idx].vlines(x, 0,dist,label=f\"H ={stats.entropy(dist):.2f}\")\n", 318 | " ax[idx].set_title(label)\n", 319 | " ax[idx].legend(loc=1, handlelength=0)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "id": "a7732bc8", 325 | "metadata": {}, 326 | "source": [ 327 | "26 possible events.\n", 328 | "\n", 329 | "- q: $q \\sim \\operatorname{binom}(n=10, p=0.75)$ (has 11 events and the lowest entropy)\n", 330 | "- qu: uniform distribution with 11 events (Need to increase n 3 times for $q$ to get a higher entropy than qu).\n", 331 | " \n", 332 | "- r: generated by duplicating $q$ and shifting to the right and then normalizing as the probabilities sum to 1.\n", 333 | "- ru: uniform distribution with 23 events.\n", 334 | "\n", 335 | "- s: like $r$ but with a larger valley\n", 336 | "- su: uniform distribution with 22 events. H of su is less than H of ru." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 54, 342 | "id": "def241fc", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "array([8.67361738e-19, 7.80625564e-17, 3.39572120e-15, 9.50801937e-14,\n", 349 | " 1.92537392e-12, 3.00358332e-11, 3.75447915e-10, 3.86174998e-09,\n", 350 | " 3.33075936e-08, 2.44255686e-07, 1.53881082e-06, 8.39351359e-06,\n", 351 | " 3.98691895e-05, 1.65610480e-04, 6.03295319e-04, 1.93054502e-03,\n", 352 | " 5.42965787e-03, 1.34144488e-02, 2.90646392e-02, 5.50698426e-02,\n", 353 | " 9.08652404e-02, 1.29807486e-01, 1.59309188e-01, 1.66235674e-01,\n", 354 | " 1.45456215e-01, 1.04728475e-01])" 355 | ] 356 | }, 357 | "execution_count": 54, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "q_pmf =stats.binom(30,0.75).pmf(x)\n", 364 | "q_pmf" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 55, 370 | "id": "33ac5109", 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "2.0671606564787637" 377 | ] 378 | }, 379 | "execution_count": 55, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "H = stats.entropy(q_pmf)\n", 386 | "H" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "id": "5d7e50b1", 392 | "metadata": {}, 393 | "source": [ 394 | "# Kullback-Leibler Divergence" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "id": "6cca0147", 400 | "metadata": {}, 401 | "source": [ 402 | "- When we do not know distribution $p$ or when $p$ is complex, we may want to approximate it with distribution $q$.\n", 403 | "- How much information do we lose by using $q$ to represent $p$?\n", 404 | "- This is equivalent to how much extra uncertainty do we introduce when using $q$ to represent $p$.\n", 405 | "- KL divergence gives us 0 when $p=q$ and positive values otherwise.\n", 406 | "- KL divergence is the average value of the difference of $\\textrm{log} p$ and $\\textrm{log} q$." 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "id": "f221d1a1", 412 | "metadata": {}, 413 | "source": [ 414 | "Events appear according to $p$ so compute the expectation (mean value) with respect to $p$.\n", 415 | "\n", 416 | "$$\n", 417 | "\\mathbb{K} \\mathbb{L}(p \\| q)=\\mathbb{E}_p[\\log p-\\log q]\n", 418 | "$$" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "id": "935a453d", 424 | "metadata": {}, 425 | "source": [ 426 | "For a discrete distribution, the above equation can be expressed as follows:" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "id": "0eae4378", 432 | "metadata": {}, 433 | "source": [ 434 | "$$\\mathbb{K} \\mathbb{L}(p \\| q)=\\sum_i^n p_i\\left(\\log p_i-\\log q_i\\right)$$" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "id": "c392f823", 440 | "metadata": {}, 441 | "source": [ 442 | "Using the properties of logarithms, rewrite in this common representation of $\\mathbb{K} \\mathbb{L}(p \\| q)$\n", 443 | "\n", 444 | "$$\\mathbb{K} \\mathbb{L}(p \\| q)=\\sum_i^n p_i \\log \\frac{p_i}{q_i}$$" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "id": "027e578d", 450 | "metadata": {}, 451 | "source": [ 452 | "## Relate the KL divergence, $D_{\\mathrm{KL}}(p \\| q)$, to information entropy, $H(p, q)$\n", 453 | "\n", 454 | "Using the properties of logarithms, rearrange and then expand. Note the negative sign." 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "id": "5852014a", 460 | "metadata": {}, 461 | "source": [ 462 | "$$\n", 463 | "\\mathbb{K} \\mathbb{L}(p \\| q)=-\\sum_i^n p_i\\left(\\log q_i-\\log p_i\\right)\n", 464 | "$$\n", 465 | "\n", 466 | "$$\n", 467 | "\\mathbb{K} \\mathbb{L}(p \\| q)=-\\overbrace{\\sum_i^n p_i \\log q_i}^{H(p, q)}-\\overbrace{\\left(-\\sum_i^n p_i \\log p_i\\right)}^{H(p)}\n", 468 | "$$" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "c962d79a", 474 | "metadata": {}, 475 | "source": [ 476 | "Note that $H(p, q)$ is the entropy of q evaluated at the values of p. It is the *cross entropy*." 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "id": "ac9a0039", 482 | "metadata": {}, 483 | "source": [ 484 | "$$\n", 485 | "H(p, q)=-\\mathbb{E}_p[\\log q]\n", 486 | "$$" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "id": "1c314ca7", 492 | "metadata": {}, 493 | "source": [ 494 | "Note that when $p=q$" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "id": "c375d189", 500 | "metadata": {}, 501 | "source": [ 502 | "$$\\mathbb{K} \\mathbb{L}(p \\| q)=\\mathbb{K} \\mathbb{L}(p \\| p)=\\sum_i p_i\\left(\\log \\left(p_i\\right)-\\log \\left(p_i\\right)\\right)=0$$" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "id": "dbc72ca7", 508 | "metadata": {}, 509 | "source": [ 510 | "Now replace $\\mathbb{K} \\mathbb{L}(p \\| q)$ with $D_{\\mathrm{KL}}(p \\| q)$ and rearrange. " 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "id": "4904f26b", 516 | "metadata": {}, 517 | "source": [ 518 | "$$\n", 519 | "H(p, q)=H(p)+D_{\\mathrm{KL}}(p \\| q)\n", 520 | "$$" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "id": "7da71b42", 526 | "metadata": {}, 527 | "source": [ 528 | "So the KL divergence can be interpreted as the extra entropy when using $q$ to represent $p$." 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "id": "067172ee", 534 | "metadata": {}, 535 | "source": [ 536 | "## Compute $D_{\\mathrm{KL}}(p \\| q)$\n", 537 | "\n", 538 | "- using all pairwise combinations.\n", 539 | "- white represents infinite values $D_{\\mathrm{KL}}(p \\| q) = \\infty$\n", 540 | "- black represents $D_{\\mathrm{KL}}(p \\| q) = 0$\n", 541 | "- Note that the plot is not symmetric because usually $D_{\\mathrm{KL}}(p \\| q) \\neq D_{\\mathrm{KL}}(q \\| p)$ " 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 46, 547 | "id": "611005a7", 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "data": { 552 | "image/png": "\n", 553 | "text/plain": [ 554 | "
" 555 | ] 556 | }, 557 | "metadata": {}, 558 | "output_type": "display_data" 559 | } 560 | ], 561 | "source": [ 562 | "dists = [q_pmf, qu_pmf, r_pmf, ru_pmf, s_pmf, su_pmf]\n", 563 | "names = [\"q\", \"qu\", \"r\", \"ru\", \"s\", \"su\"]\n", 564 | "\n", 565 | "fig, ax = plt.subplots()\n", 566 | "KL_matrix = np.zeros((6, 6))\n", 567 | "for i, dist_i in enumerate(dists):\n", 568 | " for j, dist_j in enumerate(dists):\n", 569 | " KL_matrix[i, j] = stats.entropy(dist_i, dist_j)\n", 570 | "\n", 571 | "ax.set_xticks(np.arange(len(names)))\n", 572 | "ax.set_yticks(np.arange(len(names)))\n", 573 | "ax.set_xticklabels(names)\n", 574 | "ax.set_yticklabels(names)\n", 575 | "plt.set_cmap(\"viridis\")\n", 576 | "cmap = plt.cm.get_cmap()\n", 577 | "cmap.set_bad('w', 0.3)\n", 578 | "im = ax.imshow(KL_matrix)\n", 579 | "fig.colorbar(im, extend=\"max\");\n", 580 | "\n", 581 | "plt.savefig(\"./SectionD10/images/chp11_KL_heatmap.png\")" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "id": "777f98b4", 587 | "metadata": {}, 588 | "source": [ 589 | "- The above is the KL divergence for all pairwise combinations of the distributions q, qu,r,ru,s,and su.\n", 590 | "\n", 591 | "- The dark blue corresponds to KL divergence of 0.\n", 592 | "\n", 593 | "- The white corresponds to KL divergence of 1.\n", 594 | "\n", 595 | "- Note that the matrix is not symmetric.\n" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "id": "77156d4e", 601 | "metadata": {}, 602 | "source": [ 603 | "### Note that the computation of $D_{\\mathrm{KL}}(p \\| q)$ uses the following conventions:\n", 604 | "\n", 605 | "$$\n", 606 | "0 \\log \\frac{0}{0}=0, \\quad 0 \\log \\frac{0}{q(x)}=0, \\quad p(x) \\log \\frac{p(x)}{0}=\\infty\n", 607 | "$$" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "id": "7fffab56", 613 | "metadata": {}, 614 | "source": [ 615 | "### Ranking of models by KL-divergence is equivalent to ranking on the log-score\n", 616 | "\n", 617 | "\n", 618 | "#### Section 2.5 model comparison\n", 619 | "\n", 620 | "- do not want models that are so simple that miss important information\n", 621 | "- do not want models that are so complex that they fit noise\n", 622 | "- The sweet spot is hard to find\n", 623 | " - no single criterion defines an optimal solution \n", 624 | " - the optimal solution may not exist\n", 625 | " - must choose from a limited set of models evaluated over a finite dataset\n", 626 | "\n", 627 | "\n", 628 | "- One approach is to compute the generalization error (i.e., out-of-sample predictive accuracy). Compute by cross validation.\n", 629 | "\n", 630 | "- logarithmic scoring rules have nice [theoretical properties](https://doi.org/10.1198/016214506000001437)\n", 631 | "\n", 632 | "\n", 633 | "$$\\mathrm{ELPD}=\\sum_{i=1}^n \\int p_t\\left(\\tilde{y}_i\\right) \\log p\\left(\\tilde{y}_i \\mid y_i\\right) d \\tilde{y}_i$$\n", 634 | "\n", 635 | "- ELPD: expected log *pointwise* predictive density\n", 636 | "- \"expected\" because we integrate over all possible datasets that can be generated by the process\n", 637 | "- \"pointwise\" because the computations are performed per observation $y_i$ over $n$ observations.\n", 638 | "\n", 639 | "- $p_t\\left(\\tilde{y}_i\\right)$: is the distribution of the true data-generating process for $\\tilde{y}_i$ (the future data given the current posterior which in turns depends on the model (prior and likelihood) and the current data)\n", 640 | "- $p\\left(\\tilde{y}_i \\mid y_i\\right)$: posterior predictive distribution $p(\\tilde{{y}} \\mid {y})=\\int_{{\\Theta}} p(\\tilde{{y}} \\mid {\\theta}) p({\\theta} \\mid {y}) d {\\theta}$\n", 641 | "\n", 642 | "Use the *deviance* instead:\n", 643 | "\n", 644 | "\n", 645 | "$$\\sum_{i=1}^n \\log \\int p\\left(y_i \\mid \\boldsymbol{\\theta}\\right) p(\\boldsymbol{\\theta} \\mid y) d \\boldsymbol{\\theta}$$\n", 646 | "\n", 647 | "- Note that this quantity is proportional to the quadratic mean error when the likelihood is Gaussian.\n", 648 | "- Note that we are using all of the data to fit the model, so we will over estimate ELPD." 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "id": "1a4cc6e7", 654 | "metadata": {}, 655 | "source": [ 656 | "##### Leave-One-Out Cross Validation\n", 657 | "\n", 658 | "\n", 659 | "$$\\mathrm{ELPD}_{\\text {LOO-CV }}=\\sum_{i=1}^n \\log \\int p\\left(y_i \\mid \\boldsymbol{\\theta}\\right) p\\left(\\boldsymbol{\\theta} \\mid y_{-i}\\right) d \\boldsymbol{\\theta}$$" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 49, 665 | "id": "748bf9d0", 666 | "metadata": {}, 667 | "outputs": [ 668 | { 669 | "name": "stderr", 670 | "output_type": "stream", 671 | "text": [ 672 | "Auto-assigning NUTS sampler...\n", 673 | "Initializing NUTS using jitter+adapt_diag...\n", 674 | "Multiprocess sampling (4 chains in 4 jobs)\n", 675 | "NUTS: [σ]\n", 676 | "/Users/blaine/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", 677 | " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", 678 | "/Users/blaine/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", 679 | " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", 680 | "/Users/blaine/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", 681 | " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", 682 | "/Users/blaine/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", 683 | " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" 684 | ] 685 | }, 686 | { 687 | "data": { 688 | "text/html": [ 689 | "\n", 690 | "\n" 705 | ], 706 | "text/plain": [ 707 | "" 708 | ] 709 | }, 710 | "metadata": {}, 711 | "output_type": "display_data" 712 | }, 713 | { 714 | "data": { 715 | "text/html": [ 716 | "\n", 717 | "
\n", 718 | " \n", 719 | " 100.00% [8000/8000 00:01<00:00 Sampling 4 chains, 0 divergences]\n", 720 | "
\n", 721 | " " 722 | ], 723 | "text/plain": [ 724 | "" 725 | ] 726 | }, 727 | "metadata": {}, 728 | "output_type": "display_data" 729 | }, 730 | { 731 | "name": "stderr", 732 | "output_type": "stream", 733 | "text": [ 734 | "Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 17 seconds.\n", 735 | "Sampling: [y]\n" 736 | ] 737 | }, 738 | { 739 | "data": { 740 | "text/html": [ 741 | "\n", 742 | "\n" 757 | ], 758 | "text/plain": [ 759 | "" 760 | ] 761 | }, 762 | "metadata": {}, 763 | "output_type": "display_data" 764 | }, 765 | { 766 | "data": { 767 | "text/html": [ 768 | "\n", 769 | "
\n", 770 | " \n", 771 | " 100.00% [4000/4000 00:00<00:00]\n", 772 | "
\n", 773 | " " 774 | ], 775 | "text/plain": [ 776 | "" 777 | ] 778 | }, 779 | "metadata": {}, 780 | "output_type": "display_data" 781 | }, 782 | { 783 | "ename": "KeyError", 784 | "evalue": "'y'", 785 | "output_type": "error", 786 | "traceback": [ 787 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 788 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", 789 | "Cell \u001b[0;32mIn[49], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m y \u001b[38;5;241m=\u001b[39m pm\u001b[38;5;241m.\u001b[39mNormal(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m0\u001b[39m, σ, observed\u001b[38;5;241m=\u001b[39my_obs)\n\u001b[1;32m 21\u001b[0m idataB \u001b[38;5;241m=\u001b[39m pm\u001b[38;5;241m.\u001b[39msample(return_inferencedata\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 22\u001b[0m idataB\u001b[38;5;241m.\u001b[39madd_groups({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mposterior_predictive\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[43mpm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample_posterior_predictive\u001b[49m\u001b[43m(\u001b[49m\u001b[43midataB\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43my\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;28;01mNone\u001b[39;00m,:]}})\n\u001b[1;32m 23\u001b[0m idatas_cmp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmB\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m idataB\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# Generate data from Normal likelihood model\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# with random mean and random standard deviation\u001b[39;00m\n", 790 | "File \u001b[0;32m~/opt/anaconda3/envs/stat-rethink2-pymc_v4/lib/python3.10/site-packages/arviz/data/inference_data.py:259\u001b[0m, in \u001b[0;36mInferenceData.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;124;03m\"\"\"Get item by key.\"\"\"\u001b[39;00m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_groups_all:\n\u001b[0;32m--> 259\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key)\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, key)\n", 791 | "\u001b[0;31mKeyError\u001b[0m: 'y'" 792 | ] 793 | } 794 | ], 795 | "source": [ 796 | "np.random.seed(90210)\n", 797 | "\n", 798 | "y_obs = np.random.normal(0, 1, size=100)\n", 799 | "idatas_cmp = {}\n", 800 | "\n", 801 | "# # Generate data from Skewnormal likelihood model\n", 802 | "# # with fixed mean and skewness and random standard deviation\n", 803 | "# with pm.Model() as mA:\n", 804 | "# σ = pm.HalfNormal(\"σ\", 1)\n", 805 | "# y = pm.SkewNormal(\"y\", 0, σ, alpha=1, observed=y_obs)\n", 806 | "# idataA = pm.sample(return_inferencedata=True)\n", 807 | "# # add_groups modifies an existing az. InferenceData \n", 808 | "# idataA.add_groups({\"posterior_predictive\": {\"y\":pm.sample_posterior_predictive(idataA)[\"y\"][None,:]}})\n", 809 | "# idatas_cmp[\"mA\"] = idataA\n", 810 | "\n", 811 | "# Generate data from Normal likelihood model\n", 812 | "# with fixed mean and random standard deviation\n", 813 | "with pm.Model() as mB:\n", 814 | " σ = pm.HalfNormal(\"σ\", 1)\n", 815 | " y = pm.Normal(\"y\", 0, σ, observed=y_obs)\n", 816 | " idataB = pm.sample(return_inferencedata=True)\n", 817 | " idataB.add_groups({\"posterior_predictive\": {\"y\":pm.sample_posterior_predictive(idataB)[\"y\"][None,:]}})\n", 818 | " idatas_cmp[\"mB\"] = idataB\n", 819 | "\n", 820 | "# Generate data from Normal likelihood model\n", 821 | "# with random mean and random standard deviation\n", 822 | "with pm.Model() as mC:\n", 823 | " μ = pm.Normal(\"μ\", 0, 1)\n", 824 | " σ = pm.HalfNormal(\"σ\", 1)\n", 825 | " y = pm.Normal(\"y\", μ, σ, observed=y_obs)\n", 826 | " idataC = pm.sample(return_inferencedata=True)\n", 827 | " idataC.add_groups({\"posterior_predictive\": {\"y\":pm.sample_posterior_predictive(idataC)[\"y\"][None,:]}})\n", 828 | " idatas_cmp[\"mC\"] = idataC" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "id": "5a074b94", 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "cmp = az.compare(idatas_cmp)\n", 839 | "cmp.round(2)" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 50, 845 | "id": "b05437dc", 846 | "metadata": {}, 847 | "outputs": [ 848 | { 849 | "data": { 850 | "text/html": [ 851 | "" 852 | ], 853 | "text/plain": [ 854 | "" 855 | ] 856 | }, 857 | "execution_count": 50, 858 | "metadata": {}, 859 | "output_type": "execute_result" 860 | } 861 | ], 862 | "source": [ 863 | "from IPython.display import Image\n", 864 | "Image(url=\"./SectionD10/images/table.png\", width=900, height=900)" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "id": "7aa4744e", 870 | "metadata": {}, 871 | "source": [ 872 | "- rank: 0 is the model with the best predictive accuracy\n", 873 | "- loo: the LELPD value\n", 874 | "- p_loo: values of the penalization term. Roughly the effective number of parameters.\n", 875 | "- d_loo: difference n LOO for the top-ranked model and the LOO value for each model\n", 876 | "- weight: probability of each model given the data\n", 877 | "- se: the standard error of the ELPD\n", 878 | "- dse: the standard errors of the difference between two values of ELPD. Usually 0 for the top-ranked model\n", 879 | "- warning: If True, the LOO approximation may not be reliable\n", 880 | "- loo_scale: log, deviance, and negative-log are the options" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": null, 886 | "id": "883eceea", 887 | "metadata": {}, 888 | "outputs": [], 889 | "source": [ 890 | "az.plot_compare(cmp, figsize=(9, 3))\n", 891 | "plt.savefig(\"img/chp02/compare_dummy.png\")" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 53, 897 | "id": "6667f1d4", 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "data": { 902 | "text/html": [ 903 | "" 904 | ], 905 | "text/plain": [ 906 | "" 907 | ] 908 | }, 909 | "execution_count": 53, 910 | "metadata": {}, 911 | "output_type": "execute_result" 912 | } 913 | ], 914 | "source": [ 915 | "from IPython.display import Image\n", 916 | "Image(url=\"./SectionD10/images/compareModelsbyLOO.png\", width=900, height=900)" 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "id": "bc65c3e7", 922 | "metadata": {}, 923 | "source": [ 924 | "#### Pointwise ELPD" 925 | ] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "execution_count": 52, 930 | "id": "d2c81ab1", 931 | "metadata": {}, 932 | "outputs": [ 933 | { 934 | "data": { 935 | "text/html": [ 936 | "" 937 | ], 938 | "text/plain": [ 939 | "" 940 | ] 941 | }, 942 | "execution_count": 52, 943 | "metadata": {}, 944 | "output_type": "execute_result" 945 | } 946 | ], 947 | "source": [ 948 | "from IPython.display import Image\n", 949 | "Image(url=\"./SectionD10/images/pointwiseELPD.png\", width=900, height=900)" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": null, 955 | "id": "6e46ad9f", 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "az.plot_elpd(idatas_cmp, figsize=(10, 5), plot_kwargs={\"marker\":\".\"}, threshold=2);\n", 960 | "plt.savefig(\"img/chp02/elpd_dummy.png\")\n" 961 | ] 962 | }, 963 | { 964 | "cell_type": "markdown", 965 | "id": "3df09d87", 966 | "metadata": {}, 967 | "source": [ 968 | "#### Back to section 11.3\n", 969 | "\n", 970 | "- Assume that the true model is $M_0$\n", 971 | "- Assume $k$ model posteriors $\\left\\{q_{M_1}, q_{M_2}, \\cdots q_{M_k}\\right\\}$\n", 972 | "- Note that $p_{M_0}$ is the same for all comparison and cancels out (????).\n", 973 | "\n", 974 | "\n", 975 | "\n", 976 | "$$\n", 977 | "\\begin{aligned}\n", 978 | "\\mathbb{K} \\mathbb{L}\\left(p_{M_0} \\| q_{M_1}\\right)= & \\mathbb{E}\\left[\\log p_{M_0}\\right]-\\mathbb{E}\\left[\\log q_{M_1}\\right] \\\\\n", 979 | "\\mathbb{K} \\mathbb{L}\\left(p_{M_0} \\| q_{M_2}\\right)= & \\mathbb{E}\\left[\\log p_{M_0}\\right]-\\mathbb{E}\\left[\\log q_{M_2}\\right] \\\\\n", 980 | "& \\cdots \\\\\n", 981 | "\\mathbb{K} \\mathbb{L}\\left(p_{M_0} \\| q_{M_k}\\right)= & \\mathbb{E}\\left[\\log p_{M_0}\\right]-\\mathbb{E}\\left[\\log q_{M_k}\\right]\n", 982 | "\\end{aligned}\n", 983 | "$$" 984 | ] 985 | }, 986 | { 987 | "cell_type": "markdown", 988 | "id": "54ed5319", 989 | "metadata": {}, 990 | "source": [ 991 | "$\\mathrm{ELPD}=\\sum_{i=1}^n \\int p_t\\left(\\tilde{y}_i\\right) \\log p\\left(\\tilde{y}_i \\mid y_i\\right) d \\tilde{y}_i$" 992 | ] 993 | }, 994 | { 995 | "cell_type": "code", 996 | "execution_count": null, 997 | "id": "89d774fb", 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": null, 1005 | "id": "ac5f4487", 1006 | "metadata": {}, 1007 | "outputs": [], 1008 | "source": [] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "id": "faa7aa7e", 1014 | "metadata": {}, 1015 | "outputs": [], 1016 | "source": [] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "id": "5442e51e", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "## Measure of uncertainty needs the following properties:\n", 1024 | "\n", 1025 | "1. continuous\n", 1026 | "2. increase with the number of possible events\n", 1027 | "3. additive" 1028 | ] 1029 | } 1030 | ], 1031 | "metadata": { 1032 | "kernelspec": { 1033 | "display_name": "stat-rethink-pymc4", 1034 | "language": "python", 1035 | "name": "stat-rethink-pymc4" 1036 | }, 1037 | "language_info": { 1038 | "codemirror_mode": { 1039 | "name": "ipython", 1040 | "version": 3 1041 | }, 1042 | "file_extension": ".py", 1043 | "mimetype": "text/x-python", 1044 | "name": "python", 1045 | "nbconvert_exporter": "python", 1046 | "pygments_lexer": "ipython3", 1047 | "version": "3.10.8" 1048 | }, 1049 | "toc": { 1050 | "base_numbering": 1, 1051 | "nav_menu": {}, 1052 | "number_sections": true, 1053 | "sideBar": true, 1054 | "skip_h1_title": false, 1055 | "title_cell": "Table of Contents", 1056 | "title_sidebar": "Contents", 1057 | "toc_cell": false, 1058 | "toc_position": {}, 1059 | "toc_section_display": true, 1060 | "toc_window_display": false 1061 | } 1062 | }, 1063 | "nbformat": 4, 1064 | "nbformat_minor": 5 1065 | } 1066 | --------------------------------------------------------------------------------