├── README.md ├── data ├── conversion_data.csv └── fatality_data.csv ├── driving_fatalities ├── .ipynb_checkpoints │ └── self_driving_fatalities-checkpoint.ipynb ├── README.md ├── fatality_data.csv └── self_driving_fatalities.ipynb ├── environment.yml ├── notebooks ├── .DS_Store ├── .ipynb_checkpoints │ ├── 1. Introduction to PyMC3-checkpoint.ipynb │ ├── 2. Markov Chain Monte Carlo-checkpoint.ipynb │ ├── 3. Theano-checkpoint.ipynb │ ├── 6. Model Checking-checkpoint.ipynb │ ├── Bayesian_AB_Testing-checkpoint.ipynb │ ├── Bayesian_Workflow-checkpoint.ipynb │ ├── Case Study 1- Bayesian_Changepoint_Detection-checkpoint.ipynb │ ├── Model Building with PyMC3-checkpoint.ipynb │ ├── b. Multilevel Modeling-checkpoint.ipynb │ ├── hierachical_bayesian_model_advertising-checkpoint.ipynb │ ├── nuts_and_metropolis-checkpoint.ipynb │ └── self_driving_fatalities-checkpoint.ipynb ├── 1. Introduction to PyMC3.ipynb ├── 2. Markov Chain Monte Carlo.ipynb ├── 3. Theano.ipynb ├── 6. Model Checking.ipynb ├── Bayesian_AB_Testing.ipynb ├── Bayesian_Workflow.ipynb ├── Case Study 1- Bayesian_Changepoint_Detection.ipynb ├── Introduction_to_AB_Testing.ipynb ├── Model Building with PyMC3.ipynb ├── a. BEST.ipynb ├── b. Multilevel Modeling.ipynb ├── c. Bayesian Neural Network.ipynb ├── fatality.csv ├── hierachical_bayesian_model_advertising.ipynb ├── images │ ├── 95_ci_driving_cars.png │ ├── Beta_distribution_pdf.svg │ ├── Gamma_distribution_pdf.svg │ ├── bayes_formula.png │ ├── binary_doubling.png │ ├── boxloop.png │ ├── f.png │ ├── hmc.png │ ├── nuts.png │ └── rejection_sampling.png ├── mcmc-animate.gif ├── mcmc.sqlite ├── nuts_and_metropolis.ipynb ├── poisson_dag.png ├── puppy_steps.ipynb ├── pymc38 │ ├── .ipynb_checkpoints │ │ ├── shared_variables-checkpoint.ipynb │ │ ├── supply_chain-checkpoint.ipynb │ │ └── supply_chain_example_plastic-checkpoint.ipynb │ ├── 1. Introduction to PyMC3.ipynb │ ├── 3. Theano.ipynb │ ├── Bayesian_AB_Testing.ipynb │ ├── Case Study 1- Bayesian_Changepoint_Detection.ipynb │ ├── Model Building with PyMC3.ipynb │ ├── rugby_analytics.ipynb │ ├── self_driving_fatalities.ipynb │ ├── shared_variables.ipynb │ ├── supply_chain.ipynb │ └── supply_chain_example_plastic.ipynb ├── rugby_analytics.ipynb ├── self_driving_fatalities.ipynb ├── supply_chain.ipynb ├── trace.sqlite └── untitled.txt └── src ├── .vscode └── settings.json ├── pip-delete-this-directory.txt ├── pyro_example.py └── pyro_example_blank.py /README.md: -------------------------------------------------------------------------------- 1 | # Syllabus 2 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/springcoil/probabilisticprogrammingprimer/master) 3 | 4 | - Introduction to PyMC3 5 | - What is MCMC and why should I care 6 | 1. Why MCMC is needed 7 | 2. A quick discussion of the difference between NUTS and M-H 8 | 3. What is Metropolis-Hastings? 9 | 4. What is NUTS 10 | 11 | 12 | [!Link to course](https://www.probabilisticprogrammingprimer.com/) 13 | - Note: Data is fake data however it's inspired by some work done on a PPC campaign in the past. 14 | 15 | # Following the course on PyMC v3.8 16 | The latest version of PyMC is v3.8. There are significant API changes from previous versions so some of these notebooks may be slightly different to the course videos. 17 | 18 | See notebooks provided by Mark Farragher to follow the course on PyMC v3.8 under the `notebooks/pymc38` directory. 19 | 20 | ## Environment setup 21 | Theano only supports [installation](http://deeplearning.net/software/theano/install.html) of requirements through `conda`. 22 | 23 | Create an environment through these conda commands, using the dependencies in `requirements.txt`: 24 | 25 | ```sh 26 | conda create -n ppp python=3.6 27 | conda activate ppp 28 | conda install --file requirements.txt 29 | ``` 30 | 31 | If you want to use an earlier version of PyMC, specify the version in the requirements file - e.g. `pymc3==3.6` for v3.6. 32 | -------------------------------------------------------------------------------- /data/fatality_data.csv: -------------------------------------------------------------------------------- 1 | State,Resident Population,Driving Age Population,Highway Motor Fuel Use (000),Total Lane Miles,Total Road and Street Mileage,Annual VMT (Millions),Total Highway Fatalities,Fatalities (per 100 million VMT),State Motor Fuel Taxes and Other Related Receipts,Total Highway Capital Outlay (000),Total Disburse-ments for Highways (000),Payments the HTF (000),Apportion-ments from the Federal HTF (000) 2 | AL,4447,3451,3148522,195298,94311,56534,995,1.76,579812,719722,1246223,638977,589698 3 | AK,626,458,338750,25991,12823,4613,103,2.23,27817,321612,501359,65940,378674 4 | AZ,5130,3908,2999157,118437,55195,49768,1036,2.08,565982,960137,2040266,583068,494747 5 | AR,2673,2073,1959484,198161,97600,29167,652,2.24,398717,468053,817387,415571,397312 6 | CA,33871,25599,17017620,371689,168076,306649,3753,1.22,2945156,2721334,6750225,3025732,2795250 7 | CO,4301,3322,2450177,176993,85409,41771,681,1.63,521721,730129,1391910,423763,367548 8 | CT,3405,2651,1697878,44474,20845,30756,342,1.11,545671,568931,1304378,312507,439532 9 | DE,783,610,429413,12558,5779,8240,123,1.49,103965,297648,594641,79594,128749 10 | DC,572,469,192440,3774,1425,3498,49,1.40,31727,164529,244216,33728,117381 11 | FL,15982,12742,8648333,253349,116649,152136,2999,1.97,1612070,2448336,4207948,1554162,1390224 12 | GA,8186,6251,6030954,241087,114727,105010,1541,1.47,431243,1106272,1567212,1189533,1023963 13 | HI,1211,949,417929,9255,4281,8543,131,1.53,68872,148304,272268,69351,154425 14 | ID,1293,969,847974,95178,46456,13534,276,2.04,202874,260689,491604,178492,253889 15 | IL,12419,9530,6293151,288879,138372,102866,1418,1.38,1231728,1836253,3446580,1053743,986434 16 | IN,6080,4682,4371604,193637,93608,70862,875,1.23,746424,1035129,1932198,767408,688839 17 | IA,2926,2281,1993887,232920,113377,29433,445,1.51,394458,696081,1493639,353281,345026 18 | KS,2688,2058,1676445,274014,134582,28130,461,1.64,358989,697463,1206470,346783,338426 19 | KY,4041,3161,2850498,164231,79267,46803,820,1.75,439785,1078252,1650763,577037,525325 20 | LA,4468,3395,2742677,127883,60900,40849,937,2.29,544329,767993,1300553,527753,464400 21 | ME,1274,1010,847317,46346,22670,14190,169,1.19,174259,224728,487571,162787,153306 22 | MD,5296,4085,2889534,67017,30494,50174,588,1.17,643009,594511,1599413,541915,476674 23 | MA,6349,5008,3122005,74505,35311,52796,433,0.82,644389,2238138,3524344,545690,536063 24 | MI,9938,7628,5822391,256155,121979,97792,1382,1.41,1047898,2136479,2747958,1074219,961800 25 | MN,4919,3783,3154032,271176,132250,52601,625,1.19,595997,697358,1692476,403760,439011 26 | MS,2844,2160,2035655,151701,73498,35536,949,2.67,397597,697252,1039192,428679,365747 27 | MO,5595,4292,3977442,251209,123039,67083,1157,1.72,674002,1006426,1818178,754241,719347 28 | MT,902,701,660133,141978,69567,9882,237,2.40,195390,300018,473807,140430,301755 29 | NE,1711,1315,1188911,188273,92791,18081,276,1.53,307043,383934,744905,241167,224419 30 | NV,1998,1538,1188724,79050,37854,17639,323,1.83,305124,424280,650984,215455,228039 31 | NH,1235,961,759891,31366,15211,12021,126,1.05,136478,189689,387468,137452,148580 32 | NJ,8414,6545,4748655,78163,36022,67446,731,1.08,525253,1994253,4502639,865079,781862 33 | NM,1819,1370,1285461,124841,59927,22760,430,1.89,238882,463011,1162422,269496,307801 34 | NY,18976,14797,6516320,239035,112783,129057,1458,1.13,1406054,2582541,5306825,1249954,1485648 35 | NC,8049,6291,5088090,209335,99813,89504,1472,1.64,1054849,1464209,2621330,918638,825844 36 | ND,642,502,483722,175349,86609,7217,86,1.19,102201,180072,384538,101377,194296 37 | OH,11353,8790,6570881,248722,116964,105898,1351,1.28,1484302,1650422,3350560,1158013,1006181 38 | OK,3450,2666,2478132,232710,112634,43355,652,1.50,414272,809152,1417329,500974,446540 39 | OR,3421,2673,1919249,136866,66902,35010,451,1.29,385359,357751,1010377,381740,384990 40 | PA,12281,9694,6323548,249169,119642,102337,1520,1.49,1698159,2323646,4516621,1238907,1449850 41 | RI,1048,827,450802,12812,6052,8359,80,0.96,134571,129527,255637,82095,180896 42 | SC,4012,3115,2831976,136123,64921,45538,1065,2.34,467948,502049,970218,554376,483066 43 | SD,754,577,562591,169060,83471,8432,173,2.05,116489,346269,465690,101194,211222 44 | TN,5689,4446,3759136,183640,87419,65732,1306,1.99,777581,836144,1439811,759820,685545 45 | TX,20851,15618,13252841,639853,301035,220064,3769,1.71,2700214,3421427,5664524,2573239,2199108 46 | UT,2233,1599,1333773,87435,41852,22597,373,1.65,314163,691200,1072340,249715,283695 47 | VT,608,479,403551,29359,14273,6811,79,1.16,87255,138578,287124,70411,133812 48 | VA,7078,5529,4575296,152328,70393,74801,930,1.24,774161,1270665,2678129,867264,775292 49 | WA,5894,4553,3180398,167211,80209,53330,632,1.19,725356,704342,1871259,588415,544878 50 | WV,1808,1455,1091359,76671,37277,19242,410,2.13,295148,673882,1170434,220408,329354 51 | WI,5363,4157,3061051,231340,112359,57266,799,1.40,795105,886798,1663266,602560,572783 52 | WY,493,382,590437,56780,27326,8090,152,1.88,100435,270786,395725,151317,228408 53 | -------------------------------------------------------------------------------- /driving_fatalities/README.md: -------------------------------------------------------------------------------- 1 | # Driving Fatalities 2 | This is the code to go along with the article published [here](https://colindcarroll.com/2017/12/07/does-this-convince-you-that-self-driving-cars-are-safe/) 3 | 4 | Using `conda`, and python 3.6, installation should work with `conda install pymc3 pandas seaborn jupyter`. 5 | -------------------------------------------------------------------------------- /driving_fatalities/fatality_data.csv: -------------------------------------------------------------------------------- 1 | State,Resident Population,Driving Age Population,Highway Motor Fuel Use (000),Total Lane Miles,Total Road and Street Mileage,Annual VMT (Millions),Total Highway Fatalities,Fatalities (per 100 million VMT),State Motor Fuel Taxes and Other Related Receipts,Total Highway Capital Outlay (000),Total Disburse-ments for Highways (000),Payments the HTF (000),Apportion-ments from the Federal HTF (000) 2 | AL,4447,3451,3148522,195298,94311,56534,995,1.76,579812,719722,1246223,638977,589698 3 | AK,626,458,338750,25991,12823,4613,103,2.23,27817,321612,501359,65940,378674 4 | AZ,5130,3908,2999157,118437,55195,49768,1036,2.08,565982,960137,2040266,583068,494747 5 | AR,2673,2073,1959484,198161,97600,29167,652,2.24,398717,468053,817387,415571,397312 6 | CA,33871,25599,17017620,371689,168076,306649,3753,1.22,2945156,2721334,6750225,3025732,2795250 7 | CO,4301,3322,2450177,176993,85409,41771,681,1.63,521721,730129,1391910,423763,367548 8 | CT,3405,2651,1697878,44474,20845,30756,342,1.11,545671,568931,1304378,312507,439532 9 | DE,783,610,429413,12558,5779,8240,123,1.49,103965,297648,594641,79594,128749 10 | DC,572,469,192440,3774,1425,3498,49,1.40,31727,164529,244216,33728,117381 11 | FL,15982,12742,8648333,253349,116649,152136,2999,1.97,1612070,2448336,4207948,1554162,1390224 12 | GA,8186,6251,6030954,241087,114727,105010,1541,1.47,431243,1106272,1567212,1189533,1023963 13 | HI,1211,949,417929,9255,4281,8543,131,1.53,68872,148304,272268,69351,154425 14 | ID,1293,969,847974,95178,46456,13534,276,2.04,202874,260689,491604,178492,253889 15 | IL,12419,9530,6293151,288879,138372,102866,1418,1.38,1231728,1836253,3446580,1053743,986434 16 | IN,6080,4682,4371604,193637,93608,70862,875,1.23,746424,1035129,1932198,767408,688839 17 | IA,2926,2281,1993887,232920,113377,29433,445,1.51,394458,696081,1493639,353281,345026 18 | KS,2688,2058,1676445,274014,134582,28130,461,1.64,358989,697463,1206470,346783,338426 19 | KY,4041,3161,2850498,164231,79267,46803,820,1.75,439785,1078252,1650763,577037,525325 20 | LA,4468,3395,2742677,127883,60900,40849,937,2.29,544329,767993,1300553,527753,464400 21 | ME,1274,1010,847317,46346,22670,14190,169,1.19,174259,224728,487571,162787,153306 22 | MD,5296,4085,2889534,67017,30494,50174,588,1.17,643009,594511,1599413,541915,476674 23 | MA,6349,5008,3122005,74505,35311,52796,433,0.82,644389,2238138,3524344,545690,536063 24 | MI,9938,7628,5822391,256155,121979,97792,1382,1.41,1047898,2136479,2747958,1074219,961800 25 | MN,4919,3783,3154032,271176,132250,52601,625,1.19,595997,697358,1692476,403760,439011 26 | MS,2844,2160,2035655,151701,73498,35536,949,2.67,397597,697252,1039192,428679,365747 27 | MO,5595,4292,3977442,251209,123039,67083,1157,1.72,674002,1006426,1818178,754241,719347 28 | MT,902,701,660133,141978,69567,9882,237,2.40,195390,300018,473807,140430,301755 29 | NE,1711,1315,1188911,188273,92791,18081,276,1.53,307043,383934,744905,241167,224419 30 | NV,1998,1538,1188724,79050,37854,17639,323,1.83,305124,424280,650984,215455,228039 31 | NH,1235,961,759891,31366,15211,12021,126,1.05,136478,189689,387468,137452,148580 32 | NJ,8414,6545,4748655,78163,36022,67446,731,1.08,525253,1994253,4502639,865079,781862 33 | NM,1819,1370,1285461,124841,59927,22760,430,1.89,238882,463011,1162422,269496,307801 34 | NY,18976,14797,6516320,239035,112783,129057,1458,1.13,1406054,2582541,5306825,1249954,1485648 35 | NC,8049,6291,5088090,209335,99813,89504,1472,1.64,1054849,1464209,2621330,918638,825844 36 | ND,642,502,483722,175349,86609,7217,86,1.19,102201,180072,384538,101377,194296 37 | OH,11353,8790,6570881,248722,116964,105898,1351,1.28,1484302,1650422,3350560,1158013,1006181 38 | OK,3450,2666,2478132,232710,112634,43355,652,1.50,414272,809152,1417329,500974,446540 39 | OR,3421,2673,1919249,136866,66902,35010,451,1.29,385359,357751,1010377,381740,384990 40 | PA,12281,9694,6323548,249169,119642,102337,1520,1.49,1698159,2323646,4516621,1238907,1449850 41 | RI,1048,827,450802,12812,6052,8359,80,0.96,134571,129527,255637,82095,180896 42 | SC,4012,3115,2831976,136123,64921,45538,1065,2.34,467948,502049,970218,554376,483066 43 | SD,754,577,562591,169060,83471,8432,173,2.05,116489,346269,465690,101194,211222 44 | TN,5689,4446,3759136,183640,87419,65732,1306,1.99,777581,836144,1439811,759820,685545 45 | TX,20851,15618,13252841,639853,301035,220064,3769,1.71,2700214,3421427,5664524,2573239,2199108 46 | UT,2233,1599,1333773,87435,41852,22597,373,1.65,314163,691200,1072340,249715,283695 47 | VT,608,479,403551,29359,14273,6811,79,1.16,87255,138578,287124,70411,133812 48 | VA,7078,5529,4575296,152328,70393,74801,930,1.24,774161,1270665,2678129,867264,775292 49 | WA,5894,4553,3180398,167211,80209,53330,632,1.19,725356,704342,1871259,588415,544878 50 | WV,1808,1455,1091359,76671,37277,19242,410,2.13,295148,673882,1170434,220408,329354 51 | WI,5363,4157,3061051,231340,112359,57266,799,1.40,795105,886798,1663266,602560,572783 52 | WY,493,382,590437,56780,27326,8090,152,1.88,100435,270786,395725,151317,228408 53 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ppp 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.8 7 | - notebook 8 | - mkl 9 | - graphviz 10 | - seaborn 11 | - pydot 12 | - matplotlib 13 | - arviz 14 | - pymc3 15 | - theano-pymc 16 | - scipy 17 | - pandas 18 | - numpy 19 | - mkl-service 20 | - watermark 21 | - jupyter 22 | -------------------------------------------------------------------------------- /notebooks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/.DS_Store -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/1. Introduction to PyMC3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to PyMC3\n", 8 | "\n", 9 | "\n", 10 | "Probabilistic programming (PP) allows flexible specification of Bayesian statistical models in code. PyMC3 is a new, open-source PP framework with an intuitive and readable, yet powerful, syntax that is close to the natural syntax statisticians use to describe models. It features next-generation Markov chain Monte Carlo (MCMC) sampling algorithms such as the No-U-Turn Sampler (NUTS; Hoffman, 2014), a self-tuning variant of Hamiltonian Monte Carlo (HMC; Duane, 1987). This class of samplers works well on high dimensional and complex posterior distributions and allows many complex models to be fit without specialized knowledge about fitting algorithms. HMC and NUTS take advantage of gradient information from the likelihood to achieve much faster convergence than traditional sampling methods, especially for larger models. NUTS also has several self-tuning strategies for adaptively setting the tunable parameters of Hamiltonian Monte Carlo, which means you usually don't need to have specialized knowledge about how the algorithms work. PyMC3, Stan (Stan Development Team, 2014), and the LaplacesDemon package for R are currently the only PP packages to offer HMC.\n", 11 | "\n", 12 | "### PyMC3 Features\n", 13 | "\n", 14 | "Probabilistic programming in Python confers a number of advantages including multi-platform compatibility, an expressive yet clean and readable syntax, easy integration with other scientific libraries, and extensibility via C, C++, Fortran or Cython. These features make it relatively straightforward to write and use custom statistical distributions, samplers and transformation functions, as required by Bayesian analysis.\n", 15 | "\n", 16 | "PyMC3's feature set helps to make Bayesian analysis as painless as possible. Here is a short list of some of its features:\n", 17 | "\n", 18 | "- Fits Bayesian statistical models with Markov chain Monte Carlo, variational inference and\n", 19 | " other algorithms.\n", 20 | "- Includes a large suite of well-documented statistical distributions.\n", 21 | "- Creates summaries including tables and plots.\n", 22 | "- Traces can be saved to the disk as plain text, SQLite or pandas dataframes.\n", 23 | "- Several convergence diagnostics and model checking methods are available.\n", 24 | "- Extensible: easily incorporates custom step methods and unusual probability distributions.\n", 25 | "- MCMC loops can be embedded in larger programs, and results can be analyzed with the full power of Python.\n", 26 | "\n", 27 | "Here, we present a primer on the use of PyMC3 for solving general Bayesian statistical inference and prediction problems. We will first see the basics of how to use PyMC3, motivated by a simple example: installation, data creation, model definition, model fitting and posterior analysis. Then we will cover two case studies and use them to show how to define and fit more sophisticated models. Finally we will show how to extend PyMC3 and discuss other useful features: the Generalized Linear Models subpackage, custom distributions, custom transformations and alternative storage backends." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "%load ../data/melanoma_data.py" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "%matplotlib inline\n", 48 | "import seaborn as sns; sns.set_context('notebook')\n", 49 | "from pymc3 import Normal, Model, DensityDist, sample, log, exp\n", 50 | "\n", 51 | "with Model() as melanoma_survival:\n", 52 | "\n", 53 | " # Convert censoring indicators to indicators for failure event\n", 54 | " failure = (censored==0).astype(int)\n", 55 | "\n", 56 | " # Parameters (intercept and treatment effect) for survival rate\n", 57 | " beta = Normal('beta', mu=0.0, sd=1e5, shape=2)\n", 58 | "\n", 59 | " # Survival rates, as a function of treatment\n", 60 | " lam = exp(beta[0] + beta[1]*treat)\n", 61 | " \n", 62 | " # Survival likelihood, accounting for censoring\n", 63 | " def logp(failure, value):\n", 64 | " return (failure * log(lam) - lam * value).sum()\n", 65 | "\n", 66 | " x = DensityDist('x', logp, observed={'failure':failure, 'value':t})\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "This example will generate 1000 posterior samples." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "with melanoma_survival:\n", 83 | " trace = sample(1000)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from pymc3 import traceplot\n", 93 | "\n", 94 | "traceplot(trace[500:], varnames=['beta']);" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Motivating Example: Coal mining disasters\n", 102 | "\n", 103 | "Consider the following time series of recorded coal mining disasters in the UK from 1851 to 1962 (Jarrett, 1979). The number of disasters is thought to have been affected by changes in safety regulations during this period.\n", 104 | "\n", 105 | "Let's build a model for this series and attempt to estimate when the change occurred." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "import numpy as np\n", 115 | "import matplotlib.pyplot as plt\n", 116 | "\n", 117 | "disasters_data = np.array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,\n", 118 | " 3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,\n", 119 | " 2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,\n", 120 | " 1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,\n", 121 | " 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,\n", 122 | " 3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,\n", 123 | " 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])\n", 124 | "\n", 125 | "n_years = len(disasters_data)\n", 126 | "\n", 127 | "plt.figure(figsize=(12.5, 3.5))\n", 128 | "plt.bar(np.arange(1851, 1962), disasters_data, color=\"#348ABD\")\n", 129 | "plt.xlabel(\"Year\")\n", 130 | "plt.ylabel(\"Disasters\")\n", 131 | "plt.title(\"UK coal mining disasters, 1851-1962\")\n", 132 | "plt.xlim(1851, 1962);" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "We represent our conceptual model formally as a statistical model:\n", 140 | "\n", 141 | "$$\\begin{array}{ccc} \n", 142 | "(y_t | \\tau, \\lambda_1, \\lambda_2) \\sim\\text{Poisson}\\left(r_t\\right), & r_t=\\left\\{\n", 143 | "\\begin{array}{lll} \n", 144 | "\\lambda_1 &\\text{if}& t< \\tau\\\\ \n", 145 | "\\lambda_2 &\\text{if}& t\\ge \\tau \n", 146 | "\\end{array}\\right.,&t\\in[t_l,t_h]\\\\ \n", 147 | "\\tau \\sim \\text{DiscreteUniform}(t_l, t_h)\\\\ \n", 148 | "\\lambda_1\\sim \\text{Exponential}(a)\\\\ \n", 149 | "\\lambda_2\\sim \\text{Exponential}(b) \n", 150 | "\\end{array}$$\n", 151 | "\n", 152 | "Because we have defined $y$ by its dependence on $\\tau$, $\\lambda_1$ and $\\lambda_2$, the latter three are known as the *parents* of $y$ and $D$ is called their *child*. Similarly, the parents of $\\tau$ are $t_l$ and $t_h$, and $\\tau$ is the child of $t_l$ and $t_h$.\n", 153 | "\n", 154 | "## Implementing a PyMC Model\n", 155 | "\n", 156 | "At the model-specification stage (before the data are observed), $y$, $\\tau$, $\\lambda_1$, and $\\lambda_2$ are all random variables. Bayesian \"random\" variables have not necessarily arisen from a physical random process. The Bayesian interpretation of probability is **epistemic**, meaning random variable $x$'s probability distribution $p(x)$ represents our knowledge and uncertainty about $x$'s value. Candidate values of $x$ for which $p(x)$ is high are relatively more probable, given what we know. \n", 157 | "\n", 158 | "We can generally divide the variables in a Bayesian model into two types: **stochastic** and **deterministic**. The only deterministic variable in this model is $r$. If we knew the values of $r$'s parents, we could compute the value of $r$ exactly. A deterministic like $r$ is defined by a mathematical function that returns its value given values for its parents. Deterministic variables are sometimes called the *systemic* part of the model. The nomenclature is a bit confusing, because these objects usually represent random variables; since the parents of $r$ are random, $r$ is random also.\n", 159 | "\n", 160 | "On the other hand, even if the values of the parents of variables `switchpoint`, `disasters` (before observing the data), `early_mean` or `late_mean` were known, we would still be uncertain of their values. These variables are stochastic, characterized by probability distributions that express how plausible their candidate values are, given values for their parents.\n", 161 | "\n", 162 | "Let's begin by defining the unknown switchpoint as a discrete uniform random variable:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "from pymc3 import DiscreteUniform\n", 172 | "\n", 173 | "with Model() as disaster_model:\n", 174 | "\n", 175 | " switchpoint = DiscreteUniform('switchpoint', lower=0, upper=n_years)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "We have done two things here. First, we have created a `Model` object; a `Model` is a Python object that encapsulates all of the variables that comprise our theoretical model, keeping them in a single container so that they may be used as a unit. After a `Model` is created, we can populate it with all of the model components that we specified when we wrote the model down. \n", 183 | "\n", 184 | "Notice that the `Model` above was declared using a `with` statement. This expression is used to define a Python idiom known as a **context manager**. Context managers, in general, are used to manage resources of some kind within a program. In this case, our resource is a `Model`, and we would like to add variables to it so that we can fit our statistical model. The key characteristic of the context manager is that the resources it manages are only defined within the indented block corresponding to the `with` statement. PyMC uses this idiom to automatically add defined variables to a model. Thus, any variable we define is automatically added to the `Model`, without having to explicitly add it. This avoids the repetitive syntax of `add` methods/functions that you see in some machine learning packages:\n", 185 | "\n", 186 | "```python\n", 187 | "model.add(a_variable)\n", 188 | "model.add(another_variable)\n", 189 | "model.add(yet_another_variable)\n", 190 | "model.add(and_again)\n", 191 | "model.add(please_kill_me_now)\n", 192 | "...\n", 193 | "```\n", 194 | "\n", 195 | "In fact, PyMC variables cannot be defined without a corresponding `Model`:" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "foo = DiscreteUniform('foo', lower=0, upper=10)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "However, variables can be explicitly added to models without the use of a context manager, via the variable's optional `model` argument.\n", 212 | "\n", 213 | "```python\n", 214 | "disaster_model = Model()\n", 215 | "switchpoint = DiscreteUniform('switchpoint', lower=0, upper=110, model=disaster_model)\n", 216 | "```" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Or, if we just want a discrete uniform distribution, and do not need to use it in a PyMC3 model necessarily, we can create one using the `dist` classmethod." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "x = DiscreteUniform.dist(lower=0, upper=100)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "x" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "`DiscreteUniform` is an object that represents uniformly-distributed discrete variables. Use of this distribution\n", 251 | "suggests that we have no preference *a priori* regarding the location of the switchpoint; all values are equally likely. \n", 252 | "\n", 253 | "PyMC3 includes most of the common random variable **distributions** used for statistical modeling. For example, the following discrete random variables are available." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "from pymc3 import discrete\n", 263 | "discrete.__all__" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "By having a library of variables that represent statistical distributions, users are relieved of having to code distrbutions themselves. " 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Similarly, we can create the exponentially-distributed variables `early_mean` and `late_mean` for the early and late Poisson rates, respectively (also in the context of the model `distater_model`):" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from pymc3 import Exponential\n", 287 | "\n", 288 | "with disaster_model:\n", 289 | " \n", 290 | " early_mean = Exponential('early_mean', 1)\n", 291 | " late_mean = Exponential('late_mean', 1)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "In this instance, we are told that the variables are being **transformed**. In PyMC3, variables with purely positive priors like `Exponential` are transformed with a log function. This makes sampling more robust. Behind the scenes, a variable in the unconstrained space (named `_log`) is added to the model for sampling. In this model this happens behind the scenes. Variables with priors that constrain them on two sides, like `Beta` or `Uniform` (continuous), are also transformed to be unconstrained but with a log odds transform." 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Next, we define the variable `rate`, which selects the early rate `early_mean` for times before `switchpoint` and the late rate `late_mean` for times after `switchpoint`. We create `rate` using the `switch` function, which returns `early_mean` when the switchpoint is larger than (or equal to) a particular year, and `late_mean` otherwise." 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "from pymc3 import switch\n", 315 | "\n", 316 | "with disaster_model:\n", 317 | " \n", 318 | " rate = switch(switchpoint >= np.arange(n_years), early_mean, late_mean)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "The last step is to define the **data likelihood**, or sampling distribution. In this case, our measured outcome is the number of disasters in each year, `disasters`. This is a stochastic variable but unlike `early_mean` and `late_mean` we have *observed* its value. To express this, we set the argument `observed` to the observed sequence of disasters. This tells PyMC that this distribution's value is fixed, and should not be changed:" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "from pymc3 import Poisson\n", 335 | "\n", 336 | "with disaster_model:\n", 337 | " \n", 338 | " disasters = Poisson('disasters', mu=rate, observed=disasters_data)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "The model that we specified at the top of the page has now been fully implemented in PyMC3. Let's have a look at the model's attributes to see what we have.\n", 346 | "\n", 347 | "The stochastic nodes in the model are identified in the `vars` (*i.e.* variables) attribute:" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "disaster_model.vars" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "The last two variables are the log-transformed versions of the early and late rate parameters. The original variables have become deterministic nodes in the model, since they only represent values that have been back-transformed from the transformed variable, which has been subject to fitting or sampling." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "disaster_model.deterministics" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "You might wonder why `rate`, which is a deterministic component of the model, is not in this list. This is because, unlike the other components of the model, `rate` has not been given a name and given a formal PyMC data structure. It is essentially an **intermediate calculation** in the model, implying that we are not interested in its value when it comes to summarizing the output from the model. Most PyMC objects have a name assigned; these names are used for storage and post-processing:\n", 380 | "\n", 381 | "- as keys in on-disk databases,\n", 382 | "- as axis labels in plots of traces,\n", 383 | "- as table labels in summary statistics.\n", 384 | "\n", 385 | "If we wish to include `rate` in our output, we need to make it a `Deterministic` object, and give it a name:" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "from pymc3 import Deterministic\n", 395 | "\n", 396 | "with disaster_model:\n", 397 | " \n", 398 | " rate = Deterministic('rate', switch(switchpoint >= np.arange(n_years), early_mean, late_mean))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "Now, `rate` is included in the `Model`'s deterministics list, and the model will retain its samples during MCMC sampling, for example." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "disaster_model.deterministics" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "> ### Why are data and unknown variables represented by the same object?\n", 422 | "\n", 423 | ">Since its represented by PyMC random variable object, `disasters` is defined by its dependence on its parent `rate` even though its value is **fixed**. This isn't just a quirk of PyMC's syntax; Bayesian hierarchical notation itself makes no distinction between random variables and data. The reason is simple: to use Bayes' theorem to compute the posterior, we require the likelihood. Even though `disasters`'s value is known and fixed, we need to formally assign it a *probability distribution* as if it were a random variable. Remember, the likelihood and the probability function are essentially the same, except that the former is regarded as a function of the parameters and the latter as a function of the data. This point can be counterintuitive at first, as many peoples' instinct is to regard data as fixed a priori and unknown variables as dependent on the data. \n", 424 | "\n", 425 | "> One way to understand this is to think of statistical models as predictive models for data, or as models of the processes that gave rise to data. Before observing the value of `disasters`, we could have sampled from its prior predictive distribution $p(y)$ (*i.e.* the marginal distribution of the data) as follows:\n", 426 | "\n", 427 | "> - Sample `early_mean`, `switchpoint` and `late_mean` from their\n", 428 | "> priors.\n", 429 | "> - Sample `disasters` conditional on these values.\n", 430 | "\n", 431 | "> Even after we observe the value of `disasters`, we need to use this process model to make inferences about `early_mean` , `switchpoint` and `late_mean` because its the only information we have about how the variables are related.\n", 432 | "\n", 433 | "> We will see later that we can sample from this fixed stochastic random variable, to obtain predictions after having observed our data." 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## PyMC3 Variables\n", 441 | "\n", 442 | "Each of the built-in statistical variables are subclasses of the generic `Distribution` class in PyMC3. The `Distribution` carries relevant **attributes** about the probability distribution, such as the data type (called `dtype`), any relevant transformations (`transform`, see below), and initial values (`init_value`)." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "disasters.dtype" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "early_mean.init_value" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "PyMC's built-in distribution variables can also be used to generate **random values** from that variable. For example, the `switchpoint`, which is a discrete uniform random variable, can generate random draws:" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "plt.hist(switchpoint.random(size=1000))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "As we noted earlier, some variables have undergone **transformations** prior to sampling. Such variables will have `transformed` attributes that points to the variable that it has been transformed to." 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "early_mean.transformed" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "Variables will usually have an associated distribution, as determined by the constructor used to create it. For example, the `switchpoint` variable was created by calling `DiscreteUniform()`. Hence, its distribution is `DiscreteUniform`:" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "switchpoint.distribution" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "As with all Python objects, the underlying type of a variable can be exposed with the `type()` function:" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "type(switchpoint)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "type(disasters)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "We will learn more about these types in an upcoming section." 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "## Variable log-probabilities\n", 548 | "\n", 549 | "All PyMC3 stochastic variables can evaluate their probability mass or density functions at a particular value, given the values of their parents. The **logarithm** of a stochastic object's probability mass or density can be\n", 550 | "accessed via the `logp` method. " 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": { 557 | "scrolled": true 558 | }, 559 | "outputs": [], 560 | "source": [ 561 | "switchpoint.logp({'switchpoint':55, 'early_mean_log':1, 'late_mean_log':1})" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "For **vector-valued** variables like `disasters`, the `logp` attribute returns the **sum** of the logarithms of\n", 569 | "the joint probability or density of all elements of the value." 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "disasters.logp({'switchpoint':55, 'early_mean_log':1, 'late_mean_log':1})" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "### Custom variables\n", 586 | "\n", 587 | "Though we created the variables in `disaster_model` using well-known probability distributions that are available in PyMC3, its possible to create custom distributions by **wrapping** functions that compute an arbitrary log-probability using the `DensityDist` function. For example, our initial example showed an exponential survival function, which accounts for censored data. If we pass this function as the `logp` argument for `DensityDist`, we can use it as the data likelihood in a survival model:\n", 588 | "\n", 589 | "```python\n", 590 | "def logp(failure, value):\n", 591 | " return (failure * log(lam) - lam * value).sum()\n", 592 | "\n", 593 | "x = DensityDist('x', logp, observed={'failure':failure, 'value':t})\n", 594 | "```\n", 595 | "\n", 596 | "Users are thus not\n", 597 | "limited to the set of of statistical distributions provided by PyMC." 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "## Fitting the model with MCMC\n", 605 | "\n", 606 | "PyMC3's `sample` function will fit probability models (linked collections of variables) like ours using Markov chain Monte Carlo (MCMC) sampling. Unless we manually assign particular algorithms to variables in our model, PyMC will assign algorithms that it deems appropriate (it usually does a decent job of this):" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "with disaster_model:\n", 616 | " trace = sample(2000)" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "This returns the Markov chain of draws from the model in a data structure called a **trace**." 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "trace" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "The `sample()` function always takes at least one argument, `draws`, which specifies how many samples to draw. However, there are a number of additional optional arguments that are worth knowing about:" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "help(sample)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "The `step` argument is what allows users to manually override the sampling algorithms used to fit the model. For example, if we wanted to use a **slice sampler** to sample the `early_mean` and `late_mean` variables, we could specify it:" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "from pymc3 import Slice\n", 665 | "\n", 666 | "with disaster_model:\n", 667 | " trace = sample(1000, step=Slice(vars=[early_mean, late_mean]))" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "### Accessing the samples\n", 675 | "\n", 676 | "The output of the `sample` function is a `MultiTrace` object, which stores the sequence of samples for each variable in the model. These traces can be accessed using dict-style indexing:\n" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": { 683 | "scrolled": true 684 | }, 685 | "outputs": [], 686 | "source": [ 687 | "trace['late_mean']" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "The trace can also be sliced using the NumPy array slice `[start:stop:step]`. " 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "trace['late_mean', -5:]" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "### Sampling output\n", 711 | "\n", 712 | "You can examine the marginal posterior of any variable by plotting a\n", 713 | "histogram of its trace:" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "plt.hist(trace['late_mean']);" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "PyMC has its own plotting functionality dedicated to plotting MCMC output. For example, we can obtain a time series plot of the trace and a histogram using `traceplot`:" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [ 738 | "from pymc3 import traceplot\n", 739 | "\n", 740 | "traceplot(trace[500:], varnames=['early_mean', 'late_mean', 'switchpoint']);" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "The upper left-hand pane of each figure shows the temporal series of the\n", 748 | "samples from each parameter, while below is an autocorrelation plot of\n", 749 | "the samples. The right-hand pane shows a histogram of the trace. The\n", 750 | "trace is useful for evaluating and diagnosing the algorithm's\n", 751 | "performance, while the histogram is useful for\n", 752 | "visualizing the posterior.\n", 753 | "\n", 754 | "For a non-graphical summary of the posterior, simply call the `stats` method." 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "from pymc3 import summary\n", 764 | "\n", 765 | "summary(trace[500:], varnames=['early_mean', 'late_mean'])" 766 | ] 767 | } 768 | ], 769 | "metadata": { 770 | "kernelspec": { 771 | "display_name": "Python [default]", 772 | "language": "python", 773 | "name": "python3" 774 | }, 775 | "language_info": { 776 | "codemirror_mode": { 777 | "name": "ipython", 778 | "version": 3 779 | }, 780 | "file_extension": ".py", 781 | "mimetype": "text/x-python", 782 | "name": "python", 783 | "nbconvert_exporter": "python", 784 | "pygments_lexer": "ipython3", 785 | "version": "3.6.4" 786 | } 787 | }, 788 | "nbformat": 4, 789 | "nbformat_minor": 1 790 | } 791 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/2. Markov Chain Monte Carlo-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Markov chain Monte Carlo\n", 8 | "\n", 9 | "Let's briefly cover some theory regarding Bayesian analysis using Markov chain Monte Carlo (MCMC) methods. You might wonder why a numerical simulation method like MCMC is the standard approach for fitting Bayesian models. \n", 10 | "\n", 11 | "Gelman et al. (2013) break down the business of Bayesian analysis into three primary steps:\n", 12 | "\n", 13 | "1. Specify a full probability model, including all parameters, data, transformations, missing values and predictions that are of interest.\n", 14 | "2. Calculate the posterior distribution of the unknown quantities in the model, conditional on the data.\n", 15 | "3. Perform model checking to evaluate the quality and suitablility of the model.\n", 16 | "\n", 17 | "While each of these steps is challenging, it is the second step that is the most difficult for non-trivial models, and was a bottleneck for the adoption of Bayesian methods for decades. \n", 18 | "\n", 19 | "We can consider this in terms of what Blei et al. (2014) call the Box loop \n", 20 | "![box loop](images/boxloop.png)\n", 21 | "\n", 22 | "### Bayesian Inference\n", 23 | "\n", 24 | "At this point, we should all be familiar with **Bayes Formula**:\n", 25 | "\n", 26 | "![bayes formula](images/bayes_formula.png)\n", 27 | "\n", 28 | "The equation expresses how our belief about the value of \\\\(\\theta\\\\), as expressed by the **prior distribution** \\\\(P(\\theta)\\\\) is reallocated following the observation of the data \\\\(y\\\\), as expressed by the posterior distribution the posterior distribution.\n", 29 | "\n", 30 | "Computing the posterior distribution is called the **inference problem**, and is usually the goal of Bayesian analysis.\n", 31 | "\n", 32 | "The innocuous denominator \\\\(P(y)\\\\) (the model **evidence**, or **marginal likelihood**) cannot be calculated directly, and is actually the expression in the numerator, integrated over all \\\\(\\theta\\\\):\n", 33 | "\n", 34 | "
\n", 35 | "\\\\[Pr(\\theta|y) = \\frac{Pr(y|\\theta)Pr(\\theta)}{\\int Pr(y|\\theta)Pr(\\theta) d\\theta}\\\\]\n", 36 | "
\n", 37 | "\n", 38 | "Computing this integral, which may involve many variables, is generally intractible with analytic methods. This is the major compuational hurdle for Bayesian analysis.\n", 39 | "\n", 40 | "### Simulation Approaches for Bayesian Computation\n", 41 | "\n", 42 | "Since analysis is off the table, a reasonable alternative is to attempt to estimate the integral using numerical methods. For example, consider the expected value of a random variable $\\mathbf{x}$:\n", 43 | "\n", 44 | "$$\\begin{gathered}\n", 45 | "\\begin{split}E[{\\bf x}] = \\int {\\bf x} f({\\bf x}) d{\\bf x}, \\qquad\n", 46 | "{\\bf x} = \\{x_1,...,x_k\\}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 47 | "\n", 48 | "where $k$ (the dimension of vector $x$) is perhaps very large. If we can produce a reasonable number of random vectors $\\{{\\bf x_i}\\}$, we can use these values to approximate the unknown integral. This process is known as *Monte Carlo integration*. In general, MC integration allows integrals against probability density functions:\n", 49 | "\n", 50 | "$$\\begin{gathered}\n", 51 | "\\begin{split}I = \\int h(\\mathbf{x}) f(\\mathbf{x}) \\mathbf{dx}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 52 | "\n", 53 | "to be estimated by finite sums:\n", 54 | "\n", 55 | "$$\\begin{gathered}\n", 56 | "\\begin{split}\\hat{I} = \\frac{1}{n}\\sum_{i=1}^n h(\\mathbf{x}_i),\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 57 | "\n", 58 | "where $\\mathbf{x}_i$ is a sample from $f$. This estimate is valid and useful because:\n", 59 | "\n", 60 | "- By the strong law of large numbers:\n", 61 | "\n", 62 | "$$\\begin{gathered}\n", 63 | "\\begin{split}\\hat{I} \\rightarrow I \\mbox{ with probability 1}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 64 | "\n", 65 | "- Simulation error can be measured and controlled:\n", 66 | "\n", 67 | "$$Var(\\hat{I}) = \\frac{1}{n(n-1)}\\sum_{i=1}^n\n", 68 | " (h(\\mathbf{x}_i)-\\hat{I})^2$$" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### How is this relevant to Bayesian analysis? \n", 76 | "\n", 77 | "When we observe data $y$ that we hypothesize as being obtained from a sampling model $f(y|\\theta)$, where $\\theta$ is a vector of (unknown) model parameters, a Bayesian places a *prior* distribution $p(\\theta)$ on the parameters to describe the uncertainty in the true values of the parameters. Bayesian inference, then, is obtained by calculating the *posterior* distribution, which is proportional to the product of these quantities:\n", 78 | "\n", 79 | "$$p(\\theta | y) \\propto f(y|\\theta) p(\\theta)$$\n", 80 | "\n", 81 | "unfortunately, for most problems of interest, the normalizing constant cannot be calculated because it involves mutli-dimensional integration over $\\theta$.\n", 82 | "\n", 83 | "Returning to our integral for MC sampling, if we replace $f(\\mathbf{x})$\n", 84 | "with a posterior, $p(\\theta|y)$ and make $h(\\theta)$ an interesting function of the unknown parameter, the resulting expectation is that of the posterior of $h(\\theta)$:\n", 85 | "\n", 86 | "$$E[h(\\theta)|y] = \\int h(\\theta) p(\\theta|y) d\\theta \\approx \\frac{1}{n}\\sum_{i=1}^n h(\\theta)$$\n", 87 | "\n", 88 | "We also require integrals to obtain marginal estimates from a joint model. If $\\theta$ is of length $K$, then inference about any particular parameter is obtained by:\n", 89 | "\n", 90 | "$$p(\\theta_i|y) \\propto \\int p(\\theta|y) d\\theta_{-i}$$\n", 91 | "\n", 92 | "where the `-i` subscript indicates all elements except the $i^{th}$." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Sampling Markov Chains\n", 100 | "\n", 101 | "The expectation above assumes that the draws of $\\theta$ are **independent**. The limitation in using Monte Carlo sampling for Bayesian inference is that it is not usually feasible to make independent draws from the posterior distribution. \n", 102 | "\n", 103 | "The first \"MC\" in MCMC stands for **Markov chain**. A Markov chain is a **stochastic process**, an indexed set of random variables, where the value of a particular random variable in the set is dependent only on the random variable corresponding to the prevous index. This is a Markovian dependence structure:\n", 104 | "\n", 105 | "$$Pr(X_{t+1}=x_{t+1} | X_t=x_t, X_{t-1}=x_{t-1},\\ldots,X_0=x_0) = Pr(X_{t+1}=x_{t+1} | X_t=x_t)$$\n", 106 | "\n", 107 | "This conditioning specifies that the future depends on the current state, but not past states. Thus, the Markov chain wanders about the state space, remembering only where it has just been in the last time step. The collection of transition probabilities is sometimes called a *transition matrix* when dealing with discrete states, or more generally, a *transition kernel*.\n", 108 | "\n", 109 | "MCMC allows us to generate samples from a particular posterior distribution as a Markov chain. The magic is that the resulting sample, even though it is dependent in this way, is indistinguishable from an independent sample from the true posterior." 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Why MCMC Works: Reversible Markov Chains\n", 117 | "\n", 118 | "Markov chain Monte Carlo simulates a Markov chain for which some function of interest\n", 119 | "(*e.g.* the joint distribution of the parameters of some model) is the unique, invariant limiting distribution. An invariant distribution with respect to some Markov chain with transition kernel $Pr(y \\mid x)$ implies that:\n", 120 | "\n", 121 | "$$\\begin{gathered}\n", 122 | "\\begin{split}\\int_x Pr(y \\mid x) \\pi(x) dx = \\pi(y).\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\n", 123 | "\\end{gathered}$$\n", 124 | "\n", 125 | "Invariance is guaranteed for any *reversible* Markov chain. Consider a Markov chain in reverse sequence:\n", 126 | "$\\{\\theta^{(n)},\\theta^{(n-1)},...,\\theta^{(0)}\\}$. This sequence is still Markovian, because:\n", 127 | "\n", 128 | "$$\\begin{gathered}\n", 129 | "\\begin{split}Pr(\\theta^{(k)}=y \\mid \\theta^{(k+1)}=x,\\theta^{(k+2)}=x_1,\\ldots ) = Pr(\\theta^{(k)}=y \\mid \\theta^{(k+1)}=x)\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 130 | "\n", 131 | "Forward and reverse transition probabilities may be related through Bayes theorem:\n", 132 | "\n", 133 | "$$\\begin{gathered}\n", 134 | "\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 135 | "\n", 136 | "$$\\begin{gathered}\n", 137 | "\\begin{split}\\frac{Pr(\\theta^{(k+1)}=x \\mid \\theta^{(k)}=y) \\pi^{(k)}(y)}{\\pi^{(k+1)}(x)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 138 | "\n", 139 | "Though not homogeneous in general, $\\pi$ becomes homogeneous if:\n", 140 | "\n", 141 | "- $n \\rightarrow \\infty$\n", 142 | "\n", 143 | "- $\\pi^{(i)}=\\pi$ for some $i < k$\n", 144 | "\n", 145 | "If this chain is homogeneous it is called reversible, because it satisfies the ***detailed balance equation***:\n", 146 | "\n", 147 | "$$\\begin{gathered}\n", 148 | "\\begin{split}\\pi(x)Pr(y \\mid x) = \\pi(y) Pr(x \\mid y)\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 149 | "\n", 150 | "Reversibility is important because it has the effect of balancing movement through the entire state space. When a Markov chain is reversible, $\\pi$ is the unique, invariant, stationary distribution of that chain. Hence, if $\\pi$ is of interest, we need only find the reversible Markov chain for which $\\pi$ is the limiting distribution.\n", 151 | "This is what MCMC does!" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## The Metropolis-Hastings Algorithm\n", 159 | "\n", 160 | "One of the simplest and most flexible MCMC algorithms is the Metropolis-Hastings sampler. This algorithm generates candidate state transitions from an auxilliary distribution, and accepts or rejects each candidate probabilistically, according to the posterior distribution of the model.\n", 161 | "\n", 162 | "Let us first consider a simple Metropolis-Hastings algorithm for a single parameter, $\\theta$. We will use a standard sampling distribution, referred to as the *proposal distribution*, to produce candidate variables $q_t(\\theta^{\\prime} | \\theta)$. That is, the generated value, $\\theta^{\\prime}$, is a *possible* next value for\n", 163 | "$\\theta$ at step $t+1$. We also need to be able to calculate the probability of moving back to the original value from the candidate, or\n", 164 | "$q_t(\\theta | \\theta^{\\prime})$. These probabilistic ingredients are used to define an *acceptance ratio*:\n", 165 | "\n", 166 | "$$\\begin{gathered}\n", 167 | "\\begin{split}a(\\theta^{\\prime},\\theta) = \\frac{q_t(\\theta^{\\prime} | \\theta) \\pi(\\theta^{\\prime})}{q_t(\\theta | \\theta^{\\prime}) \\pi(\\theta)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 168 | "\n", 169 | "The value of $\\theta^{(t+1)}$ is then determined by:\n", 170 | "\n", 171 | "$$\\begin{gathered}\n", 172 | "\\begin{split}\\theta^{(t+1)} = \\left\\{\\begin{array}{l@{\\quad \\mbox{with prob.} \\quad}l}\\theta^{\\prime} & \\min(a(\\theta^{\\prime},\\theta^{(t)}),1) \\\\ \\theta^{(t)} & 1 - \\min(a(\\theta^{\\prime},\\theta^{(t)}),1) \\end{array}\\right.\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 173 | "\n", 174 | "This transition kernel implies that movement is not guaranteed at every step. It only occurs if the suggested transition is likely based on the acceptance ratio.\n", 175 | "\n", 176 | "A single iteration of the Metropolis-Hastings algorithm proceeds as follows:\n", 177 | "\n", 178 | "The original form of the algorithm specified by Metropolis required that\n", 179 | "$q_t(\\theta^{\\prime} | \\theta) = q_t(\\theta | \\theta^{\\prime})$, which reduces $a(\\theta^{\\prime},\\theta)$ to\n", 180 | "$\\pi(\\theta^{\\prime})/\\pi(\\theta)$, but this is not necessary. In either case, the state moves to high-density points in the distribution with high probability, and to low-density points with low probability. After convergence, the Metropolis-Hastings algorithm describes the full target posterior density, so all points are recurrent.\n", 181 | "\n", 182 | "1. Sample $\\theta^{\\prime}$ from $q(\\theta^{\\prime} | \\theta^{(t)})$.\n", 183 | "\n", 184 | "2. Generate a Uniform[0,1] random variate $u$.\n", 185 | "\n", 186 | "3. If $a(\\theta^{\\prime},\\theta) > u$ then\n", 187 | " $\\theta^{(t+1)} = \\theta^{\\prime}$, otherwise\n", 188 | " $\\theta^{(t+1)} = \\theta^{(t)}$.\n", 189 | "\n", 190 | "### Random-walk Metropolis-Hastings\n", 191 | "\n", 192 | "A practical implementation of the Metropolis-Hastings algorithm makes use of a random-walk proposal.\n", 193 | "Recall that a random walk is a Markov chain that evolves according to:\n", 194 | "\n", 195 | "$$\n", 196 | "\\theta^{(t+1)} = \\theta^{(t)} + \\epsilon_t \\\\\n", 197 | "\\epsilon_t \\sim f(\\phi)\n", 198 | "$$\n", 199 | "\n", 200 | "As applied to the MCMC sampling, the random walk is used as a proposal distribution, whereby dependent proposals are generated according to:\n", 201 | "\n", 202 | "$$\\begin{gathered}\n", 203 | "\\begin{split}q(\\theta^{\\prime} | \\theta^{(t)}) = f(\\theta^{\\prime} - \\theta^{(t)}) = \\theta^{(t)} + \\epsilon_t\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 204 | "\n", 205 | "Generally, the density generating $\\epsilon_t$ is symmetric about zero,\n", 206 | "resulting in a symmetric chain. Chain symmetry implies that\n", 207 | "$q(\\theta^{\\prime} | \\theta^{(t)}) = q(\\theta^{(t)} | \\theta^{\\prime})$,\n", 208 | "which reduces the Metropolis-Hastings acceptance ratio to:\n", 209 | "\n", 210 | "$$\\begin{gathered}\n", 211 | "\\begin{split}a(\\theta^{\\prime},\\theta) = \\frac{\\pi(\\theta^{\\prime})}{\\pi(\\theta)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 212 | "\n", 213 | "The choice of the random walk distribution for $\\epsilon_t$ is frequently a normal or Student’s $t$ density, but it may be any distribution that generates an irreducible proposal chain.\n", 214 | "\n", 215 | "An important consideration is the specification of the scale parameter for the random walk error distribution. Large values produce random walk steps that are highly exploratory, but tend to produce proposal values in the tails of the target distribution, potentially resulting in very small acceptance rates. Conversely, small values tend to be accepted more frequently, since they tend to produce proposals close to the current parameter value, but may result in chains that mix very slowly.\n", 216 | "Some simulation studies suggest optimal acceptance rates in the range of 20-50%. It is often worthwhile to optimize the proposal variance by iteratively adjusting its value, according to observed acceptance rates early in the MCMC simulation ." 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "# Hamiltonian Monte Carlo" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "While flexible and easy to implement, Metropolis-Hastings sampling is a random walk\n", 231 | "sampler that might not be statistically efficient for many models. In\n", 232 | "this context, and when sampling from continuous variables, Hamiltonian (or Hybrid) Monte\n", 233 | "Carlo (HMC) can prove to be a powerful tool. It avoids\n", 234 | "random walk behavior by simulating a physical system governed by\n", 235 | "Hamiltonian dynamics, potentially avoiding tricky conditional\n", 236 | "distributions in the process.\n", 237 | "\n", 238 | "![hmc comparison](images/hmc.png)\n", 239 | "\n", 240 | "In HMC, model samples are obtained by simulating a physical system,\n", 241 | "where particles move about a high-dimensional landscape, subject to\n", 242 | "potential and kinetic energies. Adapting the notation from [Neal (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html),\n", 243 | "particles are characterized by a position vector or state\n", 244 | "$s \\in \\mathcal{R}^D$ and velocity vector $\\phi \\in \\mathcal{R}^D$. The\n", 245 | "combined state of a particle is denoted as $\\chi=(s,\\phi)$. The\n", 246 | "Hamiltonian is then defined as the sum of potential energy $E(s)$ and kinetic energy\n", 247 | "$K(\\phi)$, as follows:\n", 248 | "\n", 249 | "$$\\mathcal{H}(s,\\phi) = E(s) + K(\\phi)\n", 250 | "= E(s) + \\frac{1}{2} \\sum_i \\phi_i^2$$\n", 251 | "\n", 252 | "Instead of sampling $p(s)$ directly, HMC operates by sampling from the\n", 253 | "canonical distribution\n", 254 | "$p(s,\\phi) = \\frac{1}{Z} \\exp(-\\mathcal{H}(s,\\phi))=p(s)p(\\phi)$.\n", 255 | "Because the two variables are independent, marginalizing over $\\phi$ is\n", 256 | "trivial and recovers the original distribution of interest.\n", 257 | "\n", 258 | "**Hamiltonian Dynamics**\n", 259 | "\n", 260 | "State $s$ and velocity $\\phi$ are modified such that\n", 261 | "$\\mathcal{H}(s,\\phi)$ remains constant throughout the simulation. The\n", 262 | "differential equations are given by:\n", 263 | "\n", 264 | "$$\\begin{aligned}\\frac{ds_i}{dt} &= \\frac{\\partial \\mathcal{H}}{\\partial \\phi_i} = \\phi_i \\\\\n", 265 | "\\frac{d\\phi_i}{dt} &= - \\frac{\\partial \\mathcal{H}}{\\partial s_i}\n", 266 | "= - \\frac{\\partial E}{\\partial s_i}\n", 267 | "\\end{aligned}$$\n", 268 | "\n", 269 | "As shown in [Neal (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html), \n", 270 | "the above transformation preserves volume and is\n", 271 | "reversible. The above dynamics can thus be used as transition operators\n", 272 | "of a Markov chain and will leave $p(s,\\phi)$ invariant. That chain by\n", 273 | "itself is not ergodic however, since simulating the dynamics maintains a\n", 274 | "fixed Hamiltonian $\\mathcal{H}(s,\\phi)$. HMC thus alternates Hamiltonian\n", 275 | "dynamic steps, with Gibbs sampling of the velocity. Because $p(s)$ and\n", 276 | "$p(\\phi)$ are independent, sampling $\\phi_{new} \\sim p(\\phi|s)$ is\n", 277 | "trivial since $p(\\phi|s)=p(\\phi)$, where $p(\\phi)$ is often taken to be\n", 278 | "the univariate Gaussian.\n", 279 | "\n", 280 | "**The Leap-Frog Algorithm**\n", 281 | "\n", 282 | "In practice, we cannot simulate Hamiltonian dynamics exactly because of\n", 283 | "the problem of time discretization. There are several ways one can do\n", 284 | "this. To maintain invariance of the Markov chain however, care must be\n", 285 | "taken to preserve the properties of *volume conservation* and *time\n", 286 | "reversibility*. The **leap-frog algorithm** maintains these properties\n", 287 | "and operates in 3 steps:\n", 288 | "\n", 289 | "$$\\begin{aligned}\n", 290 | "\\phi_i(t + \\epsilon/2) &= \\phi_i(t) - \\frac{\\epsilon}{2} \\frac{\\partial{}}{\\partial s_i} E(s(t)) \\\\\n", 291 | "s_i(t + \\epsilon) &= s_i(t) + \\epsilon \\phi_i(t + \\epsilon/2) \\\\\n", 292 | "\\phi_i(t + \\epsilon) &= \\phi_i(t + \\epsilon/2) - \\frac{\\epsilon}{2} \\frac{\\partial{}}{\\partial s_i} E(s(t + \\epsilon)) \n", 293 | "\\end{aligned}$$\n", 294 | "\n", 295 | "We thus perform a half-step update of the velocity at time\n", 296 | "$t+\\epsilon/2$, which is then used to compute $s(t + \\epsilon)$ and\n", 297 | "$\\phi(t + \\epsilon)$.\n", 298 | "\n", 299 | "**Accept / Reject**\n", 300 | "\n", 301 | "In practice, using finite stepsizes $\\epsilon$ will not preserve\n", 302 | "$\\mathcal{H}(s,\\phi)$ exactly and will introduce bias in the simulation.\n", 303 | "Also, rounding errors due to the use of floating point numbers means\n", 304 | "that the above transformation will not be perfectly reversible.\n", 305 | "\n", 306 | "HMC cancels these effects **exactly** by adding a Metropolis\n", 307 | "accept/reject stage, after $n$ leapfrog steps. The new state\n", 308 | "$\\chi' = (s',\\phi')$ is accepted with probability $p_{acc}(\\chi,\\chi')$,\n", 309 | "defined as:\n", 310 | "\n", 311 | "$$p_{acc}(\\chi,\\chi') = min \\left( 1, \\frac{\\exp(-\\mathcal{H}(s',\\phi')}{\\exp(-\\mathcal{H}(s,\\phi)} \\right)$$\n", 312 | "\n", 313 | "**HMC Algorithm**\n", 314 | "\n", 315 | "We obtain a new HMC sample as follows:\n", 316 | "\n", 317 | "1. sample a new velocity from a univariate Gaussian distribution\n", 318 | "2. perform $n$ leapfrog steps to obtain the new state $\\chi'$\n", 319 | "3. perform accept/reject move of $\\chi'$" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## No U-Turn Sampling\n", 327 | "\n", 328 | "The major drawback of the HMC algorithm is the extensive tuning required to make it sample efficiency. There are a handful of parameters that require specification by the user:\n", 329 | "\n", 330 | "- the scaling of the momentum distribution\n", 331 | "- the step size forthe leapfrog algorithm\n", 332 | "- the number of steps to be taken for the leapfrog algorithm\n", 333 | "\n", 334 | "When these parameters are poorly-chosen, the HMC algorithm can suffer severe losses in efficiency. For example, if we take steps that are too short, the simulation becomes a random walk, while steps that are too long end up retracing paths already taken.\n", 335 | "\n", 336 | "An efficient MCMC algorithm seeks to optimize mixing, while maintaining detailed balance. While HMC can be tuned on-the-fly, it requires costly burn-in runs to do so.\n", 337 | "\n", 338 | "![nuts](images/nuts.png)\n", 339 | "\n", 340 | "The No U-turn Sampling (NUTS) algorithm automatically tunes the step size and step number parameters, without any intervention from the user. To do so, NUTS constructs a binary tree of leapfrog steps by repeated doubling. When the trajectory of steps creates an angle of more than 90 degrees (*i.e.* a u-turn), the doubling stops, and a point is proposed.\n", 341 | "\n", 342 | "![binary doubling](images/binary_doubling.png)\n", 343 | "\n", 344 | "NUTS provides the efficiency of gradient-based MCMC sampling without extensive user intervention required to tune Hamiltonian Monte Carlo. As the result, NUTS is the default sampling algorithm for continuous variables in PyMC3." 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## References\n", 352 | "\n", 353 | "1. [Gelman, A., Carlin, J. B., Stern, H. S., Dunson, D. B., Vehtari, A., and Rubin, D. B. (2013)](http://www.stat.columbia.edu/~gelman/book/). Bayesian Data Analysis. Chapman &Hall/CRC Press, London, third edition.\n", 354 | "2. [Geyer, C. (2013)](http://www.mcmchandbook.net/HandbookChapter1.pdf) Introduction to Markov Chain Monte Carlo. In *Handbook of Markov Chain Monte Carlo*, S. Brooks, A. Gelman, G. Jones, X.L. Meng, eds. CRC Press.\n", 355 | "3. [Neal, R.M. (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html) Probabilistic Inference Using Markov Chain Monte Carlo Methods, Technical Report CRG-TR-93-1, Dept. of Computer Science, University of Toronto, 144 pages.\n", 356 | "4. [Blei, David M. (2014)](https://www.annualreviews.org/doi/full/10.1146/annurev-statistics-022513-115657) Build, compute, critique, repeat: Data analysis with latent variable models. *Annual Review of Statistics and Its Application 1*: 203-232.\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [] 365 | } 366 | ], 367 | "metadata": { 368 | "kernelspec": { 369 | "display_name": "Python [default]", 370 | "language": "python", 371 | "name": "python3" 372 | }, 373 | "language_info": { 374 | "codemirror_mode": { 375 | "name": "ipython", 376 | "version": 3 377 | }, 378 | "file_extension": ".py", 379 | "mimetype": "text/x-python", 380 | "name": "python", 381 | "nbconvert_exporter": "python", 382 | "pygments_lexer": "ipython3", 383 | "version": "3.6.4" 384 | } 385 | }, 386 | "nbformat": 4, 387 | "nbformat_minor": 1 388 | } 389 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/3. Theano-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Theano\n", 8 | "\n", 9 | "While most of PyMC3's user-facing features are written in pure Python, it leverages Theano (Bergstra et al., 2010) to transparently transcode models to C and compile them to machine code, thereby boosting performance. Theano is a library that allows expressions to be defined using generalized vector data structures called **tensors**, which are tightly integrated with the popular NumPy `ndarray` data structure, and similarly allow for broadcasting and advanced indexing, just as NumPy arrays do. Theano also automatically optimizes the likelihood's computational graph for speed and provides simple GPU integration." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Theano is a Python library that allows you to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays efficiently. Theano features:\n", 17 | "\n", 18 | "* __tight integration with numpy__ – Use numpy.ndarray in Theano-compiled functions.\n", 19 | "* __transparent use of a GPU__ – Perform data-intensive calculations up to 140x faster than with CPU.(float32 only)\n", 20 | "* __efficient symbolic differentiation__ – Theano does your derivatives for function with one or many inputs.\n", 21 | "* __speed and stability optimizations__ – Get the right answer for log(1+x) even when x is really tiny.\n", 22 | "* __dynamic C code generation__ – Evaluate expressions faster.\n", 23 | "* __extensive unit-testing and self-verification__ – Detect and diagnose errors.\n", 24 | "\n", 25 | "Theano is part programming language, part compiler. It is often used to build machine learning, just as packages like TensorFlow are, though it is not in itself a machine learning toolkit; think of it as a **mathematical toolkit**.\n", 26 | "\n", 27 | "### Installing Theano\n", 28 | "\n", 29 | "The easiest way to install Theano is to build it from source, using **pip**:\n", 30 | "\n", 31 | "```bash\n", 32 | "pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git\n", 33 | "```\n", 34 | "\n", 35 | "however, if you have PyMC3 installed, then Theano will already be available." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Adding Two Scalars\n", 43 | "\n", 44 | "To get us started with Theano and get a feel of what we're working with, let's make a simple function: *add two numbers together*. Here is how you do it:\n", 45 | "\n", 46 | "### Step 1 - Declaring Variables" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from theano import function, shared\n", 56 | "from theano import tensor as T\n", 57 | "import theano\n", 58 | "\n", 59 | "x = T.dscalar('x')\n", 60 | "y = T.dscalar('y')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "In Theano, all symbols must be typed. In particular, `T.dscalar`\n", 68 | "is the type we assign to \"0-dimensional arrays (`scalar`) of doubles\n", 69 | "(`d`)\". It is a Theano `type`." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "type(x)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "x.type" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "T.dscalar" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Step 2 - Symbolic Expressions\n", 104 | "\n", 105 | "The second step is to combine *x* and *y* into their sum *z*:" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "z = x + y" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "*z* is yet another *Variable* which represents the addition of\n", 122 | "*x* and *y*. You can use the `pp` function to *pretty-print* out the computation associated to *z*.\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from theano.printing import pp\n", 132 | "\n", 133 | "print(pp(z))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Step 3 - Compiling a Function\n", 141 | "\n", 142 | "The last step is to create a function taking `x` and `y` as **inputs** and returning `z` as **output**:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "f = function([x, y], z)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "The first argument to `function()` is a list of `Variable`s that will be provided as inputs to the function. The second argument is a single `Variable` *or* a list of `Variable`s. For either case, the second argument is what we want to see as output when we apply the function. `f` may then be used like a normal Python function." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Now we can call the function:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "f(2, 3)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "f(16.4, 12.1)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "If you are following along and typing into an interpreter, you may have\n", 191 | "noticed that there was a slight delay in executing the ``function``\n", 192 | "instruction. Behind the scenes, *f* was being compiled into C code." 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Internally, Theano builds a graph structure composed of interconnected `Variable` nodes, `op` nodes and `apply` nodes. \n", 200 | "\n", 201 | "An `apply` node represents the application of an `op` to some variables. It is important to draw the difference between the definition of a computation represented by an `op` and its application to some actual data which is represented by the apply node. \n", 202 | "\n", 203 | "Here is the expression graph corresponding to the addition of `x` and `y`:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "from theano import printing\n", 213 | "\n", 214 | "printing.pydotprint(f, 'images/f.png')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "from IPython.display import Image\n", 224 | "Image('images/f.png', width='80%')" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "A `Variable` is the main data structure you work with when using Theano. By calling `T.dscalar` with a string argument, you create a `Variable` representing a floating-point scalar quantity with the given name. If you provide no argument, the symbol will be unnamed. Names are not required, but they can help debugging." 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Adding Two Matrices\n", 239 | "\n", 240 | "If we want to work with matrices instead of scalars, the only change\n", 241 | "from the previous example is that you need to instantiate *x* and\n", 242 | "*y* using the matrix Types:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "x = T.dmatrix('x')\n", 252 | "y = T.dmatrix('y')\n", 253 | "z = x + y\n", 254 | "f = function([x, y], z)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "``dmatrix`` is the Type for matrices of doubles. Then we can use\n", 262 | "our new function on 2D arrays:\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "f([[1, 2], [3, 4]], [[10, 20], [30, 40]])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "The following types are available:\n", 279 | "\n", 280 | "* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4``\n", 281 | "* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4``\n", 282 | "* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4``\n", 283 | "* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4``\n", 284 | "* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4``\n", 285 | "* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4``\n", 286 | "* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4``" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "An example of a slightly more interesting function is the **logistic curve**. Let's create a matrix, and apply the logistic transformation to it:" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "x = T.dmatrix('x')" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "The logistic transformation:" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "s = 1 / (1 + T.exp(-x))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "logistic = function([x], s)\n", 328 | "logistic([[0, 1], [-1, -2]])" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "Theano supports functions with multiple outputs. For example, we can compute the **elementwise difference**, **absolute difference**, and **squared difference** between two matrices `a` and `b` at the same time." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "a, b = T.dmatrices('a', 'b')\n", 345 | "\n", 346 | "# Operations\n", 347 | "diff = a - b\n", 348 | "abs_diff = abs(diff)\n", 349 | "diff_squared = diff ** 2" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "When we use the function `f`, it returns the three computed results as a list." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "f = function([a, b], [diff, abs_diff, diff_squared])\n", 366 | "\n", 367 | "f([[1, 1], [1, 1]], [[0, 1], [2, 3]])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## Setting a Default Value for an Argument\n", 375 | " \n", 376 | "Let's say you want to define a function that adds two numbers, except that if you only provide one number, the other input is assumed to be one. In Python, the **default value** for parameters achieves this effect.\n", 377 | "\n", 378 | "In Theano we make use of the `In` class, which allows you to specify properties of your function's parameters with greater detail. Here we give a default value of 1 for `y` by creating an ``In`` instance with its ``value`` field set to 1. Inputs with default values must **follow** inputs without default values (like Python's functions). There can be multiple inputs with default values. These parameters can be set positionally or by name, as in standard Python." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "from theano import In\n", 388 | "\n", 389 | "x, y, w = T.dscalars('x', 'y', 'w')\n", 390 | "z = (x + y) * w\n", 391 | "g = function([x, In(y, value=1), In(w, value=2, name='w_by_name')], z)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "print('g(33) = {}'.format(g(33)))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "print('g(33, 0, 1) = {}'.format(g(33, 0, 1)))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "print('g(33, w_by_name=1) = {}'.format(g(33, w_by_name=1)))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "print('g(33, w_by_name=1, y=0) = {}'.format(g(33, w_by_name=1, y=0)))" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "## Maintaining State with Shared Variables\n", 435 | "\n", 436 | "It is also possible to make a function with an internal state. For example, let’s say we want to make an **accumulator**: at the beginning, the state is initialized to zero. Then, on each function call, the state is incremented by the function’s argument.\n", 437 | "\n", 438 | "First let’s define the accumulator function. It adds its argument to the internal state, and returns the old state value." 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "state = shared(0)\n", 448 | "inc = T.iscalar('inc')\n", 449 | "accumulator = function([inc], state, updates=[(state, state+inc)])" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "This code introduces a couple of new concepts. The `shared` function constructs so-called **shared variables**. \n", 457 | "\n", 458 | " state = shared(0)\n", 459 | "\n", 460 | "These are hybrid symbolic and non-symbolic variables whose value may be shared between multiple functions. \n", 461 | "\n", 462 | "Shared variables can be used in symbolic expressions but they also have an internal value that defines the value taken by this symbolic variable in all the functions that use it. It is called a shared variable because its value is shared between many functions. The value can be accessed and modified by the `get_value` and `set_value` methods.\n", 463 | "\n", 464 | "The other new thing in this code is the `updates` parameter of function. \n", 465 | "\n", 466 | " updates=[(state, state+inc)\n", 467 | "\n", 468 | "`updates` must be supplied with a list of pairs of the form `(shared-variable, new expression)`. It can also be a dictionary whose keys are shared-variables and values are the new expressions. Here, the accumulator replaces the `state`‘s value with the sum of `state` and the increment amount `inc`." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "print(state.get_value())" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "print(accumulator(1))" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "print(state.get_value())" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "print(accumulator(300))" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "print(state.get_value())" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "It is possible to reset the state. Just use the `set_value` method:" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "state.set_value(-1)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "print(accumulator(3))" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "print(state.get_value())" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "As we mentioned above, you can define more than one function to use the same shared variable. These functions can all update the value." 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "decrementor = function([inc], state, updates=[(state, state-inc)])" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "print(decrementor(2))" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "print(state.get_value())" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "You might be wondering why the updates mechanism exists. You can always achieve a similar result by returning the new expressions, and working with them in NumPy as usual. \n", 589 | "\n", 590 | "While the updates mechanism can be a syntactic convenience, it is mainly there for **efficiency**. Updates to shared variables can sometimes be done more quickly using in-place algorithms (*e.g.* low-rank matrix updates). \n", 591 | "\n", 592 | "Also, Theano has more control over where and how shared variables are allocated, which is one of the important elements of getting good performance on the GPU." 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "### Exercise: Create and manipulate Theano objects\n", 600 | "\n", 601 | "To give you some practice with basic Theano data structures and functions, try making the operations below work by implementing the functions that are needed." 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "def make_vector():\n", 611 | " \"\"\"\n", 612 | " Create and return a new Theano vector.\n", 613 | " \"\"\"\n", 614 | "\n", 615 | " pass\n", 616 | "\n", 617 | "def make_matrix():\n", 618 | " \"\"\"\n", 619 | " Create and return a new Theano matrix.\n", 620 | " \"\"\"\n", 621 | "\n", 622 | " pass\n", 623 | "\n", 624 | "def elemwise_mul(a, b):\n", 625 | " \"\"\"\n", 626 | " a: A theano matrix\n", 627 | " b: A theano matrix\n", 628 | " \n", 629 | " Calcuate the elementwise product of a and b and return it\n", 630 | " \"\"\"\n", 631 | "\n", 632 | " pass\n", 633 | "\n", 634 | "def matrix_vector_mul(a, b):\n", 635 | " \"\"\"\n", 636 | " a: A theano matrix\n", 637 | " b: A theano vector\n", 638 | " \n", 639 | " Calculate the matrix-vector product of a and b and return it\n", 640 | " \"\"\"\n", 641 | "\n", 642 | " pass\n", 643 | "\n", 644 | "a = make_vector()\n", 645 | "b = make_vector()\n", 646 | "c = elemwise_mul(a, b)\n", 647 | "d = make_matrix()\n", 648 | "e = matrix_vector_mul(d, c)\n", 649 | "\n", 650 | "f = function([a, b, d], e)\n", 651 | "\n", 652 | "import numpy as np\n", 653 | "rng = np.random.RandomState([1, 2, 3])\n", 654 | "a_value = rng.randn(5).astype(a.dtype)\n", 655 | "b_value = rng.rand(5).astype(b.dtype)\n", 656 | "c_value = a_value * b_value\n", 657 | "d_value = rng.randn(5, 5).astype(d.dtype)\n", 658 | "expected = np.dot(d_value, c_value)\n", 659 | "\n", 660 | "actual = f(a_value, b_value, d_value)\n", 661 | "\n", 662 | "assert np.allclose(actual, expected)\n", 663 | "print(\"SUCCESS!\")" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "### Example: Logistic regression\n", 671 | "\n", 672 | "Here is a non-trivial example, which uses Theano to estimate the parameters of a logistic regression model using gradient information. We will use the bioassay example as a test case:" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "import numpy as np\n", 682 | "\n", 683 | "rng = np.random\n", 684 | "\n", 685 | "dose = np.array([-0.86, -0.3 , -0.05, 0.73])\n", 686 | "deaths = np.array([0, 1, 3, 5])\n", 687 | "training_steps = 1000" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "We first declare Theano symbolic variables:" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "x = T.vector(\"x\")\n", 704 | "y = T.vector(\"y\")\n", 705 | "w = theano.shared(1., name=\"w\")\n", 706 | "b = theano.shared(0., name=\"b\")\n", 707 | "\n", 708 | "print(\"Initial model:\", w.get_value(), b.get_value())" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "... then construct the expression graph:" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "# Probability that target = 1\n", 725 | "p_1 = 1 / (1 + T.exp(-(x*w + b))) \n", 726 | "\n", 727 | "# The prediction threshold\n", 728 | "prediction = p_1 > 0.5 \n", 729 | "\n", 730 | "# Cross-entropy loss function\n", 731 | "xent = -y * T.log(p_1) - (5-y) * T.log(1-p_1) \n", 732 | "\n", 733 | "# The cost to minimize\n", 734 | "cost = xent.mean() \n", 735 | "\n", 736 | "# Compute the gradient of the cost\n", 737 | "gw, gb = T.grad(cost, [w, b]) " 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "Compile Theano functions:" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "step = theano.shared(10., name='step')\n", 754 | "train = theano.function(\n", 755 | " inputs=[x, y],\n", 756 | " outputs=[prediction, xent],\n", 757 | " updates=((w, w - step * gw), (b, b - step * gb), (step, step * 0.99)))\n", 758 | "predict = theano.function(inputs=[x], outputs=prediction)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "Train model:" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [ 774 | "for i in range(training_steps):\n", 775 | " pred, err = train(dose, deaths)\n", 776 | "\n", 777 | "print(\"Final model:\", w.get_value(), b.get_value())" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "%matplotlib inline\n", 787 | "import matplotlib.pyplot as plt\n", 788 | "\n", 789 | "logit = lambda x: 1. / (1 + np.exp(-x))\n", 790 | "xvals = np.linspace(-1, 1)\n", 791 | "plt.plot(xvals, logit(7.8*xvals + .85))\n", 792 | "plt.plot(dose, deaths/5., 'ro')" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "## Exercises: Gradients and functions\n", 800 | "\n", 801 | "Let's try using the Theano automatic gradient system to compute derivatives." 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "def grad_sum(x, y, z):\n", 811 | " \"\"\"\n", 812 | " x: A theano variable\n", 813 | " y: A theano variable\n", 814 | " z: A theano expression involving x and y\n", 815 | " Returns dz / dx + dz / dy\n", 816 | " \"\"\"\n", 817 | "\n", 818 | " pass\n", 819 | "\n", 820 | "x = T.scalar()\n", 821 | "y = T.scalar()\n", 822 | "z = x + y\n", 823 | "s = grad_sum(x, y, z)\n", 824 | "assert s.eval({x: 0, y: 0}) == 2\n", 825 | "print(\"SUCCESS!\")" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "### Random Numbers\n", 833 | "\n", 834 | "Because in Theano you first express everything symbolically and afterwards compile this expression to get functions, using **pseudo-random numbers** is not as straightforward as it is in NumPy.\n", 835 | "\n", 836 | "The way to think about putting randomness into Theano’s computations is to put random variables in your graph. Theano will allocate a NumPy `RandomStream` object (a random number generator) for each such variable, and draw from it as necessary. We will call this sort of sequence of random numbers a random stream." 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "from theano.tensor.shared_randomstreams import RandomStreams\n", 846 | "\n", 847 | "srng = RandomStreams(seed=234)\n", 848 | "rv_u = srng.uniform((2,2))\n", 849 | "f = function([], rv_u)" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": null, 855 | "metadata": {}, 856 | "outputs": [], 857 | "source": [ 858 | "f()" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "f()" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "metadata": {}, 873 | "source": [ 874 | "## Looping in Theano\n", 875 | "\n", 876 | "The `scan` function provides the ability to write **loops** in Theano. We are not able to use Python `for` loops with Theano because Theano needs to be able to build and optimize the expression graph before compiling it into faster code, and be able to use symbolic differentiation for calculating gradients.\n", 877 | "\n", 878 | "### Simple loop with accumulation\n", 879 | "\n", 880 | "Assume that, given $k$ you want to get $A^k$ using a loop. More precisely, if $A$ is a tensor you want to compute $A^k$ elementwise. The python code might look like:\n", 881 | "\n", 882 | "```python\n", 883 | "result = 1\n", 884 | "for i in range(k):\n", 885 | " result = result * A\n", 886 | "```\n", 887 | "\n", 888 | "There are three things here that we need to handle: the initial value assigned to result, the accumulation of results in result, and the unchanging variable A. Unchanging variables are passed to scan as non_sequences. Initialization occurs in outputs_info, and the accumulation happens automatically.\n", 889 | "\n", 890 | "The equivalent Theano code would be:" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "metadata": {}, 897 | "outputs": [], 898 | "source": [ 899 | "k = T.iscalar(\"k\")\n", 900 | "A = T.vector(\"A\")\n", 901 | "\n", 902 | "# Symbolic description of the result\n", 903 | "result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A,\n", 904 | " outputs_info=T.ones_like(A),\n", 905 | " non_sequences=A,\n", 906 | " n_steps=k)\n", 907 | "\n", 908 | "# We only care about A**k, but scan has provided us with A**1 through A**k.\n", 909 | "# Discard the values that we don't care about. Scan is smart enough to\n", 910 | "# notice this and not waste memory saving them.\n", 911 | "final_result = result[-1]\n", 912 | "\n", 913 | "# compiled function that returns A**k\n", 914 | "power = theano.function(inputs=[A,k], outputs=final_result, updates=updates)\n", 915 | "\n", 916 | "print(power(range(10),2))\n", 917 | "print(power(range(10),4))" 918 | ] 919 | }, 920 | { 921 | "cell_type": "markdown", 922 | "metadata": {}, 923 | "source": [ 924 | "Let us go through the example line by line. What we did is first to **construct a function** (using a lambda expression) that given `prior_result` and `A` returns `prior_result * A`. The order of parameters is fixed by `scan`: the output of the prior call to `fn` is the first parameter, followed by all non-sequences.\n", 925 | "\n", 926 | "Next we **initialize the output** as a tensor with same shape and `dtype` as `A`, filled with ones. We give `A` to `scan` as a non sequence parameter and specify the number of steps `k` to iterate over our `lambda` expression.\n", 927 | "\n", 928 | "Scan **returns a tuple** containing our result (`result`) and a dictionary of updates (empty in this case). Note that the result is not a matrix, but a 3D tensor containing the value of $A^k$ for each step. We want the last value (after k steps) so we compile a function to return just that. \n", 929 | "\n", 930 | "Note that there is an **optimization**, that at compile time will detect that you are using just the last value of the result and ensure that scan does not store all the intermediate values that are used. So do not worry if `A` and `k` are large." 931 | ] 932 | }, 933 | { 934 | "cell_type": "markdown", 935 | "metadata": {}, 936 | "source": [ 937 | "In addition to looping a fixed number of times, scan can iterate over the leading dimension of tensors (similar to Python’s **list comprehension** `for x in a_list`).\n", 938 | "\n", 939 | "The tensor(s) to be looped over should be provided to `scan` using the `sequence` keyword argument.\n", 940 | "\n", 941 | "Here’s an example that builds a **symbolic calculation of a polynomial** from a list of its coefficients:" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "metadata": {}, 948 | "outputs": [], 949 | "source": [ 950 | "coefficients = theano.tensor.vector(\"coefficients\")\n", 951 | "x = T.scalar(\"x\")\n", 952 | "\n", 953 | "# Generate the components of the polynomial\n", 954 | "components, updates = theano.scan(fn=lambda coefficient, power, val: coefficient * (val ** power),\n", 955 | " outputs_info=None,\n", 956 | " sequences=[coefficients, theano.tensor.arange(1000)],\n", 957 | " non_sequences=x)\n", 958 | "# Sum them up\n", 959 | "polynomial = components.sum()\n", 960 | "\n", 961 | "# Compile a function\n", 962 | "calculate_polynomial = theano.function(inputs=[coefficients, x], outputs=polynomial)\n", 963 | "\n", 964 | "# Test\n", 965 | "test_coefficients = np.asarray([1, 0, 2], dtype=np.float32)\n", 966 | "test_value = 3\n", 967 | "print(calculate_polynomial(test_coefficients, test_value))" 968 | ] 969 | }, 970 | { 971 | "cell_type": "markdown", 972 | "metadata": {}, 973 | "source": [ 974 | "## Gradient-based sampling methods in PyMC3\n", 975 | "\n", 976 | "PyMC3 has the standard sampling algorithms like adaptive Metropolis-Hastings and adaptive slice sampling, but PyMC3's most capable step method is the No-U-Turn Sampler. NUTS is especially useful on models that have many continuous parameters, a situation where other MCMC algorithms work very slowly. It takes advantage of information about where regions of higher probability are, based on the gradient of the log posterior-density. This helps it achieve dramatically faster convergence on large problems than traditional sampling methods achieve. \n", 977 | "\n", 978 | "\n", 979 | "PyMC3 relies on Theano to analytically compute **model gradients** via automatic differentiation of the posterior density. NUTS also has several self-tuning strategies for adaptively setting the tunable parameters of Hamiltonian Monte Carlo. For random variables that are undifferentiable (namely, discrete variables) NUTS cannot be used, but it may still be used on the differentiable variables in a model that contains undifferentiable variables. " 980 | ] 981 | }, 982 | { 983 | "cell_type": "markdown", 984 | "metadata": {}, 985 | "source": [ 986 | "## Comparison of Metropolis and NUTS\n", 987 | "\n", 988 | "As an informal comparison, we will demonstrate samples generated from a simple statistical model using both the Metropolis and NUTS sampler in PyMC3. The set of examples includes a univariate linear model that is fit to simulated data via the `glm` module. \n", 989 | "\n", 990 | "```python\n", 991 | "with Model() as model:\n", 992 | " glm.glm('y ~ x', data)\n", 993 | "```\n", 994 | "\n", 995 | "The model contains three parameters (intercept, slope and sampling standard deviation), each of which is continuous, so the model can be fit by either algorithm. We will run a short chain for each, and compare the output graphically:" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [ 1004 | "from pymc3.examples import glm_linear\n", 1005 | "from pymc3 import sample, Metropolis, NUTS" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": null, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "with glm_linear.model:\n", 1015 | " trace_metropolis = sample(1000, step=Metropolis())" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": null, 1021 | "metadata": {}, 1022 | "outputs": [], 1023 | "source": [ 1024 | "with glm_linear.model:\n", 1025 | " trace_nuts = sample(1000, step=NUTS())" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": null, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "traceplot(trace_metropolis);" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [ 1043 | "traceplot(trace_nuts);" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "markdown", 1048 | "metadata": {}, 1049 | "source": [ 1050 | "The samples from `Metropolis` shows very poor mixing during the first 1000 iterations, and has clearly converged. The `NUTS` samples are more homogeneous, with better mixing and less autocorrelation." 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "markdown", 1055 | "metadata": {}, 1056 | "source": [ 1057 | "## References\n", 1058 | "\n", 1059 | "1. [DeepLearning documentation and tutorials](http://deeplearning.net/tutorial/contents.html)\n", 1060 | "\n" 1061 | ] 1062 | } 1063 | ], 1064 | "metadata": { 1065 | "kernelspec": { 1066 | "display_name": "Python [default]", 1067 | "language": "python", 1068 | "name": "python3" 1069 | }, 1070 | "language_info": { 1071 | "codemirror_mode": { 1072 | "name": "ipython", 1073 | "version": 3 1074 | }, 1075 | "file_extension": ".py", 1076 | "mimetype": "text/x-python", 1077 | "name": "python", 1078 | "nbconvert_exporter": "python", 1079 | "pygments_lexer": "ipython3", 1080 | "version": "3.6.4" 1081 | } 1082 | }, 1083 | "nbformat": 4, 1084 | "nbformat_minor": 1 1085 | } 1086 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/6. Model Checking-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Checking\n", 8 | "\n", 9 | "After running an MCMC simulation, `sample` returns a `MutliTrace` object containing the samples for all the stochastic and deterministic random variables. The final step in Bayesian computation is model checking, in order to ensure that inferences derived from your sample are valid. There are two components to model checking:\n", 10 | "\n", 11 | "1. Convergence diagnostics\n", 12 | "2. Goodness of fit\n", 13 | "\n", 14 | "Convergence diagnostics are intended to detect lack of convergence in the Markov chain Monte Carlo sample; it is used to ensure that you have not halted your sampling too early. However, a converged model is not guaranteed to be a good model. The second component of model checking, goodness of fit, is used to check the internal validity of the model, by comparing predictions from the model to the data used to fit the model. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Convergence Diagnostics\n", 22 | "\n", 23 | "Valid inferences from sequences of MCMC samples are based on the\n", 24 | "assumption that the samples are derived from the true posterior\n", 25 | "distribution of interest. Theory guarantees this condition as the number\n", 26 | "of iterations approaches infinity. It is important, therefore, to\n", 27 | "determine the **minimum number of samples** required to ensure a reasonable\n", 28 | "approximation to the target posterior density. Unfortunately, no\n", 29 | "universal threshold exists across all problems, so convergence must be\n", 30 | "assessed independently each time MCMC estimation is performed. The\n", 31 | "procedures for verifying convergence are collectively known as\n", 32 | "*convergence diagnostics*.\n", 33 | "\n", 34 | "One approach to analyzing convergence is **analytical**, whereby the\n", 35 | "variance of the sample at different sections of the chain are compared\n", 36 | "to that of the limiting distribution. These methods use distance metrics\n", 37 | "to analyze convergence, or place theoretical bounds on the sample\n", 38 | "variance, and though they are promising, they are generally difficult to\n", 39 | "use and are not prominent in the MCMC literature. More common is a\n", 40 | "**statistical** approach to assessing convergence. With this approach,\n", 41 | "rather than considering the properties of the theoretical target\n", 42 | "distribution, only the statistical properties of the observed chain are\n", 43 | "analyzed. Reliance on the sample alone restricts such convergence\n", 44 | "criteria to **heuristics**. As a result, convergence cannot be guaranteed.\n", 45 | "Although evidence for lack of convergence using statistical convergence\n", 46 | "diagnostics will correctly imply lack of convergence in the chain, the\n", 47 | "absence of such evidence will not *guarantee* convergence in the chain.\n", 48 | "Nevertheless, negative results for one or more criteria may provide some\n", 49 | "measure of assurance to users that their sample will provide valid\n", 50 | "inferences.\n", 51 | "\n", 52 | "For most simple models, convergence will occur quickly, sometimes within\n", 53 | "a the first several hundred iterations, after which all remaining\n", 54 | "samples of the chain may be used to calculate posterior quantities. For\n", 55 | "more complex models, convergence requires a significantly longer burn-in\n", 56 | "period; sometimes orders of magnitude more samples are needed.\n", 57 | "Frequently, lack of convergence will be caused by **poor mixing**. \n", 58 | "Recall that *mixing* refers to the degree to which the Markov\n", 59 | "chain explores the support of the posterior distribution. Poor mixing\n", 60 | "may stem from inappropriate proposals (if one is using the\n", 61 | "Metropolis-Hastings sampler) or from attempting to estimate models with\n", 62 | "highly correlated variables." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "%matplotlib inline\n", 72 | "import numpy as np\n", 73 | "import seaborn as sns; sns.set_context('notebook')" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from pymc3 import exp, Normal, Binomial, sample, Model\n", 83 | "\n", 84 | "# Samples for each dose level\n", 85 | "n = 5 * np.ones(4, dtype=int)\n", 86 | "# Log-dose\n", 87 | "dose = np.array([-.86, -.3, -.05, .73])\n", 88 | "deaths = np.array([0, 1, 3, 5])\n", 89 | "\n", 90 | "def invlogit(x):\n", 91 | " return exp(x) / (1 + exp(x))\n", 92 | "\n", 93 | "with Model() as bioassay_model:\n", 94 | "\n", 95 | " # Logit-linear model parameters\n", 96 | " alpha = Normal('alpha', 0, 0.01)\n", 97 | " beta = Normal('beta', 0, 0.01)\n", 98 | "\n", 99 | " # Calculate probabilities of death\n", 100 | " theta = invlogit(alpha + beta * dose)\n", 101 | "\n", 102 | " # Data likelihood\n", 103 | " deaths = Binomial('deaths', n=n, p=theta, observed=deaths)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from pymc3 import Metropolis\n", 113 | "\n", 114 | "with bioassay_model:\n", 115 | " step = Metropolis(scaling=0.0001)\n", 116 | " bioassay_trace = sample(1000, step=step)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "from pymc3 import traceplot\n", 126 | "\n", 127 | "traceplot(bioassay_trace[500:], varnames=['alpha'])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Informal Methods\n", 135 | "\n", 136 | "The most straightforward approach for assessing convergence is based on\n", 137 | "simply **plotting and inspecting traces and histograms** of the observed\n", 138 | "MCMC sample. If the trace of values for each of the stochastics exhibits\n", 139 | "asymptotic behavior over the last $m$ iterations, this may be\n", 140 | "satisfactory evidence for convergence. " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "with bioassay_model:\n", 150 | " bioassay_trace = sample(10000)\n", 151 | " \n", 152 | "traceplot(bioassay_trace[9000:], varnames=['beta'])" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "A similar approach involves\n", 160 | "plotting a histogram for every set of $k$ iterations (perhaps 50-100)\n", 161 | "beyond some burn in threshold $n$; if the histograms are not visibly\n", 162 | "different among the sample intervals, this may be considered some evidence for\n", 163 | "convergence. Note that such diagnostics should be carried out for each\n", 164 | "stochastic estimated by the MCMC algorithm, because convergent behavior\n", 165 | "by one variable does not imply evidence for convergence for other\n", 166 | "variables in the analysis. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "import matplotlib.pyplot as plt\n", 176 | "\n", 177 | "beta_trace = bioassay_trace['beta']\n", 178 | "\n", 179 | "fig, axes = plt.subplots(2, 5, figsize=(14,6))\n", 180 | "axes = axes.ravel()\n", 181 | "for i in range(10):\n", 182 | " axes[i].hist(beta_trace[500*i:500*(i+1)])\n", 183 | "plt.tight_layout()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "An extension of this approach can be taken\n", 191 | "when multiple parallel chains are run, rather than just a single, long\n", 192 | "chain. In this case, the final values of $c$ chains run for $n$\n", 193 | "iterations are plotted in a histogram; just as above, this is repeated\n", 194 | "every $k$ iterations thereafter, and the histograms of the endpoints are\n", 195 | "plotted again and compared to the previous histogram. This is repeated\n", 196 | "until consecutive histograms are indistinguishable." 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Another *ad hoc* method for detecting lack of convergence is to examine\n", 204 | "the traces of several MCMC chains initialized with different starting\n", 205 | "values. Overlaying these traces on the same set of axes should (if\n", 206 | "convergence has occurred) show each chain tending toward the same\n", 207 | "equilibrium value, with approximately the same variance. Recall that the\n", 208 | "tendency for some Markov chains to converge to the true (unknown) value\n", 209 | "from diverse initial values is called *ergodicity*. This property is\n", 210 | "guaranteed by the reversible chains constructed using MCMC, and should\n", 211 | "be observable using this technique. Again, however, this approach is\n", 212 | "only a heuristic method, and cannot always detect lack of convergence,\n", 213 | "even though chains may appear ergodic." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "with bioassay_model:\n", 223 | " \n", 224 | " bioassay_trace = sample(1000, njobs=2, start=[{'alpha':0.5}, {'alpha':5}])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "bioassay_trace.get_values('alpha', chains=0)[0]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "plt.plot(bioassay_trace.get_values('alpha', chains=0)[:200], 'r--')\n", 243 | "plt.plot(bioassay_trace.get_values('alpha', chains=1)[:200], 'k--')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "A principal reason that evidence from informal techniques cannot\n", 251 | "guarantee convergence is a phenomenon called ***metastability***. Chains may\n", 252 | "appear to have converged to the true equilibrium value, displaying\n", 253 | "excellent qualities by any of the methods described above. However,\n", 254 | "after some period of stability around this value, the chain may suddenly\n", 255 | "move to another region of the parameter space. This period\n", 256 | "of metastability can sometimes be very long, and therefore escape\n", 257 | "detection by these convergence diagnostics. Unfortunately, there is no\n", 258 | "statistical technique available for detecting metastability.\n", 259 | "\n", 260 | "### Formal Methods\n", 261 | "\n", 262 | "Along with the *ad hoc* techniques described above, a number of more\n", 263 | "formal methods exist which are prevalent in the literature. These are\n", 264 | "considered more formal because they are based on existing statistical\n", 265 | "methods, such as time series analysis.\n", 266 | "\n", 267 | "PyMC currently includes three formal convergence diagnostic methods. The\n", 268 | "first, proposed by [Geweke (1992)](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.ss/1177011446), is a time-series approach that\n", 269 | "compares the mean and variance of segments from the beginning and end of\n", 270 | "a single chain.\n", 271 | "\n", 272 | "$$z = \\frac{\\bar{\\theta}_a - \\bar{\\theta}_b}{\\sqrt{S_a(0) + S_b(0)}}$$\n", 273 | "\n", 274 | "where $a$ is the early interval and $b$ the late interval, and $S_i(0)$ is the spectral density estimate at zero frequency for chain segment $i$. If the\n", 275 | "z-scores (theoretically distributed as standard normal variates) of\n", 276 | "these two segments are similar, it can provide evidence for convergence.\n", 277 | "PyMC calculates z-scores of the difference between various initial\n", 278 | "segments along the chain, and the last 50% of the remaining chain. If\n", 279 | "the chain has converged, the majority of points should fall within 2\n", 280 | "standard deviations of zero.\n", 281 | "\n", 282 | "In PyMC, diagnostic z-scores can be obtained by calling the `geweke` function. It\n", 283 | "accepts either (1) a single trace, (2) a Node or Stochastic object, or\n", 284 | "(4) an entire Model object:" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "from pymc3 import geweke\n", 294 | "\n", 295 | "with bioassay_model:\n", 296 | " tr = sample(2000)\n", 297 | " \n", 298 | "z = geweke(tr, intervals=15)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "plt.scatter(*z['alpha'].T)\n", 308 | "plt.hlines([-1,1], 0, 1000, linestyles='dotted')\n", 309 | "plt.xlim(0, 1000)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "The arguments expected are the following:\n", 317 | "\n", 318 | "- `x` : The trace of a variable.\n", 319 | "- `first` : The fraction of series at the beginning of the trace.\n", 320 | "- `last` : The fraction of series at the end to be compared with the section at the beginning.\n", 321 | "- `intervals` : The number of segments.\n", 322 | "\n", 323 | "Plotting the output displays the scores in series, making it is easy to\n", 324 | "see departures from the standard normal assumption." 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "A second convergence diagnostic provided by PyMC is the Gelman-Rubin\n", 332 | "statistic [Gelman and Rubin (1992)](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.ss/1177011136). This diagnostic uses multiple chains to\n", 333 | "check for lack of convergence, and is based on the notion that if\n", 334 | "multiple chains have converged, by definition they should appear very\n", 335 | "similar to one another; if not, one or more of the chains has failed to\n", 336 | "converge.\n", 337 | "\n", 338 | "The Gelman-Rubin diagnostic uses an analysis of variance approach to\n", 339 | "assessing convergence. That is, it calculates both the between-chain\n", 340 | "varaince (B) and within-chain varaince (W), and assesses whether they\n", 341 | "are different enough to worry about convergence. Assuming $m$ chains,\n", 342 | "each of length $n$, quantities are calculated by:\n", 343 | "\n", 344 | "$$\\begin{align}B &= \\frac{n}{m-1} \\sum_{j=1}^m (\\bar{\\theta}_{.j} - \\bar{\\theta}_{..})^2 \\\\\n", 345 | "W &= \\frac{1}{m} \\sum_{j=1}^m \\left[ \\frac{1}{n-1} \\sum_{i=1}^n (\\theta_{ij} - \\bar{\\theta}_{.j})^2 \\right]\n", 346 | "\\end{align}$$\n", 347 | "\n", 348 | "for each scalar estimand $\\theta$. Using these values, an estimate of\n", 349 | "the marginal posterior variance of $\\theta$ can be calculated:\n", 350 | "\n", 351 | "$$\\hat{\\text{Var}}(\\theta | y) = \\frac{n-1}{n} W + \\frac{1}{n} B$$\n", 352 | "\n", 353 | "Assuming $\\theta$ was initialized to arbitrary starting points in each\n", 354 | "chain, this quantity will overestimate the true marginal posterior\n", 355 | "variance. At the same time, $W$ will tend to underestimate the\n", 356 | "within-chain variance early in the sampling run. However, in the limit\n", 357 | "as $n \\rightarrow \n", 358 | "\\infty$, both quantities will converge to the true variance of $\\theta$.\n", 359 | "In light of this, the Gelman-Rubin statistic monitors convergence using\n", 360 | "the ratio:\n", 361 | "\n", 362 | "$$\\hat{R} = \\sqrt{\\frac{\\hat{\\text{Var}}(\\theta | y)}{W}}$$\n", 363 | "\n", 364 | "This is called the potential scale reduction, since it is an estimate of\n", 365 | "the potential reduction in the scale of $\\theta$ as the number of\n", 366 | "simulations tends to infinity. In practice, we look for values of\n", 367 | "$\\hat{R}$ close to one (say, less than 1.1) to be confident that a\n", 368 | "particular estimand has converged. In PyMC, the function\n", 369 | "`gelman_rubin` will calculate $\\hat{R}$ for each stochastic node in\n", 370 | "the passed model:" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "from pymc3 import gelman_rubin\n", 380 | "\n", 381 | "gelman_rubin(bioassay_trace)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "For the best results, each chain should be initialized to highly\n", 389 | "dispersed starting values for each stochastic node.\n", 390 | "\n", 391 | "By default, when calling the `forestplot` function using nodes with\n", 392 | "multiple chains, the $\\hat{R}$ values will be plotted alongside the\n", 393 | "posterior intervals." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "from pymc3 import forestplot\n", 403 | "\n", 404 | "forestplot(bioassay_trace)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "## Autocorrelation" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "from pymc3 import autocorrplot\n", 421 | "\n", 422 | "autocorrplot(tr);" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "bioassay_trace['alpha'].shape" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "from pymc3 import effective_n\n", 441 | "\n", 442 | "effective_n(bioassay_trace)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "## Goodness of Fit\n", 450 | "\n", 451 | "Checking for model convergence is only the first step in the evaluation\n", 452 | "of MCMC model outputs. It is possible for an entirely unsuitable model\n", 453 | "to converge, so additional steps are needed to ensure that the estimated\n", 454 | "model adequately fits the data. One intuitive way of evaluating model\n", 455 | "fit is to compare model predictions with the observations used to fit\n", 456 | "the model. In other words, the fitted model can be used to simulate\n", 457 | "data, and the distribution of the simulated data should resemble the\n", 458 | "distribution of the actual data.\n", 459 | "\n", 460 | "Fortunately, simulating data from the model is a natural component of\n", 461 | "the Bayesian modelling framework. Recall, from the discussion on\n", 462 | "imputation of missing data, the posterior predictive distribution:\n", 463 | "\n", 464 | "$$p(\\tilde{y}|y) = \\int p(\\tilde{y}|\\theta) f(\\theta|y) d\\theta$$\n", 465 | "\n", 466 | "Here, $\\tilde{y}$ represents some hypothetical new data that would be\n", 467 | "expected, taking into account the posterior uncertainty in the model\n", 468 | "parameters. Sampling from the posterior predictive distribution is easy\n", 469 | "in PyMC. The code looks identical to the corresponding data stochastic,\n", 470 | "with two modifications: (1) the node should be specified as\n", 471 | "deterministic and (2) the statistical likelihoods should be replaced by\n", 472 | "random number generators. Consider the `gelman_bioassay` example, \n", 473 | "where deaths are modeled as a binomial random variable for which\n", 474 | "the probability of death is a logit-linear function of the dose of a\n", 475 | "particular drug." 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "from pymc3 import Normal, Binomial, Deterministic, invlogit\n", 485 | "\n", 486 | "# Samples for each dose level\n", 487 | "n = 5 * np.ones(4, dtype=int)\n", 488 | "# Log-dose\n", 489 | "dose = np.array([-.86, -.3, -.05, .73])\n", 490 | "\n", 491 | "with Model() as model:\n", 492 | "\n", 493 | " # Logit-linear model parameters\n", 494 | " alpha = Normal('alpha', 0, 0.01)\n", 495 | " beta = Normal('beta', 0, 0.01)\n", 496 | "\n", 497 | " # Calculate probabilities of death\n", 498 | " theta = Deterministic('theta', invlogit(alpha + beta * dose))\n", 499 | "\n", 500 | " # Data likelihood\n", 501 | " deaths = Binomial('deaths', n=n, p=theta, observed=[0, 1, 3, 5])" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "The posterior predictive distribution of deaths uses the same functional\n", 509 | "form as the data likelihood, in this case a binomial stochastic. Here is\n", 510 | "the corresponding sample from the posterior predictive distribution:" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "with model:\n", 520 | " \n", 521 | " deaths_sim = Binomial('deaths_sim', n=n, p=theta, shape=4)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "Notice that the observed stochastic `Binomial` has been replaced with a stochastic node that is identical in every respect to `deaths`, except that its values are not fixed to be the observed data -- they are left to vary according to the values of the fitted parameters.\n", 529 | "\n", 530 | "The degree to which simulated data correspond to observations can be evaluated in at least two ways. First, these quantities can simply be compared visually. This allows for a qualitative comparison of model-based replicates and observations. If there is poor fit, the true value of the data may appear in the tails of the histogram of replicated data, while a good fit will tend to show the true data in high-probability regions of the posterior predictive distribution. The Matplot package in PyMC provides an easy way of producing such plots, via the `gof_plot` function." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "with model:\n", 540 | " \n", 541 | " gof_trace = sample(2000)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "from pymc3 import forestplot\n", 551 | "\n", 552 | "forestplot(gof_trace, varnames=['deaths_sim'])" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "## Exercise: Meta-analysis of beta blocker effectiveness\n", 560 | "\n", 561 | "Carlin (1992) considers a Bayesian approach to meta-analysis, and includes the following examples of 22 trials of beta-blockers to prevent mortality after myocardial infarction.\n", 562 | "\n", 563 | "In a random effects meta-analysis we assume the true effect (on a log-odds scale) $d_i$ in a trial $i$\n", 564 | "is drawn from some population distribution. Let $r^C_i$ denote number of events in the control group in trial $i$,\n", 565 | "and $r^T_i$ denote events under active treatment in trial $i$. Our model is:\n", 566 | "\n", 567 | "$$\\begin{aligned}\n", 568 | "r^C_i &\\sim \\text{Binomial}\\left(p^C_i, n^C_i\\right) \\\\\n", 569 | "r^T_i &\\sim \\text{Binomial}\\left(p^T_i, n^T_i\\right) \\\\\n", 570 | "\\text{logit}\\left(p^C_i\\right) &= \\mu_i \\\\\n", 571 | "\\text{logit}\\left(p^T_i\\right) &= \\mu_i + \\delta_i \\\\\n", 572 | "\\delta_i &\\sim \\text{Normal}(d, t) \\\\\n", 573 | "\\mu_i &\\sim \\text{Normal}(m, s)\n", 574 | "\\end{aligned}$$\n", 575 | "\n", 576 | "We want to make inferences about the population effect $d$, and the predictive distribution for the effect $\\delta_{\\text{new}}$ in a new trial. Build a model to estimate these quantities in PyMC, and (1) use convergence diagnostics to check for convergence and (2) use posterior predictive checks to assess goodness-of-fit.\n", 577 | "\n", 578 | "Here are the data:" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "r_t_obs = [3, 7, 5, 102, 28, 4, 98, 60, 25, 138, 64, 45, 9, 57, 25, 33, 28, 8, 6, 32, 27, 22]\n", 588 | "n_t_obs = [38, 114, 69, 1533, 355, 59, 945, 632, 278,1916, 873, 263, 291, 858, 154, 207, 251, 151, 174, 209, 391, 680]\n", 589 | "r_c_obs = [3, 14, 11, 127, 27, 6, 152, 48, 37, 188, 52, 47, 16, 45, 31, 38, 12, 6, 3, 40, 43, 39]\n", 590 | "n_c_obs = [39, 116, 93, 1520, 365, 52, 939, 471, 282, 1921, 583, 266, 293, 883, 147, 213, 122, 154, 134, 218, 364, 674]\n", 591 | "N = len(n_c_obs)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "# Write your answer here" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "## References\n", 608 | "\n", 609 | "Gelman, A., & Rubin, D. B. (1992). Inference from iterative simulation using multiple sequences. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 457–472.\n", 610 | "\n", 611 | "Geweke, J., Berger, J. O., & Dawid, A. P. (1992). Evaluating the accuracy of sampling-based approaches to the calculation of posterior moments. In Bayesian Statistics 4.\n", 612 | "\n", 613 | "Brooks, S. P., Catchpole, E. A., & Morgan, B. J. T. (2000). Bayesian Animal Survival Estimation. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 15(4), 357–376. doi:10.1214/ss/1177010123\n", 614 | "\n", 615 | "Gelman, A., Meng, X., & Stern, H. (1996). Posterior predicitive assessment of model fitness via realized discrepencies with discussion. Statistica Sinica, 6, 733–807.\n", 616 | "\n", 617 | "Raftery, A., & Lewis, S. (1992). One long run with diagnostics: Implementation strategies for Markov chain Monte Carlo. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 7, 493–497.\n", 618 | "\n", 619 | "[CrossValidated: How to use scikit-learn's cross validation functions on multi-label classifiers](http://stats.stackexchange.com/questions/65828/how-to-use-scikit-learns-cross-validation-functions-on-multi-label-classifiers)" 620 | ] 621 | } 622 | ], 623 | "metadata": { 624 | "kernelspec": { 625 | "display_name": "Python [default]", 626 | "language": "python", 627 | "name": "python3" 628 | }, 629 | "language_info": { 630 | "codemirror_mode": { 631 | "name": "ipython", 632 | "version": 3 633 | }, 634 | "file_extension": ".py", 635 | "mimetype": "text/x-python", 636 | "name": "python", 637 | "nbconvert_exporter": "python", 638 | "pygments_lexer": "ipython3", 639 | "version": "3.6.4" 640 | } 641 | }, 642 | "nbformat": 4, 643 | "nbformat_minor": 1 644 | } 645 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/nuts_and_metropolis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/2. Markov Chain Monte Carlo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Markov chain Monte Carlo\n", 8 | "\n", 9 | "Let's briefly cover some theory regarding Bayesian analysis using Markov chain Monte Carlo (MCMC) methods. You might wonder why a numerical simulation method like MCMC is the standard approach for fitting Bayesian models. \n", 10 | "\n", 11 | "Gelman et al. (2013) break down the business of Bayesian analysis into three primary steps:\n", 12 | "\n", 13 | "1. Specify a full probability model, including all parameters, data, transformations, missing values and predictions that are of interest.\n", 14 | "2. Calculate the posterior distribution of the unknown quantities in the model, conditional on the data.\n", 15 | "3. Perform model checking to evaluate the quality and suitablility of the model.\n", 16 | "\n", 17 | "While each of these steps is challenging, it is the second step that is the most difficult for non-trivial models, and was a bottleneck for the adoption of Bayesian methods for decades. \n", 18 | "\n", 19 | "We can consider this in terms of what Blei et al. (2014) call the Box loop \n", 20 | "![box loop](images/boxloop.png)\n", 21 | "\n", 22 | "### Bayesian Inference\n", 23 | "\n", 24 | "At this point, we should all be familiar with **Bayes Formula**:\n", 25 | "\n", 26 | "![bayes formula](images/bayes_formula.png)\n", 27 | "\n", 28 | "The equation expresses how our belief about the value of \\\\(\\theta\\\\), as expressed by the **prior distribution** \\\\(P(\\theta)\\\\) is reallocated following the observation of the data \\\\(y\\\\), as expressed by the posterior distribution the posterior distribution.\n", 29 | "\n", 30 | "Computing the posterior distribution is called the **inference problem**, and is usually the goal of Bayesian analysis.\n", 31 | "\n", 32 | "The innocuous denominator \\\\(P(y)\\\\) (the model **evidence**, or **marginal likelihood**) cannot be calculated directly, and is actually the expression in the numerator, integrated over all \\\\(\\theta\\\\):\n", 33 | "\n", 34 | "
\n", 35 | "\\\\[Pr(\\theta|y) = \\frac{Pr(y|\\theta)Pr(\\theta)}{\\int Pr(y|\\theta)Pr(\\theta) d\\theta}\\\\]\n", 36 | "
\n", 37 | "\n", 38 | "Computing this integral, which may involve many variables, is generally intractible with analytic methods. This is the major compuational hurdle for Bayesian analysis.\n", 39 | "\n", 40 | "### Simulation Approaches for Bayesian Computation\n", 41 | "\n", 42 | "Since analysis is off the table, a reasonable alternative is to attempt to estimate the integral using numerical methods. For example, consider the expected value of a random variable $\\mathbf{x}$:\n", 43 | "\n", 44 | "$$\\begin{gathered}\n", 45 | "\\begin{split}E[{\\bf x}] = \\int {\\bf x} f({\\bf x}) d{\\bf x}, \\qquad\n", 46 | "{\\bf x} = \\{x_1,...,x_k\\}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 47 | "\n", 48 | "where $k$ (the dimension of vector $x$) is perhaps very large. If we can produce a reasonable number of random vectors $\\{{\\bf x_i}\\}$, we can use these values to approximate the unknown integral. This process is known as *Monte Carlo integration*. In general, MC integration allows integrals against probability density functions:\n", 49 | "\n", 50 | "$$\\begin{gathered}\n", 51 | "\\begin{split}I = \\int h(\\mathbf{x}) f(\\mathbf{x}) \\mathbf{dx}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 52 | "\n", 53 | "to be estimated by finite sums:\n", 54 | "\n", 55 | "$$\\begin{gathered}\n", 56 | "\\begin{split}\\hat{I} = \\frac{1}{n}\\sum_{i=1}^n h(\\mathbf{x}_i),\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 57 | "\n", 58 | "where $\\mathbf{x}_i$ is a sample from $f$. This estimate is valid and useful because:\n", 59 | "\n", 60 | "- By the strong law of large numbers:\n", 61 | "\n", 62 | "$$\\begin{gathered}\n", 63 | "\\begin{split}\\hat{I} \\rightarrow I \\mbox{ with probability 1}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 64 | "\n", 65 | "- Simulation error can be measured and controlled:\n", 66 | "\n", 67 | "$$Var(\\hat{I}) = \\frac{1}{n(n-1)}\\sum_{i=1}^n\n", 68 | " (h(\\mathbf{x}_i)-\\hat{I})^2$$" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### How is this relevant to Bayesian analysis? \n", 76 | "\n", 77 | "When we observe data $y$ that we hypothesize as being obtained from a sampling model $f(y|\\theta)$, where $\\theta$ is a vector of (unknown) model parameters, a Bayesian places a *prior* distribution $p(\\theta)$ on the parameters to describe the uncertainty in the true values of the parameters. Bayesian inference, then, is obtained by calculating the *posterior* distribution, which is proportional to the product of these quantities:\n", 78 | "\n", 79 | "$$p(\\theta | y) \\propto f(y|\\theta) p(\\theta)$$\n", 80 | "\n", 81 | "unfortunately, for most problems of interest, the normalizing constant cannot be calculated because it involves mutli-dimensional integration over $\\theta$.\n", 82 | "\n", 83 | "Returning to our integral for MC sampling, if we replace $f(\\mathbf{x})$\n", 84 | "with a posterior, $p(\\theta|y)$ and make $h(\\theta)$ an interesting function of the unknown parameter, the resulting expectation is that of the posterior of $h(\\theta)$:\n", 85 | "\n", 86 | "$$E[h(\\theta)|y] = \\int h(\\theta) p(\\theta|y) d\\theta \\approx \\frac{1}{n}\\sum_{i=1}^n h(\\theta)$$\n", 87 | "\n", 88 | "We also require integrals to obtain marginal estimates from a joint model. If $\\theta$ is of length $K$, then inference about any particular parameter is obtained by:\n", 89 | "\n", 90 | "$$p(\\theta_i|y) \\propto \\int p(\\theta|y) d\\theta_{-i}$$\n", 91 | "\n", 92 | "where the `-i` subscript indicates all elements except the $i^{th}$." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Sampling Markov Chains\n", 100 | "\n", 101 | "The expectation above assumes that the draws of $\\theta$ are **independent**. The limitation in using Monte Carlo sampling for Bayesian inference is that it is not usually feasible to make independent draws from the posterior distribution. \n", 102 | "\n", 103 | "The first \"MC\" in MCMC stands for **Markov chain**. A Markov chain is a **stochastic process**, an indexed set of random variables, where the value of a particular random variable in the set is dependent only on the random variable corresponding to the prevous index. This is a Markovian dependence structure:\n", 104 | "\n", 105 | "$$Pr(X_{t+1}=x_{t+1} | X_t=x_t, X_{t-1}=x_{t-1},\\ldots,X_0=x_0) = Pr(X_{t+1}=x_{t+1} | X_t=x_t)$$\n", 106 | "\n", 107 | "This conditioning specifies that the future depends on the current state, but not past states. Thus, the Markov chain wanders about the state space, remembering only where it has just been in the last time step. The collection of transition probabilities is sometimes called a *transition matrix* when dealing with discrete states, or more generally, a *transition kernel*.\n", 108 | "\n", 109 | "MCMC allows us to generate samples from a particular posterior distribution as a Markov chain. The magic is that the resulting sample, even though it is dependent in this way, is indistinguishable from an independent sample from the true posterior." 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Why MCMC Works: Reversible Markov Chains\n", 117 | "\n", 118 | "Markov chain Monte Carlo simulates a Markov chain for which some function of interest\n", 119 | "(*e.g.* the joint distribution of the parameters of some model) is the unique, invariant limiting distribution. An invariant distribution with respect to some Markov chain with transition kernel $Pr(y \\mid x)$ implies that:\n", 120 | "\n", 121 | "$$\\begin{gathered}\n", 122 | "\\begin{split}\\int_x Pr(y \\mid x) \\pi(x) dx = \\pi(y).\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\n", 123 | "\\end{gathered}$$\n", 124 | "\n", 125 | "Invariance is guaranteed for any *reversible* Markov chain. Consider a Markov chain in reverse sequence:\n", 126 | "$\\{\\theta^{(n)},\\theta^{(n-1)},...,\\theta^{(0)}\\}$. This sequence is still Markovian, because:\n", 127 | "\n", 128 | "$$\\begin{gathered}\n", 129 | "\\begin{split}Pr(\\theta^{(k)}=y \\mid \\theta^{(k+1)}=x,\\theta^{(k+2)}=x_1,\\ldots ) = Pr(\\theta^{(k)}=y \\mid \\theta^{(k+1)}=x)\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 130 | "\n", 131 | "Forward and reverse transition probabilities may be related through Bayes theorem:\n", 132 | "\n", 133 | "$$\\begin{gathered}\n", 134 | "\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 135 | "\n", 136 | "$$\\begin{gathered}\n", 137 | "\\begin{split}\\frac{Pr(\\theta^{(k+1)}=x \\mid \\theta^{(k)}=y) \\pi^{(k)}(y)}{\\pi^{(k+1)}(x)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 138 | "\n", 139 | "Though not homogeneous in general, $\\pi$ becomes homogeneous if:\n", 140 | "\n", 141 | "- $n \\rightarrow \\infty$\n", 142 | "\n", 143 | "- $\\pi^{(i)}=\\pi$ for some $i < k$\n", 144 | "\n", 145 | "If this chain is homogeneous it is called reversible, because it satisfies the ***detailed balance equation***:\n", 146 | "\n", 147 | "$$\\begin{gathered}\n", 148 | "\\begin{split}\\pi(x)Pr(y \\mid x) = \\pi(y) Pr(x \\mid y)\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 149 | "\n", 150 | "Reversibility is important because it has the effect of balancing movement through the entire state space. When a Markov chain is reversible, $\\pi$ is the unique, invariant, stationary distribution of that chain. Hence, if $\\pi$ is of interest, we need only find the reversible Markov chain for which $\\pi$ is the limiting distribution.\n", 151 | "This is what MCMC does!" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## The Metropolis-Hastings Algorithm\n", 159 | "\n", 160 | "One of the simplest and most flexible MCMC algorithms is the Metropolis-Hastings sampler. This algorithm generates candidate state transitions from an auxilliary distribution, and accepts or rejects each candidate probabilistically, according to the posterior distribution of the model.\n", 161 | "\n", 162 | "Let us first consider a simple Metropolis-Hastings algorithm for a single parameter, $\\theta$. We will use a standard sampling distribution, referred to as the *proposal distribution*, to produce candidate variables $q_t(\\theta^{\\prime} | \\theta)$. That is, the generated value, $\\theta^{\\prime}$, is a *possible* next value for\n", 163 | "$\\theta$ at step $t+1$. We also need to be able to calculate the probability of moving back to the original value from the candidate, or\n", 164 | "$q_t(\\theta | \\theta^{\\prime})$. These probabilistic ingredients are used to define an *acceptance ratio*:\n", 165 | "\n", 166 | "$$\\begin{gathered}\n", 167 | "\\begin{split}a(\\theta^{\\prime},\\theta) = \\frac{q_t(\\theta^{\\prime} | \\theta) \\pi(\\theta^{\\prime})}{q_t(\\theta | \\theta^{\\prime}) \\pi(\\theta)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 168 | "\n", 169 | "The value of $\\theta^{(t+1)}$ is then determined by:\n", 170 | "\n", 171 | "$$\\begin{gathered}\n", 172 | "\\begin{split}\\theta^{(t+1)} = \\left\\{\\begin{array}{l@{\\quad \\mbox{with prob.} \\quad}l}\\theta^{\\prime} & \\min(a(\\theta^{\\prime},\\theta^{(t)}),1) \\\\ \\theta^{(t)} & 1 - \\min(a(\\theta^{\\prime},\\theta^{(t)}),1) \\end{array}\\right.\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 173 | "\n", 174 | "This transition kernel implies that movement is not guaranteed at every step. It only occurs if the suggested transition is likely based on the acceptance ratio.\n", 175 | "\n", 176 | "A single iteration of the Metropolis-Hastings algorithm proceeds as follows:\n", 177 | "\n", 178 | "The original form of the algorithm specified by Metropolis required that\n", 179 | "$q_t(\\theta^{\\prime} | \\theta) = q_t(\\theta | \\theta^{\\prime})$, which reduces $a(\\theta^{\\prime},\\theta)$ to\n", 180 | "$\\pi(\\theta^{\\prime})/\\pi(\\theta)$, but this is not necessary. In either case, the state moves to high-density points in the distribution with high probability, and to low-density points with low probability. After convergence, the Metropolis-Hastings algorithm describes the full target posterior density, so all points are recurrent.\n", 181 | "\n", 182 | "1. Sample $\\theta^{\\prime}$ from $q(\\theta^{\\prime} | \\theta^{(t)})$.\n", 183 | "\n", 184 | "2. Generate a Uniform[0,1] random variate $u$.\n", 185 | "\n", 186 | "3. If $a(\\theta^{\\prime},\\theta) > u$ then\n", 187 | " $\\theta^{(t+1)} = \\theta^{\\prime}$, otherwise\n", 188 | " $\\theta^{(t+1)} = \\theta^{(t)}$.\n", 189 | "\n", 190 | "![rejection](images/rejection_sampling.png)\n", 191 | "### Random-walk Metropolis-Hastings\n", 192 | "\n", 193 | "A practical implementation of the Metropolis-Hastings algorithm makes use of a random-walk proposal.\n", 194 | "Recall that a random walk is a Markov chain that evolves according to:\n", 195 | "\n", 196 | "$$\n", 197 | "\\theta^{(t+1)} = \\theta^{(t)} + \\epsilon_t \\\\\n", 198 | "\\epsilon_t \\sim f(\\phi)\n", 199 | "$$\n", 200 | "\n", 201 | "As applied to the MCMC sampling, the random walk is used as a proposal distribution, whereby dependent proposals are generated according to:\n", 202 | "\n", 203 | "$$\\begin{gathered}\n", 204 | "\\begin{split}q(\\theta^{\\prime} | \\theta^{(t)}) = f(\\theta^{\\prime} - \\theta^{(t)}) = \\theta^{(t)} + \\epsilon_t\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 205 | "\n", 206 | "Generally, the density generating $\\epsilon_t$ is symmetric about zero,\n", 207 | "resulting in a symmetric chain. Chain symmetry implies that\n", 208 | "$q(\\theta^{\\prime} | \\theta^{(t)}) = q(\\theta^{(t)} | \\theta^{\\prime})$,\n", 209 | "which reduces the Metropolis-Hastings acceptance ratio to:\n", 210 | "\n", 211 | "$$\\begin{gathered}\n", 212 | "\\begin{split}a(\\theta^{\\prime},\\theta) = \\frac{\\pi(\\theta^{\\prime})}{\\pi(\\theta)}\\end{split}\\notag\\\\\\begin{split}\\end{split}\\notag\\end{gathered}$$\n", 213 | "\n", 214 | "The choice of the random walk distribution for $\\epsilon_t$ is frequently a normal or Student’s $t$ density, but it may be any distribution that generates an irreducible proposal chain.\n", 215 | "\n", 216 | "An important consideration is the specification of the scale parameter for the random walk error distribution. Large values produce random walk steps that are highly exploratory, but tend to produce proposal values in the tails of the target distribution, potentially resulting in very small acceptance rates. Conversely, small values tend to be accepted more frequently, since they tend to produce proposals close to the current parameter value, but may result in chains that mix very slowly.\n", 217 | "Some simulation studies suggest optimal acceptance rates in the range of 20-50%. It is often worthwhile to optimize the proposal variance by iteratively adjusting its value, according to observed acceptance rates early in the MCMC simulation ." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "# Hamiltonian Monte Carlo" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "While flexible and easy to implement, Metropolis-Hastings sampling is a random walk\n", 232 | "sampler that might not be statistically efficient for many models. In\n", 233 | "this context, and when sampling from continuous variables, Hamiltonian (or Hybrid) Monte\n", 234 | "Carlo (HMC) can prove to be a powerful tool. It avoids\n", 235 | "random walk behavior by simulating a physical system governed by\n", 236 | "Hamiltonian dynamics, potentially avoiding tricky conditional\n", 237 | "distributions in the process.\n", 238 | "\n", 239 | "![hmc comparison](images/hmc.png)\n", 240 | "\n", 241 | "In HMC, model samples are obtained by simulating a physical system,\n", 242 | "where particles move about a high-dimensional landscape, subject to\n", 243 | "potential and kinetic energies. Adapting the notation from [Neal (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html),\n", 244 | "particles are characterized by a position vector or state\n", 245 | "$s \\in \\mathcal{R}^D$ and velocity vector $\\phi \\in \\mathcal{R}^D$. The\n", 246 | "combined state of a particle is denoted as $\\chi=(s,\\phi)$. The\n", 247 | "Hamiltonian is then defined as the sum of potential energy $E(s)$ and kinetic energy\n", 248 | "$K(\\phi)$, as follows:\n", 249 | "\n", 250 | "$$\\mathcal{H}(s,\\phi) = E(s) + K(\\phi)\n", 251 | "= E(s) + \\frac{1}{2} \\sum_i \\phi_i^2$$\n", 252 | "\n", 253 | "Instead of sampling $p(s)$ directly, HMC operates by sampling from the\n", 254 | "canonical distribution\n", 255 | "$p(s,\\phi) = \\frac{1}{Z} \\exp(-\\mathcal{H}(s,\\phi))=p(s)p(\\phi)$.\n", 256 | "Because the two variables are independent, marginalizing over $\\phi$ is\n", 257 | "trivial and recovers the original distribution of interest.\n", 258 | "\n", 259 | "**Hamiltonian Dynamics**\n", 260 | "\n", 261 | "State $s$ and velocity $\\phi$ are modified such that\n", 262 | "$\\mathcal{H}(s,\\phi)$ remains constant throughout the simulation. The\n", 263 | "differential equations are given by:\n", 264 | "\n", 265 | "$$\\begin{aligned}\\frac{ds_i}{dt} &= \\frac{\\partial \\mathcal{H}}{\\partial \\phi_i} = \\phi_i \\\\\n", 266 | "\\frac{d\\phi_i}{dt} &= - \\frac{\\partial \\mathcal{H}}{\\partial s_i}\n", 267 | "= - \\frac{\\partial E}{\\partial s_i}\n", 268 | "\\end{aligned}$$\n", 269 | "\n", 270 | "As shown in [Neal (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html), \n", 271 | "the above transformation preserves volume and is\n", 272 | "reversible. The above dynamics can thus be used as transition operators\n", 273 | "of a Markov chain and will leave $p(s,\\phi)$ invariant. That chain by\n", 274 | "itself is not ergodic however, since simulating the dynamics maintains a\n", 275 | "fixed Hamiltonian $\\mathcal{H}(s,\\phi)$. HMC thus alternates Hamiltonian\n", 276 | "dynamic steps, with Gibbs sampling of the velocity. Because $p(s)$ and\n", 277 | "$p(\\phi)$ are independent, sampling $\\phi_{new} \\sim p(\\phi|s)$ is\n", 278 | "trivial since $p(\\phi|s)=p(\\phi)$, where $p(\\phi)$ is often taken to be\n", 279 | "the univariate Gaussian.\n", 280 | "\n", 281 | "**The Leap-Frog Algorithm**\n", 282 | "\n", 283 | "In practice, we cannot simulate Hamiltonian dynamics exactly because of\n", 284 | "the problem of time discretization. There are several ways one can do\n", 285 | "this. To maintain invariance of the Markov chain however, care must be\n", 286 | "taken to preserve the properties of *volume conservation* and *time\n", 287 | "reversibility*. The **leap-frog algorithm** maintains these properties\n", 288 | "and operates in 3 steps:\n", 289 | "\n", 290 | "$$\\begin{aligned}\n", 291 | "\\phi_i(t + \\epsilon/2) &= \\phi_i(t) - \\frac{\\epsilon}{2} \\frac{\\partial{}}{\\partial s_i} E(s(t)) \\\\\n", 292 | "s_i(t + \\epsilon) &= s_i(t) + \\epsilon \\phi_i(t + \\epsilon/2) \\\\\n", 293 | "\\phi_i(t + \\epsilon) &= \\phi_i(t + \\epsilon/2) - \\frac{\\epsilon}{2} \\frac{\\partial{}}{\\partial s_i} E(s(t + \\epsilon)) \n", 294 | "\\end{aligned}$$\n", 295 | "\n", 296 | "We thus perform a half-step update of the velocity at time\n", 297 | "$t+\\epsilon/2$, which is then used to compute $s(t + \\epsilon)$ and\n", 298 | "$\\phi(t + \\epsilon)$.\n", 299 | "\n", 300 | "**Accept / Reject**\n", 301 | "\n", 302 | "In practice, using finite stepsizes $\\epsilon$ will not preserve\n", 303 | "$\\mathcal{H}(s,\\phi)$ exactly and will introduce bias in the simulation.\n", 304 | "Also, rounding errors due to the use of floating point numbers means\n", 305 | "that the above transformation will not be perfectly reversible.\n", 306 | "\n", 307 | "HMC cancels these effects **exactly** by adding a Metropolis\n", 308 | "accept/reject stage, after $n$ leapfrog steps. The new state\n", 309 | "$\\chi' = (s',\\phi')$ is accepted with probability $p_{acc}(\\chi,\\chi')$,\n", 310 | "defined as:\n", 311 | "\n", 312 | "$$p_{acc}(\\chi,\\chi') = min \\left( 1, \\frac{\\exp(-\\mathcal{H}(s',\\phi')}{\\exp(-\\mathcal{H}(s,\\phi)} \\right)$$\n", 313 | "\n", 314 | "**HMC Algorithm**\n", 315 | "\n", 316 | "We obtain a new HMC sample as follows:\n", 317 | "\n", 318 | "1. sample a new velocity from a univariate Gaussian distribution\n", 319 | "2. perform $n$ leapfrog steps to obtain the new state $\\chi'$\n", 320 | "3. perform accept/reject move of $\\chi'$" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "## No U-Turn Sampling\n", 328 | "\n", 329 | "The major drawback of the HMC algorithm is the extensive tuning required to make it sample efficiency. There are a handful of parameters that require specification by the user:\n", 330 | "\n", 331 | "- the scaling of the momentum distribution\n", 332 | "- the step size forthe leapfrog algorithm\n", 333 | "- the number of steps to be taken for the leapfrog algorithm\n", 334 | "\n", 335 | "When these parameters are poorly-chosen, the HMC algorithm can suffer severe losses in efficiency. For example, if we take steps that are too short, the simulation becomes a random walk, while steps that are too long end up retracing paths already taken.\n", 336 | "\n", 337 | "An efficient MCMC algorithm seeks to optimize mixing, while maintaining detailed balance. While HMC can be tuned on-the-fly, it requires costly burn-in runs to do so.\n", 338 | "\n", 339 | "![nuts](images/nuts.png)\n", 340 | "\n", 341 | "The No U-turn Sampling (NUTS) algorithm automatically tunes the step size and step number parameters, without any intervention from the user. To do so, NUTS constructs a binary tree of leapfrog steps by repeated doubling. When the trajectory of steps creates an angle of more than 90 degrees (*i.e.* a u-turn), the doubling stops, and a point is proposed.\n", 342 | "\n", 343 | "![binary doubling](images/binary_doubling.png)\n", 344 | "\n", 345 | "NUTS provides the efficiency of gradient-based MCMC sampling without extensive user intervention required to tune Hamiltonian Monte Carlo. As the result, NUTS is the default sampling algorithm for continuous variables in PyMC3." 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "## References\n", 353 | "\n", 354 | "1. [Gelman, A., Carlin, J. B., Stern, H. S., Dunson, D. B., Vehtari, A., and Rubin, D. B. (2013)](http://www.stat.columbia.edu/~gelman/book/). Bayesian Data Analysis. Chapman &Hall/CRC Press, London, third edition.\n", 355 | "2. [Geyer, C. (2013)](http://www.mcmchandbook.net/HandbookChapter1.pdf) Introduction to Markov Chain Monte Carlo. In *Handbook of Markov Chain Monte Carlo*, S. Brooks, A. Gelman, G. Jones, X.L. Meng, eds. CRC Press.\n", 356 | "3. [Neal, R.M. (1993)](http://www.cs.toronto.edu/~radford/review.abstract.html) Probabilistic Inference Using Markov Chain Monte Carlo Methods, Technical Report CRG-TR-93-1, Dept. of Computer Science, University of Toronto, 144 pages.\n", 357 | "4. [Blei, David M. (2014)](https://www.annualreviews.org/doi/full/10.1146/annurev-statistics-022513-115657) Build, compute, critique, repeat: Data analysis with latent variable models. *Annual Review of Statistics and Its Application 1*: 203-232.\n" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [] 366 | } 367 | ], 368 | "metadata": { 369 | "kernelspec": { 370 | "display_name": "Python [conda env:stat-rethink-pymc3]", 371 | "language": "python", 372 | "name": "conda-env-stat-rethink-pymc3-py" 373 | }, 374 | "language_info": { 375 | "codemirror_mode": { 376 | "name": "ipython", 377 | "version": 3 378 | }, 379 | "file_extension": ".py", 380 | "mimetype": "text/x-python", 381 | "name": "python", 382 | "nbconvert_exporter": "python", 383 | "pygments_lexer": "ipython3", 384 | "version": "3.6.4" 385 | } 386 | }, 387 | "nbformat": 4, 388 | "nbformat_minor": 1 389 | } 390 | -------------------------------------------------------------------------------- /notebooks/6. Model Checking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Checking\n", 8 | "\n", 9 | "After running an MCMC simulation, `sample` returns a `MutliTrace` object containing the samples for all the stochastic and deterministic random variables. The final step in Bayesian computation is model checking, in order to ensure that inferences derived from your sample are valid. There are two components to model checking:\n", 10 | "\n", 11 | "1. Convergence diagnostics\n", 12 | "2. Goodness of fit\n", 13 | "\n", 14 | "Convergence diagnostics are intended to detect lack of convergence in the Markov chain Monte Carlo sample; it is used to ensure that you have not halted your sampling too early. However, a converged model is not guaranteed to be a good model. The second component of model checking, goodness of fit, is used to check the internal validity of the model, by comparing predictions from the model to the data used to fit the model. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Convergence Diagnostics\n", 22 | "\n", 23 | "Valid inferences from sequences of MCMC samples are based on the\n", 24 | "assumption that the samples are derived from the true posterior\n", 25 | "distribution of interest. Theory guarantees this condition as the number\n", 26 | "of iterations approaches infinity. It is important, therefore, to\n", 27 | "determine the **minimum number of samples** required to ensure a reasonable\n", 28 | "approximation to the target posterior density. Unfortunately, no\n", 29 | "universal threshold exists across all problems, so convergence must be\n", 30 | "assessed independently each time MCMC estimation is performed. The\n", 31 | "procedures for verifying convergence are collectively known as\n", 32 | "*convergence diagnostics*.\n", 33 | "\n", 34 | "One approach to analyzing convergence is **analytical**, whereby the\n", 35 | "variance of the sample at different sections of the chain are compared\n", 36 | "to that of the limiting distribution. These methods use distance metrics\n", 37 | "to analyze convergence, or place theoretical bounds on the sample\n", 38 | "variance, and though they are promising, they are generally difficult to\n", 39 | "use and are not prominent in the MCMC literature. More common is a\n", 40 | "**statistical** approach to assessing convergence. With this approach,\n", 41 | "rather than considering the properties of the theoretical target\n", 42 | "distribution, only the statistical properties of the observed chain are\n", 43 | "analyzed. Reliance on the sample alone restricts such convergence\n", 44 | "criteria to **heuristics**. As a result, convergence cannot be guaranteed.\n", 45 | "Although evidence for lack of convergence using statistical convergence\n", 46 | "diagnostics will correctly imply lack of convergence in the chain, the\n", 47 | "absence of such evidence will not *guarantee* convergence in the chain.\n", 48 | "Nevertheless, negative results for one or more criteria may provide some\n", 49 | "measure of assurance to users that their sample will provide valid\n", 50 | "inferences.\n", 51 | "\n", 52 | "For most simple models, convergence will occur quickly, sometimes within\n", 53 | "a the first several hundred iterations, after which all remaining\n", 54 | "samples of the chain may be used to calculate posterior quantities. For\n", 55 | "more complex models, convergence requires a significantly longer burn-in\n", 56 | "period; sometimes orders of magnitude more samples are needed.\n", 57 | "Frequently, lack of convergence will be caused by **poor mixing**. \n", 58 | "Recall that *mixing* refers to the degree to which the Markov\n", 59 | "chain explores the support of the posterior distribution. Poor mixing\n", 60 | "may stem from inappropriate proposals (if one is using the\n", 61 | "Metropolis-Hastings sampler) or from attempting to estimate models with\n", 62 | "highly correlated variables." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "%matplotlib inline\n", 72 | "import numpy as np\n", 73 | "import seaborn as sns; sns.set_context('notebook')" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from pymc3 import exp, Normal, Binomial, sample, Model\n", 83 | "\n", 84 | "# Samples for each dose level\n", 85 | "n = 5 * np.ones(4, dtype=int)\n", 86 | "# Log-dose\n", 87 | "dose = np.array([-.86, -.3, -.05, .73])\n", 88 | "deaths = np.array([0, 1, 3, 5])\n", 89 | "\n", 90 | "def invlogit(x):\n", 91 | " return exp(x) / (1 + exp(x))\n", 92 | "\n", 93 | "with Model() as bioassay_model:\n", 94 | "\n", 95 | " # Logit-linear model parameters\n", 96 | " alpha = Normal('alpha', 0, 0.01)\n", 97 | " beta = Normal('beta', 0, 0.01)\n", 98 | "\n", 99 | " # Calculate probabilities of death\n", 100 | " theta = invlogit(alpha + beta * dose)\n", 101 | "\n", 102 | " # Data likelihood\n", 103 | " deaths = Binomial('deaths', n=n, p=theta, observed=deaths)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from pymc3 import Metropolis\n", 113 | "\n", 114 | "with bioassay_model:\n", 115 | " step = Metropolis(scaling=0.0001)\n", 116 | " bioassay_trace = sample(1000, step=step)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "from pymc3 import traceplot\n", 126 | "\n", 127 | "traceplot(bioassay_trace[500:], varnames=['alpha'])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Informal Methods\n", 135 | "\n", 136 | "The most straightforward approach for assessing convergence is based on\n", 137 | "simply **plotting and inspecting traces and histograms** of the observed\n", 138 | "MCMC sample. If the trace of values for each of the stochastics exhibits\n", 139 | "asymptotic behavior over the last $m$ iterations, this may be\n", 140 | "satisfactory evidence for convergence. " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "with bioassay_model:\n", 150 | " bioassay_trace = sample(10000)\n", 151 | " \n", 152 | "traceplot(bioassay_trace[9000:], varnames=['beta'])" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "A similar approach involves\n", 160 | "plotting a histogram for every set of $k$ iterations (perhaps 50-100)\n", 161 | "beyond some burn in threshold $n$; if the histograms are not visibly\n", 162 | "different among the sample intervals, this may be considered some evidence for\n", 163 | "convergence. Note that such diagnostics should be carried out for each\n", 164 | "stochastic estimated by the MCMC algorithm, because convergent behavior\n", 165 | "by one variable does not imply evidence for convergence for other\n", 166 | "variables in the analysis. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "import matplotlib.pyplot as plt\n", 176 | "\n", 177 | "beta_trace = bioassay_trace['beta']\n", 178 | "\n", 179 | "fig, axes = plt.subplots(2, 5, figsize=(14,6))\n", 180 | "axes = axes.ravel()\n", 181 | "for i in range(10):\n", 182 | " axes[i].hist(beta_trace[500*i:500*(i+1)])\n", 183 | "plt.tight_layout()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "An extension of this approach can be taken\n", 191 | "when multiple parallel chains are run, rather than just a single, long\n", 192 | "chain. In this case, the final values of $c$ chains run for $n$\n", 193 | "iterations are plotted in a histogram; just as above, this is repeated\n", 194 | "every $k$ iterations thereafter, and the histograms of the endpoints are\n", 195 | "plotted again and compared to the previous histogram. This is repeated\n", 196 | "until consecutive histograms are indistinguishable." 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Another *ad hoc* method for detecting lack of convergence is to examine\n", 204 | "the traces of several MCMC chains initialized with different starting\n", 205 | "values. Overlaying these traces on the same set of axes should (if\n", 206 | "convergence has occurred) show each chain tending toward the same\n", 207 | "equilibrium value, with approximately the same variance. Recall that the\n", 208 | "tendency for some Markov chains to converge to the true (unknown) value\n", 209 | "from diverse initial values is called *ergodicity*. This property is\n", 210 | "guaranteed by the reversible chains constructed using MCMC, and should\n", 211 | "be observable using this technique. Again, however, this approach is\n", 212 | "only a heuristic method, and cannot always detect lack of convergence,\n", 213 | "even though chains may appear ergodic." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "with bioassay_model:\n", 223 | " \n", 224 | " bioassay_trace = sample(1000, njobs=2, start=[{'alpha':0.5}, {'alpha':5}])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "bioassay_trace.get_values('alpha', chains=0)[0]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "plt.plot(bioassay_trace.get_values('alpha', chains=0)[:200], 'r--')\n", 243 | "plt.plot(bioassay_trace.get_values('alpha', chains=1)[:200], 'k--')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "A principal reason that evidence from informal techniques cannot\n", 251 | "guarantee convergence is a phenomenon called ***metastability***. Chains may\n", 252 | "appear to have converged to the true equilibrium value, displaying\n", 253 | "excellent qualities by any of the methods described above. However,\n", 254 | "after some period of stability around this value, the chain may suddenly\n", 255 | "move to another region of the parameter space. This period\n", 256 | "of metastability can sometimes be very long, and therefore escape\n", 257 | "detection by these convergence diagnostics. Unfortunately, there is no\n", 258 | "statistical technique available for detecting metastability.\n", 259 | "\n", 260 | "### Formal Methods\n", 261 | "\n", 262 | "Along with the *ad hoc* techniques described above, a number of more\n", 263 | "formal methods exist which are prevalent in the literature. These are\n", 264 | "considered more formal because they are based on existing statistical\n", 265 | "methods, such as time series analysis.\n", 266 | "\n", 267 | "PyMC currently includes three formal convergence diagnostic methods. The\n", 268 | "first, proposed by [Geweke (1992)](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.ss/1177011446), is a time-series approach that\n", 269 | "compares the mean and variance of segments from the beginning and end of\n", 270 | "a single chain.\n", 271 | "\n", 272 | "$$z = \\frac{\\bar{\\theta}_a - \\bar{\\theta}_b}{\\sqrt{S_a(0) + S_b(0)}}$$\n", 273 | "\n", 274 | "where $a$ is the early interval and $b$ the late interval, and $S_i(0)$ is the spectral density estimate at zero frequency for chain segment $i$. If the\n", 275 | "z-scores (theoretically distributed as standard normal variates) of\n", 276 | "these two segments are similar, it can provide evidence for convergence.\n", 277 | "PyMC calculates z-scores of the difference between various initial\n", 278 | "segments along the chain, and the last 50% of the remaining chain. If\n", 279 | "the chain has converged, the majority of points should fall within 2\n", 280 | "standard deviations of zero.\n", 281 | "\n", 282 | "In PyMC, diagnostic z-scores can be obtained by calling the `geweke` function. It\n", 283 | "accepts either (1) a single trace, (2) a Node or Stochastic object, or\n", 284 | "(4) an entire Model object:" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "from pymc3 import geweke\n", 294 | "\n", 295 | "with bioassay_model:\n", 296 | " tr = sample(2000)\n", 297 | " \n", 298 | "z = geweke(tr, intervals=15)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "plt.scatter(*z['alpha'].T)\n", 308 | "plt.hlines([-1,1], 0, 1000, linestyles='dotted')\n", 309 | "plt.xlim(0, 1000)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "The arguments expected are the following:\n", 317 | "\n", 318 | "- `x` : The trace of a variable.\n", 319 | "- `first` : The fraction of series at the beginning of the trace.\n", 320 | "- `last` : The fraction of series at the end to be compared with the section at the beginning.\n", 321 | "- `intervals` : The number of segments.\n", 322 | "\n", 323 | "Plotting the output displays the scores in series, making it is easy to\n", 324 | "see departures from the standard normal assumption." 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "A second convergence diagnostic provided by PyMC is the Gelman-Rubin\n", 332 | "statistic [Gelman and Rubin (1992)](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.ss/1177011136). This diagnostic uses multiple chains to\n", 333 | "check for lack of convergence, and is based on the notion that if\n", 334 | "multiple chains have converged, by definition they should appear very\n", 335 | "similar to one another; if not, one or more of the chains has failed to\n", 336 | "converge.\n", 337 | "\n", 338 | "The Gelman-Rubin diagnostic uses an analysis of variance approach to\n", 339 | "assessing convergence. That is, it calculates both the between-chain\n", 340 | "varaince (B) and within-chain varaince (W), and assesses whether they\n", 341 | "are different enough to worry about convergence. Assuming $m$ chains,\n", 342 | "each of length $n$, quantities are calculated by:\n", 343 | "\n", 344 | "$$\\begin{align}B &= \\frac{n}{m-1} \\sum_{j=1}^m (\\bar{\\theta}_{.j} - \\bar{\\theta}_{..})^2 \\\\\n", 345 | "W &= \\frac{1}{m} \\sum_{j=1}^m \\left[ \\frac{1}{n-1} \\sum_{i=1}^n (\\theta_{ij} - \\bar{\\theta}_{.j})^2 \\right]\n", 346 | "\\end{align}$$\n", 347 | "\n", 348 | "for each scalar estimand $\\theta$. Using these values, an estimate of\n", 349 | "the marginal posterior variance of $\\theta$ can be calculated:\n", 350 | "\n", 351 | "$$\\hat{\\text{Var}}(\\theta | y) = \\frac{n-1}{n} W + \\frac{1}{n} B$$\n", 352 | "\n", 353 | "Assuming $\\theta$ was initialized to arbitrary starting points in each\n", 354 | "chain, this quantity will overestimate the true marginal posterior\n", 355 | "variance. At the same time, $W$ will tend to underestimate the\n", 356 | "within-chain variance early in the sampling run. However, in the limit\n", 357 | "as $n \\rightarrow \n", 358 | "\\infty$, both quantities will converge to the true variance of $\\theta$.\n", 359 | "In light of this, the Gelman-Rubin statistic monitors convergence using\n", 360 | "the ratio:\n", 361 | "\n", 362 | "$$\\hat{R} = \\sqrt{\\frac{\\hat{\\text{Var}}(\\theta | y)}{W}}$$\n", 363 | "\n", 364 | "This is called the potential scale reduction, since it is an estimate of\n", 365 | "the potential reduction in the scale of $\\theta$ as the number of\n", 366 | "simulations tends to infinity. In practice, we look for values of\n", 367 | "$\\hat{R}$ close to one (say, less than 1.1) to be confident that a\n", 368 | "particular estimand has converged. In PyMC, the function\n", 369 | "`gelman_rubin` will calculate $\\hat{R}$ for each stochastic node in\n", 370 | "the passed model:" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "from pymc3 import gelman_rubin\n", 380 | "\n", 381 | "gelman_rubin(bioassay_trace)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "For the best results, each chain should be initialized to highly\n", 389 | "dispersed starting values for each stochastic node.\n", 390 | "\n", 391 | "By default, when calling the `forestplot` function using nodes with\n", 392 | "multiple chains, the $\\hat{R}$ values will be plotted alongside the\n", 393 | "posterior intervals." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "from pymc3 import forestplot\n", 403 | "\n", 404 | "forestplot(bioassay_trace)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "## Autocorrelation" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "from pymc3 import autocorrplot\n", 421 | "\n", 422 | "autocorrplot(tr);" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "bioassay_trace['alpha'].shape" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "from pymc3 import effective_n\n", 441 | "\n", 442 | "effective_n(bioassay_trace)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "## Goodness of Fit\n", 450 | "\n", 451 | "Checking for model convergence is only the first step in the evaluation\n", 452 | "of MCMC model outputs. It is possible for an entirely unsuitable model\n", 453 | "to converge, so additional steps are needed to ensure that the estimated\n", 454 | "model adequately fits the data. One intuitive way of evaluating model\n", 455 | "fit is to compare model predictions with the observations used to fit\n", 456 | "the model. In other words, the fitted model can be used to simulate\n", 457 | "data, and the distribution of the simulated data should resemble the\n", 458 | "distribution of the actual data.\n", 459 | "\n", 460 | "Fortunately, simulating data from the model is a natural component of\n", 461 | "the Bayesian modelling framework. Recall, from the discussion on\n", 462 | "imputation of missing data, the posterior predictive distribution:\n", 463 | "\n", 464 | "$$p(\\tilde{y}|y) = \\int p(\\tilde{y}|\\theta) f(\\theta|y) d\\theta$$\n", 465 | "\n", 466 | "Here, $\\tilde{y}$ represents some hypothetical new data that would be\n", 467 | "expected, taking into account the posterior uncertainty in the model\n", 468 | "parameters. Sampling from the posterior predictive distribution is easy\n", 469 | "in PyMC. The code looks identical to the corresponding data stochastic,\n", 470 | "with two modifications: (1) the node should be specified as\n", 471 | "deterministic and (2) the statistical likelihoods should be replaced by\n", 472 | "random number generators. Consider the `gelman_bioassay` example, \n", 473 | "where deaths are modeled as a binomial random variable for which\n", 474 | "the probability of death is a logit-linear function of the dose of a\n", 475 | "particular drug." 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "from pymc3 import Normal, Binomial, Deterministic, invlogit\n", 485 | "\n", 486 | "# Samples for each dose level\n", 487 | "n = 5 * np.ones(4, dtype=int)\n", 488 | "# Log-dose\n", 489 | "dose = np.array([-.86, -.3, -.05, .73])\n", 490 | "\n", 491 | "with Model() as model:\n", 492 | "\n", 493 | " # Logit-linear model parameters\n", 494 | " alpha = Normal('alpha', 0, 0.01)\n", 495 | " beta = Normal('beta', 0, 0.01)\n", 496 | "\n", 497 | " # Calculate probabilities of death\n", 498 | " theta = Deterministic('theta', invlogit(alpha + beta * dose))\n", 499 | "\n", 500 | " # Data likelihood\n", 501 | " deaths = Binomial('deaths', n=n, p=theta, observed=[0, 1, 3, 5])" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "The posterior predictive distribution of deaths uses the same functional\n", 509 | "form as the data likelihood, in this case a binomial stochastic. Here is\n", 510 | "the corresponding sample from the posterior predictive distribution:" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "with model:\n", 520 | " \n", 521 | " deaths_sim = Binomial('deaths_sim', n=n, p=theta, shape=4)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "Notice that the observed stochastic `Binomial` has been replaced with a stochastic node that is identical in every respect to `deaths`, except that its values are not fixed to be the observed data -- they are left to vary according to the values of the fitted parameters.\n", 529 | "\n", 530 | "The degree to which simulated data correspond to observations can be evaluated in at least two ways. First, these quantities can simply be compared visually. This allows for a qualitative comparison of model-based replicates and observations. If there is poor fit, the true value of the data may appear in the tails of the histogram of replicated data, while a good fit will tend to show the true data in high-probability regions of the posterior predictive distribution. The Matplot package in PyMC provides an easy way of producing such plots, via the `gof_plot` function." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "with model:\n", 540 | " \n", 541 | " gof_trace = sample(2000)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "from pymc3 import forestplot\n", 551 | "\n", 552 | "forestplot(gof_trace, varnames=['deaths_sim'])" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "## Exercise: Meta-analysis of beta blocker effectiveness\n", 560 | "\n", 561 | "Carlin (1992) considers a Bayesian approach to meta-analysis, and includes the following examples of 22 trials of beta-blockers to prevent mortality after myocardial infarction.\n", 562 | "\n", 563 | "In a random effects meta-analysis we assume the true effect (on a log-odds scale) $d_i$ in a trial $i$\n", 564 | "is drawn from some population distribution. Let $r^C_i$ denote number of events in the control group in trial $i$,\n", 565 | "and $r^T_i$ denote events under active treatment in trial $i$. Our model is:\n", 566 | "\n", 567 | "$$\\begin{aligned}\n", 568 | "r^C_i &\\sim \\text{Binomial}\\left(p^C_i, n^C_i\\right) \\\\\n", 569 | "r^T_i &\\sim \\text{Binomial}\\left(p^T_i, n^T_i\\right) \\\\\n", 570 | "\\text{logit}\\left(p^C_i\\right) &= \\mu_i \\\\\n", 571 | "\\text{logit}\\left(p^T_i\\right) &= \\mu_i + \\delta_i \\\\\n", 572 | "\\delta_i &\\sim \\text{Normal}(d, t) \\\\\n", 573 | "\\mu_i &\\sim \\text{Normal}(m, s)\n", 574 | "\\end{aligned}$$\n", 575 | "\n", 576 | "We want to make inferences about the population effect $d$, and the predictive distribution for the effect $\\delta_{\\text{new}}$ in a new trial. Build a model to estimate these quantities in PyMC, and (1) use convergence diagnostics to check for convergence and (2) use posterior predictive checks to assess goodness-of-fit.\n", 577 | "\n", 578 | "Here are the data:" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "r_t_obs = [3, 7, 5, 102, 28, 4, 98, 60, 25, 138, 64, 45, 9, 57, 25, 33, 28, 8, 6, 32, 27, 22]\n", 588 | "n_t_obs = [38, 114, 69, 1533, 355, 59, 945, 632, 278,1916, 873, 263, 291, 858, 154, 207, 251, 151, 174, 209, 391, 680]\n", 589 | "r_c_obs = [3, 14, 11, 127, 27, 6, 152, 48, 37, 188, 52, 47, 16, 45, 31, 38, 12, 6, 3, 40, 43, 39]\n", 590 | "n_c_obs = [39, 116, 93, 1520, 365, 52, 939, 471, 282, 1921, 583, 266, 293, 883, 147, 213, 122, 154, 134, 218, 364, 674]\n", 591 | "N = len(n_c_obs)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "# Write your answer here" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "## References\n", 608 | "\n", 609 | "Gelman, A., & Rubin, D. B. (1992). Inference from iterative simulation using multiple sequences. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 457–472.\n", 610 | "\n", 611 | "Geweke, J., Berger, J. O., & Dawid, A. P. (1992). Evaluating the accuracy of sampling-based approaches to the calculation of posterior moments. In Bayesian Statistics 4.\n", 612 | "\n", 613 | "Brooks, S. P., Catchpole, E. A., & Morgan, B. J. T. (2000). Bayesian Animal Survival Estimation. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 15(4), 357–376. doi:10.1214/ss/1177010123\n", 614 | "\n", 615 | "Gelman, A., Meng, X., & Stern, H. (1996). Posterior predicitive assessment of model fitness via realized discrepencies with discussion. Statistica Sinica, 6, 733–807.\n", 616 | "\n", 617 | "Raftery, A., & Lewis, S. (1992). One long run with diagnostics: Implementation strategies for Markov chain Monte Carlo. Statistical Science. A Review Journal of the Institute of Mathematical Statistics, 7, 493–497.\n", 618 | "\n", 619 | "[CrossValidated: How to use scikit-learn's cross validation functions on multi-label classifiers](http://stats.stackexchange.com/questions/65828/how-to-use-scikit-learns-cross-validation-functions-on-multi-label-classifiers)" 620 | ] 621 | } 622 | ], 623 | "metadata": { 624 | "kernelspec": { 625 | "display_name": "Python [default]", 626 | "language": "python", 627 | "name": "python3" 628 | }, 629 | "language_info": { 630 | "codemirror_mode": { 631 | "name": "ipython", 632 | "version": 3 633 | }, 634 | "file_extension": ".py", 635 | "mimetype": "text/x-python", 636 | "name": "python", 637 | "nbconvert_exporter": "python", 638 | "pygments_lexer": "ipython3", 639 | "version": "3.6.4" 640 | } 641 | }, 642 | "nbformat": 4, 643 | "nbformat_minor": 1 644 | } 645 | -------------------------------------------------------------------------------- /notebooks/a. BEST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bayesian Estimation Supersedes the T-Test" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### The Problem\n", 15 | "\n", 16 | "Several statistical inference procedures involve the comparison of two groups. We may be interested in whether one group is larger than another, or simply different from the other. We require a statistical model for this because true differences are usually accompanied by measurement or stochastic noise that prevent us from drawing conclusions simply from differences calculated from the observed data. \n", 17 | "\n", 18 | "The *de facto* standard for statistically comparing two (or more) samples is to use a statistical test. This involves expressing a null hypothesis, which typically claims that there is no difference between the groups, and using a chosen test statistic to determine whether the distribution of the observed data is plausible under the hypothesis. This rejection occurs when the calculated test statistic is higher than some pre-specified threshold value.\n", 19 | "\n", 20 | "Unfortunately, it is not easy to conduct hypothesis tests correctly, and their results are very easy to misinterpret. Setting up a statistical test involves several subjective choices (*e.g.* statistical test to use, null hypothesis to test, significance level) by the user that are rarely justified based on the problem or decision at hand, but rather, are usually based on traditional choices that are entirely arbitrary (Johnson 1999). The evidence that it provides to the user is indirect, incomplete, and typically overstates the evidence against the null hypothesis (Goodman 1999). \n", 21 | "\n", 22 | "A more informative and effective approach for comparing groups is one based on **estimation** rather than **testing**, and is driven by Bayesian probability rather than frequentist. That is, rather than testing whether two groups are different, we instead pursue an estimate of how different they are, which is fundamentally more informative. Moreover, we include an estimate of uncertainty associated with that difference which includes uncertainty due to our lack of knowledge of the model parameters (epistemic uncertainty) and uncertainty due to the inherent stochasticity of the system (aleatory uncertainty)." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Example: Drug trial evaluation\n", 30 | "\n", 31 | "To illustrate how this Bayesian estimation approach works in practice, we will use a fictitious example from Kruschke (2012) concerning the evaluation of a clinical trial for drug evaluation. The trial aims to evaluate the efficacy of a \"smart drug\" that is supposed to increase intelligence by comparing IQ scores of individuals in a treatment arm (those receiving the drug) to those in a control arm (those recieving a placebo). There are 47 individuals and 42 individuals in the treatment and control arms, respectively." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "%matplotlib inline\n", 43 | "import numpy as np\n", 44 | "import pymc3 as pm\n", 45 | "import pandas as pd\n", 46 | "import seaborn as sns\n", 47 | "sns.set(color_codes=True)\n", 48 | "\n", 49 | "np.random.seed(20090425)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "drug = (101,100,102,104,102,97,105,105,98,101,100,123,105,103,100,95,102,106,\n", 61 | " 109,102,82,102,100,102,102,101,102,102,103,103,97,97,103,101,97,104,\n", 62 | " 96,103,124,101,101,100,101,101,104,100,101)\n", 63 | "placebo = (99,101,100,101,102,100,97,101,104,101,102,102,100,105,88,101,100,\n", 64 | " 104,100,100,100,101,102,103,97,101,101,100,101,99,101,100,100,\n", 65 | " 101,100,99,101,100,102,99,100,99)\n", 66 | "\n", 67 | "y1 = np.array(drug)\n", 68 | "y2 = np.array(placebo)\n", 69 | "y = pd.DataFrame(dict(value=np.r_[y1, y2], group=np.r_[['drug']*len(drug), ['placebo']*len(placebo)]))\n", 70 | "\n", 71 | "y.hist('value', by='group');" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "The first step in a Bayesian approach to inference is to specify the full probability model that corresponds to the problem. For this example, Kruschke chooses a Student-t distribution to describe the distributions of the scores in each group. This choice adds robustness to the analysis, as a T distribution is less sensitive to outlier observations, relative to a normal distribution. The three-parameter Student-t distribution allows for the specification of a mean $\\mu$, a precision (inverse-variance) $\\lambda$ and a degrees-of-freedom parameter $\\nu$:\n", 79 | "\n", 80 | "$$ f(x|\\mu,\\lambda,\\nu) =\n", 81 | " \\frac{\\Gamma(\\frac{\\nu + 1}{2})}{\\Gamma(\\frac{\\nu}{2})}\n", 82 | " \\left(\\frac{\\lambda}{\\pi\\nu}\\right)^{\\frac{1}{2}}\n", 83 | " \\left[1+\\frac{\\lambda(x-\\mu)^2}{\\nu}\\right]^{-\\frac{\\nu+1}{2}}$$\n", 84 | " \n", 85 | "the degrees-of-freedom parameter essentially specifies the \"normality\" of the data, since larger values of $\\nu$ make the distribution converge to a normal distribution, while small values (close to zero) result in heavier tails.\n", 86 | "\n", 87 | "Thus, the likelihood functions of our model are specified as follows:\n", 88 | "\n", 89 | "$$y^{(treat)}_i \\sim T(\\nu, \\mu_1, \\sigma_1)$$\n", 90 | "\n", 91 | "$$y^{(placebo)}_i \\sim T(\\nu, \\mu_2, \\sigma_2)$$\n", 92 | "\n", 93 | "As a simplifying assumption, we will assume that the degree of normality $\\nu$ is the same for both groups. We will, of course, have separate parameters for the means $\\mu_k, k=1,2$ and standard deviations $\\sigma_k$.\n", 94 | "\n", 95 | "Since the means are real-valued, we will apply normal priors on them, and arbitrarily set the hyperparameters to the pooled empirical mean of the data and twice the pooled empirical standard deviation, which applies very diffuse information to these quantities (and importantly, does not favor one or the other *a priori*).\n", 96 | "\n", 97 | "$$\\mu_k \\sim N(\\bar{x}, 2s)$$" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "μ_m = y.value.mean()\n", 109 | "μ_s = y.value.std() * 2\n", 110 | "\n", 111 | "with pm.Model() as model:\n", 112 | " \n", 113 | " group1_mean = pm.Normal('group1_mean', μ_m, sd=μ_s)\n", 114 | " group2_mean = pm.Normal('group2_mean', μ_m, sd=μ_s)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "The group standard deviations will be given a uniform prior over a plausible range of values for the variability of the outcome variable, IQ.\n", 122 | "\n", 123 | "In Kruschke's original model, he uses a very wide uniform prior for the group standard deviations, from the pooled empirical standard deviation divided by 1000 to the pooled standard deviation multiplied by 1000. This is a poor choice of prior, because very basic prior knowledge about measures of human coginition dictate that the variation cannot ever be as high as this upper bound. IQ is a standardized measure, and hence this constrains how variable a given population's IQ values can be. When you place such a wide uniform prior on these values, you are essentially giving a lot of prior weight on inadmissable values. In this example, there is little practical difference, but in general it is best to apply as much prior information that you have available to the parameterization of prior distributions. \n", 124 | "\n", 125 | "We will instead set the group standard deviations to have a $\\text{Uniform}(1,10)$ prior:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "σ_low = 1\n", 137 | "σ_high = 10\n", 138 | "\n", 139 | "with model:\n", 140 | " \n", 141 | " group1_std = pm.Uniform('group1_std', lower=σ_low, upper=σ_high)\n", 142 | " group2_std = pm.Uniform('group2_std', lower=σ_low, upper=σ_high)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "We follow Kruschke by making the prior for $\\mu$ exponentially distributed with a mean of 30; this allocates high prior probability over the regions of the parameter that describe the range from normal to heavy-tailed data under the Student-T distribution." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "with model:\n", 161 | " \n", 162 | " ν = pm.Exponential('ν_minus_one', 1/29.) + 1\n", 163 | "\n", 164 | "sns.distplot(np.random.exponential(30, size=10000), kde=False);" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "Since PyMC3 parameterizes the Student-T in terms of precision, rather than standard deviation, we must transform the standard deviations before specifying our likelihoods." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "with model:\n", 183 | " \n", 184 | " λ1 = group1_std**-2\n", 185 | " λ2 = group2_std**-2\n", 186 | "\n", 187 | " group1 = pm.StudentT('drug', nu=ν, mu=group1_mean, lam=λ1, observed=y1)\n", 188 | " group2 = pm.StudentT('placebo', nu=ν, mu=group2_mean, lam=λ2, observed=y2)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "Having fully specified our probabilistic model, we can turn our attention to calculating the comparisons of interest in order to evaluate the effect of the drug. To this end, we can specify deterministic nodes in our model for the difference between the group means and the difference between the group standard deviations. Wrapping them in named `Deterministic` objects signals to PyMC that we wish to record the sampled values as part of the output.\n", 196 | "\n", 197 | "As a joint measure of the groups, we will also estimate the \"effect size\", which is the difference in means scaled by the pooled estimates of standard deviation. This quantity can be harder to interpret, since it is no longer in the same units as our data, but the quantity is a function of all four estimated parameters." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "with model:\n", 209 | "\n", 210 | " diff_of_means = pm.Deterministic('difference of means', group1_mean - group2_mean)\n", 211 | " diff_of_stds = pm.Deterministic('difference of stds', group1_std - group2_std)\n", 212 | " effect_size = pm.Deterministic('effect size', \n", 213 | " diff_of_means / pm.sqrt((group1_std**2 + group2_std**2) / 2))\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "Now, we can fit the model and evaluate its output." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "with model:\n", 232 | " trace = pm.sample(2000, njobs=2)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "We can plot the stochastic parameters of the model. PyMC's `plot_posterior` function replicates the informative histograms portrayed in Kruschke (2012). These summarize the posterior distributions of the parameters, and present a 95% credible interval and the posterior mean. The plots below are constructed with the final 1000 samples from each of the 2 chains, pooled together." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "pm.plot_posterior(trace[1000:], \n", 251 | " varnames=['group1_mean', 'group2_mean', 'group1_std', 'group2_std', 'ν_minus_one'],\n", 252 | " color='#87ceeb');" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Looking at the group differences, we can conclude that there are meaningful differences between the two groups for all three measures. For these comparisons, it is useful to use zero as a reference value (`ref_val`); providing this reference value yields cumulative probabilities for the posterior distribution on either side of the value. Thus, for the difference in means, 98.9% of the posterior probability is greater than zero, which suggests the group means are credibly different. The effect size and differences in standard deviation are similarly positive.\n", 260 | "\n", 261 | "These estimates suggest that the \"smart drug\" increased both the expected scores, but also the variability in scores across the sample. So, this does not rule out the possibility that some recipients may be adversely affected by the drug at the same time others benefit." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "pm.plot_posterior(trace[1000:], \n", 273 | " varnames=['difference of means', 'difference of stds', 'effect size'],\n", 274 | " ref_val=0,\n", 275 | " color='#87ceeb');" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "When `forestplot` is called on a trace with more than one chain, it also plots the potential scale reduction parameter, which is used to reveal evidence for lack of convergence; values near one, as we have here, suggest that the model has converged." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "pm.forestplot(trace[1000:], varnames=['difference of means', 'difference of stds', 'effect size'])" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "pm.plots.summary(trace[1000:], \n", 305 | " varnames=['difference of means', 'difference of stds', 'effect size'])" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## References\n", 313 | "\n", 314 | "1.\tGoodman SN. Toward evidence-based medical statistics. 1: The P value fallacy. Annals of Internal Medicine. 1999;130(12):995-1004. doi:10.7326/0003-4819-130-12-199906150-00008.\n", 315 | "2.\tJohnson D. The insignificance of statistical significance testing. Journal of Wildlife Management. 1999;63(3):763-772.\n", 316 | "3.\tKruschke JK. Bayesian estimation supersedes the t test. J Exp Psychol Gen. 2013;142(2):573-603. doi:10.1037/a0029146." 317 | ] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.5.1" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 1 341 | } 342 | -------------------------------------------------------------------------------- /notebooks/fatality.csv: -------------------------------------------------------------------------------- 1 | ,State,Resident Population,Driving Age Population,Highway Motor Fuel Use (000),Total Lane Miles,Total Road and Street Mileage,Annual VMT (Millions),Total Highway Fatalities,Fatalities (per 100 million VMT),State Motor Fuel Taxes and Other Related Receipts,Total Highway Capital Outlay (000),Total Disburse-ments for Highways (000),Payments the HTF (000),Apportion-ments from the Federal HTF (000) 2 | 0,AL,4447,3451,3148522,195298,94311,56534,995,1.76,579812,719722,1246223,638977,589698 3 | 1,AK,626,458,338750,25991,12823,4613,103,2.23,27817,321612,501359,65940,378674 4 | 2,AZ,5130,3908,2999157,118437,55195,49768,1036,2.08,565982,960137,2040266,583068,494747 5 | 3,AR,2673,2073,1959484,198161,97600,29167,652,2.24,398717,468053,817387,415571,397312 6 | 4,CA,33871,25599,17017620,371689,168076,306649,3753,1.22,2945156,2721334,6750225,3025732,2795250 7 | 5,CO,4301,3322,2450177,176993,85409,41771,681,1.63,521721,730129,1391910,423763,367548 8 | 6,CT,3405,2651,1697878,44474,20845,30756,342,1.11,545671,568931,1304378,312507,439532 9 | 7,DE,783,610,429413,12558,5779,8240,123,1.49,103965,297648,594641,79594,128749 10 | 8,DC,572,469,192440,3774,1425,3498,49,1.4,31727,164529,244216,33728,117381 11 | 9,FL,15982,12742,8648333,253349,116649,152136,2999,1.97,1612070,2448336,4207948,1554162,1390224 12 | 10,GA,8186,6251,6030954,241087,114727,105010,1541,1.47,431243,1106272,1567212,1189533,1023963 13 | 11,HI,1211,949,417929,9255,4281,8543,131,1.53,68872,148304,272268,69351,154425 14 | 12,ID,1293,969,847974,95178,46456,13534,276,2.04,202874,260689,491604,178492,253889 15 | 13,IL,12419,9530,6293151,288879,138372,102866,1418,1.38,1231728,1836253,3446580,1053743,986434 16 | 14,IN,6080,4682,4371604,193637,93608,70862,875,1.23,746424,1035129,1932198,767408,688839 17 | 15,IA,2926,2281,1993887,232920,113377,29433,445,1.51,394458,696081,1493639,353281,345026 18 | 16,KS,2688,2058,1676445,274014,134582,28130,461,1.64,358989,697463,1206470,346783,338426 19 | 17,KY,4041,3161,2850498,164231,79267,46803,820,1.75,439785,1078252,1650763,577037,525325 20 | 18,LA,4468,3395,2742677,127883,60900,40849,937,2.29,544329,767993,1300553,527753,464400 21 | 19,ME,1274,1010,847317,46346,22670,14190,169,1.19,174259,224728,487571,162787,153306 22 | 20,MD,5296,4085,2889534,67017,30494,50174,588,1.17,643009,594511,1599413,541915,476674 23 | 21,MA,6349,5008,3122005,74505,35311,52796,433,0.82,644389,2238138,3524344,545690,536063 24 | 22,MI,9938,7628,5822391,256155,121979,97792,1382,1.41,1047898,2136479,2747958,1074219,961800 25 | 23,MN,4919,3783,3154032,271176,132250,52601,625,1.19,595997,697358,1692476,403760,439011 26 | 24,MS,2844,2160,2035655,151701,73498,35536,949,2.67,397597,697252,1039192,428679,365747 27 | 25,MO,5595,4292,3977442,251209,123039,67083,1157,1.72,674002,1006426,1818178,754241,719347 28 | 26,MT,902,701,660133,141978,69567,9882,237,2.4,195390,300018,473807,140430,301755 29 | 27,NE,1711,1315,1188911,188273,92791,18081,276,1.53,307043,383934,744905,241167,224419 30 | 28,NV,1998,1538,1188724,79050,37854,17639,323,1.83,305124,424280,650984,215455,228039 31 | 29,NH,1235,961,759891,31366,15211,12021,126,1.05,136478,189689,387468,137452,148580 32 | 30,NJ,8414,6545,4748655,78163,36022,67446,731,1.08,525253,1994253,4502639,865079,781862 33 | 31,NM,1819,1370,1285461,124841,59927,22760,430,1.89,238882,463011,1162422,269496,307801 34 | 32,NY,18976,14797,6516320,239035,112783,129057,1458,1.13,1406054,2582541,5306825,1249954,1485648 35 | 33,NC,8049,6291,5088090,209335,99813,89504,1472,1.64,1054849,1464209,2621330,918638,825844 36 | 34,ND,642,502,483722,175349,86609,7217,86,1.19,102201,180072,384538,101377,194296 37 | 35,OH,11353,8790,6570881,248722,116964,105898,1351,1.28,1484302,1650422,3350560,1158013,1006181 38 | 36,OK,3450,2666,2478132,232710,112634,43355,652,1.5,414272,809152,1417329,500974,446540 39 | 37,OR,3421,2673,1919249,136866,66902,35010,451,1.29,385359,357751,1010377,381740,384990 40 | 38,PA,12281,9694,6323548,249169,119642,102337,1520,1.49,1698159,2323646,4516621,1238907,1449850 41 | 39,RI,1048,827,450802,12812,6052,8359,80,0.96,134571,129527,255637,82095,180896 42 | 40,SC,4012,3115,2831976,136123,64921,45538,1065,2.34,467948,502049,970218,554376,483066 43 | 41,SD,754,577,562591,169060,83471,8432,173,2.05,116489,346269,465690,101194,211222 44 | 42,TN,5689,4446,3759136,183640,87419,65732,1306,1.99,777581,836144,1439811,759820,685545 45 | 43,TX,20851,15618,13252841,639853,301035,220064,3769,1.71,2700214,3421427,5664524,2573239,2199108 46 | 44,UT,2233,1599,1333773,87435,41852,22597,373,1.65,314163,691200,1072340,249715,283695 47 | 45,VT,608,479,403551,29359,14273,6811,79,1.16,87255,138578,287124,70411,133812 48 | 46,VA,7078,5529,4575296,152328,70393,74801,930,1.24,774161,1270665,2678129,867264,775292 49 | 47,WA,5894,4553,3180398,167211,80209,53330,632,1.19,725356,704342,1871259,588415,544878 50 | 48,WV,1808,1455,1091359,76671,37277,19242,410,2.13,295148,673882,1170434,220408,329354 51 | 49,WI,5363,4157,3061051,231340,112359,57266,799,1.4,795105,886798,1663266,602560,572783 52 | 50,WY,493,382,590437,56780,27326,8090,152,1.88,100435,270786,395725,151317,228408 53 | -------------------------------------------------------------------------------- /notebooks/images/95_ci_driving_cars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/95_ci_driving_cars.png -------------------------------------------------------------------------------- /notebooks/images/bayes_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/bayes_formula.png -------------------------------------------------------------------------------- /notebooks/images/binary_doubling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/binary_doubling.png -------------------------------------------------------------------------------- /notebooks/images/boxloop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/boxloop.png -------------------------------------------------------------------------------- /notebooks/images/f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/f.png -------------------------------------------------------------------------------- /notebooks/images/hmc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/hmc.png -------------------------------------------------------------------------------- /notebooks/images/nuts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/nuts.png -------------------------------------------------------------------------------- /notebooks/images/rejection_sampling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/images/rejection_sampling.png -------------------------------------------------------------------------------- /notebooks/mcmc-animate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/mcmc-animate.gif -------------------------------------------------------------------------------- /notebooks/mcmc.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/mcmc.sqlite -------------------------------------------------------------------------------- /notebooks/nuts_and_metropolis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Probabilistic Programming Primer: What's NUTS and MCMC?#\n", 8 | "\n", 9 | "The aim with this chapter is to give a very high level overview of what MCMC is, and why you should care. \n", 10 | "\n", 11 | "You'll hear a lot in textbooks or in other documents/documentation about Probabilistic Programming - MCMC, and stuff. \n", 12 | "I can personally remember being very intimidated by this. My aim by the end of this section is to demystify this. " 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [] 21 | } 22 | ], 23 | "metadata": { 24 | "kernelspec": { 25 | "display_name": "Python [conda env:stat-rethink-pymc3]", 26 | "language": "python", 27 | "name": "conda-env-stat-rethink-pymc3-py" 28 | }, 29 | "language_info": { 30 | "codemirror_mode": { 31 | "name": "ipython", 32 | "version": 3 33 | }, 34 | "file_extension": ".py", 35 | "mimetype": "text/x-python", 36 | "name": "python", 37 | "nbconvert_exporter": "python", 38 | "pygments_lexer": "ipython3", 39 | "version": "3.6.4" 40 | } 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 2 44 | } 45 | -------------------------------------------------------------------------------- /notebooks/poisson_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/poisson_dag.png -------------------------------------------------------------------------------- /notebooks/trace.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/trace.sqlite -------------------------------------------------------------------------------- /notebooks/untitled.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/probabilisticprogrammingprimer/5a50472efd95220f2c6eed07c8c1b909b6f958f9/notebooks/untitled.txt -------------------------------------------------------------------------------- /src/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/Users/peadarcoyle/miniconda3/envs/ppp/bin/python" 3 | } -------------------------------------------------------------------------------- /src/pip-delete-this-directory.txt: -------------------------------------------------------------------------------- 1 | This file is placed here by pip to indicate the source was put 2 | here by pip. 3 | 4 | Once this package is successfully installed this source code will be 5 | deleted (unless you remove this file). 6 | -------------------------------------------------------------------------------- /src/pyro_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.functional import normalize # noqa: F401 7 | 8 | import pyro 9 | from pyro.distributions import Bernoulli, Normal # noqa: F401 10 | from pyro.infer import SVI, JitTrace_ELBO, Trace_ELBO 11 | from pyro.optim import Adam 12 | 13 | 14 | """ 15 | Bayesian Regression 16 | Learning a function of the form: 17 | y = wx + b 18 | """ 19 | 20 | 21 | # generate toy dataset 22 | def build_linear_dataset(N, p, noise_std=0.01): 23 | X = np.random.rand(N, p) 24 | # use random integer weights from [0, 7] 25 | w = np.random.randint(4, size=p) 26 | print('w = {}'.format(w)) 27 | # set b = 1 28 | y = np.matmul(X, w) + np.repeat(1, N) + np.random.normal(0, noise_std, size=N) 29 | y = y.reshape(N, 1) 30 | X, y = torch.tensor(X), torch.tensor(y) 31 | data = torch.cat((X, y), 1) 32 | assert data.shape == (N, p + 1) 33 | return data 34 | 35 | 36 | # NN with one linear layer 37 | class RegressionModel(nn.Module): 38 | def __init__(self, p): 39 | super(RegressionModel, self).__init__() 40 | self.linear = nn.Linear(p, 1) 41 | 42 | def forward(self, x): 43 | # x * w + b 44 | return self.linear(x) 45 | 46 | 47 | N = 1000 # size of toy data 48 | p = 4 # number of features 49 | 50 | softplus = nn.Softplus() 51 | regression_model = RegressionModel(p) 52 | 53 | 54 | def model(data): 55 | # Create unit normal priors over the parameters 56 | loc = data.new_zeros(torch.Size((1, p))) 57 | scale = 2 * data.new_ones(torch.Size((1, p))) 58 | bias_loc = data.new_zeros(torch.Size((1,))) 59 | bias_scale = 2 * data.new_ones(torch.Size((1,))) 60 | w_prior = Normal(loc, scale).independent(1) 61 | b_prior = Normal(bias_loc, bias_scale).independent(1) 62 | priors = {'linear.weight': w_prior, 'linear.bias': b_prior} 63 | # lift module parameters to random variables sampled from the priors 64 | lifted_module = pyro.random_module("module", regression_model, priors) 65 | # sample a regressor (which also samples w and b) 66 | lifted_reg_model = lifted_module() 67 | 68 | with pyro.iarange("map", N, subsample=data): 69 | x_data = data[:, :-1] 70 | y_data = data[:, -1] 71 | # run the regressor forward conditioned on inputs 72 | prediction_mean = lifted_reg_model(x_data).squeeze(-1) 73 | pyro.sample("obs", Normal(prediction_mean, 1), 74 | obs=y_data) 75 | 76 | 77 | def guide(data): 78 | w_loc = torch.randn(1, p, dtype=data.dtype, device=data.device) 79 | w_log_sig = -3 + 0.05 * torch.randn(1, p, dtype=data.dtype, device=data.device) 80 | b_loc = torch.randn(1, dtype=data.dtype, device=data.device) 81 | b_log_sig = -3 + 0.05 * torch.randn(1, dtype=data.dtype, device=data.device) 82 | # register learnable params in the param store 83 | mw_param = pyro.param("guide_mean_weight", w_loc) 84 | sw_param = softplus(pyro.param("guide_log_scale_weight", w_log_sig)) 85 | mb_param = pyro.param("guide_mean_bias", b_loc) 86 | sb_param = softplus(pyro.param("guide_log_scale_bias", b_log_sig)) 87 | # gaussian guide distributions for w and b 88 | w_dist = Normal(mw_param, sw_param).independent(1) 89 | b_dist = Normal(mb_param, sb_param).independent(1) 90 | dists = {'linear.weight': w_dist, 'linear.bias': b_dist} 91 | # overloading the parameters in the module with random samples from the guide distributions 92 | lifted_module = pyro.random_module("module", regression_model, dists) 93 | # sample a regressor 94 | return lifted_module() 95 | 96 | 97 | # get array of batch indices 98 | def get_batch_indices(N, batch_size): 99 | all_batches = np.arange(0, N, batch_size) 100 | if all_batches[-1] != N: 101 | all_batches = list(all_batches) + [N] 102 | return all_batches 103 | 104 | 105 | def main(args): 106 | pyro.clear_param_store() 107 | data = build_linear_dataset(N, p) 108 | if args.cuda: 109 | # make tensors and modules CUDA 110 | data = data.cuda() 111 | softplus.cuda() 112 | regression_model.cuda() 113 | 114 | # perform inference 115 | optim = Adam({"lr": 0.05}) 116 | elbo = JitTrace_ELBO() if args.jit else Trace_ELBO() 117 | svi = SVI(model, guide, optim, loss=elbo) 118 | for j in range(args.num_epochs): 119 | if args.batch_size == N: 120 | # use the entire data set 121 | epoch_loss = svi.step(data) 122 | else: 123 | # mini batch 124 | epoch_loss = 0.0 125 | perm = torch.randperm(N) if not args.cuda else torch.randperm(N).cuda() 126 | # shuffle data 127 | data = data[perm] 128 | # get indices of each batch 129 | all_batches = get_batch_indices(N, args.batch_size) 130 | for ix, batch_start in enumerate(all_batches[:-1]): 131 | batch_end = all_batches[ix + 1] 132 | batch_data = data[batch_start: batch_end] 133 | epoch_loss += svi.step(batch_data) 134 | if j % 100 == 0: 135 | print("epoch avg loss {}".format(epoch_loss/float(N))) 136 | 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser(description="parse args") 140 | parser.add_argument('-n', '--num-epochs', default=1000, type=int) 141 | parser.add_argument('-b', '--batch-size', default=N, type=int) 142 | parser.add_argument('--cuda', action='store_true') 143 | parser.add_argument('--jit', action='store_true') 144 | args = parser.parse_args() 145 | main(args) -------------------------------------------------------------------------------- /src/pyro_example_blank.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.functional import normalize # noqa: F401 7 | 8 | import pyro 9 | from pyro.distributions import Bernoulli, Normal # noqa: F401 10 | from pyro.infer import SVI, JitTrace_ELBO, Trace_ELBO 11 | from pyro.optim import Adam 12 | 13 | 14 | """ 15 | Bayesian Regression 16 | Learning a function of the form: 17 | y = wx + b 18 | """ 19 | 20 | 21 | # generate toy dataset 22 | def build_linear_dataset(N, p, noise_std=0.01): 23 | X = np.random.rand(N, p) 24 | # use random integer weights from [0, 7] 25 | w = np.random.randint(4, size=p) 26 | print('w = {}'.format(w)) 27 | # set b = 1 28 | y = np.matmul(X, w) + np.repeat(1, N) + np.random.normal(0, noise_std, size=N) 29 | y = y.reshape(N, 1) 30 | X, y = torch.tensor(X), torch.tensor(y) 31 | data = torch.cat((X, y), 1) 32 | assert data.shape == (N, p + 1) 33 | return data 34 | 35 | 36 | # NN with one linear layer 37 | class RegressionModel(nn.Module): 38 | def __init__(self, p): 39 | super(RegressionModel, self).__init__() 40 | self.linear = nn.Linear(p, 1) 41 | 42 | def forward(self, x): 43 | # x * w + b 44 | return self.linear(x) 45 | 46 | 47 | N = 1000 # size of toy data 48 | p = 4 # number of features 49 | 50 | softplus = nn.Softplus() 51 | regression_model = RegressionModel(p) 52 | 53 | def model(data): 54 | # Create unit normal priors over the parameters 55 | loc = data.new_zeros(torch.Size((1, p))) 56 | scale = 2 * data.new_ones(torch.Size((1, p))) 57 | bias_loc = data.new_zeros(torch.Size((1,))) 58 | bias_scale = 2 * data.new_ones(torch.Size((1,))) 59 | w_prior = Normal(loc, scale).independent(1) 60 | b_prior = Normal(bias_loc, bias_scale).independent(1) 61 | priors = {'linear.weight': w_prior, 'linear.bias': b_prior} 62 | # lift module parameters to random variables sampled from the priors 63 | lifted_module = pyro.random_module("module", regression_model, priors) 64 | # sample a regressor (which also samples w and b) 65 | lifted_reg_model = lifted_module() 66 | 67 | with pyro.iarange("map", N, subsample=data): 68 | x_data = data[:, :-1] 69 | y_data = data[:, -1] 70 | # run the regressor forward conditioned on inputs 71 | prediction_mean = lifted_reg_model(x_data).squeeze(-1) 72 | pyro.sample("obs", Normal(prediction_mean, 1), 73 | obs=y_data) 74 | 75 | def guide(data): 76 | """ 77 | In order to do inference we’re going to need a guide, i.e. a parameterized family of 78 | distributions over w and b. Writing down a guide will proceed in close analogy to the construction of our model, 79 | with the key difference that the guide parameters need to be trainable. 80 | """ 81 | w_loc = torch.randn(1, p, dtype=data.dtype, device=data.device) 82 | w_log_sig = -3 + 0.05 * torch.randn(1, p, dtype=data.dtype, device=data.device) 83 | b_loc = torch.randn(1, dtype=data.dtype, device=data.device) 84 | b_log_sig = -3 + 0.05 * torch.randn(1, dtype=data.dtype, device=data.device) 85 | # register learnable params in the param store 86 | mw_param = pyro.param("guide_mean_weight", w_loc) 87 | sw_param = softplus(pyro.param("guide_log_scale_weight", w_log_sig)) 88 | mb_param = pyro.param("guide_mean_bias", b_loc) 89 | sb_param = softplus(pyro.param("guide_log_scale_bias", b_log_sig)) 90 | # gaussian guide distributions for w and b 91 | w_dist = Normal(mw_param, sw_param).independent(1) 92 | b_dist = Normal(mb_param, sb_param).independent(1) 93 | dists = {'linear.weight': w_dist, 'linear.bias': b_dist} 94 | # overloading the parameters in the module with random samples from the guide distributions 95 | lifted_module = pyro.random_module("module", regression_model, dists) 96 | # sample a regressor 97 | return lifted_module() 98 | 99 | # get array of batch indices 100 | def get_batch_indices(N, batch_size): 101 | all_batches = np.arange(0, N, batch_size) 102 | if all_batches[-1] != N: 103 | all_batches = list(all_batches) + [N] 104 | return all_batches 105 | 106 | def main(args): 107 | pyro.clear_param_store() 108 | data = build_linear_dataset(N, p) 109 | if args.cuda: 110 | # make tensors and modules CUDA 111 | data = data.cuda() 112 | softplus.cuda() 113 | regression_model.cuda() 114 | 115 | # perform inference 116 | optim = Adam({"lr": 0.05}) 117 | elbo = JitTrace_ELBO() if args.jit else Trace_ELBO() 118 | svi = SVI(model, guide, optim, loss=elbo) 119 | for j in range(args.num_epochs): 120 | if args.batch_size == N: 121 | # use the entire data set 122 | epoch_loss = svi.step(data) 123 | else: 124 | # mini batch 125 | epoch_loss = 0.0 126 | perm = torch.randperm(N) if not args.cuda else torch.randperm(N).cuda() 127 | # shuffle data 128 | data = data[perm] 129 | # get indices of each batch 130 | all_batches = get_batch_indices(N, args.batch_size) 131 | for ix, batch_start in enumerate(all_batches[:-1]): 132 | batch_end = all_batches[ix + 1] 133 | batch_data = data[batch_start: batch_end] 134 | epoch_loss += svi.step(batch_data) 135 | if j % 100 == 0: 136 | print("epoch avg loss {}".format(epoch_loss/float(N))) 137 | 138 | 139 | if __name__ == '__main__': 140 | parser = argparse.ArgumentParser(description="parse args") 141 | parser.add_argument('-n', '--num-epochs', default=1000, type=int) 142 | parser.add_argument('-b', '--batch-size', default=N, type=int) 143 | parser.add_argument('--cuda', action='store_true') 144 | parser.add_argument('--jit', action='store_true') 145 | args = parser.parse_args() 146 | main(args) 147 | --------------------------------------------------------------------------------