├── .gitignore
├── ds-102-book
    ├── content
    │   ├── chapters
    │   │   ├── 01
    │   │   │   ├── .gitignore
    │   │   │   ├── fdp.png
    │   │   │   ├── row_wise_rates.png
    │   │   │   ├── nhst_outcomes_diagram.png
    │   │   │   ├── intro.md
    │   │   │   ├── p_values.csv
    │   │   │   ├── 00_figure_data_generation.ipynb
    │   │   │   └── 05_decision_theory.ipynb
    │   │   ├── 04
    │   │   │   ├── data
    │   │   │   │   ├── .gitignore
    │   │   │   │   └── bp_treatment.csv
    │   │   │   ├── causal_graph1.png
    │   │   │   ├── causal_graph2.png
    │   │   │   ├── figures
    │   │   │   │   ├── rain.png
    │   │   │   │   ├── berkson.png
    │   │   │   │   ├── spurious.png
    │   │   │   │   ├── instagram.png
    │   │   │   │   ├── restaurant.png
    │   │   │   │   ├── restaurants.png
    │   │   │   │   ├── sunburn_icecream.png
    │   │   │   │   └── mozzarella_civile_phd.png
    │   │   │   ├── intro.md
    │   │   │   ├── 02_quantifying_association.ipynb
    │   │   │   ├── 04_randomized_experiments.ipynb
    │   │   │   └── 06_instrumental_variables.ipynb
    │   │   ├── 02
    │   │   │   ├── chain_gm.png
    │   │   │   ├── collider_gm.png
    │   │   │   ├── heights_gm.png
    │   │   │   ├── review_model.png
    │   │   │   ├── explorer-graph.png
    │   │   │   ├── kc_hierarchical.png
    │   │   │   ├── review_model_simple.png
    │   │   │   ├── intro.md
    │   │   │   ├── 03_graphical_models.ipynb
    │   │   │   └── 04_inference.ipynb
    │   │   ├── 03
    │   │   │   ├── tree_small2.jpeg
    │   │   │   ├── backprop_filled.jpg
    │   │   │   ├── backprop_forward.jpg
    │   │   │   ├── intro.md
    │   │   │   ├── figures
    │   │   │   │   └── credible_interval_comparison.png
    │   │   │   ├── 01_prediction.ipynb
    │   │   │   ├── turbines.csv
    │   │   │   ├── 02_regression_review.ipynb
    │   │   │   └── 00_figure_generation.ipynb
    │   │   └── 05
    │   │   │   └── intro.md
    │   ├── LICENSE.md
    │   └── intro.md
    └── myst.yml
├── logo.png
├── favicon.ico
├── requirements.txt
├── README.md
├── .github
    └── workflows
    │   ├── a11y.yml
    │   └── deploy-jb2.yml
├── CONTRIBUTING.md
├── CONDUCT.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | .ipynb_checkpoints
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/.gitignore:
--------------------------------------------------------------------------------
1 | manufacturing.csv
2 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/logo.png


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/favicon.ico


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/data/.gitignore:
--------------------------------------------------------------------------------
1 | restaurants.csv
2 | restaurants_counterfactuals.csv
3 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/fdp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/01/fdp.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/chain_gm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/chain_gm.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/collider_gm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/collider_gm.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/heights_gm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/heights_gm.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/review_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/review_model.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/tree_small2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/03/tree_small2.jpeg


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/causal_graph1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/causal_graph1.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/causal_graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/causal_graph2.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/rain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/rain.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/row_wise_rates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/01/row_wise_rates.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/explorer-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/explorer-graph.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/kc_hierarchical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/kc_hierarchical.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/backprop_filled.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/03/backprop_filled.jpg


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/backprop_forward.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/03/backprop_forward.jpg


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/berkson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/berkson.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/spurious.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/spurious.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/instagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/instagram.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/restaurant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/restaurant.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/nhst_outcomes_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/01/nhst_outcomes_diagram.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/review_model_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/02/review_model_simple.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/restaurants.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/restaurants.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/intro.md:
--------------------------------------------------------------------------------
1 | # Chapter 3: Prediction
2 | 
3 | This chapter will cover prediction with generalized linear models and nonparametric models.
4 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/05/intro.md:
--------------------------------------------------------------------------------
1 | # Chapter 5: Tail Bounds and Concentration Inequalities
2 | 
3 | This chapter covers tail bounds and concentration inequalities.
4 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/sunburn_icecream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/sunburn_icecream.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/figures/mozzarella_civile_phd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/04/figures/mozzarella_civile_phd.png


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/intro.md:
--------------------------------------------------------------------------------
1 | # Chapter 1: Decisions and Hypothesis testing
2 | 
3 | *This chapter is a work in progress.*
4 | 
5 | We start by looking at binary decision-making.
6 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/intro.md:
--------------------------------------------------------------------------------
1 | # Chapter 4: Causal Inference
2 | 
3 | *This chapter is a work in progress.*
4 | 
5 | This chapter will cover the fundamentals of causal inference.
6 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/intro.md:
--------------------------------------------------------------------------------
1 | # Chapter 2: Bayesian Inference
2 | 
3 | *This chapter is a work in progress.*
4 | 
5 | This chapter will cover the fundamentals of Bayesian inference.
6 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/figures/credible_interval_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds-102/ds-102-book/HEAD/ds-102-book/content/chapters/03/figures/credible_interval_comparison.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyter-book>=2
 2 | matplotlib
 3 | numpy
 4 | seaborn
 5 | pandas
 6 | ghp-import
 7 | scipy
 8 | scikit-learn
 9 | statsmodels
10 | arviz
11 | bambi
12 | torch
13 | pymc
14 | daft-pgm
15 | 


--------------------------------------------------------------------------------
/ds-102-book/content/LICENSE.md:
--------------------------------------------------------------------------------
1 | # License for this book
2 | 
3 | All content in this book (ie, any files and content in the `content/` folder)
4 | is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-sa/4.0/)
5 | (CC BY-SA 4.0) license.


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/data/bp_treatment.csv:
--------------------------------------------------------------------------------
 1 | treated,is_old,bp_change
 2 | 0,0,-0.2
 3 | 0,0,-3.4
 4 | 0,0,-4.0
 5 | 0,0,-5.6
 6 | 0,0,-2.9
 7 | 0,0,-3.8
 8 | 0,0,-4.7
 9 | 0,1,-0.7
10 | 0,1,-0.9
11 | 1,0,-5.2
12 | 1,0,-4.8
13 | 1,0,-5.9
14 | 1,0,-6.1
15 | 1,1,-1.1
16 | 1,1,-1.5
17 | 1,1,-2.1
18 | 1,1,-0.8
19 | 1,1,-1.7
20 | 1,1,-4.9
21 | 1,1,-5.4
22 | 


--------------------------------------------------------------------------------
/ds-102-book/content/intro.md:
--------------------------------------------------------------------------------
 1 | # Data, Inference, and Decisions
 2 | 
 3 | This is a draft textbook for Data 102, created as a supplement to other course material.
 4 | 
 5 | This work is licensed under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/deed.en)
 6 | 
 7 | ## Acknowledgements
 8 | 
 9 | This course and its content would not be possible without a large team of
10 | faculty and students who developed the material and refined it over the course
11 | of many semesters.
12 | 
13 | Jupyter Books was originally created by Sam Lau and Chris Holdgraf
14 | with support of the **UC Berkeley Data Science Education Program and the Berkeley
15 | Institute for Data Science**.
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data, Inference, and Decisions
 2 | 
 3 | [![Jupyter Book (via myst) GitHub Pages Deploy](https://github.com/ds-102/ds-102-book/actions/workflows/deploy-jb2.yml/badge.svg)](https://github.com/ds-102/ds-102-book/actions/workflows/deploy-jb2.yml) [![Accessibility Checks](https://github.com/ds-102/ds-102-book/actions/workflows/a11y.yml/badge.svg)](https://github.com/ds-102/ds-102-book/actions/workflows/a11y.yml)
 4 | 
 5 | Textbook for Berkeley's Data 102 course
 6 | 
 7 | ## Usage
 8 | 
 9 | ### Building the book
10 | 
11 | If you'd like to develop and/or build the Data, Inference, and Decisions book, you should:
12 | 
13 | 1. Clone this repository
14 | 2. Run `pip install -r requirements.txt` (it is recommended you do this within a virtual environment)
15 | 3. (Optional) Edit the books source files located in the `ds-102-book/` directory
16 | 4. Run `jupyter-book start`. A server will start and you can view the book in your browser. Updates to the source files will update the book in the real-time.
17 | 
18 | ### Hosting the book
19 | 
20 | The book is deployed using GitHub Pages.
21 | 
22 | ## Contributors
23 | 
24 | We welcome and recognize all contributions. You can see a list of current contributors in the [contributors tab](https://github.com/rameshvs/ds-102-book/graphs/contributors).
25 | 
26 | ## Credits
27 | 
28 | This project is created using the excellent open source [Jupyter Book project](https://jupyterbook.org/) and the [executablebooks/cookiecutter-jupyter-book template](https://github.com/executablebooks/cookiecutter-jupyter-book).
29 | 


--------------------------------------------------------------------------------
/.github/workflows/a11y.yml:
--------------------------------------------------------------------------------
 1 | name: Accessibility Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   axe-check:
11 |     defaults:
12 |       run:
13 |         working-directory: ds-102-book
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 | 
19 |       # 1. Install Jupyter Book & Build
20 |       - name: Install Jupyter Book (via myst)
21 |         run: npm install -g jupyter-book
22 | 
23 |       - name: Build Jupyter Book
24 |         run: jupyter-book build --html
25 | 
26 |       # 2. Setup Node environment for Axe
27 |       - name: Setup Node.js
28 |         uses: actions/setup-node@v4
29 |         with:
30 |           node-version: '20'
31 | 
32 |       - name: Install Axe CLI and HTTP Server
33 |         run: npm install -g @axe-core/cli http-server
34 | 
35 |       # 3. Serve, Discover URLs, and Run Axe
36 |       - name: Run Axe Checks
37 |         run: |
38 |           # A. Start a local server in the background
39 |           npx http-server ./_build/html -p 8080 -s &
40 |           sleep 5
41 |           
42 |           # B. Generate the URL list from the actual build artifacts
43 |           cd _build/html
44 |           URLS=$(find . -name "*.html" -not -path "*/build/*" | sed 's|^\./||' | sed 's|^|http://localhost:8080/|' | tr '\n' ' ')          
45 |           
46 |           # C. Run Axe CLI on the discovered URLs
47 |           axe $URLS \
48 |             --tags wcag2a,wcag2aa,wcag21a,wcag21aa \
49 |             --save axe-report.json \
50 |             --exit
51 | 
52 |       - name: Upload Accessibility Report
53 |         if: always() 
54 |         uses: actions/upload-artifact@v4
55 |         with:
56 |           name: axe-report
57 |           path: _build/html/axe-report.json


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contributions are welcome, and they are greatly appreciated! Every little bit
 4 | helps, and credit will always be given. You can contribute in the ways listed below.
 5 | 
 6 | ## Report Bugs
 7 | 
 8 | Report bugs using GitHub issues.
 9 | 
10 | If you are reporting a bug, please include:
11 | 
12 | * Your operating system name and version.
13 | * Any details about your local setup that might be helpful in troubleshooting.
14 | * Detailed steps to reproduce the bug.
15 | 
16 | ## Fix Bugs
17 | 
18 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
19 | wanted" is open to whoever wants to implement it.
20 | 
21 | ## Implement Features
22 | 
23 | Look through the GitHub issues for features. Anything tagged with "enhancement"
24 | and "help wanted" is open to whoever wants to implement it.
25 | 
26 | ## Write Documentation
27 | 
28 | Data, Inference, and Decisions could always use more documentation, whether as part of the
29 | official Data, Inference, and Decisions docs, in docstrings, or even on the web in blog posts,
30 | articles, and such.
31 | 
32 | ## Submit Feedback
33 | 
34 | The best way to send feedback is to file an issue on GitHub.
35 | 
36 | If you are proposing a feature:
37 | 
38 | * Explain in detail how it would work.
39 | * Keep the scope as narrow as possible, to make it easier to implement.
40 | * Remember that this is a volunteer-driven project, and that contributions
41 |   are welcome :)
42 | 
43 | ## Get Started
44 | 
45 | Ready to contribute? Here's how to set up `Data, Inference, and Decisions` for local development.
46 | 
47 | 1. Fork the repo on GitHub.
48 | 2. Clone your fork locally.
49 | 3. Install your local copy into a virtualenv, e.g., using `conda`.
50 | 4. Create a branch for local development and make changes locally.
51 | 5. Commit your changes and push your branch to GitHub.
52 | 6. Submit a pull request through the GitHub website.
53 | 
54 | ## Code of Conduct
55 | 
56 | Please note that the Data, Inference, and Decisions project is released with a [Contributor Code of Conduct](CONDUCT.md). By contributing to this project you agree to abide by its terms.
57 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-jb2.yml:
--------------------------------------------------------------------------------
 1 | # This file was created automatically with `jupyter-book init --gh-pages` 🪄 💚
 2 | # Ensure your GitHub Pages settings for this repository are set to deploy with **GitHub Actions**.
 3 | 
 4 | name: Jupyter Book (via myst) GitHub Pages Deploy
 5 | on:
 6 |   push:
 7 |     # Runs on pushes targeting the default branch
 8 |     branches: [main]
 9 | env:
10 |   # `BASE_URL` determines, relative to the root of the domain, the URL that your site is served from.
11 |   # E.g., if your site lives at `https://mydomain.org/myproject`, set `BASE_URL=/myproject`.
12 |   # If, instead, your site lives at the root of the domain, at `https://mydomain.org`, set `BASE_URL=''`.
13 |   BASE_URL: /${{ github.event.repository.name }}
14 | 
15 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
16 | permissions:
17 |   contents: read
18 |   pages: write
19 |   id-token: write
20 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
21 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
22 | concurrency:
23 |   group: 'pages'
24 |   cancel-in-progress: false
25 | jobs:
26 |   deploy:
27 |     environment:
28 |       name: github-pages
29 |       url: ${{ steps.deployment.outputs.page_url }}
30 |     defaults:
31 |       run:
32 |         working-directory: ds-102-book
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - uses: actions/checkout@v4
36 |       - name: Setup Pages
37 |         uses: actions/configure-pages@v3
38 |       - uses: actions/setup-node@v4
39 |         with:
40 |           node-version: 18.x
41 |       - name: Install Jupyter Book (via myst)
42 |         run: npm install -g jupyter-book
43 |       - name: Build HTML Assets
44 |         run: jupyter-book build --html
45 |       - name: Install uv
46 |         uses: astral-sh/setup-uv@v4
47 |       - name: Generate redirects
48 |         run: |
49 |           uv run https://raw.githubusercontent.com/pancakereport/jb1-redirect-generator/main/generate_redirects.py \
50 |             --base-url https://data102.org/ds-102-book/
51 |       - name: Upload artifact
52 |         uses: actions/upload-pages-artifact@v3
53 |         with:
54 |           path: 'ds-102-book/_build/html'
55 |       - name: Deploy to GitHub Pages
56 |         id: deployment
57 |         uses: actions/deploy-pages@v4
58 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/p_values.csv:
--------------------------------------------------------------------------------
  1 | pvalue,is_alternative
  2 | 0.0401306148585278,0
  3 | 0.0004355184345501,1
  4 | 0.2511288640204292,0
  5 | 0.3813993774219155,0
  6 | 0.6210165696994836,0
  7 | 0.0005127250338293,1
  8 | 0.6165744277674163,0
  9 | 0.6768423869141591,0
 10 | 0.4044883638113907,1
 11 | 0.1077482075264597,0
 12 | 0.4710338282425249,0
 13 | 0.5457053191960356,0
 14 | 0.2763602731106821,0
 15 | 0.980498076130198,0
 16 | 0.7589065052540647,0
 17 | 0.3111174861403785,0
 18 | 0.00247254407267,0
 19 | 0.0532053336064183,1
 20 | 0.5433843669450287,0
 21 | 0.952673922523798,0
 22 | 7.550189628613424e-06,1
 23 | 0.8653087584574276,0
 24 | 0.330297514871708,0
 25 | 0.6172250817714626,0
 26 | 0.7705454057682809,0
 27 | 0.0131877140899846,1
 28 | 0.0856505901585873,1
 29 | 0.0109349511963193,0
 30 | 0.2805910933597149,0
 31 | 0.145970694854551,1
 32 | 0.6801005553052915,0
 33 | 0.9865424447328072,0
 34 | 0.7564030269818318,0
 35 | 0.0049767281889351,1
 36 | 0.2639250832525113,0
 37 | 0.4939866986615047,0
 38 | 0.1538669217731585,1
 39 | 0.6576489278222185,0
 40 | 0.4609660577396227,0
 41 | 0.0077329105599599,1
 42 | 0.0164309426989428,0
 43 | 0.003677967706878,1
 44 | 0.9820762182755142,0
 45 | 0.0001090103228518,0
 46 | 0.0808665427268469,1
 47 | 0.0258760314808369,0
 48 | 0.040958614185707,1
 49 | 0.7267304639800592,0
 50 | 0.0044704472587663,1
 51 | 0.9788434120576012,0
 52 | 0.0589638765273701,0
 53 | 0.1801317071156826,1
 54 | 0.209103321587016,0
 55 | 0.4316202288496276,0
 56 | 0.4377729332554143,0
 57 | 0.2082360782325001,0
 58 | 0.0001622446758947,1
 59 | 0.0014461999402815,1
 60 | 0.0107485462272249,0
 61 | 0.1565349917315902,0
 62 | 0.3213676413785715,0
 63 | 0.0151140956750291,0
 64 | 0.0356138828795129,1
 65 | 0.1198971670159513,0
 66 | 0.0038420256459548,0
 67 | 0.0249309668737955,1
 68 | 0.0011302480548279,0
 69 | 0.7348576719789774,0
 70 | 0.4597694650209541,0
 71 | 0.0323421362825291,1
 72 | 0.0303796996968643,1
 73 | 0.2863887681519448,0
 74 | 0.106062517807776,0
 75 | 0.0431440058187606,0
 76 | 0.0187665046373697,0
 77 | 0.1069968191038566,0
 78 | 0.0133034910397425,0
 79 | 0.0203154651716473,0
 80 | 0.2143255807347974,0
 81 | 0.0213534944802327,0
 82 | 0.1463456818020398,0
 83 | 0.2124191471200944,1
 84 | 0.0002187552446574,1
 85 | 0.6470213915215514,0
 86 | 0.2264100954870737,1
 87 | 0.0077013107169201,1
 88 | 0.008767611909691,1
 89 | 0.6980873865819376,0
 90 | 0.5487033948153001,0
 91 | 0.9314022049652886,0
 92 | 0.4017864945592472,0
 93 | 0.3929117898647605,0
 94 | 0.001127760747436,1
 95 | 0.0333022721906001,0
 96 | 0.1554242085820154,0
 97 | 0.8479260583634631,0
 98 | 0.2664349234201569,1
 99 | 0.8221850764119517,0
100 | 0.1184454542842929,1
101 | 0.5144673939015094,0
102 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Code of Conduct
 3 | 
 4 | ## Our Pledge
 5 | 
 6 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 7 | 
 8 | ## Our Standards
 9 | 
10 | Examples of behavior that contributes to creating a positive environment include:
11 | 
12 | * Using welcoming and inclusive language
13 | * Being respectful of differing viewpoints and experiences
14 | * Gracefully accepting constructive criticism
15 | * Focusing on what is best for the community
16 | * Showing empathy towards other community members
17 | 
18 | Examples of unacceptable behavior by participants include:
19 | 
20 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
21 | * Trolling, insulting/derogatory comments, and personal or political attacks
22 | * Public or private harassment
23 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
24 | * Other conduct which could reasonably be considered inappropriate in a professional setting
25 | 
26 | ## Our Responsibilities
27 | 
28 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
29 | 
30 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
31 | 
32 | ## Scope
33 | 
34 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
35 | 
36 | ## Enforcement
37 | 
38 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
39 | 
40 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
41 | 
42 | ## Attribution
43 | 
44 | This Code of Conduct is adapted from the [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4).
45 | 


--------------------------------------------------------------------------------
/ds-102-book/myst.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | project:
 3 |   title: Data, Inference, and Decisions
 4 |   authors:
 5 |     - name: Data 102 Staff
 6 |   github: ds-102/ds-102-book
 7 |   license: CC-BY-SA-4.0
 8 |   settings:
 9 |     output_matplotlib_strings: remove
10 |   toc:
11 |     - file: content/intro.md
12 |     - file: content/chapters/01/intro.md
13 |       children:
14 |         - file: content/chapters/01/01_decisions_and_errors.ipynb
15 |           title: Binary Decision-Making and Error Rates
16 |         - file: content/chapters/01/02_hypothesis_testing.ipynb
17 |           title: Hypothesis Testing and p-Values
18 |         - file: content/chapters/01/03_multiple_tests.ipynb
19 |           title: Multiple Hypothesis Testing
20 |         - file: content/chapters/01/04_binary_classification.ipynb
21 |           title: Binary Classification
22 |         - file: content/chapters/01/05_decision_theory.ipynb
23 |           title: Decision Theory
24 |     - file: content/chapters/02/intro.md
25 |       children:
26 |         - file: content/chapters/02/01_parameter_estimation.ipynb
27 |           title: Parameter Estimation and Bayesian Inference Fundamentals
28 |         - file: content/chapters/02/02_hierarchical_models.ipynb
29 |           title: Hierarchical Bayesian Models
30 |         - file: content/chapters/02/03_graphical_models.ipynb
31 |           title: Graphical Models
32 |         - file: content/chapters/02/04_inference.ipynb
33 |           title: Bayesian Inference
34 |         - file: content/chapters/02/05_inference_with_sampling.ipynb
35 |           title: Bayesian Inference with Sampling
36 |     - file: content/chapters/03/intro.md
37 |       children:
38 |         - file: content/chapters/03/01_prediction.ipynb
39 |           title: Prediction
40 |         - file: content/chapters/03/02_regression_review.ipynb
41 |           title: Linear Regression Review
42 |         - file: content/chapters/03/03_glms.ipynb
43 |           title: Generalized Linear Models
44 |         - file: content/chapters/03/04_model_checking.ipynb
45 |           title: Model Checking
46 |         - file: content/chapters/03/05_uncertainty_quantification.ipynb
47 |           title: Uncertainty Quantification
48 |         - file: content/chapters/03/06_nonparametric.ipynb
49 |           title: Nonparametric Methods
50 |         - file: content/chapters/03/07_neural_networks.ipynb
51 |           title: Neural Networks
52 |     - file: content/chapters/04/intro.md
53 |       children:
54 |         - file: content/chapters/04/01_association_correlation_causation.ipynb
55 |           title: Understanding Association
56 |         - file: content/chapters/04/02_quantifying_association.ipynb
57 |           title: Quantifying Association
58 |         - file: content/chapters/04/03_causality_potential_outcomes.ipynb
59 |           title: Causality and Potential Outcomes
60 |         - file: content/chapters/04/04_randomized_experiments.ipynb
61 |           title: Causality in Randomized Experiments
62 |         - file: content/chapters/04/05_observational_studies_unconfoundedness.ipynb
63 |           title: 'Causality in Observational Studies: Unconfoundedness'
64 |         - file: content/chapters/04/06_instrumental_variables.ipynb
65 |           title: 'Causality in Observational Studies: Natural Experiments'
66 |     - file: content/chapters/05/intro.md
67 |       children:
68 |         - file: content/chapters/05/01_concentration.ipynb
69 |           title: Tail Bounds and Concentration Inequalities
70 | site:
71 |   options:
72 |     folders: true
73 |     hide_authors: true
74 |     fav_icon: ../favicon.ico
75 |     logo: ../logo.png
76 |     logo_alt: "Data 102: Data, Inference, and Decisions Text "
77 |   template: book-theme
78 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/01_prediction.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "1f4c4491",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Prediction"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "markdown",
13 |    "id": "b7bf6c8b",
14 |    "metadata": {},
15 |    "source": [
16 |     "*You may find it helpful to review [Chapter 15 of the Data 8 textbook](https://inferentialthinking.com/chapters/15/Prediction.html), which looks at prediction using linear regression and $k$-nearest neighbors.*\n",
17 |     "\n",
18 |     "In many cases, our goal is **prediction**: to predict one variable from several others. We'll usually call the prediction variable $y$, and the other variables $x$. Here are a few examples of prediction in real-world settings:\n",
19 |     "\n",
20 |     "* A subscription-based company might be interested in predicting whether a certain customer will renew their subscription next month ($y$), using demographic information, historical usage data, and general market trends ($x$). This information will help them make important business decisions on whether to send promotions to that customer, or by aggregating many predictions, to estimate how much revenue the company will bring in over the next month.\n",
21 |     "* A YouTube content producer might be interested in predicting the number of views on their next video ($y$), using information about their YouTube channel (follower count, etc.) and information about the video itself (length, number of guest stars, production cost, etc.) ($x$). This information could be useful when looking for advertisers who want to understand how many people will see the ad if they watch the video.\n",
22 |     "\n",
23 |     "In each of these examples, we can see that we have multiple predictors $x$ (usually we write these as $x_1, x_2, \\ldots, x_d$, or just as a vector $x$), and we're predicting a single target variable $y$ (usually scalar-valued). The predictions can be binary, otherwise discrete, or continuous.\n",
24 |     "\n",
25 |     "Here's the general framework that we'll use: we'll start by assuming that we have some pairs of known examples, $(x_1, y_1), (x_2, y_2), \\ldots, (x_n, y_n)$, where each pair contains a vector $x_i$ of predictors for data point $i$, and a known value of the prediction target for data point $i$. In the examples above, this might be historical information on customers who did and didn't renew in the past, or data on previously released videos with known view counts and channel information at the time of release. We'll use these points to learn a relationship between $x$ and $y$, and then apply what we learned to new points $x_{n+1}, x_{n+2}, \\ldots$. Any points we use to learn that relationship are referred to as the training set.\n",
26 |     "\n",
27 |     "You've already seen this pattern used several times before:\n",
28 |     "* ...in linear regression, where we **fit** or **train** a linear model using $(x_1, y_1), \\ldots, (x_n, y_n)$ and then use the learned coefficients to make predictions on new data points.\n",
29 |     "* ...in $k$-nearest neighbors classification, where we store the entire training set, and then when classifying (i.e., predicting classification labels for) new points, we find the $k$ closest points in the training set, and then use a majority vote of their labels as our prediction.\n",
30 |     "\n",
31 |     "Note that prediction is not necessarily the same as causality! A YouTuber might find that the view count on their most recent video is a strong predictor of the view count for their next video, but one doesn't necesarily have a causal relationship to the other. This doesn't mean it's any less useful for prediction, though. So, we'll limit ourselves to the world of making and understanding predictions, and avoid making any conclusions about causality or lack thereof for now. In the next chapter, we'll build a better understanding of the distinction between the two, and discover when we can in fact reason about causality."
32 |    ]
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3 (ipykernel)",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.11.5"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 5
56 | }
57 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/02_quantifying_association.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "latest-detail",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Quantifying Association"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "seasonal-pattern",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "We'll start our journey into causal inference by looking at several ways of quantifying association.\n",
 17 |     "\n",
 18 |     "Throughout, we'll assume we have two random variables, $Z$ and $Y$, and sometimes a third variable $X$. Although most of the methods we'll describe can be used regardless of how we interpret them, it will be helpful when we move to causality to think of $Z$ as either a treatment or covariate, to think of $Y$ as an outcome, and to think of $X$ as a potential confounding variable."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "infrared-grace",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Continuous Data: Correlation and Regression\n",
 27 |     "\n",
 28 |     "### Correlation coefficient\n",
 29 |     "\n",
 30 |     "There are a few different ways to measure correlation. The most common is the **Pearson correlation** (also called the correlation coefficient), usually denoted with $r$ or $\\rho$ (the greek letter *rho*):\n",
 31 |     "\n",
 32 |     "$$\n",
 33 |     "    \\rho_{ZY} = \\frac{\\text{cov}(Z, Y)}{\\sqrt{\\text{var}(Z)\\text{var}(Y)}}\n",
 34 |     "$$\n",
 35 |     "\n",
 36 |     "This is a good measure of the linear association between $Z$ and $Y$. For a refresher on Pearson correlation, see the [Data 8 textbook](https://www.inferentialthinking.com/chapters/15/1/Correlation.html) and the [Data 140 textbook](http://prob140.org/textbook/content/Chapter_13/01_Covariance.html#correlation).\n",
 37 |     "\n",
 38 |     "### Linear regression\n",
 39 |     "\n",
 40 |     "If we were to fit a linear model to predict $Y$ from $Z$, it would look something like:\n",
 41 |     "\n",
 42 |     "$$Y = \\alpha + \\beta Z + \\varepsilon.$$\n",
 43 |     "\n",
 44 |     "As usual, we assume that $\\varepsilon$ is zero-mean noise, with the additional property that cov$(Z, \\varepsilon) = 0$. We've talked a lot about how to interpret this equation as a predictive model, but now we'll look at it slightly differently. \n",
 45 |     "\n",
 46 |     "We'll think of this equation as simply a descriptive explanation of a relationship between $Z$ and $Y$, where the most important part of the relationship is $\\beta$. We can use all the same computational machinery we've already developed to fit the model and compute $\\beta$, and the interpretation is subject to the limitations we've already learned about (e.g., it doesn't capture nonlinear association, it can be impacted by outliers, etc.). While it's common to describe $\\beta$ as quantifying the \"effect\" of $Z$ on $Y$, it's important to understand the limitations of the word \"effect\" here: linear regression can only tell us the *predictive* effect, rather than the causal effect.\n",
 47 |     "\n",
 48 |     "So, we'll use the coefficient $\\beta$ as a means to quantify the relationship between $Z$ and $y$. Starting from our assumption that cov$(Z, \\varepsilon) = 0$ and using properties of covariance, we can show that $\\beta = \\frac{\\text{cov}(Z, Y)}{\\text{var}(Z)}$. From here, we can also show that $\\beta = \\rho_{ZY} \\sqrt{\\frac{\\text{var}(Y)}{\\text{var}(Z)}}$ (as you may have seen empirically in Data 8).\n",
 49 |     "\n",
 50 |     "For example, suppose we're interested in quantifying the relationship between the number of years of schooling an individual has received ($Z$) and their income ($Y$). If we were to compute the coefficient $\\beta$, it would provide a way of quantifying the association between these two variables.\n",
 51 |     "\n",
 52 |     "### Multiple linear regression\n",
 53 |     "\n",
 54 |     "Suppose we are now interested in quantifying the relationship between two variables $x$ and $y$, but we also want to account for or \"control for\" the effect of a third variable, $w$. Assuming a linear relationship between them, we can extend our earlier relationship:\n",
 55 |     "\n",
 56 |     "$$Y = \\alpha + \\beta Z + \\gamma X + \\varepsilon.$$\n",
 57 |     "\n",
 58 |     "In this case, we can interpret $\\beta$ as a measure of association between $Z$ and $Y$ while **controlling for** or **adjusting for** the effect of a third variable $X$.\n",
 59 |     "\n",
 60 |     "Here are some refresher resources on linear regression:\n",
 61 |     "* [Prob 140 textbook Chapter 24](http://prob140.org/textbook/content/Chapter_24/00_Simple_Linear_Regression.html)\n",
 62 |     "* Data 100 textbook: [Chapter 14](https://www.textbook.ds100.org/ch/14/linear_models.html) and [Chapter 18](https://www.textbook.ds100.org/ch/18/mult_model.html)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "id": "87ddc26c",
 69 |    "metadata": {
 70 |     "tags": [
 71 |      "remove-input"
 72 |     ]
 73 |    },
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/html": [
 78 |        "\n",
 79 |        "        <iframe\n",
 80 |        "            width=\"400\"\n",
 81 |        "            height=\"300\"\n",
 82 |        "            src=\"https://www.youtube.com/embed/qlOiejELwDA\"\n",
 83 |        "            frameborder=\"0\"\n",
 84 |        "            allowfullscreen\n",
 85 |        "            \n",
 86 |        "        ></iframe>\n",
 87 |        "        "
 88 |       ],
 89 |       "text/plain": [
 90 |        "<IPython.lib.display.YouTubeVideo at 0x11379edd0>"
 91 |       ]
 92 |      },
 93 |      "execution_count": 2,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "from IPython.display import YouTubeVideo\n",
100 |     "YouTubeVideo('qlOiejELwDA')"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "complimentary-institution",
106 |    "metadata": {},
107 |    "source": [
108 |     "## Binary Data: (A Different Kind of) Risk"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "separate-spread",
114 |    "metadata": {},
115 |    "source": [
116 |     "Correlation and regression coefficients are a fine way to measure association between continuous numerical variables, but what if our data are categorical? We'll restrict ourselves to binary data in this section for simplicity. We'll look at three commonly used metrics for these cases: risk difference (RD), risk ratio (RR), and odds ratio (OR).\n",
117 |     "\n",
118 |     "When dealing with categorical data, we'll often start by visualizing the data in a contingency table.  We've already seen an example of this when we looked at Simpson's Paradox.\n",
119 |     "    \n",
120 |     "\n",
121 |     "|      | $y=0$ | $y=1$ |\n",
122 |     "| --- | --- | --- |\n",
123 |     "|$z=0$ | $n_{00}$ | $n_{01}$ |\n",
124 |     "|$z=1$ | $n_{10}$ | $n_{11}$ |\n",
125 |     "\n",
126 |     "Note that these are different from the 2x2 tables that we used at the beginning of the course: there, the rows and columns represented reality and our decisions, respectively. Here, they represent two different observed variables in our data. Just as with those tables, there's no standard convention about what to put in the rows vs the columns.\n",
127 |     "\n",
128 |     "For example, suppose we are interested in examining the relationship between receiving a vaccine ($z$) for a particular virus and being infected with that virus ($y$). We'll look at a study conducted on that vaccine. We'll use $z=1$ to indicate getting the vaccine, and $y=1$ to indicate being infected with the virus. In this case, for example, $n_{10}$ would represent the number of people in the study who received the vaccine and did not get infected.\n",
129 |     "\n",
130 |     "Most of the metrics we'll discuss are based on the **risk**, which represents the probability of $y=1$ given a particular value of $z$: The risk for $z=1$ is $P(y=1 | z=1)$ and the risk for $z=0$ is $P(y=1|z=0)$. Note that this definition is completely unrelated to the risk that we learned about in Chapter 1.\n",
131 |     "\n",
132 |     "In our vaccination example, the term *risk* has an intuitive interpretation: it represents your risk of being infected given whether or not you were vaccinated. \n",
133 |     "\n",
134 |     "The **risk difference (RD)** is defined as follows:\n",
135 |     "\n",
136 |     "$$\n",
137 |     "\\begin{align}\n",
138 |     "RD \n",
139 |     "    &= \\underbrace{P(Y=1 \\mid Z=1)}_{\\text{risk for }Z=1}\n",
140 |     "       - \\underbrace{P(Y=1 \\mid Z=0)}_{\\text{risk for }Z=0} \\\\\n",
141 |     "    &= \\quad\\,\\overbrace{\\frac{n_{11}}{n_{10} + n_{11}}}^{} \\quad\\,\\,-\\quad\\, \\overbrace{\\frac{n_{01}}{n_{00} + n_{01}}}^{}\n",
142 |     "\\end{align}\n",
143 |     "$$\n",
144 |     "\n",
145 |     "Returning to the vaccine example: if the vaccine works as intended (i.e., there's a strong association between being vaccinated and being infected), your risk of being infected should decrease after being vaccinated, and the risk difference should be a negative number far from 0. On the other hand, if there's little to no relationship between vaccination and infection, then the two terms should be very similar, and the risk difference should be close to 0. \n",
146 |     "\n",
147 |     "We can see the same fact mathematically. If $Z$ and $Y$ are independent, then $P(Y=1 \\mid Z=1) = P(Y=1 \\mid Z=0) = P(Y=1)$, so the two terms are equal. This means that they cancel and that the risk difference is 0.\n",
148 |     "\n",
149 |     "The **risk ratio (RR)**, also sometimes called the relative risk, is defined similarly as the ratio (instead of the difference) between the two quantities above:\n",
150 |     "\n",
151 |     "$$\n",
152 |     "RR = \\frac{P(Y=1 \\mid Z=1)}{P(Y=1 \\mid Z=0)}\n",
153 |     "$$\n",
154 |     "\n",
155 |     "We can use similar reasoning as above to conclude that this ratio should 1 when $Z$ and $Y$ are independent.\n",
156 |     "\n",
157 |     "The third commonly used measure is the **odds ratio (OR)**. It's the ratio of two odds, where each odds is itself a ratio:\n",
158 |     "\n",
159 |     "$$\n",
160 |     "OR = \\frac{%\n",
161 |     "        \\overbrace{P(Y=1|Z=1)/P(Y=0|Z=1)}^{\\text{odds of }y\\text{ in the presence of }Z}}{%\n",
162 |     "        \\underbrace{P(Y=1|Z=0)/P(Y=0|Z=0)}_{\\text{odds of }y\\text{ in the absence of }Z}}\n",
163 |     "$$\n",
164 |     "\n",
165 |     "While this looks more complicated, we can show that it simplifies to:\n",
166 |     "\n",
167 |     "$$\n",
168 |     "OR = \\frac{n_{00}}{n_{10}} \\cdot \\frac{n_{11}}{n_{01}}\n",
169 |     "$$"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 2,
175 |    "id": "35de73fb",
176 |    "metadata": {
177 |     "tags": [
178 |      "remove-input"
179 |     ]
180 |    },
181 |    "outputs": [
182 |     {
183 |      "data": {
184 |       "text/html": [
185 |        "\n",
186 |        "        <iframe\n",
187 |        "            width=\"400\"\n",
188 |        "            height=\"300\"\n",
189 |        "            src=\"https://www.youtube.com/embed/qlOiejELwDA\"\n",
190 |        "            frameborder=\"0\"\n",
191 |        "            allowfullscreen\n",
192 |        "            \n",
193 |        "        ></iframe>\n",
194 |        "        "
195 |       ],
196 |       "text/plain": [
197 |        "<IPython.lib.display.YouTubeVideo at 0x11379edd0>"
198 |       ]
199 |      },
200 |      "execution_count": 2,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "from IPython.display import YouTubeVideo\n",
207 |     "YouTubeVideo('aBIcozKxogo')"
208 |    ]
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "celltoolbar": "Edit Metadata",
213 |   "kernelspec": {
214 |    "display_name": "Python 3 (ipykernel)",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.11.5"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 5
233 | }
234 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/turbines.csv:
--------------------------------------------------------------------------------
  1 | t_state,p_year,t_built,t_cap
  2 | AK,1997.0,6,390.0
  3 | AK,1999.0,6,475.0
  4 | AK,2000.0,2,100.0
  5 | AK,2001.0,1,1500.0
  6 | AK,2002.0,1,100.0
  7 | AK,2003.0,2,130.0
  8 | AK,2004.0,4,260.0
  9 | AK,2006.0,6,495.0
 10 | AK,2007.0,2,450.0
 11 | AK,2008.0,21,1475.0
 12 | AK,2009.0,14,5600.0
 13 | AK,2010.0,13,1900.0
 14 | AK,2011.0,4,400.0
 15 | AK,2012.0,45,50175.0
 16 | AK,2013.0,4,2700.0
 17 | AK,2014.0,2,0.0
 18 | AK,2015.0,4,0.0
 19 | AK,2018.0,4,900.0
 20 | AK,2019.0,1,900.0
 21 | AR,2003.0,1,100.0
 22 | AZ,2009.0,30,63000.0
 23 | AZ,2010.0,31,65100.0
 24 | AZ,2011.0,6,11000.0
 25 | AZ,2012.0,62,99200.0
 26 | AZ,2015.0,15,30000.0
 27 | AZ,2020.0,97,0.0
 28 | CA,1981.0,11,0.0
 29 | CA,1982.0,937,323.0
 30 | CA,1983.0,432,28080.0
 31 | CA,1984.0,155,10070.0
 32 | CA,1985.0,714,40375.0
 33 | CA,1986.0,151,17806.0
 34 | CA,1987.0,213,21310.0
 35 | CA,1988.0,277,35410.0
 36 | CA,1989.0,288,0.0
 37 | CA,1990.0,327,73700.0
 38 | CA,1991.0,1,225.0
 39 | CA,1992.0,1,500.0
 40 | CA,1994.0,30,14700.0
 41 | CA,1995.0,42,9450.0
 42 | CA,1996.0,13,0.0
 43 | CA,1997.0,207,10920.0
 44 | CA,1998.0,2,450.0
 45 | CA,1999.0,186,133475.0
 46 | CA,2001.0,108,64800.0
 47 | CA,2002.0,106,105272.0
 48 | CA,2003.0,141,198180.0
 49 | CA,2004.0,104,92690.0
 50 | CA,2005.0,93,61250.0
 51 | CA,2006.0,152,220000.0
 52 | CA,2007.0,21,63000.0
 53 | CA,2008.0,71,94000.0
 54 | CA,2009.0,172,279725.0
 55 | CA,2010.0,212,427500.0
 56 | CA,2011.0,370,981700.0
 57 | CA,2012.0,788,1625180.0
 58 | CA,2013.0,115,269550.0
 59 | CA,2014.0,35,106900.0
 60 | CA,2015.0,94,189800.0
 61 | CA,2016.0,3,1500.0
 62 | CA,2017.0,23,47400.0
 63 | CA,2018.0,114,329850.0
 64 | CA,2019.0,41,132800.0
 65 | CA,2020.0,25,0.0
 66 | CO,1999.0,29,21750.0
 67 | CO,2001.0,48,39600.0
 68 | CO,2003.0,108,162000.0
 69 | CO,2004.0,5,7500.0
 70 | CO,2005.0,1,50.0
 71 | CO,2006.0,40,60000.0
 72 | CO,2007.0,591,776000.0
 73 | CO,2008.0,1,900.0
 74 | CO,2009.0,83,178100.0
 75 | CO,2010.0,36,52800.0
 76 | CO,2011.0,262,500500.0
 77 | CO,2012.0,308,496000.0
 78 | CO,2013.0,18,31800.0
 79 | CO,2014.0,153,263250.0
 80 | CO,2015.0,232,394400.0
 81 | CO,2016.0,36,60860.0
 82 | CO,2017.0,36,75000.0
 83 | CO,2018.0,300,598000.0
 84 | CO,2019.0,27,59400.0
 85 | CO,2020.0,407,795760.0
 86 | CT,2010.0,1,100.0
 87 | CT,2015.0,2,5700.0
 88 | DE,2010.0,1,2000.0
 89 | FL,2013.0,1,0.0
 90 | GU,2016.0,1,275.0
 91 | HI,2001.0,1,1500.0
 92 | HI,2006.0,36,40560.0
 93 | HI,2007.0,14,21000.0
 94 | HI,2011.0,12,30000.0
 95 | HI,2012.0,53,114000.0
 96 | HI,2013.0,1,0.0
 97 | HI,2015.0,3,0.0
 98 | HI,2017.0,5,3300.0
 99 | HI,2020.0,7,24150.0
100 | IA,1992.0,1,250.0
101 | IA,1995.0,1,65.0
102 | IA,1997.0,2,1200.0
103 | IA,1998.0,3,2250.0
104 | IA,1999.0,260,194700.0
105 | IA,2001.0,91,81750.0
106 | IA,2002.0,107,72406.0
107 | IA,2003.0,32,45570.0
108 | IA,2004.0,108,162000.0
109 | IA,2005.0,152,202310.0
110 | IA,2006.0,67,104650.0
111 | IA,2007.0,161,267340.0
112 | IA,2008.0,912,1625780.0
113 | IA,2009.0,534,876400.0
114 | IA,2010.0,5,7100.0
115 | IA,2011.0,282,646700.0
116 | IA,2012.0,385,814230.0
117 | IA,2013.0,20,43700.0
118 | IA,2014.0,218,511428.0
119 | IA,2015.0,226,524594.0
120 | IA,2016.0,304,704982.0
121 | IA,2017.0,195,397470.0
122 | IA,2018.0,499,1098825.0
123 | IA,2019.0,771,1726800.0
124 | IA,2020.0,510,787340.0
125 | ID,2005.0,50,75000.0
126 | ID,2009.0,34,71400.0
127 | ID,2010.0,134,205800.0
128 | ID,2011.0,155,265100.0
129 | ID,2012.0,168,355200.0
130 | IL,1997.0,1,550.0
131 | IL,2004.0,1,660.0
132 | IL,2005.0,34,56100.0
133 | IL,2007.0,358,590550.0
134 | IL,2008.0,130,218550.0
135 | IL,2009.0,430,636808.0
136 | IL,2010.0,284,498150.0
137 | IL,2011.0,405,692450.0
138 | IL,2012.0,493,823535.0
139 | IL,2015.0,153,273700.0
140 | IL,2016.0,93,184000.0
141 | IL,2017.0,139,305800.0
142 | IL,2018.0,232,527100.0
143 | IL,2019.0,184,538135.0
144 | IL,2020.0,333,508000.0
145 | IN,2008.0,88,130550.0
146 | IN,2009.0,529,909700.0
147 | IN,2010.0,184,303200.0
148 | IN,2011.0,1,900.0
149 | IN,2012.0,128,202700.0
150 | IN,2013.0,1,900.0
151 | IN,2014.0,101,200850.0
152 | IN,2015.0,65,149500.0
153 | IN,2017.0,106,217600.0
154 | IN,2018.0,61,200400.0
155 | IN,2020.0,221,146640.0
156 | KS,2001.0,170,112200.0
157 | KS,2005.0,100,150000.0
158 | KS,2006.0,67,100500.0
159 | KS,2008.0,222,450300.0
160 | KS,2009.0,73,199000.0
161 | KS,2010.0,47,60900.0
162 | KS,2011.0,112,199800.0
163 | KS,2012.0,802,1439100.0
164 | KS,2013.0,141,255750.0
165 | KS,2014.0,1,0.0
166 | KS,2015.0,414,796470.0
167 | KS,2016.0,376,682815.0
168 | KS,2017.0,277,658800.0
169 | KS,2018.0,201,543050.0
170 | KS,2019.0,162,469650.0
171 | KS,2020.0,353,392505.0
172 | MA,2001.0,1,660.0
173 | MA,2005.0,1,100.0
174 | MA,2006.0,2,2460.0
175 | MA,2007.0,1,1500.0
176 | MA,2008.0,3,1300.0
177 | MA,2009.0,13,10300.0
178 | MA,2010.0,9,6150.0
179 | MA,2011.0,16,24400.0
180 | MA,2012.0,34,55250.0
181 | MA,2013.0,2,3360.0
182 | MA,2014.0,1,600.0
183 | MA,2016.0,4,8000.0
184 | MA,2017.0,1,1500.0
185 | MA,2019.0,4,9600.0
186 | MD,2010.0,31,70300.0
187 | MD,2011.0,20,50000.0
188 | MD,2014.0,16,40000.0
189 | MD,2015.0,12,30000.0
190 | MD,2017.0,1,750.0
191 | ME,2006.0,7,10500.0
192 | ME,2007.0,22,31500.0
193 | ME,2008.0,3,4500.0
194 | ME,2009.0,64,128100.0
195 | ME,2010.0,41,91700.0
196 | ME,2011.0,72,130600.0
197 | ME,2012.0,19,34200.0
198 | ME,2014.0,3,8550.0
199 | ME,2015.0,57,173250.0
200 | ME,2016.0,91,287700.0
201 | ME,2017.0,8,22800.0
202 | ME,2020.0,22,0.0
203 | MI,1996.0,1,600.0
204 | MI,2001.0,2,1900.0
205 | MI,2002.0,22,14872.0
206 | MI,2008.0,80,126800.0
207 | MI,2009.0,7,14350.0
208 | MI,2010.0,10,20500.0
209 | MI,2011.0,121,208350.0
210 | MI,2012.0,353,615400.0
211 | MI,2013.0,103,175100.0
212 | MI,2014.0,207,368180.0
213 | MI,2016.0,44,80400.0
214 | MI,2017.0,101,249050.0
215 | MI,2018.0,19,43700.0
216 | MI,2019.0,119,286000.0
217 | MI,2020.0,293,271100.0
218 | MN,1997.0,1,230.0
219 | MN,1998.0,139,104250.0
220 | MN,1999.0,172,128380.0
221 | MN,2000.0,18,11880.0
222 | MN,2001.0,41,28930.0
223 | MN,2002.0,18,16400.0
224 | MN,2003.0,164,218650.0
225 | MN,2004.0,28,41850.0
226 | MN,2005.0,81,122200.0
227 | MN,2006.0,81,147950.0
228 | MN,2007.0,263,404200.0
229 | MN,2008.0,269,453575.0
230 | MN,2009.0,41,61600.0
231 | MN,2010.0,228,397450.0
232 | MN,2011.0,332,542930.0
233 | MN,2012.0,152,266670.0
234 | MN,2013.0,1,0.0
235 | MN,2014.0,32,48000.0
236 | MN,2015.0,100,200000.0
237 | MN,2016.0,145,291800.0
238 | MN,2017.0,100,200000.0
239 | MN,2018.0,42,88475.0
240 | MN,2019.0,71,166825.0
241 | MN,2020.0,237,203000.0
242 | MO,2007.0,27,56700.0
243 | MO,2008.0,52,105800.0
244 | MO,2009.0,73,146000.0
245 | MO,2010.0,101,150000.0
246 | MO,2012.0,1,0.0
247 | MO,2016.0,78,155950.0
248 | MO,2017.0,164,300000.0
249 | MO,2019.0,1,0.0
250 | MO,2020.0,492,242000.0
251 | MT,2005.0,108,136170.0
252 | MT,2006.0,6,9000.0
253 | MT,2007.0,8,2000.0
254 | MT,2008.0,83,124500.0
255 | MT,2009.0,69,103500.0
256 | MT,2010.0,8,12000.0
257 | MT,2012.0,171,259000.0
258 | MT,2013.0,1,0.0
259 | MT,2014.0,12,19720.0
260 | MT,2016.0,13,29900.0
261 | MT,2018.0,48,104810.0
262 | MT,2020.0,123,79900.0
263 | NC,2009.0,1,0.0
264 | NC,2017.0,104,208000.0
265 | ND,1997.0,2,200.0
266 | ND,2001.0,1,900.0
267 | ND,2002.0,3,3500.0
268 | ND,2003.0,41,61500.0
269 | ND,2005.0,22,33000.0
270 | ND,2006.0,50,78320.0
271 | ND,2007.0,112,174420.0
272 | ND,2008.0,247,372900.0
273 | ND,2009.0,297,488100.0
274 | ND,2010.0,132,221100.0
275 | ND,2011.0,9,21000.0
276 | ND,2012.0,80,235000.0
277 | ND,2013.0,1,1600.0
278 | ND,2014.0,64,204800.0
279 | ND,2015.0,118,257500.0
280 | ND,2016.0,314,602760.0
281 | ND,2017.0,122,245050.0
282 | ND,2018.0,45,148050.0
283 | ND,2019.0,220,473140.0
284 | ND,2020.0,194,0.0
285 | NE,1998.0,2,1320.0
286 | NE,2001.0,1,660.0
287 | NE,2005.0,36,59400.0
288 | NE,2009.0,27,81000.0
289 | NE,2010.0,43,60100.0
290 | NE,2011.0,85,124500.0
291 | NE,2012.0,73,122000.0
292 | NE,2013.0,46,74800.0
293 | NE,2014.0,161,273700.0
294 | NE,2015.0,47,80190.0
295 | NE,2016.0,221,437500.0
296 | NE,2017.0,44,98560.0
297 | NE,2018.0,211,557550.0
298 | NE,2019.0,72,160020.0
299 | NE,2020.0,259,231240.0
300 | NH,2008.0,12,24000.0
301 | NH,2009.0,1,100.0
302 | NH,2012.0,57,147000.0
303 | NH,2015.0,5,14250.0
304 | NH,2019.0,9,28800.0
305 | NJ,2005.0,5,7500.0
306 | NJ,2012.0,1,1500.0
307 | NM,1999.0,1,660.0
308 | NM,2002.0,2,1352.0
309 | NM,2003.0,138,205320.0
310 | NM,2004.0,60,60000.0
311 | NM,2005.0,140,140000.0
312 | NM,2006.0,90,90000.0
313 | NM,2008.0,1,1500.0
314 | NM,2009.0,40,99400.0
315 | NM,2010.0,64,102400.0
316 | NM,2011.0,28,50400.0
317 | NM,2012.0,14,27300.0
318 | NM,2013.0,5,0.0
319 | NM,2014.0,21,34750.0
320 | NM,2015.0,134,268000.0
321 | NM,2016.0,16,31650.0
322 | NM,2017.0,260,569600.0
323 | NM,2018.0,22,48400.0
324 | NM,2019.0,84,220500.0
325 | NM,2020.0,359,0.0
326 | NV,2012.0,67,151800.0
327 | NV,2014.0,1,0.0
328 | NY,2000.0,17,18150.0
329 | NY,2001.0,19,28500.0
330 | NY,2002.0,1,250.0
331 | NY,2005.0,83,136950.0
332 | NY,2006.0,112,184800.0
333 | NY,2007.0,31,54500.0
334 | NY,2008.0,188,282000.0
335 | NY,2009.0,345,567500.0
336 | NY,2010.0,2,200.0
337 | NY,2011.0,67,132650.0
338 | NY,2012.0,78,232600.0
339 | NY,2013.0,52,84240.0
340 | NY,2014.0,16,25920.0
341 | NY,2015.0,6,1100.0
342 | NY,2016.0,40,77800.0
343 | NY,2017.0,7,1500.0
344 | NY,2018.0,77,158400.0
345 | NY,2019.0,1,0.0
346 | NY,2020.0,7,0.0
347 | OH,2003.0,2,3600.0
348 | OH,2004.0,2,3600.0
349 | OH,2006.0,1,225.0
350 | OH,2009.0,4,380.0
351 | OH,2010.0,15,3500.0
352 | OH,2011.0,58,101600.0
353 | OH,2012.0,166,317975.0
354 | OH,2013.0,2,3400.0
355 | OH,2014.0,1,900.0
356 | OH,2015.0,7,7500.0
357 | OH,2016.0,49,102300.0
358 | OH,2017.0,34,72000.0
359 | OH,2018.0,49,112500.0
360 | OH,2019.0,6,9000.0
361 | OH,2020.0,105,126000.0
362 | OK,1983.0,1,0.0
363 | OK,2001.0,1,100.0
364 | OK,2003.0,113,176250.0
365 | OK,2005.0,182,298200.0
366 | OK,2006.0,40,60000.0
367 | OK,2007.0,85,154500.0
368 | OK,2008.0,91,141900.0
369 | OK,2009.0,153,299100.0
370 | OK,2010.0,195,352260.0
371 | OK,2011.0,257,524900.0
372 | OK,2012.0,596,1127050.0
373 | OK,2014.0,369,648100.0
374 | OK,2015.0,710,1399960.0
375 | OK,2016.0,602,1457525.0
376 | OK,2017.0,323,850725.0
377 | OK,2018.0,272,543245.0
378 | OK,2019.0,33,100050.0
379 | OK,2020.0,440,1120550.0
380 | OR,1998.0,38,25080.0
381 | OR,2001.0,180,128940.0
382 | OR,2002.0,102,64800.0
383 | OR,2003.0,41,41000.0
384 | OR,2005.0,50,75000.0
385 | OR,2006.0,67,109200.0
386 | OR,2007.0,260,445600.0
387 | OR,2008.0,102,182350.0
388 | OR,2009.0,403,753950.0
389 | OR,2010.0,129,282500.0
390 | OR,2011.0,209,417050.0
391 | OR,2012.0,253,632500.0
392 | OR,2016.0,6,9900.0
393 | OR,2017.0,25,50000.0
394 | OR,2018.0,1,0.0
395 | OR,2019.0,56,200850.0
396 | OR,2020.0,128,0.0
397 | PA,2001.0,16,24000.0
398 | PA,2003.0,63,94500.0
399 | PA,2006.0,25,50000.0
400 | PA,2007.0,65,118500.0
401 | PA,2008.0,32,67200.0
402 | PA,2009.0,211,383500.0
403 | PA,2011.0,21,41200.0
404 | PA,2012.0,279,550200.0
405 | PA,2016.0,14,39900.0
406 | PA,2019.0,25,90000.0
407 | PR,2012.0,58,124600.0
408 | PR,2013.0,3,825.0
409 | RI,2006.0,1,660.0
410 | RI,2009.0,2,200.0
411 | RI,2012.0,6,6525.0
412 | RI,2016.0,15,45000.0
413 | RI,2017.0,1,1500.0
414 | RI,2018.0,7,21000.0
415 | SD,2001.0,4,2816.0
416 | SD,2002.0,1,108.0
417 | SD,2003.0,28,41250.0
418 | SD,2007.0,36,54000.0
419 | SD,2008.0,59,88500.0
420 | SD,2009.0,68,126400.0
421 | SD,2010.0,229,396000.0
422 | SD,2011.0,51,75000.0
423 | SD,2014.0,11,20350.0
424 | SD,2015.0,98,173050.0
425 | SD,2018.0,18,41400.0
426 | SD,2019.0,203,503940.0
427 | SD,2020.0,386,323000.0
428 | TN,2000.0,3,1980.0
429 | TN,2004.0,15,27000.0
430 | TX,1999.0,123,89020.0
431 | TX,2001.0,851,894820.0
432 | TX,2002.0,5,2704.0
433 | TX,2003.0,186,200500.0
434 | TX,2005.0,434,676680.0
435 | TX,2006.0,397,740700.0
436 | TX,2007.0,980,1709800.0
437 | TX,2008.0,1694,2687700.0
438 | TX,2009.0,1327,2125050.0
439 | TX,2010.0,353,681300.0
440 | TX,2011.0,133,282900.0
441 | TX,2012.0,920,1830680.0
442 | TX,2013.0,84,141100.0
443 | TX,2014.0,964,1805190.0
444 | TX,2015.0,1797,3622844.0
445 | TX,2016.0,1211,2564940.0
446 | TX,2017.0,904,2305115.0
447 | TX,2018.0,919,2254580.0
448 | TX,2019.0,1568,3926120.0
449 | TX,2020.0,1423,2003960.0
450 | UT,2000.0,1,225.0
451 | UT,2005.0,1,660.0
452 | UT,2008.0,9,18900.0
453 | UT,2009.0,98,205000.0
454 | UT,2011.0,68,102000.0
455 | UT,2015.0,1,0.0
456 | UT,2016.0,28,63890.0
457 | VA,2020.0,2,12000.0
458 | VT,1997.0,12,6550.0
459 | VT,2009.0,2,200.0
460 | VT,2010.0,2,200.0
461 | VT,2011.0,16,40000.0
462 | VT,2012.0,25,73000.0
463 | VT,2013.0,1,100.0
464 | VT,2017.0,15,30000.0
465 | WA,2001.0,270,178200.0
466 | WA,2002.0,37,48100.0
467 | WA,2003.0,12,15600.0
468 | WA,2005.0,83,149600.0
469 | WA,2006.0,260,428100.0
470 | WA,2007.0,165,340500.0
471 | WA,2008.0,104,213000.0
472 | WA,2009.0,243,546300.0
473 | WA,2010.0,161,294800.0
474 | WA,2011.0,158,369800.0
475 | WA,2012.0,119,235150.0
476 | WA,2014.0,117,266800.0
477 | WA,2015.0,1,0.0
478 | WA,2020.0,94,0.0
479 | WI,1999.0,17,11220.0
480 | WI,2001.0,20,30000.0
481 | WI,2004.0,1,0.0
482 | WI,2008.0,215,341850.0
483 | WI,2009.0,37,54100.0
484 | WI,2010.0,11,20200.0
485 | WI,2011.0,90,162000.0
486 | WI,2012.0,11,18000.0
487 | WI,2013.0,1,0.0
488 | WI,2017.0,49,98000.0
489 | WV,2002.0,44,66000.0
490 | WV,2008.0,132,264000.0
491 | WV,2010.0,67,100500.0
492 | WV,2011.0,76,133600.0
493 | WV,2012.0,8,19200.0
494 | WV,2016.0,49,102500.0
495 | WV,2020.0,20,56200.0
496 | WY,1998.0,2,1300.0
497 | WY,1999.0,40,29500.0
498 | WY,2000.0,31,18820.0
499 | WY,2001.0,49,49000.0
500 | WY,2002.0,12,8112.0
501 | WY,2003.0,80,144000.0
502 | WY,2005.0,2,1320.0
503 | WY,2008.0,226,434500.0
504 | WY,2009.0,275,478200.0
505 | WY,2010.0,185,311300.0
506 | WY,2016.0,46,85100.0
507 | WY,2020.0,407,226560.0
508 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/04_randomized_experiments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "moderate-constitution",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Causal Inference in Randomized Experiments"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "lined-technical",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In this section, we'll build a deeper understanding of randomized experiments. Recall from the last section the table of potential outcomes:\n",
 17 |     "\n",
 18 |     "| Unit | Outcome if not treated | Outcome if treated | Treated or not? |\n",
 19 |     "| ---  |         ---            |       ---          |      ---        |\n",
 20 |     "|  1   |      ?          |      $Y_1(1)$      |     $Z_1=1$       |\n",
 21 |     "|  2   |      $Y_2(0)$          |      ?      |     $Z_2=0$       |\n",
 22 |     "|  3   |      $Y_3(0)$          |      ?      |     $Z_3=0$       |\n",
 23 |     "|  4   |      ?          |      $Y_4(1)$      |     $Z_4=1$       |\n",
 24 |     "|  5   |      $Y_5(0)$          |      ?      |     $Z_5=0$       |\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "statistical-mortgage",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Randomized experiments and potential outcomes\n",
 33 |     "\n",
 34 |     "In a randomized experiment, we protect ourselves from dealing with confounding variables by randomizing units into treatment and control. In other words, we choose each $Z_i$ randomly, and we choose it independent of whatever $Y_i(0)$ and $Y_i(1)$ might be. Mathematically, we can write:\n",
 35 |     "\n",
 36 |     "$$\n",
 37 |     "\\begin{align}\n",
 38 |     "    \\big(Y_i(0), Y_i(1)\\big) \\perp \\!\\!\\! \\perp Z_i\n",
 39 |     "\\end{align}\n",
 40 |     "$$\n",
 41 |     "\n",
 42 |     "Remember, this doesn't mean that the treatment is independent from the observed outcome! It only means that the treatment is independent from the *pair* of potential outcomes. The observed outcome $Y_{i,obs} = Y_i(0)(1-Z_i) + Y_i(1) Z_i$ always depends on the treatment decision. In other words, knowing the treatment decision $Z_i$ always gives us information about *which* of the two potential outcomes ($Y_i(0)$ or $Y_i(1)$) we observed (except in the uninteresting scenario where the treatment is completely unrelated to the outcome), but\n",
 43 |     "\n",
 44 |     "For example, consider a double-blind vaccine trial. We can consider the potential outcomes for a particular subject: they represent what happens to that subject if they get the vaccine ($Y_i(1)$) or if they don't get the vaccine ($Y_i(0)$). This is the pair of potential outcomes, $\\big(Y_i(0), Y_i(1)\\big)$. Next, consider the treatment decision $Z_i$: this represents whether the subject got the vaccine ($Z_i = 1$) or a placebo ($Z_i = 0$). These are independent: knowing whether or not a subject got the vaccine/placebo gives us no information about the pair of potential outcomes: it only gives us information about which one of the two we observe in the real world.\n",
 45 |     "\n",
 46 |     "### Computing the average treatment effect\n",
 47 |     "\n",
 48 |     "You may remember learning that in a randomized controlled trial, we can determine causality by using the difference in means between the treatment and control groups. Let's now show mathematically that this is true. Recall from the previous section our definition of the ATE $\\tau$:\n",
 49 |     "\n",
 50 |     "$$\\tau = E[Y_i(1) - Y_i(0)] = E[Y_i(1)] - E[Y_i(0)]$$\n",
 51 |     "\n",
 52 |     "If $Z_i$ and $Y_i$ are independent, then $E[Y_i(\\cdot)] = E[Y_i(\\cdot)|Z_i]$. In other words, conditioning on $Z_i$ shouldn't change the expectation, as long as $Z_i$ and $Y_i(\\cdot)$ are independent.\n",
 53 |     "\n",
 54 |     "$$\n",
 55 |     "\\begin{align}\n",
 56 |     "    \\tau\n",
 57 |     "        &= E[Y_i(1)] - E[Y_i(0)] \\\\\n",
 58 |     "        &= E[Y_i(1)|Z_i=1] - E[Y_i(0)|Z_i=0] \\quad{\\scriptsize (\\text{if }(Y_i(0), Y_i(1)) \\perp \\!\\! \\perp Z_i)}\n",
 59 |     "\\end{align}\n",
 60 |     "$$\n",
 61 |     "\n",
 62 |     "These two terms correspond to the mean outcomes in the treatment and control groups, respectively. If we have $n$ observations $(Z_1, Y_{1,obs}), \\ldots, (Z_n, Y_{n,obs})$, then our empirical estimate for the ATE is just:\n",
 63 |     "\n",
 64 |     "$$\n",
 65 |     "\\begin{align}\n",
 66 |     "    \\hat{\\tau}\n",
 67 |     "        &= \\underbrace{\\left[\\frac{1}{n_1} \\sum_{i: Z_i = 1} Y_i\\right]}_{=\\bar{Y}_{obs,1}} - \\underbrace{\\left[\\frac{1}{n_0} \\sum_{i: Z_i = 0} Y_i\\right]}_{=\\bar{Y}_{obs,0}},\n",
 68 |     "\\end{align}\n",
 69 |     "$$\n",
 70 |     "\n",
 71 |     "where $n_1$ is the number of treated units and $n_0$ is the number of untreated units, and $\\bar{Y}_{obs,1}$ and $\\bar{Y}_{obs,0}$ are the means of the treatment and control groups respectively. This quantity $\\hat{\\tau}$, which you've most likely seen and used before (e.g., in Data 8), has many names. Here are a few of them:\n",
 72 |     "* The **difference in means**\n",
 73 |     "* The **simple difference in mean outcomes / simple difference in observed means (SDO)**\n",
 74 |     "* The **Neyman estimator**\n",
 75 |     "* The ***prima facie* causal effect**, $\\tau_{PF}$ (*prima facie* is latin for \"at first sight\"). "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "635043c0",
 82 |    "metadata": {
 83 |     "tags": [
 84 |      "remove-input"
 85 |     ]
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "from IPython.display import YouTubeVideo\n",
 90 |     "YouTubeVideo(\"dmrMZ5vERx4\")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "f32c9f16",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from IPython.display import YouTubeVideo\n",
101 |     "YouTubeVideo(\"0o_m_GIfe6I\")"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "id": "natural-language",
107 |    "metadata": {},
108 |    "source": [
109 |     "## (Optional) Fixed-sample assumption: Fisher and Neyman\n",
110 |     "\n",
111 |     "In this section, we'll analyze randomized experiments and the Neyman estimator under the fixed-sample assumption. Recall that in this setting, we assume that $Z_i$ (which we observe) is random, but that $Y_i(0)$ and $Y_i(1)$ (which are unknown) are fixed. In this case, the statement of independence above doesn't really make sense, since $Y_i(0)$ and $Y_i(1)$ are not random. However, we can still compute the Neyman estimator. We'll develop some properties of the estimator, then use those to construct a confidence interval for the estimated ATE. Finally, we'll look at two different hypothesis tests used in randomized experiments.\n",
112 |     "\n",
113 |     "### Properties of the Neyman estimator\n",
114 |     "The Neyman estimator has two useful properties: the first is that it's unbiased, and the second is that its variance can be bounded by the estimated variances within the two groups:\n",
115 |     "\n",
116 |     "$$\n",
117 |     "\\begin{align*}\n",
118 |     "E[\\hat{\\tau}] &= \\tau \\\\\n",
119 |     "\\text{var}(\\hat{\\tau}) &\\leq \\frac{\\hat{\\sigma}_1^2}{n_1} + \\frac{\\hat{\\sigma}_0^2}{n_0},\n",
120 |     "\\end{align*}\n",
121 |     "$$\n",
122 |     "\n",
123 |     "where $\\hat{\\sigma}_k$ is the sample standard deviation of the potential treatment outcomes $Y_1(k), \\ldots, Y_n(k)$:\n",
124 |     "\n",
125 |     "$$\n",
126 |     "\\begin{align*}\n",
127 |     "    \\hat{\\sigma}_k &= \n",
128 |     "            \\frac{1}{n - 1} \\sum_{i} \\big(Y_i(k) - \\bar{Y}(k)\\big)\n",
129 |     "\\end{align*}\n",
130 |     "$$\n",
131 |     "Since this depends on the counterfactual outcomes that we don't get to observe, we'll typically approximate it by replacing the true sample variances with the sample variances within each observed group in our data, and call this $\\hat{V}$:\n",
132 |     "\n",
133 |     "\n",
134 |     "$$\n",
135 |     "\\begin{align*}\n",
136 |     "    \\hat{V} &= \n",
137 |     "    \\Bigg[\\frac{1}{n_1}\n",
138 |     "        \\underbrace{%\n",
139 |     "            \\frac{1}{n_1 - 1} \\sum_{i: Z_i = 1} \\big(Y_i - \\bar{Y}_{obs,1}\\big)^2}_{%\n",
140 |     "            \\text{sample std. dev. of treatment group}}\\Bigg] +\n",
141 |     "    \\Bigg[\\frac{1}{n_0}\n",
142 |     "        \\underbrace{%\n",
143 |     "            \\frac{1}{n_0 - 1} \\sum_{i: Z_i = 0} \\big(Y_i - \\bar{Y}_{obs,0}\\big)^2}_{%\n",
144 |     "            \\text{sample std. dev. of control group}}\\Bigg]\n",
145 |     "\\end{align*}\n",
146 |     "$$\n",
147 |     "\n",
148 |     "It can be shown that under certain regularity conditions, as the number of samples grows larger, the distribution of the quantity $\\frac{\\hat{\\tau} - \\tau}{\\sqrt{\\hat{V}}}$ converges to a normal distribution $N(0, \\sigma^2)$, where $\\sigma^2 < 1$.\n",
149 |     "\n",
150 |     "### Confidence intervals for the ATE\n",
151 |     "\n",
152 |     "Given the fact above, we can construct an asymptotically valid $95\\%$ confidence interval for $\\tau$ as follows:\n",
153 |     "$$\n",
154 |     "\\begin{align*}\n",
155 |     "    \\left(\\hat{\\tau} - 1.96\\sqrt{\\hat{V}}, \\hat{\\tau} + 1.96\\sqrt{\\hat{V}}\\right)\n",
156 |     "\\end{align*}\n",
157 |     "$$\n",
158 |     "\n",
159 |     "### Hypothesis testing for causal effects\n",
160 |     "\n",
161 |     "In the classic null hypothesis significance testing framework, there are two different null hypotheses commonly used when measuring causal effects in randomized trials:\n",
162 |     "\n",
163 |     "1. **Fisher's strong null** (also known as Fisher's sharp null) states that for every unit, the treatment effect $Y_i(1) - Y_i(0) = 0$.\n",
164 |     "2. **Neyman's weak null** states that the average treatment effect is 0. In other words, even if some individual treatment effects are positive and some are negative, they average out to 0.\n",
165 |     "\n",
166 |     "The first is a much stricter null hypothesis (hence the name \"strong null\"), since it states that all the treatment effects are 0. The second is looser, requiring only that the treatment effects average out to 0. Because of this, the strong null is often easier to reject.\n",
167 |     "\n",
168 |     "To test a hypothesis against Neyman's weak null, we construct the test statistic $\\frac{\\hat{\\tau}}{\\sqrt{\\hat{V}}}$. Under the Neyman weak null hypothesis, this should follow a standard normal distribution.\n",
169 |     "\n",
170 |     "To test a hypothessis against Fisher's strong null, we need some stronger mathematical machinery. We'll limit ourself to cases where the treatment and outcome are both binary, and use a technique called **permutation testing**. In this technique, we randomly shuffle the treatment/control labels, and use that to build a null distribution for the difference in outcomes (for a refresher on this technique, see the [Data 8 textbook](https://www.inferentialthinking.com/chapters/12/1/AB_Testing.html)). Instead of randomly shuffling, however, Fisher's exact test instead looks at every single possible permutation, and computes a $p$-value in closed form."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "id": "attached-joshua",
176 |    "metadata": {},
177 |    "source": [
178 |     "## (Optional) Complications with randomized experiments"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "id": "federal-humanity",
184 |    "metadata": {},
185 |    "source": [
186 |     "Randomized experiments present several challenges that make them infeasible in some circumstances. Here are some of the issues that come up with them. Note that this list is far from exhaustive!"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "id": "economic-office",
192 |    "metadata": {},
193 |    "source": [
194 |     "### Compliance\n",
195 |     "\n",
196 |     "Carrying out a randomized experiment requires that the units follow their treatment/no-treatment assignment. This is easier to ensure in some experiments than others. Consider the following examples:\n",
197 |     "\n",
198 |     "1. An experiment to determine whether a new fertilizer increases crop yield\n",
199 |     "2. A double-blind vaccine trial\n",
200 |     "3. An experiment on whether using a mindfulness app for at least 20 minutes a day causes better sleep\n",
201 |     "4. An experiment on whether eating a certain amount of chocolate causes improved heart function\n",
202 |     "\n",
203 |     "For the first two, we can guarantee that the treatment will be properly followed. As experimenters, we know that if a certain unit (plant, person, etc.) is assigned to the treatment group, that they will receive the treatment.\n",
204 |     "\n",
205 |     "For the last two, however, this is more difficult to ensure. While we can ask subjects in a research study to use an app for a certain amount of time per day, we can't guarantee that every subject will follow the instructions perfectly. \n",
206 |     "\n",
207 |     "In particular, in some randomized experiments, we can't guarantee that units will be **compliant** with the treatment assignment. We can't solve this by simply removing the units that were non-compliant, since this could introduce bias and/or confounding."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "id": "helpful-feature",
213 |    "metadata": {},
214 |    "source": [
215 |     "### External validity\n",
216 |     "\n",
217 |     "**External validity** refers to whether or not a finding from a randomized experiment will apply to a broader population of interest. This often arises due to sampling bias, contrived situations in an experiment that don't reflect real-world conditions, or other similar effects.\n",
218 |     "\n",
219 |     "Consider the following example:\n",
220 |     "\n",
221 |     "> Suppose we want to determine whether watching a 15-minute video about common pitfalls and misunderstandings of probability helps people make better decisions about whether news stories they read are valid. We recruit a random sample of Data 102 students, randomly assign half to watch the video, and assign the other half not to watch the video. We then evaluate how well the students can critically evaluate several news stories. We find that the videos have no effect: everyone in our sample does an excellent job of evaluation, regardless of whether or not they watched the video.\n",
222 |     "\n",
223 |     "In this case, our randomized experiment shows no causal effect, but our sample is not representative of the population at large: we can expect that most Data 102 students, who are probability experts, already know the common pitfalls in the 15-minute video. Among a larger population, however, the same might not be true!"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "id": "b1dc1cd1",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Ethical considerations\n",
232 |     "\n",
233 |     "Randomized experiments might not always be ethical. Suppose we want to determine whether or not the death penalty acts as a deterrent: in other words, does instituting the death penalty cause a reduction in crime?\n",
234 |     "\n",
235 |     "Random assignment of the death penalty is profoundly unethical, and no randomized experiment can determine this."
236 |    ]
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "celltoolbar": "Edit Metadata",
241 |   "kernelspec": {
242 |    "display_name": "Python 3 (ipykernel)",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.11.5"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 5
261 | }
262 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/03_graphical_models.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from scipy import stats\n",
 14 |     "from IPython.display import YouTubeVideo\n",
 15 |     "\n",
 16 |     "%matplotlib inline\n",
 17 |     "\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import seaborn as sns\n",
 20 |     "sns.set()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Graphical Models, Probability Distributions, and Independence"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Graphical Models\n",
 35 |     "\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "A **graphical model** provides a visual representation of a Bayesian hierarchical model using. These models are sometimes known as Bayesian networks, or **Bayes nets**.\n",
 43 |     "\n",
 44 |     "We represent each random variable with a node (circle), and a directed edge (arrow) between two random variables indicates that the the distribution for the child variable is conditioned on the parent variable. When drawing graphical models, we usually start with the variables that don't depend on any others. These are usually, but not always, unobserved parameters of interest like $\\theta$ in this example. Then, we proceed by drawing a node for each variable that depends on those, and so on. Variables that are observed are shaded in.\n",
 45 |     "\n",
 46 |     "We'll draw graphical models for the three examples we've seen in previous sections: the product review model, the kidney cancer model, and the exoplanet model.\n",
 47 |     "\n"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Graphical model for product reviews\n",
 55 |     "\n",
 56 |     "In our product review model, we have the following random variables:\n",
 57 |     "\n",
 58 |     "$$\n",
 59 |     "\\begin{align}\n",
 60 |     "    x_i | \\theta  &\\sim \\mathrm{Bernoulli}(\\theta) \\\\\n",
 61 |     "    \\theta &\\sim \\mathrm{Beta}(a, b)\n",
 62 |     "\\end{align}\n",
 63 |     "$$\n",
 64 |     "\n",
 65 |     "In this case, this means we start with a node for the product quality $\\theta$, and then with one node for each review $x_i$, all of which depend on $\\theta$. The nodes for the observed reviews $x_i$ are shaded in, while the node for the hidden (unobserved) product quality $\\theta$ is not:\n",
 66 |     "\n",
 67 |     "![](review_model_simple.png)\n",
 68 |     "\n",
 69 |     "This visual representation shows us the structure of the model, by making it clear that each review $x_i$ depends on the quality $\\theta$. But just as before, this model is simple enough that we already knew that. Next, we'll look at the graphical model for a more interesting example.\n"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Graphical model for kidney cancer death risk\n",
 77 |     "\n",
 78 |     "Recall the full hierarchical model for the kidney cancer death risk example:\n",
 79 |     "\n",
 80 |     "$$\n",
 81 |     "\\begin{align*}\n",
 82 |     "a &\\sim \\mathrm{Uniform}(0, 50) \\\\\n",
 83 |     "b &\\sim \\mathrm{Uniform}(0, 300000) \\\\\n",
 84 |     "\\theta_i &\\sim \\mathrm{Beta}(a, b), & i \\in \\{1, 2, \\ldots, C\\} \\\\\n",
 85 |     "y_i &\\sim \\mathrm{Binomial}(\\theta_i, n_i), & i \\in \\{1, 2, \\ldots, C\\}\n",
 86 |     "\\end{align*}\n",
 87 |     "$$\n",
 88 |     "\n",
 89 |     "* $y_i$ represents the number of kidney cancer deaths in county $i$ (out of a population of $n_i$).\n",
 90 |     "* $\\theta_i$ represents the kidney cancer death rate for county $i$.\n",
 91 |     "* $a$ and $b$ represent the parameters of the shared prior for the county-level rates.\n",
 92 |     "\n",
 93 |     "In order to draw the graphical model, we need to draw one node per random variable, and draw arrows to indicate dependency. We know that:\n",
 94 |     "\n",
 95 |     "* We need a node for $a$ and a node for $b$.\n",
 96 |     "* We need one node for each $\\theta_i$ and one node for each $y_i$.\n",
 97 |     "* Each $\\theta_i$ depends on $a$ and $b$.\n",
 98 |     "* Each $y_i$ depends on $\\theta_i$ and $n_i$.\n",
 99 |     "* Because $n_i$ is a fixed number, we'll draw it as a dot.\n",
100 |     "\n",
101 |     "So, our full graphical model looks like:\n",
102 |     "\n",
103 |     "![](kc_hierarchical.png)\n"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "### (Optional) Example: Graphical Model for Exoplanet Model\n",
111 |     "\n",
112 |     "*Text coming soon: see video*"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "tags": [
120 |      "remove-input"
121 |     ]
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "YouTubeVideo('e6CoEsLiMXc')"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Relating graphical models to probability distributions\n",
133 |     "\n",
134 |     "When we were drawing graphical models above, we drew one node per variable, and started from the \"top,\" working with the variables that didn't depend on any others. We then worked our way through the model, ending with observed variables. When looking at a graphical model to derive the corresponding joint distribution of all the variables in the model, we follow a similar process. For example, in the kidney cancer death rate model, we can write the joint distribution of all the variables in our model by starting at the root (i.e., the nodes that have no parents), and then proceeding through their children, writing the joint distribution as a product. \n",
135 |     "\n",
136 |     "So, we start with $p(a)$ and $p(b)$, then $p(\\theta_i | a, b)$ (for $i \\in \\{1, \\ldots, C\\}$), then $p(y_i | \\theta_i)$:\n",
137 |     "\n",
138 |     "$$\n",
139 |     "p(a, b, \\theta_1, \\ldots, \\theta_C, y_1, \\ldots, y_C) = p(a)p(b) \\prod_{i=1}^C p(\\theta_i\\mid a, b) p(y_i\\mid\\theta_i)\n",
140 |     "$$\n",
141 |     "\n",
142 |     "Factoring the distribution this way helps us understand and mathematically demonstrate the independence and dependence relationships in our graphical models, as we'll see shortly."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "tags": [
150 |      "remove-input"
151 |     ]
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "YouTubeVideo('TzY3-EYwipk')"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Independence and Conditional Independence"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Review: independence and conditional independence\n",
170 |     "\n",
171 |     "We say that two random variables $w$ and $v$ are **independent** if knowing the value of one tells us nothing about the distribution of the other. Notationally, we write $w \\perp\\!\\!\\!\\perp v$. The following statements are all true for independent random variables $w$ and $v$:\n",
172 |     "\n",
173 |     "* If $w$ and $v$ are independent ($w \\perp\\!\\!\\!\\perp v$), then the joint distribution $p(w, v)$ can be written as the product of the marginal distributions: $p(w, v) = p(w)p(v)$.\n",
174 |     "* If $w$ and $v$ are independent ($w \\perp\\!\\!\\!\\perp v$), then the conditional distributions are equal to the marginal distributions: $p(w|v) = p(w)$ and $p(v|w) = p(v)$. \n",
175 |     "\n",
176 |     "***Exercise**: using the definition of conditional distributions, show that the two conditions above are mathematically equivalent.*\n",
177 |     "\n",
178 |     "We say that two random variables $w$ and $v$ are **conditionally independent** given a third random variable $u$ if, when we condition on $u$, knowing the value of one of $v$ or $w$ tells us nothing about the distribution of the other. Notationally, we write $w \\perp\\!\\!\\!\\perp v \\mid u$, and mathematically this means that $p(w, v \\mid u) = p(w\\mid u) p(v \\mid u)$.\n",
179 |     "\n",
180 |     "For example, suppose $x_1$ and $x_2$ are the heights of two people randomly sampled from a very specific population with some average height $\\mu$: this population could be college students, or second-graders, or Olympic swimmers, or some other group entirely.\n",
181 |     "\n",
182 |     "If we know the value of $\\mu$, then $x_1$ and $x_2$ are conditionally independent, because they're random samples from the same distribution with known mean $\\mu$. For example, if we are given that $\\mu = 4'1''$, then knowing $x_1$ does not tell us anything about $x_2$.\n",
183 |     "\n",
184 |     "Suppose instead that we don't know the value of $\\mu$. Then, we find out that $x_1 = 7' 1''$. In this case, we might guess that the 'specific population' is likely a very tall group, such as NBA players. This will affect our belief about the distribution of $x_2$ (i.e., we should expect the second person to be tall too). So, in this case:\n",
185 |     "\n",
186 |     "* $x_1$ and $x_2$ are conditionally independent given $\\mu$: $x_1 \\perp\\!\\!\\!\\perp x_2 \\mid \\mu$.\n",
187 |     "* $x_1$ and $x_2$ are not unconditionally independent: it is not true that $x_1 \\perp\\!\\!\\!\\perp x_2$."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "tags": [
195 |      "remove-input"
196 |     ]
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "YouTubeVideo('WhqyUmqkSE8')"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "### Independence and conditional independence in graphical models\n",
208 |     "\n",
209 |     "The structure of a graphical model can tell us a lot about the independence relationships between the variables in our model. Specifically, we can determine whether two random variables are unconditionally independent or conditionally independent given a third variable, just by looking at the structure of the model. Let's look at a few examples to illustrate this. We'll start with the height example we just saw:\n",
210 |     "\n",
211 |     "![](heights_gm.png)\n",
212 |     "\n",
213 |     "From our reasoning above, we know that $x_1 \\perp\\!\\!\\!\\perp x_2 \\mid \\mu$, but that $x_1$ and $x_2$ are not unconditionally independent. This is true in general for any three variables in a graphical model in this configuration. \n",
214 |     "\n",
215 |     "***Exercise**: mathematically prove the results stated above.*\n",
216 |     "\n",
217 |     "**Solution**: To show that $x_1$ and $x_2$ are not unconditionally independent, we must show that $p(x_1, x_2) \\neq p(x_1)p(x_2)$. We can compute $p(x_1, x_2)$ by looking at the joint distribution over all three variables and then marginalizing over $\\mu$:\n",
218 |     "\n",
219 |     "$$\n",
220 |     "\\begin{align*}\n",
221 |     "p(x_1, x_2) \n",
222 |     "&= \\int p(x_1, x_2, \\mu) d\\mu \\\\\n",
223 |     "&= \\int p(\\mu) p(x_1 \\mid \\mu) p(x_2 \\mid \\mu) d\\mu\n",
224 |     "\\end{align*}\n",
225 |     "$$\n",
226 |     "\n",
227 |     "Unfortunately, there is no way to factor the integral that separates terms with $x_1$ and terms with $x_2$, so this does not factor. In other words, in general, the integral above will not equal $p(x_1)p(x_2)$, so the variables are not unconditionally independent.\n",
228 |     "\n",
229 |     "What about conditional independence given $\\mu$? We need to show that $p(x_1, x_2\\mid\\mu) = p(x_1\\mid\\mu) p(x_2\\mid\\mu)$:\n",
230 |     "\n",
231 |     "$$\n",
232 |     "\\begin{align*}\n",
233 |     "p(x_1, x_2 \\mid \\mu) \n",
234 |     "&= \\frac{p(x_1, x_2, \\mu)}{p(\\mu)} \\\\\n",
235 |     "&= \\frac{p(\\mu) p(x_1 \\mid \\mu) p(x_2 \\mid \\mu)}{p(\\mu)} \\\\\n",
236 |     "&= p(x_1 \\mid \\mu) p(x_2 \\mid \\mu)\n",
237 |     "\\end{align*}\n",
238 |     "$$\n",
239 |     "\n",
240 |     "This mathematical result aligns with the intuition we built in the previous section.\n",
241 |     "\n",
242 |     "Let's look at another example:\n",
243 |     "\n",
244 |     "![](chain_gm.png)\n",
245 |     "\n",
246 |     "In this example, $x$ and $z$ are not unconditionally independent. Intuitively, we can see that $y$ depends on $x$, and $z$ depends on $y$, so that $x$ and $z$ are dependent. \n",
247 |     "\n",
248 |     "But, $x$ and $z$ are conditionally independent given $y$: the lack of an arrow directly from $x$ to $z$ tells us that $z$ only depends on $x$ through $y$.\n",
249 |     "\n",
250 |     "***Exercise**: mathematically prove the results stated above.*\n",
251 |     "\n",
252 |     "Let's look at a third example:\n",
253 |     "\n",
254 |     "![](collider_gm.png)\n",
255 |     "\n",
256 |     "In this example, $x$ and $z$ are unconditionally independent, but given $y$, they are conditionally dependent. Why? Let's look at an example that will help us build intuition for this result. Suppose that:\n",
257 |     "\n",
258 |     "* $y$ is whether or not I have a stuffy nose.\n",
259 |     "* $x$ is whether or not I am sick (with a cold, flu, COVID, etc.)\n",
260 |     "* $z$ is whether or not I have seasonal allergies.\n",
261 |     "\n",
262 |     "First, we can see that the description matches the graphical model: whether or not I have a stuffy nose depends on whether or not I'm sick, and whether or not I have allergies. But, sickness and allergies don't affect each other. In other words, if I don't know anything about whether I have a stuffy nose, then my sickness and allergies are independent of each other.\n",
263 |     "\n",
264 |     "Now, suppose I wake up one morning with a stuffy nose (i.e., $y=1$), and I'm trying to determine whether I'm sick or have allergies. I look at the weather forecast, and see that the pollen counts are very high. As soon as I hear this information, I'm a lot more certain that $z=1$. But, even though the weather forecast didn't directly tell me anything about whether or not I'm sick, my belief that I'm sick drops significantly: my symptoms have been **explained away** by the explanation that I probably have allergies. \n",
265 |     "\n",
266 |     "In other words, conditioned on a value of $y$ (stuffy nose), knowing something about $z$ (allergies) gives me information about the distribution of $x$ (sickness). This is precisely the definition of conditional dependence.\n",
267 |     "\n",
268 |     "***Exercise**: mathematically prove the results above.*\n",
269 |     "\n",
270 |     "These results can be formalized and generalized in the **d-separation** or **Bayes' ball** algorithm. While this algorithm is beyond the scope of this textbook, we'll look at a variant of it in a few chapters when we talk about causality."
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "tags": [
278 |      "remove-input"
279 |     ]
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "YouTubeVideo('lpujKeK90RM')"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "celltoolbar": "Edit Metadata",
289 |   "kernelspec": {
290 |    "display_name": "Python 3 (ipykernel)",
291 |    "language": "python",
292 |    "name": "python3"
293 |   },
294 |   "language_info": {
295 |    "codemirror_mode": {
296 |     "name": "ipython",
297 |     "version": 3
298 |    },
299 |    "file_extension": ".py",
300 |    "mimetype": "text/x-python",
301 |    "name": "python",
302 |    "nbconvert_exporter": "python",
303 |    "pygments_lexer": "ipython3",
304 |    "version": "3.11.5"
305 |   }
306 |  },
307 |  "nbformat": 4,
308 |  "nbformat_minor": 2
309 | }
310 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/02/04_inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from scipy import stats\n",
 14 |     "from IPython.display import YouTubeVideo\n",
 15 |     "\n",
 16 |     "%matplotlib inline\n",
 17 |     "\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import seaborn as sns\n",
 20 |     "sns.set()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Bayesian Inference"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "In this section, we'll focus on computing and using posterior distributions in more sophisticated Bayesian models. We'll start by discussing why posterior distributions are useful in Bayesian inference, and then explain why they're hard. Then, in the next section, we'll learn about approximating distributions using sampling."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Why we need posterior distributions\n",
 42 |     "\n",
 43 |     "In general, we need the posterior distribution so that we can make statements and decisions about our unknown quantity of interest, $\\theta$. We saw that for simple models like the product review model or the model for heights, it was easy to compute the posterior exactly, because we chose a conjugate prior.\n",
 44 |     "\n",
 45 |     "In the product review example:\n",
 46 |     "* Our parameter of interest $\\theta$ represents the probability of a positive review.\n",
 47 |     "* If we chose a Beta prior, i.e., $\\theta \\sim \\mathrm{Beta}(\\alpha, \\beta)$, then the posterior distribution also belonged to the Beta family: $\\theta | x \\sim \\mathrm{Beta}\\left(\\alpha + \\sum x_i, \\beta + n - \\sum x_i\\right)$.\n",
 48 |     "* This made it easy to determine things like the MAP estimate or LMSE estimate, simply by using known properties of the Beta distribution.\n",
 49 |     "\n",
 50 |     "But what if our posterior distribution didn't have such a convenient form? In that case, we would have to compute the posterior (and any estimates from it) ourselves:\n",
 51 |     "\n",
 52 |     "$$\n",
 53 |     "\\begin{align}\n",
 54 |     "p(\\theta|x)\n",
 55 |     "&= \\frac{p(x|\\theta)p(\\theta)}{p(x)} \\\\\n",
 56 |     "&= \\frac{p(x|\\theta)p(\\theta)}{\\int p(x|\\theta)p(\\theta)\\,d\\theta} \\\\\n",
 57 |     "\\end{align}\n",
 58 |     "$$\n",
 59 |     "\n",
 60 |     "In general, the integral in the denominator could be impossible to compute. We call the denominator the **normalizing constant**: it's a constant because it doesn't depend on $\\theta$, and it's normalizing because we need it for the distribution or density to sum or integrate to 1. \n",
 61 |     "\n",
 62 |     "In the next section, we'll see a few examples that illustrate why computing the normalizing constant is hard, but first, let's look at three examples of why we might need to know it in the first place.\n",
 63 |     "\n",
 64 |     "### Computing probabilities\n",
 65 |     "\n",
 66 |     "Suppose we want to know the probability that $\\theta$ is greater than 0.7, given the observed data. In this case, we can set up an integral to compute this:\n",
 67 |     "\n",
 68 |     "$$\n",
 69 |     "\\begin{align}\n",
 70 |     "P(\\theta > 0.7 | x)\n",
 71 |     "&= \\int_{0.7}^1 p(\\theta|x) \\, dx \\\\\n",
 72 |     "&= \\int_{0.7}^1 \\frac{p(x|\\theta)p(\\theta)}{p(x)} \\, dx \\\\\n",
 73 |     "&= \\frac{1}{p(x)} \\int_{0.7}^1 p(x|\\theta)p(\\theta) \\, dx\n",
 74 |     "\\end{align}\n",
 75 |     "$$\n",
 76 |     "\n",
 77 |     "In the last step, we used the fact that p(x) doesn't depend on $\\theta$.\n",
 78 |     "\n",
 79 |     "If we don't know $p(x)$, then our probability will be off by an unknown factor. For example, suppose the true probability is 0.9, the integral is 0.0009, and the normally-unknown denominator $p(x)$ is $0.001$. In this case, if we don't know the normalizing constant, there's no way we can determine the probability: we'll always be wrong by an unknown factor, which means that our answer is useless.\n",
 80 |     "\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 2,
 86 |    "metadata": {
 87 |     "tags": [
 88 |      "remove-input"
 89 |     ]
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/html": [
 95 |        "\n",
 96 |        "        <iframe\n",
 97 |        "            width=\"400\"\n",
 98 |        "            height=\"300\"\n",
 99 |        "            src=\"https://www.youtube.com/embed/WOS7iFlsN5c\"\n",
100 |        "            frameborder=\"0\"\n",
101 |        "            allowfullscreen\n",
102 |        "            \n",
103 |        "        ></iframe>\n",
104 |        "        "
105 |       ],
106 |       "text/plain": [
107 |        "<IPython.lib.display.YouTubeVideo at 0x13d4603d0>"
108 |       ]
109 |      },
110 |      "execution_count": 2,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "YouTubeVideo('WOS7iFlsN5c')"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "### MAP Estimation \n",
124 |     "\n",
125 |     "Suppose we want to compute the MAP estimate:\n",
126 |     "\n",
127 |     "$$\n",
128 |     "\\begin{align}\n",
129 |     "\\hat{\\theta}_{MAP} \n",
130 |     "&= \\underset{\\theta}{\\operatorname{argmax}} p(\\theta|x) \\\\\n",
131 |     "&= \\underset{\\theta}{\\operatorname{argmax}} \\frac{p(x|\\theta)p(\\theta)}{p(x)} \\\\\n",
132 |     "&= \\underset{\\theta}{\\operatorname{argmax}} p(x|\\theta)p(\\theta) \\\\\n",
133 |     "\\end{align}\n",
134 |     "$$\n",
135 |     "\n",
136 |     "In the last step, we used the fact that p(x) doesn't depend on $\\theta$.\n",
137 |     "\n",
138 |     "If $\\theta$ is low-dimensional and continuous, we can often optimize this either analytically or sometimes numerically. If $\\theta$ is discrete and doesn't take on too many different values, we can search over all possible values. However, if $\\theta$ is discrete and takes on an intractably large number of possible values, then we'd need to search over all of them, which would be impossible. Similarly, if $\\theta$ is high-dimensional, then the search can also be intractable.\n",
139 |     "\n",
140 |     "To summarize: for low-dimensional continuous variables, or discrete random variables with a low number of possible values, we can compute the MAP estimate without needing to know the exact posterior. For higher-dimensional random variables and/or discrete random variables with many possible values, this won't work.\n",
141 |     "\n",
142 |     "### LMSE Estimation\n",
143 |     "Suppose we want to compute the LMSE estimate. Recall the definition of conditional expectation (see Data 140 textbook, [Chapter 9](http://prob140.org/textbook/content/Chapter_09/02_Expectation_by_Conditioning.html) and [Chapter 15](http://prob140.org/textbook/content/Chapter_15/03_Expectation.html)):\n",
144 |     "\n",
145 |     "$$\n",
146 |     "\\begin{align}\n",
147 |     "\\hat{\\theta}_{LMSE} \n",
148 |     "&= E_{\\theta|x}[\\theta] \\\\\n",
149 |     "&= \\int \\theta \\cdot p(\\theta|x) \\, d\\theta \\\\\n",
150 |     "&= \\int \\theta \\cdot \\frac{p(x|\\theta)p(\\theta)}{p(x)} \\, d\\theta \\\\\n",
151 |     "&= \\frac{1}{p(x)} \\int \\theta \\cdot p(x|\\theta)p(\\theta)\\, d\\theta \n",
152 |     "\\end{align}\n",
153 |     "$$\n",
154 |     "\n",
155 |     "In order to compute the LMSE estimate, we need to compute the denominator, $p(x)$. If we don't know it, then our estimate will be off by a multiplicative factor that we don't know, making it effectively useless.\n",
156 |     "\n",
157 |     "The same is true for computing the expected value of any other function of $\\theta$, or any other probability involving the posterior distribution. For example, answering the question \"according to the posterior distribution, what is the variance of $\\theta$?\" will lead to the same problem.\n",
158 |     "\n",
159 |     "To summarize: any computations involving the posteriors (probabilities, expectations, etc.) require us to have the full normalized distribution: the numerator in Bayes' rule isn't enough."
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "\n",
167 |     "## Why computing posterior distributions is hard"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "In simple models like our product review model or our model for heights, it was easy to compute the exact posterior for the unknown variable that we were interested in. This happened because we chose a conjugate prior. In most other cases, computing the exact posterior is hard! Here are two examples:"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "### One-dimensional non-conjugate prior\n",
182 |     "\n",
183 |     "Let's return to the product review example, but this time, instead of a Beta prior, we choose $p(\\theta) = \\frac{2}{\\pi}\\cos\\left(\\frac{\\pi}{2} \\theta\\right)$ for $\\theta \\in [0, 1]$. \n",
184 |     "\n",
185 |     "\n",
186 |     "$$\n",
187 |     "\\begin{align}\n",
188 |     "p(\\theta|x) \n",
189 |     "    &\\propto p(x|\\theta)p(\\theta) \\\\\n",
190 |     "    &\\propto \\Big[\\theta^{\\left[\\sum_i x_i\\right]}(1-\\theta)^{\\left[\\sum_i (1-x_i)\\right]}\\Big]\\cos\\left(\\frac{\\pi}{2}\\theta\\right)\n",
191 |     "\\end{align}\n",
192 |     "$$\n",
193 |     "\n",
194 |     "This distribution looks much more complicated: we can't reduce it to a known distribution at all. So, in order to properly compute $p(\\theta|x)$, we'd need to figure out the normalizing constant. This requires solving the integral:\n",
195 |     "\n",
196 |     "$$\n",
197 |     "\\begin{align}\n",
198 |     "p(x) &= \\int_0^1 \\Big[\\theta^{\\left[\\sum_i x_i\\right]}(1-\\theta)^{\\left[\\sum_i (1-x_i)\\right]}\\Big]\\cos\\left(\\frac{\\pi}{2}\\theta\\right)\\,d\\theta\n",
199 |     "\\end{align}\n",
200 |     "$$\n",
201 |     "\n",
202 |     "This integral is difficult to solve in closed form. In this specific example, since this is a one-dimensional problem, we could take advantage of numerical integration. In other words, for a particular sequence of values $x_1, \\ldots, x_n$, we can plug them in and compute a numerical approximation to the integral, and then find the normalizing constant that way. As we saw above, we don't need the normalizing constant if we're only interested in the MAP estimate, but we can't compute the LMSE estimate without it."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "### Multi-dimensional example\n",
210 |     "\n",
211 |     "Consider the exoplanet model from last time: $x_i$ is the (observed) radius of planet $i$, $z_i$ is whether the planet belongs to group 0 (small, possibly habitable planets) or group 1 (large, possibly inhabitable planets), and $\\mu_0$ and $\\mu_1$ are the mean radii of those two groups, respectively.\n",
212 |     "\n",
213 |     "$$\n",
214 |     "\\begin{align}\n",
215 |     "    z_i &\\sim \\mathrm{Bernoulli}(\\pi) & i = 1, \\ldots, n \\\\\n",
216 |     "    \\mu_k &\\sim \\mathcal{N}(\\mu_p, \\sigma_p) &  k =0, 1 \\\\\n",
217 |     "    x_i | z_i, \\mu_0, \\mu_1 &\\sim \\mathcal{N}(\\mu_{z_i}, \\sigma) & i = 1, \\ldots, n\\\\\n",
218 |     "\\end{align}\n",
219 |     "$$\n",
220 |     "\n",
221 |     "We can write the likelihood and prior. To simplify, we'll write $\\mathcal{N}(y; m, s) = \\frac{1}{s \\sqrt{2\\pi}} \\exp\\left\\{-\\frac{1}{2s^2}(y - m)^2\\right\\}$\n",
222 |     "\n",
223 |     "$$\n",
224 |     "\\begin{align}\n",
225 |     "    p(z_i) &= \\pi^{z_i}(1-\\pi)^{1-z_i} \\\\\n",
226 |     "    p(\\mu_k) &= \\mathcal{N}(\\mu_k; \\mu_p, \\sigma_p) \\\\\n",
227 |     "    p(x_i | z_i, \\mu_0, \\mu_1) &= \\mathcal{N}(x_i; \\mu_{z_i}, \\sigma)\n",
228 |     "\\end{align}\n",
229 |     "$$\n",
230 |     "\n",
231 |     "We can try computing the posterior over the hidden variables $z_i$, $\\mu_0$, and $\\mu_1$. We'll use the notation $z_{1:n}$ to represent $z_1, \\dots, z_n$ (and similarly for $x_{1:n}$).\n",
232 |     "\n",
233 |     "$$\n",
234 |     "\\begin{align}\n",
235 |     "    p(z_{1:n}, \\mu_0, \\mu_1 | x_{1:n}) &\\propto p(\\mu_0)p(\\mu_1)\\prod_i \\left[p(z_i) p(x_i | z_i, \\mu_0, \\mu_1)\\right]\n",
236 |     "\\end{align}\n",
237 |     "$$\n",
238 |     "\n",
239 |     "This distribution is more complicated than anything we've seen up until now. It's the joint distribution over $n+2$ random variables (the group labels $z_1, \\ldots, z_n$ and the two group means $\\mu_0$ and $\\mu_1$).\n",
240 |     "\n",
241 |     "Computing the normalization constant $p(x_{1:n})$ requires a complicated combination of sums and integrals:\n",
242 |     "\n",
243 |     "$$\n",
244 |     "\\begin{align}\n",
245 |     "p(x_{1:n}) &= \\sum_{z_1=0}^1 \\sum_{z_2=0}^1 \\ldots \\sum_{z_n=0}^1 \\int \\int p(\\mu_0)p(\\mu_1)\\prod_i \\left[p(z_i) p(x_i | z_i, \\mu_0, \\mu_1)\\right] d\\mu_0 d\\mu_1\n",
246 |     "\\end{align}\n",
247 |     "$$\n",
248 |     "\n",
249 |     "For our dataset of over 500 planets, the sums alone would require a completely intractable amount of computation:"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 3,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/plain": [
260 |        "429049853758163107186368799942587076079339706258956588087153966199096448962353503257659977541340909686081019461967553627320124249982290238285876768194691072"
261 |       ]
262 |      },
263 |      "execution_count": 3,
264 |      "metadata": {},
265 |      "output_type": "execute_result"
266 |     }
267 |    ],
268 |    "source": [
269 |     "2**517"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Worse still, we can't even compute the MAP estimate for the labels $z_i$: in order to find the one that maximizes the numerator, we'd have to search over all $2^{517}$ combinations, which is also completely intractable. \n",
277 |     "\n",
278 |     "Even in this fairly simple model, with two groups, we've found that exact inference is completely hopeless: there's no way we can compute the exact posterior for all our unknowns. In the rest of this section, we'll talk about ways to get around this problem using approximations to the posterior distribution.\n",
279 |     "\n",
280 |     "Specifically, our approximation methods will take advantage of what we've learned. We know that the hardest part of computing posterior distributions is computing the normalization constant $p(x)$. So, we'll build methods that start with the unnormalized posterior $p(x|\\theta)p(\\theta)$ and use that to give us an approximation of the actual posterior $p(\\theta|x) = \\frac{p(x|\\theta)p(\\theta)}{p(x)}$. While there are multiple families of methods to provide such approximations, in this textbook we'll focus on ones that use **samples** to approximate the distribution."
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 4,
286 |    "metadata": {
287 |     "tags": [
288 |      "remove-input"
289 |     ]
290 |    },
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/html": [
295 |        "\n",
296 |        "        <iframe\n",
297 |        "            width=\"400\"\n",
298 |        "            height=\"300\"\n",
299 |        "            src=\"https://www.youtube.com/embed/BlQ6IVoJ0X8\"\n",
300 |        "            frameborder=\"0\"\n",
301 |        "            allowfullscreen\n",
302 |        "            \n",
303 |        "        ></iframe>\n",
304 |        "        "
305 |       ],
306 |       "text/plain": [
307 |        "<IPython.lib.display.YouTubeVideo at 0x13d463810>"
308 |       ]
309 |      },
310 |      "execution_count": 4,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "YouTubeVideo('BlQ6IVoJ0X8')"
317 |    ]
318 |   }
319 |  ],
320 |  "metadata": {
321 |   "celltoolbar": "Edit Metadata",
322 |   "kernelspec": {
323 |    "display_name": "Python 3 (ipykernel)",
324 |    "language": "python",
325 |    "name": "python3"
326 |   },
327 |   "language_info": {
328 |    "codemirror_mode": {
329 |     "name": "ipython",
330 |     "version": 3
331 |    },
332 |    "file_extension": ".py",
333 |    "mimetype": "text/x-python",
334 |    "name": "python",
335 |    "nbconvert_exporter": "python",
336 |    "pygments_lexer": "ipython3",
337 |    "version": "3.11.5"
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 2
342 | }
343 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/00_figure_data_generation.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "id": "52c6b1ae",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import numpy as np\n",
11 |     "import pandas as pd\n",
12 |     "from sklearn.metrics import roc_curve\n",
13 |     "\n",
14 |     "%matplotlib inline\n",
15 |     "\n",
16 |     "import matplotlib.pyplot as plt\n",
17 |     "import seaborn as sns\n",
18 |     "\n",
19 |     "sns.set()"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": 60,
25 |    "id": "efdbb32c",
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "data": {
30 |       "text/plain": [
31 |        "[<matplotlib.lines.Line2D at 0x141aee9d0>]"
32 |       ]
33 |      },
34 |      "execution_count": 60,
35 |      "metadata": {},
36 |      "output_type": "execute_result"
37 |     },
38 |     {
39 |      "data": {
40 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiYAAAGgCAYAAACez6weAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsQElEQVR4nO3dfWxU153/8c/M2DM22GNwfhgTHAIiJV53VYNr81CZ0GaFVG27WxTxR+nPbWKCg35tcQIJhKgkPNRZokLW1E0cmgpII5bCCtqkTa0qpK1Wm+wucaDadkOJkl2Fp/iBXRsbsJkZz5zfH6wnHmyDrz1z587M+yVV2DdnxsdfXO7H554HlzHGCAAAwAHcye4AAADAIIIJAABwDIIJAABwDIIJAABwDIIJAABwDIIJAABwDIIJAABwDIIJAABwjKxkd8AqY4wikcTsCed2uxL23vgUdbYHdbYPtbYHdbZHIursdrvkcrnG1DblgkkkYtTVdS3u75uV5dbUqZPV29ungYFI3N8fN1Bne1Bn+1Bre1BneySqzoWFk+XxjC2Y8CgHAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4BsEEAAA4xoSCSXNzs775zW/esk13d7cef/xxVVVVqaqqSk8//bT6+vom8mUBAECaGncweeWVV9TU1HTbdvX19Tp//ny0/TvvvKPt27eP98sCAIA0ZvkQv46ODn3ve9/TyZMnNWfOnFu2/cMf/qB3331XLS0tmjt3riRpx44dWrNmjTZs2KDp06ePr9cAACAtWQ4m77//vgoKCvTLX/5SL774oi5evDhq2/fee0/Tpk2LhhJJWrhwoVwul06ePKm//uu/Hl+vASDOjDEKhqyfphqOGF0PDCgQDHPqbQJRZ3uEI0bGmKT2wXIwuf/++3X//fePqW1HR4dmzJgRc83r9WrKlClqa2uz+qWjsrLiP2fX43HH/InEoM72oM7WGGPU8NOT+vBCT7K7AiTdX8wu1Pe+9fmE3GvHwnIwsaK/v19er3fYdZ/Pp0AgMK73dLtdmjp18kS7Niq/Pzdh741PUWd7UOfRGWMUCIYlSdeDYUIJMITfnyuXy5WUr53QYJKTk6NgMDjseiAQ0KRJk8b1npGIUW9v/Ff1eDxu+f256u3tVzjMMGGiUGd7UOdbuzFC8t6IYeSF9ffJl+0Z83u5PS7583PVe6VfkXByh8DTGXW2h9vj0rQ78nTlyvW4/tvh9+eOeQQ3ocGkuLhYb731Vsy1YDCoy5cvT2jiayKfL4bDEZ5f2oA62yMT6jyeuSGB0MgjJPeUFCjX67H0m2KWx60cX5b6+9waMOld62SizvbI8rjlcrmS+m9HQoNJVVWVdu/erbNnz+ruu++WJJ04cUKSVFFRkcgvDSBFWQkaRkbPHTylc51Xx/319qyrjo6QeLPdSRu+BnBDXINJOBxWV1eX8vPzlZOTo/LyclVUVGj9+vXatm2b+vr6tHXrVq1YsYKlwgCGMcZo58FT+uiiPfM97ikpUP6kbMII4CBxDSZtbW36q7/6K+3cuVMPPPCAXC6XXnjhBW3fvl0PPvigfD6fvvzlL+upp56K55cFYLPxLq29nUAoPK5QMqsoT5trKuSStYDBCAngPC6T7AXLFoXDEXV1XYv7+2ZluTV16mR1d19L+2fyyUSd7ZHIOts1qjH0EcvtJDNg8DNtD+psj0TVubBwsjMmvwJIH4OjJOMd1bCCRyxA5iKYALit0UZJrIxqWMEjFiBzEUyAFDOW+R3x3r57pFESRjUAJALBBEghEWO040DrhJbHTtTgKAmjGgASgWACpAjjgFDCKAmARCOYACkiGIpEQ8n0qbnaWls16vLYrCy3pkyZpMuX++I6s55REgCJRjABUtDW2irleEf/v29W1o3tu31ejzxuggSA1MGZ6ECKMPp0yyGrG4kBQKogmAApwJgbZ8IAQLrjUQ7gQDcvCQ6EwtH5JbOK8uTN5ncKAOmJYAIk0HjOlLndibmbayqYgAogbRFMgARJxJky95QUJGSnVQBwCoIJkADGGF3pC00olIx0Yi7LdQGkO4IJEGcjjZSM50wZQgiATEQwAeIsGIrEhBJ2SwWAsSOYAAm0Z101oQQALGDNIZBAvmwPoQQALCCYAHFkjFEgFE52NwAgZfEoB4iTRCwPBoBMQzABJmhwE7VAKDxs0is7tAKANQQTYAJGGyVh0isAjA+/zgHjNNomaiwPBoDxY8QEGIdbbaLGxmgAMH4EE+AWRjuEb6T5JIySAMDEEUyAIYYGkdud8juI+SQAED8EE+B/jWe5LyMlABBfBBNAtz4NeKRTfgcxnwQA4otggox3u9OACR8AYB+CCTLOzRNamcgKAM5BMEFGud08EiayAkByEUyQ9oaOkNw8OjIUIyUAkHwEE6S1W42QDJ1HIjGXBACcgGCCtBYMRUYMJYyOAIAzEUyQtowxCoTC0c9ZaQMAzkcwQVoa6RGOL9sjn9dzi1cBAJKNYIKUN9J5NiMtAfZmc5g2ADgdwQQpbSzbyLMEGABSB79CIqXdavmvxCRXAEg1jJgg5Qw+ujEy2n6gNXr95uW/EpNcASDVEEyQUkZ7dDOrKI+REQBIAzzKQUoZaV+SWUV5eqa2ilACAGmAEROkrMFHNzyuAYD0QTBBShicVzJ0wzT2JQGA9EMwgeONZUkwACA9MMcEjjfSkmA2TAOA9MSICRwtYkZeEsy8EgBITwQTOJYxRjsOtKqju18SS4IBIBMwFg5HMsboSl9I5zqvSpKmT81lSTAAZABGTOA4I0123VpbJTehBADSHsEEjjF0SfDNJwPfvNU8ACA9EUzgCKMtCeZkYADILAQTJNVooyQSJwMDQCYimCBpbjVKwpJgAMhMBBMkzUgH8jFKAgCZjWCCpDEy0Y8ZJQEASAQTJIkxRs8dPBX9nAP5AADSODZYi0Qiampq0tKlS1VeXq7Vq1fr7Nmzo7a/dOmSNmzYoEWLFmnRokV69NFH1d7ePqFOI/UFQuHo5mmzivI49wYAIGkcwaS5uVmHDx9WQ0ODjhw5IpfLpbq6OgWDwRHbr1+/Xm1tbTpw4IAOHDig9vZ2ffvb355wx5G6jDFq+Ol70c8311Tw+AYAIMliMAkGg9q/f7/WrVunZcuWqbS0VI2Njero6NDx48eHte/t7VVra6vq6upUVlamsrIyPfLII3r//ffV3d0dt28CqcMYo56rQZ3r+HS0hM3TAACDLM0xOXPmjK5du6bFixdHr/n9fpWVlam1tVVf+cpXYtr7fD5NmjRJr732mhYuXChJev311zV79mwVFBSMv9NZ8R/293jcMX8i/iLG6Jl97+ps+5XotS0PVSqbYBJ3/Dzbh1rbgzrbwwl1thRMBueGzJgxI+Z6UVGR2trahrX3+Xx69tlntWPHDlVWVsrlcmnatGk6ePCg3O7xfdNut0tTp04e12vHwu/PTdh7ZzJjjB77+3+KCSV/MbtQxUV+HuMkED/P9qHW9qDO9khmnS0Fk/7+G8fPe73emOs+n089PT3D2htj9MEHH2jBggVas2aNwuGwGhsb9Z3vfEc/+9nPlJeXZ7nDkYhRb2+f5dfdjsfjlt+fq97efoXDkbi/f6YaurPrf31y42fkzv8zWd+vW6Qst0uXL8f/7xL8PNuJWtuDOtsjUXX2+3PHPApjKZjk5ORIujHXZPBjSQoEAsrNHZ6ufv3rX+vQoUP6/e9/Hw0he/fu1Ze+9CUdO3ZMDz74oJUvHzUwkLgfynA4ktD3zySj7uy64Yu63hf43zqbkV+MuODn2T7U2h7U2R7JrLOl5ymDj3A6Oztjrnd2dqq4uHhY+5MnT2rOnDkxIyMFBQWaM2eOPv7443F0F6lkpJ1dP1NSoBz2KwEAjMLSiElpaany8vJ04sQJzZo1S9KNlTenT59WTU3NsPYzZsxQS0uLAoGAfD6fpBuPgy5cuKC/+Zu/iUP3kSoGd3adlJvFnBIAwKgsjZh4vV7V1NRo9+7d+u1vf6szZ85o/fr1Ki4u1vLlyxUOh3Xp0iVdv35dkrRixQpJ0mOPPaYzZ85E23u9Xj3wwANx/2bgHMYYBULh6OeDO7sSSgAAt2J5aUx9fb1WrlypLVu2aNWqVfJ4PNq3b5+8Xq/a2tpUXV2tlpYWSTdW6xw6dEjGGD344IOqra1Vdna2fvazn8nv98f9m0FyGWMUCIZ1PTig7Qda9diP3k52lwAAKcZljEmp2YfhcERdXdfi/r5ZWW5NnTpZ3d3XmFg1DqNNdJVunBj81P+9sbsrdbYHdbYPtbYHdbZHoupcWDg5MatygJEYY3SlLzQslMwqytPmmgr5snmEAwAYG4IJJmSkkZLBia7ebDeBBABgCcEEExIIhWNCyT0lBcqflE0gAQCMC8EE42aM0XMHT0U/37OumlACAJgQTkPCuAVDEZ3r/PSUYEIJAGCiCCaIi801FYQSAMCEEUwQFy4RSgAAE0cwwbjcvLMrAADxwORXWHarzdQAAJgIggnGzBijYCgy4hJhbzaDbwCAiSOYYExGGyVhiTAAIJ74NRdjEgxFhoUSNlMDAMQbIyawjC3nAQCJQjCBZb5sj3xeT7K7AQBIQwQT3NLQCa8AACQawQSjYlkwAMBuTH7FqG5eFiyxNBgAkFiMmGBEI50czIRXAECiEUwwIk4OBgAkA8EEMUaa7MrJwQAAuxBMEDXaZFdODgYA2IVZjIgabXdXJrsCAOzCiAlGxGRXAEAyEEwwInZ3BQAkA2P0AADAMQgmAADAMXiUA87DAQA4BsEkw3EeDgDASXiUk+FYIgwAcBJGTBDFEmEAQLIRTBDFEmEAQLIxXg8AAByDYAIAAByDYAIAAByDYAIAAByDYJLhjEyyuwAAQBTBJIMZY/TcwVPJ7gYAAFEEkwwWCIV1rvOqJGlWUR6bqgEAko47UYa6ebRkc00Fm6oBAJKOYJKhbh4t8WWzsRoAIPkIJhkoYoy2H2iNfs5oCQDAKQgmGcYYox0HWtXR3S+J0RIAgLMQTDJMMBSJPsKZPjVXz9RWMVoCAHAMgkkG21pbJTehBADgIASTDOYSoQQA4CwEEwAA4BgEkwzDFvQAACcjmGSQm5cJAwDgNASTDDHSMmG2oAcAOA13pgwxdKdXlgkDAJyKYJIBbj4Xh2XCAACnIphkAM7FAQCkCoJJmuMUYQBAKiGYpLmhW9AzWgIAcDqCSQZhtAQA4HSWg0kkElFTU5OWLl2q8vJyrV69WmfPnh21fSgU0vPPP6+lS5dq/vz5qqmp0Z///OcJdRrjwxb0AACnsxxMmpubdfjwYTU0NOjIkSNyuVyqq6tTMBgcsf22bdt09OhRff/739exY8c0ZcoU1dXV6cqVKxPuPAAASC+WgkkwGNT+/fu1bt06LVu2TKWlpWpsbFRHR4eOHz8+rP358+d19OhR7dy5U1/84hc1d+5c/d3f/Z28Xq/+4z/+I27fBAAASA9ZVhqfOXNG165d0+LFi6PX/H6/ysrK1Nraqq985Ssx7d9++235/X7dd999Me1/97vfTazTWfGfGuPxuGP+TBfhyKdn42RluRNSOyvStc5OQ53tQ63tQZ3t4YQ6Wwom7e3tkqQZM2bEXC8qKlJbW9uw9h9//LHuuusuvfnmm3r55ZfV0dGhsrIybd68WXPnzh1Xh91ul6ZOnTyu146F35+bsPdOhuuBgejHU6ZMUo7P0l95wqRbnZ2KOtuHWtuDOtsjmXW2dJfq779xzorX64257vP51NPTM6z91atXde7cOTU3N2vTpk3y+/166aWX9I1vfEMtLS264447LHc4EjHq7e2z/Lrb8Xjc8vtz1dvbr3A4Evf3T5ZAMBz9+PLlPvm8yV0unK51dhrqbB9qbQ/qbI9E1dnvzx3zKIylYJKTkyPpxlyTwY8lKRAIKDd3eLrKzs7WlStX1NjYGB0haWxs1LJly/SLX/xCa9assfLlowYGEvdDGQ5HEvr+dgsNfBpMBgYi8ridsTIn3ersVNTZPtTaHtTZHsmss6WHSIOPcDo7O2Oud3Z2qri4eFj74uJiZWVlxTy2ycnJ0V133aULFy6Mp7+w4OZdXwEAcDpLwaS0tFR5eXk6ceJE9Fpvb69Onz6tysrKYe0rKys1MDCgP/3pT9Fr169f1/nz53X33XdPoNsYi5t3ffVmM2kMAOBslh7leL1e1dTUaPfu3SosLNTMmTO1a9cuFRcXa/ny5QqHw+rq6lJ+fr5ycnJUWVmpL3zhC3ryySe1Y8cOTZkyRU1NTfJ4PPra176WqO8JI2DXVwBAKrD8K3R9fb1WrlypLVu2aNWqVfJ4PNq3b5+8Xq/a2tpUXV2tlpaWaPsf/ehHWrhwob773e9q5cqVunr1ql599VUVFhbG9RvBrbHrKwAgFbiMMeb2zZwjHI6oq+ta3N83K8utqVMnq7v7WtpMrAoEw/p/f/9PkqSXNixL+oocKT3r7ETU2T7U2h7U2R6JqnNh4eQxr8ph0kGaMsYoEArfviEAAA7ijN22EFfGGO08eEofXRy+twwAAE7GiEkaCoYiMaHknpICVuQAAFICIyZpbs+6auVPymZFDgAgJfBrdJrzZXsIJQCAlEEwSUNGKbXQCgCAKIJJmmEbegBAKiOYpBm2oQcApDLuWmmMbegBAKmGYJLG2IYeAJBqCCYAAMAxCCYAAMAxCCZphqXCAIBURjBJIywVBgCkOoJJGmGpMAAg1XHnSlMsFQYApCKCSZpiqTAAIBURTAAAgGMQTAAAgGMQTAAAgGMQTAAAgGMQTNIIm6sBAFIdwSRNsLkaACAdEEzSBJurAQDSAXevNMTmagCAVEUwSUNsrgYASFUEkzRgjFEgFE52NwAAmLCsZHcAE2OM0c6Dp/TRxZ5kdwUAgAljxCTFBUORmFByT0kBE18BACmLEZM0smddtfInZTPxFQCQsvjVOo34sj2EEgBASiOYAAAAxyCYAAAAxyCYAAAAxyCYAAAAxyCYpDhOFAYApBOCSQrjRGEAQLohmKQwThQGAKQb7mRpghOFAQDpgGCSJjhRGACQDggmAADAMQgmAADAMQgmKYylwgCAdEMwSVERY7T9QGuyuwEAQFwRTFKQMUY7DrSqo7tfEkuFAQDpg7tZChq6f8n0qbl6praKpcIAgLRAMElxW2ur5CaUAADSBMEkxbF/CQAgnRBMAACAYxBMAACAYxBMAACAYxBMAACAYxBMAACAYxBMAACAYxBMAACAY1gOJpFIRE1NTVq6dKnKy8u1evVqnT17dkyv/dWvfqV7771XFy5csNxRAACQ/iwHk+bmZh0+fFgNDQ06cuSIXC6X6urqFAwGb/m6ixcvavv27ePuKG4wxigQCie7GwAAJISlYBIMBrV//36tW7dOy5YtU2lpqRobG9XR0aHjx4+P+rpIJKKNGzfqs5/97IQ7nMmMMdp58JQe+9Hbye4KAAAJYSmYnDlzRteuXdPixYuj1/x+v8rKytTa2jrq6/bu3atQKKS1a9eOv6dQMBTRRxd7op/fU1LAqcIAgLSSZaVxe3u7JGnGjBkx14uKitTW1jbia/74xz9q//79Onr0qDo6OsbZzVhZWfG/GXs87pg/nSgcMdGPX1h/n/InZafcqcKpUOd0QJ3tQ63tQZ3t4YQ6Wwom/f39kiSv1xtz3efzqaenZ1j7vr4+PfHEE3riiSc0e/bsuAQTt9ulqVMnT/h9RuP35ybsvSfqemAg+vH0afnK8Vn663MUJ9c5nVBn+1Bre1BneySzzpbubDk5OZJuzDUZ/FiSAoGAcnOHfxMNDQ2aPXu2vv71r0+wm5+KRIx6e/vi9n6DPB63/P5c9fb2KxyOxP394yEQ/HTS6+XLffJ5PUnszfikQp3TAXW2D7W2B3W2R6Lq7PfnjnkUxlIwGXyE09nZqVmzZkWvd3Z2qrS0dFj7Y8eOyev1asGCBZKkcPjGjfWrX/2q/vZv/1Y7duyw8uWjBgYS90MZDkcS+v4TMbRfAwMRedyp9RhnKCfXOZ1QZ/tQa3tQZ3sks86Wgklpaany8vJ04sSJaDDp7e3V6dOnVVNTM6z9m2++GfP5v//7v2vjxo16+eWXNXfu3Al0GwAApCNLwcTr9aqmpka7d+9WYWGhZs6cqV27dqm4uFjLly9XOBxWV1eX8vPzlZOTo7vvvjvm9YOTZ++8807dcccd8fsuMoSRuX0jAABSmOVpt/X19Vq5cqW2bNmiVatWyePxaN++ffJ6vWpra1N1dbVaWloS0deMZozRcwdPJbsbAAAklOVlHR6PRxs3btTGjRuH/beSkhJ98MEHo7520aJFt/zvGF0wFNG5zquSpFlFeexfAgBIS9zdUtDmmoqU278EAICxIJikIJcIJQCA9EQwAQAAjkEwAQAAjkEwSREsFQYAZAKCSQpgqTAAIFMQTFIAS4UBAJmCO1yKYakwACCdEUxSDEuFAQDpjGACAAAcg2ACAAAcg2ACAAAcg2ACAAAcg2DicMYYBULhZHcDAABbZCW7AxidMUY7D57SRxd7kt0VAABswYiJgwVDkZhQck9JAZurAQDSGiMmKWLPumrlT8pmczUAQFrj1+8U4cv2EEoAAGmPYOJgnCgMAMg0BBOH4kRhAEAmIpg4FCcKAwAyEXe7FMCJwgCATEEwcaih80s4URgAkCkIJg7E/BIAQKYimDhQIBRmfgkAICOxwZqDDJ6Ls/1Aa/Qa80sAAJmEYOIQI52LM6soT75sTxJ7BQCAvXhG4BA3n4szqyhPz9RWMVoCAMgojJg4EOfiAAAyFSMmDsS5OACATEUwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwAQAAjkEwcQgjk+wuAACQdAQTBzDG6LmDp5LdDQAAko5g4gCBUFjnOq9KkmYV5cmbzV8LACAzcQdMsptHSzbXVMjlciWxRwAAJA/BJMmCoUjMaIkv25PkHgEAkDwEEwdhtAQAkOkIJg7iEqEEAJDZCCZJxjJhAAA+RTBJIpYJAwAQi2CSRDdPfGWZMAAg03EndAgmvgIAQDBxDCa+AgBAMAEAAA5CMAEAAI5hOZhEIhE1NTVp6dKlKi8v1+rVq3X27NlR23/44Yd65JFHtGjRIi1ZskT19fX65JNPJtRpAACQniwHk+bmZh0+fFgNDQ06cuSIXC6X6urqFAwGh7Xt7u5WbW2tJk+erIMHD+onP/mJuru7tWbNGgUCgbh8AwAAIH1YCibBYFD79+/XunXrtGzZMpWWlqqxsVEdHR06fvz4sPZvvfWW+vv79dxzz+kzn/mM/vIv/1K7du3Sf/7nf+rUKfbvYHM1AABiWQomZ86c0bVr17R48eLoNb/fr7KyMrW2tg5rv2TJEr344ovy+XzD/ltPT884ups+2FwNAIDhsqw0bm9vlyTNmDEj5npRUZHa2tqGtS8pKVFJSUnMtR//+Mfy+Xyqqqqy2teorKz4z9n1eNwxfyZaIBj+dHO16XmalJuVEfuY2F3nTEWd7UOt7UGd7eGEOlsKJv39/ZIkr9cbc93n841pBOTVV1/VoUOH9NRTT+mOO+6w8qWj3G6Xpk6dPK7XjoXfn5uw9x7qemAg+vHuR5cp12fpryLl2VXnTEed7UOt7UGd7ZHMOlu6G+bk5Ei6Mddk8GNJCgQCys0d/ZswxuiHP/yhXnrpJa1du1YPPfTQ+HorKRIx6u3tG/frR+PxuOX356q3t1/hcCTu73+zQDAc/bjncp+uez0J/5pOYHedMxV1tg+1tgd1tkei6uz35455FMZSMBl8hNPZ2alZs2ZFr3d2dqq0tHTE14RCIT311FN64403tGnTJj388MNWvuSIBgYS90MZDkcS+v6Dhn6NgYGIPO70f4wzlF11znTU2T7U2h7U2R7JrLOlh0ilpaXKy8vTiRMnotd6e3t1+vRpVVZWjviaTZs26Te/+Y2ef/75uIQSAACQviyNmHi9XtXU1Gj37t0qLCzUzJkztWvXLhUXF2v58uUKh8Pq6upSfn6+cnJy9POf/1wtLS3atGmTFi5cqEuXLkXfa7BNpmKpMAAAw1medltfX6+VK1dqy5YtWrVqlTwej/bt2yev16u2tjZVV1erpaVFkvTGG29Ikn7wgx+ouro65n+DbTIRS4UBABiZ5aUgHo9HGzdu1MaNG4f9t5KSEn3wwQfRz/fv3z+x3qWpQGjIUuGiPHmzWf4GAIA0jmCC8TPGKBAKa/uBTzej21xTkRH7lwAAMBYEE5sYY7Tz4Cl9dPHT/V5mFeXJl50Zy4QBABgLniHYJBiKDAslz9RWMVoCAMAQjJgkwZ511cqflE0oAQDgJoyYJIEv20MoAQBgBAQTAADgGAQTAADgGAQTm7DTKwAAt0cwsQE7vQIAMDYEExsEQxF2egUAYAy4Q9qMnV4BABgdwcQGQ+eXuEQoAQBgNASTBGN+CQAAY0cwSTDmlwAAMHbcJW3E/BIAAG6NYGIj5pcAAHBrBBMAAOAYBBMAAOAYBJMEYyt6AADGjmCSQCwVBgDAGoJJghhjdKUvxFJhAAAsyEp2B9JRxBjtONAaDSUSS4UBABgLfoWPMzNCKLmnpEC+bE8SewUAQGpgxCTOhu70On1qrrbWVsmX7WG0BACAMSCYJNDW2irleCkxAABjxaOcODLGKBAKRz9np1cAAKzh1/k4McZo58FT+uhiT7K7AgBAymLEJE6CoUhMKLmnpIDlwQAAWMSISQLsWVet/EnZTHgFAMAifqVPAFbhAAAwPgQTAADgGAQTAADgGAQTAADgGAQTAADgGKzKGSdjjIKhSPTzoRurAQCA8SGYjMNIpwcDAICJ41GORSOdHjwUG6sBADB+jJhYYIzRlb7QsNODh56J4812s4cJAADjRDAZo5HOwuH0YAAA4otnDmM00lk4vmxPEnsEAED64df9ceAsHAAAEoMRkzEyMtGPOQsHAIDEIJiMQcQYbT/QmuxuAACQ9ggmtzG4PLiju1+SNKsoj+XAAAAkCHfY2wiGIjHLg5+preIxDgAACUIwsWBrbZXchBIAABKGYGLB0I3UAABA/BFMAACAY7CPyQiGnhzMqcEAANiHYHKTkbaeBwAA9uBRzk1u3np+EKcGAwCQeIyY3GToDq971lVHz8Ph1GAAABKPYDKEMUYNP30v+rkv2yOfl4P6AACwC88mhggEwzrXcWMzNXZ4BQDAftx5R7G5poJHNwAA2MxyMIlEImpqatLSpUtVXl6u1atX6+zZs6O27+7u1uOPP66qqipVVVXp6aefVl9f34Q6nShmyMdspgYAgP0sB5Pm5mYdPnxYDQ0NOnLkiFwul+rq6hQMBkdsX19fr/Pnz+uVV15RU1OT3nnnHW3fvn3CHY83Y4w2v/B2srsBAEBGsxRMgsGg9u/fr3Xr1mnZsmUqLS1VY2OjOjo6dPz48WHt//CHP+jdd9/Vzp079dnPflZLlizRjh079Prrr6ujoyNu30Q8BEMR/dcnN5YJM78EAIDksHT3PXPmjK5du6bFixdHr/n9fpWVlam1tXVY+/fee0/Tpk3T3Llzo9cWLlwol8ulkydPTqDb8Td0mTDzSwAASA5Ly4Xb29slSTNmzIi5XlRUpLa2tmHtOzo6hrX1er2aMmXKiO3HKisrvqMZxhhtf+XToJSd5Yn718ANHo875k8kBnW2D7W2B3W2hxPqbCmY9Pf3S7oRLoby+Xzq6Rm+W2p/f/+wtoPtA4GAlS8d5Xa7NHXq5HG9djTGGE3KyZYk/cXsQk0vymfEJMH8/txkdyEjUGf7UGt7UGd7JLPOloJJTk6OpBtzTQY/lqRAIKDc3OHfRE5OzoiTYgOBgCZNmmS1r5KkSMSotzf+q3q+963Py5fjVeB6UJcvO3PVUDrweNzy+3PV29uvcDiS7O6kLepsH2ptD+psj0TV2e/PHfMojKVgMvhYprOzU7NmzYpe7+zsVGlp6bD2xcXFeuutt2KuBYNBXb58WdOnT7fypWMMDMT/hzIry60cX5b6+wIJeX/ECocj1NkG1Nk+1Noe1NkeyayzpYdIpaWlysvL04kTJ6LXent7dfr0aVVWVg5rX1VVpfb29ph9TgZfW1FRMd4+AwCANGVpxMTr9aqmpka7d+9WYWGhZs6cqV27dqm4uFjLly9XOBxWV1eX8vPzlZOTo/LyclVUVGj9+vXatm2b+vr6tHXrVq1YsWJCIyYAACA9WZ52W19fr5UrV2rLli1atWqVPB6P9u3bJ6/Xq7a2NlVXV6ulpUWS5HK59MILL6ikpEQPPvigHnvsMd13333atm1bvL8PAACQBlzGGHP7Zs4RDkfU1XUt7u+bleXW1KmT1d19jeeXCUSd7UGd7UOt7UGd7ZGoOhcWTh7z5FcWhAMAAMcgmAAAAMcgmAAAAMcgmAAAAMcgmAAAAMcgmAAAAMcgmAAAAMcgmAAAAMcgmAAAAMdIuZ1fjTGKRBLTZY/HzXHaNqDO9qDO9qHW9qDO9khEnd1ul1wu15japlwwAQAA6YtHOQAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEIJgAAwDEyJphEIhE1NTVp6dKlKi8v1+rVq3X27NlR23d3d+vxxx9XVVWVqqqq9PTTT6uvr8/GHqcmq3X+8MMP9cgjj2jRokVasmSJ6uvr9cknn9jY49Rktc5D/epXv9K9996rCxcuJLiX6cFqrUOhkJ5//nktXbpU8+fPV01Njf785z/b2OPUZLXOly5d0oYNG7Ro0SItWrRIjz76qNrb223sceprbm7WN7/5zVu2Sca9MGOCSXNzsw4fPqyGhgYdOXJELpdLdXV1CgaDI7avr6/X+fPn9corr6ipqUnvvPOOtm/fbnOvU4+VOnd3d6u2tlaTJ0/WwYMH9ZOf/ETd3d1as2aNAoFAEnqfOqz+PA+6ePEiP8cWWa31tm3bdPToUX3/+9/XsWPHNGXKFNXV1enKlSs29zy1WK3z+vXr1dbWpgMHDujAgQNqb2/Xt7/9bZt7nboG7223k5R7ockAgUDALFiwwBw6dCh6raenx3zuc58zb7zxxrD2p06dMvPmzTMfffRR9No///M/m3vvvde0t7fb0udUZLXO//iP/2gqKirM9evXo9fa2trMvHnzzL/8y7/Y0udUZLXOg8LhsFm1apX51re+ZebNm2fOnz9vR3dTmtVanzt3zsybN8/8/ve/j2n/pS99iZ/pW7Ba556eHjNv3jzz29/+NnrtrbfeMvPmzTNdXV229DlVtbe3m4cfftjMnz/ffPnLXzY1NTWjtk3WvTAjRkzOnDmja9euafHixdFrfr9fZWVlam1tHdb+vffe07Rp0zR37tzotYULF8rlcunkyZO29DkVWa3zkiVL9OKLL8rn8w37bz09PQntayqzWudBe/fuVSgU0tq1a+3oZlqwWuu3335bfr9f9913X0z73/3ud1qyZIktfU5FVuvs8/k0adIkvfbaa7p69aquXr2q119/XbNnz1ZBQYGdXU8577//vgoKCvTLX/5S5eXlt2ybrHthVsLe2UEGnzvOmDEj5npRUZHa2tqGte/o6BjW1uv1asqUKSO2xw1W61xSUqKSkpKYaz/+8Y/l8/lUVVWVuI6mOKt1lqQ//vGP2r9/v44ePaqOjo6E9zFdWK31xx9/rLvuuktvvvmmXn75ZXV0dKisrEybN2+O+ccdsazW2efz6dlnn9WOHTtUWVkpl8uladOm6eDBg3K7M+L37XG7//77df/994+pbbLuhRnxN9jf3y/pRkGH8vl8I85l6O/vH9b2Vu1xg9U63+zVV1/VoUOHtGHDBt1xxx0J6WM6sFrnvr4+PfHEE3riiSc0e/ZsO7qYNqzW+urVqzp37pyam5u1YcMGvfTSS8rKytI3vvEN/c///I8tfU5FVutsjNEHH3ygBQsW6B/+4R/005/+VDNnztR3vvMdXb161ZY+Z4Jk3QszIpjk5ORI0rBJVIFAQLm5uSO2H2nCVSAQ0KRJkxLTyTRgtc6DjDHas2ePnn32Wa1du1YPPfRQIruZ8qzWuaGhQbNnz9bXv/51W/qXTqzWOjs7W1euXFFjY6Oqq6v1uc99To2NjZKkX/ziF4nvcIqyWudf//rXOnTokHbt2qXPf/7zWrhwofbu3auLFy/q2LFjtvQ5EyTrXpgRwWRwKKqzszPmemdnp4qLi4e1Ly4uHtY2GAzq8uXLmj59euI6muKs1lm6sbRy48aN2rt3rzZt2qQNGzYkvJ+pzmqdjx07pn/913/VggULtGDBAtXV1UmSvvrVr+qZZ55JfIdT2Hj+7cjKyop5bJOTk6O77rqL5dm3YLXOJ0+e1Jw5c5SXlxe9VlBQoDlz5ujjjz9OaF8zSbLuhRkRTEpLS5WXl6cTJ05Er/X29ur06dOqrKwc1r6qqkrt7e0xa+gHX1tRUZH4Dqcoq3WWpE2bNuk3v/mNnn/+eT388MN2dTWlWa3zm2++qTfeeEOvvfaaXnvtNTU0NEiSXn75ZT366KO29TsVWa11ZWWlBgYG9Kc//Sl67fr16zp//rzuvvtuW/qciqzWecaMGTp79mzM44T+/n5duHCBOsdRsu6FGTH51ev1qqamRrt371ZhYaFmzpypXbt2qbi4WMuXL1c4HFZXV5fy8/OVk5Oj8vJyVVRUaP369dq2bZv6+vq0detWrVixghGTW7Ba55///OdqaWnRpk2btHDhQl26dCn6XoNtMJzVOt/8D/XgRMM777yTuTy3YbXWlZWV+sIXvqAnn3xSO3bs0JQpU9TU1CSPx6Ovfe1ryf52HMtqnVesWKF9+/bpsccei4brPXv2yOv16oEHHkjyd5O6HHMvTNhCZIcZGBgwP/jBD8zixYvN/PnzTV1dXXQfh/Pnz5t58+aZY8eORdv/93//t1m3bp2ZP3++WbRokdm6dWvMfhsYmZU619bWmnnz5o34v6F/FxjO6s/zUP/2b//GPiYWWK31lStXzNatW82iRYtMeXm5qa2tNR9++GGyup8yrNb5o48+MmvXrjULFy40ixcvNt/97nf5mbboySefjNnHxCn3QpcxxiQu9gAAAIxdRswxAQAAqYFgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHINgAgAAHOP/A/09Q9jRTAbiAAAAAElFTkSuQmCC",
41 |       "text/plain": [
42 |        "<Figure size 640x480 with 1 Axes>"
43 |       ]
44 |      },
45 |      "metadata": {},
46 |      "output_type": "display_data"
47 |     }
48 |    ],
49 |    "source": [
50 |     "np.random.seed(2021)\n",
51 |     "true_defect_prob = 0.2\n",
52 |     "\n",
53 |     "N = 1000\n",
54 |     "is_defective = np.random.random(N) < true_defect_prob\n",
55 |     "\n",
56 |     "defective_yhats = np.random.beta(2.5, 1, N)\n",
57 |     "defective_yhats[defective_yhats > 1] = 1\n",
58 |     "defective_yhats[defective_yhats < 0] = 0\n",
59 |     "\n",
60 |     "good_yhat_failure_prob = 0.1\n",
61 |     "detection_failures = np.random.random(N) < good_yhat_failure_prob\n",
62 |     "good_yhats = np.random.beta(1, 3, N) * (1-detection_failures) + np.random.beta(10, 1, N) * detection_failures\n",
63 |     "#plt.hist(good_yhats, bins=50);\n",
64 |     "\n",
65 |     "yhat = good_yhats.copy()\n",
66 |     "yhat[is_defective] = defective_yhats[is_defective]\n",
67 |     "\n",
68 |     "mfg = pd.DataFrame({'is_defective': is_defective, 'predicted_prob': yhat})\n",
69 |     "mfg.to_csv('manufacturing.csv', index=False)\n",
70 |     "\n",
71 |     "fpr, tpr, threshs = roc_curve(is_defective, yhat)\n",
72 |     "plt.plot(fpr, tpr)"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3 (ipykernel)",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.11.5"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 5
97 | }
98 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/02_regression_review.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "0f911158",
  7 |    "metadata": {
  8 |     "tags": [
  9 |      "hide-cell"
 10 |     ]
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "from IPython.display import YouTubeVideo"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "professional-sauce",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "# Regression review"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "sufficient-advisory",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "*This section is a review of linear regression. You may find it helpful to review [Section 15.4 of the Data 100 textbook](https://learningds.org/ch/15/linear_multi.html), which covers the corresponding material.*\n",
 31 |     "\n",
 32 |     "Recall that regression is a form of supervised learning. Given some data $x$ (typically a scalar or a vector), we're trying to predict a single value $y$. You've seen cases where $y$ is a real number (linear regression) or a binary value $\\in \\{0, 1\\}$ (logistic regression). Let's briefly review the setup for linear regression.\n",
 33 |     "\n",
 34 |     "We have a collection of data $(x_1, y_1), \\ldots, (x_n, y_n)$. We're trying to predict $y_i$ from $x_i$, but our prediction won't be perfect. We'll use the notation $\\hat{y}_i$ to represent the predicted value for data point $i$. We start by discussing how to get the predictions, and then move on to the relationship between the predictions and the actual observed values."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "aquatic-weekly",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## One dimension\n",
 43 |     "\n",
 44 |     "In one dimension, we have data in the form $(x_1, y_1), \\ldots, (x_n, y_n)$, where each $x_i$ is a scalar and each $y_i$ is a scalar. We form a linear prediction\n",
 45 |     "\n",
 46 |     "$$\n",
 47 |     "\\hat{y}_i = ax_i + b,\n",
 48 |     "$$\n",
 49 |     "\n",
 50 |     "where $a$ is a slope and $b$ is an intercept. In one-dimensional linear regression, we compute $a$ and $b$ from the observed data points $(x_1, y_1), \\ldots, (x_n, y_n)$."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "single-effort",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Multiple dimensions\n",
 59 |     "\n",
 60 |     "In multiple linear regression, we still have data in the form $(x_1, y_1), \\ldots, (x_n, y_n)$, but now each $x_i \\in \\mathbb{R}^d$ is a $d$-dimensional vector. We can write\n",
 61 |     "\n",
 62 |     "$$\n",
 63 |     "x_i = \\left(x_{i1}, x_{i2}, \\ldots, x_{id}\\right)\n",
 64 |     "$$\n",
 65 |     "\n",
 66 |     "Each entry of this vector corresponds to a different aspect of this data point that we're using in our prediction. We call each of these a **predictor** or **feature**.\n",
 67 |     "\n",
 68 |     "In multiple linear regression, we form our prediction for data point $i$, $\\hat{y}_i$, as follows:\n",
 69 |     "\n",
 70 |     "$$\n",
 71 |     "\\hat{y}_i = \\sum_j \\beta_j x_{ij}\n",
 72 |     "$$\n",
 73 |     "\n",
 74 |     "The $d$-dimensional vector $\\beta = (\\beta_1, \\ldots, \\beta_d)$ contains the **coefficients** for each predictor: linear regression involves figuring out what these are. We can write this in vector notation using the vectors $\\beta$ and $x_i$:\n",
 75 |     "\n",
 76 |     "$$\n",
 77 |     "\\hat{y}_i = \\beta^T x_i = x_i^T \\beta\n",
 78 |     "$$\n",
 79 |     "\n",
 80 |     "We can take this notation one step further, and construct a matrix with all the $x$ values for all data points and all features.\n",
 81 |     "\n",
 82 |     "$$\n",
 83 |     "X = \\begin{pmatrix}\n",
 84 |     "    x_{11} & x_{12} & \\cdots & x_{1d} \\\\\n",
 85 |     "    x_{21} & x_{22} & \\cdots & x_{2d} \\\\\n",
 86 |     "    \\vdots & \\vdots & \\ddots & \\vdots \\\\\n",
 87 |     "    x_{n1} & x_{n2} & \\cdots & x_{nd}\n",
 88 |     "    \\end{pmatrix}\n",
 89 |     "$$\n",
 90 |     "\n",
 91 |     "One entry of this matrix, $x_{ij}$, represents feature $j$ for data point $i$. If we consider the entire vector of predictions $\\hat{y} = \\left(\\hat{y}_1, \\ldots, \\hat{y}_n\\right)$, we can write the predictions in a fully vectorized way:\n",
 92 |     "\n",
 93 |     "$$\n",
 94 |     "\\hat{y} = X\\beta\n",
 95 |     "$$\n",
 96 |     "\n",
 97 |     "We can interpret each coefficient $\\beta_j$ in the context of the model as follows: \"if $x_j$ increases by $t$, then the model predicts that $y$ should increase by about $\\beta_j \\times t$.\""
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "03cd9b60",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "YouTubeVideo('rEUf3bW32jM')"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "meaning-virginia",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Likelihoods and loss functions\n",
116 |     "\n",
117 |     "In order to compute the vector of coefficients $\\beta$, we need some way to connect the predictions $\\hat{y}_i$ (which are based on $\\beta$) with the actual observed values $y_i$. You've seen two ways of doing this:\n",
118 |     "1. A loss function between the prediction $\\hat{y}$ and the observed value $y$; we can minimize this loss function to find $\\beta$. \n",
119 |     "2. A probabilistic model that describes the errors $\\epsilon = y - \\hat{y}$ as random variables, and tries to maximize the likelihood of the data under that model.\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "undefined-twenty",
125 |    "metadata": {},
126 |    "source": [
127 |     "### Loss functions\n",
128 |     "\n",
129 |     "Recall that in ordinary least-squares linear regression, we try to find the value of $y$ that minimizes the mean squared error (MSE). We can write the MSE as follows:\n",
130 |     "\n",
131 |     "$$\n",
132 |     "\\text{MSE} = \\frac{1}{n}\\sum_i (y_i - \\beta^T x_i)^2\n",
133 |     "$$\n",
134 |     "\n",
135 |     "We can also write it as the [$\\ell_2$ norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm) of the vector $y - \\hat{y}$:\n",
136 |     "\n",
137 |     "$$\n",
138 |     "\\text{MSE} = \\frac{1}{n}\\left\\|y - \\hat{y}\\right\\|_2^2 = \\frac{1}{n}\\left\\|y - X\\beta\\right\\|_2^2\n",
139 |     "$$\n",
140 |     "\n",
141 |     "where for any vector $z$, the $\\ell_2$ norm of $z$ is $\\|z\\|_2 = \\sqrt{\\sum_i z_i^2}$.\n",
142 |     "\n",
143 |     "We want to choose a value for $\\beta$ that makes this as small as possible:\n",
144 |     "\n",
145 |     "$$\n",
146 |     "\\hat{\\beta} = \\text{argmin}_\\beta \\|y - X\\beta\\|_2^2\n",
147 |     "$$"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "id": "formal-sending",
153 |    "metadata": {},
154 |    "source": [
155 |     "### Likelihood and noise\n",
156 |     "\n",
157 |     "*We'll build heavily off the formulation in this section, so make sure you understand what's going on here well!*\n",
158 |     "\n",
159 |     "We can also describe the errors in our model:\n",
160 |     "\n",
161 |     "$$\n",
162 |     "y_i = \\beta^T x_i + \\epsilon_i,\n",
163 |     "$$\n",
164 |     "\n",
165 |     "where $\\epsilon_i \\sim N(0, \\sigma^2)$ is a random variable that represents the noise, or error, in the observed value. We can vectorize this noise, too: we'll write $\\epsilon = (\\epsilon_1, \\ldots, \\epsilon_n)$ so that the vector $\\epsilon$ has a multivariate normal distribution $\\epsilon \\sim N(0, \\sigma^2 I_n)$. We can then write:\n",
166 |     "\n",
167 |     "$$\n",
168 |     "y = X\\beta + \\epsilon,\n",
169 |     "$$\n",
170 |     "\n",
171 |     "or equivalently, using properties of the normal distribution,\n",
172 |     "\n",
173 |     "$$\n",
174 |     "y | \\beta \\sim N(X\\beta, \\sigma^2 I_n).\n",
175 |     "$$\n",
176 |     "\n",
177 |     "We can interpret this equation as telling us that the **average prediction** is $X\\beta$. This equation is a likelihood model: it tells us the likelihood of data $y$ given the parameter(s) $\\beta$. Note that we are treating $X$ as fixed and known, so there's no probability model associated with it. We'll generally focus on this version, rather than the previous one\n",
178 |     "\n",
179 |     "Let's think about the implicit assumptions we're making by choosing a normal likelihood. Recall that under a normal distribution, we're very unlikely to see values more than 3$\\sigma$ away from the mean. That means that we're implicitly assuming that the vast majority of $y$-values we see will be within 3$\\sigma$ of the average prediction prediction $X\\beta$. This helps explain why linear regression is sensitive to outliers: the likelihood of a point very far away from the average prediction is very small, but the likelihood of several points all somewhat far away is much higher.\n",
180 |     "\n",
181 |     "This model is often referred to as **ordinary least squares, or OLS**.\n",
182 |     "\n",
183 |     "Under this model, one reasonable way to estimate $\\beta$ is to choose the value that maximizes the likelihood. When choosing a value of $\\beta$ to maximize the likelihood, we note that we don't actually care about the normalizing constant in the normal distribution. So, we can write:\n",
184 |     "\n",
185 |     "$$\n",
186 |     "\\begin{align}\n",
187 |     "\\hat{\\beta} \n",
188 |     "    &= \\text{argmax}_\\beta \\exp\\left\\{-\\frac{1}{2}(y - X\\beta)^T(\\sigma^2 I_n)^{-1}(y-X\\beta)^T\\right\\} \\\\\n",
189 |     "    &= \\text{argmax}_\\beta \\exp\\left\\{-\\frac{1}{2\\sigma^2}\\|y - X\\beta\\|_2^2\\right\\}\n",
190 |     "\\end{align}\n",
191 |     "$$\n",
192 |     "\n",
193 |     "Just as we did in our earlier foray into maximum likelihood estimation, we'll take advantage of the fact that the $\\log$ function is monotonically increasing, and optimize the log-likelihood. Furthermore, we'll make this a minimization rather than a maximization. In general, for any well-behaved function $f$:\n",
194 |     "\n",
195 |     "$$\n",
196 |     "\\begin{align}\n",
197 |     "\\text{argmax}_\\theta f(\\theta) \n",
198 |     "&= \\text{argmax}_\\theta \\log(f(\\theta)) \\\\\n",
199 |     "&= \\text{argmin}_\\theta \\left[-\\log(f(\\theta))\\right] \\\\\n",
200 |     "\\end{align}\n",
201 |     "$$\n",
202 |     "\n",
203 |     "So, we can write:\n",
204 |     "\n",
205 |     "$$\n",
206 |     "\\begin{align}\n",
207 |     "\\hat{\\beta} \n",
208 |     "    &= \\text{argmax}_\\beta \\exp\\left\\{-\\frac{1}{2\\sigma^2}\\|y - X\\beta\\|_2^2\\right\\} \\\\\n",
209 |     "    &= \\text{argmin}_\\beta \\left[\\frac{1}{2\\sigma^2}\\|y - X\\beta\\|_2^2 \\right]\\\\\n",
210 |     "    &= \\text{argmin}_\\beta \\|y - X\\beta\\|_2^2\n",
211 |     "\\end{align}\n",
212 |     "$$\n",
213 |     "\n",
214 |     "So, we've found that maximizing the Gaussian likelihood of the data is exactly equivalent to minimizing the squared loss. This is true in general for regression problems: we can arrive at the same answer by either choosing a loss function and minimizing it, or choosing a corresponding likelihood and maximizing it.\n",
215 |     "\n",
216 |     "### Uncertainty in Regression Predictions\n",
217 |     "\n",
218 |     "It's important to remember that when making a prediction for a new data point, there are multiple sources of uncertainty. Recall that our model states that $\\hat{y} = X\\hat{\\beta} + \\epsilon$. When making a prediction, we have some uncertainty the first term $X\\hat{\\beta}$, because the coefficients we estimate depend on the random data. We also have additional uncertainty from the second term, depending on how much variability the model estimates in the data around the average predictions."
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "split-tokyo",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Logistic regression"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "id": "aggregate-cooperation",
232 |    "metadata": {},
233 |    "source": [
234 |     "Recall that in logistic regression, we're trying to predict binary outputs: $y_i \\in \\{0, 1\\}$. We're trying to predict the **probability** that $y_i$ will be 1, which we'll call $\\hat{y}$:\n",
235 |     "\n",
236 |     "$$\n",
237 |     "\\hat{y}_i = \\sigma(\\beta^T x_i),\n",
238 |     "$$\n",
239 |     "\n",
240 |     "where $\\sigma$ is the sigmoid function, which converts real values to values between 0 and 1. To find $\\beta$, we minimize the binary cross-entropy loss:\n",
241 |     "\n",
242 |     "$$\n",
243 |     "\\hat{\\beta} = \\text{argmin}_\\beta \\sum_i -\\left[ y_i \\ln(\\hat{y}_i) + (1-y_i) \\ln(1-\\hat{y}_i) \\right]\n",
244 |     "$$\n",
245 |     "\n",
246 |     "You'll show on the discussion worksheet that if we assume the likelihood model for $y$ is Bernoulli with parameter $\\sigma(\\beta^T x_i)$, then maximizing the likelihood is equivalent to minimizing the binary cross-entropy loss.\n",
247 |     "\n",
248 |     "*For a deeper refresher on logistic regression, see [Chapter 23 of the Data 100 textbook](https://www.textbook.ds100.org/ch/23/classification_intro.html). Note that our notation is slightly different:*\n",
249 |     "* We're using $\\beta$ instead of $\\theta$ for the coefficients\n",
250 |     "* We're using $\\hat{y}$ for the predictions instead of $f_{\\hat{\\theta}}$."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 3,
256 |    "id": "e4ce69c9",
257 |    "metadata": {
258 |     "tags": [
259 |      "remove-input"
260 |     ]
261 |    },
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/html": [
266 |        "\n",
267 |        "        <iframe\n",
268 |        "            width=\"400\"\n",
269 |        "            height=\"300\"\n",
270 |        "            src=\"https://www.youtube.com/embed/-xHu9FfVKqg\"\n",
271 |        "            frameborder=\"0\"\n",
272 |        "            allowfullscreen\n",
273 |        "            \n",
274 |        "        ></iframe>\n",
275 |        "        "
276 |       ],
277 |       "text/plain": [
278 |        "<IPython.lib.display.YouTubeVideo at 0x113237590>"
279 |       ]
280 |      },
281 |      "execution_count": 3,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "YouTubeVideo('-xHu9FfVKqg')"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "id": "bed49e72",
293 |    "metadata": {},
294 |    "source": [
295 |     "## Fitting models with `scikit-learn`"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "id": "9cbdd9a0",
301 |    "metadata": {},
302 |    "source": [
303 |     "You may recall that the easiest way to fit a linear model is to use the `LinearRegression` model from `scikit-learn`. We can then inspect the coefficients and intercept using the `coef_` and `intercept_` attributes respectively, and make predictions on new data points using the `predict()` method."
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 4,
309 |    "id": "a2215dd9",
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "import numpy as np\n",
314 |     "from sklearn.linear_model import LinearRegression"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 14,
320 |    "id": "11900766",
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "data": {
325 |       "text/plain": [
326 |        "(-0.0638684061258262, array([ 0.15201513, -0.62278861]))"
327 |       ]
328 |      },
329 |      "execution_count": 14,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "x = np.random.normal(size=[8, 2])\n",
336 |     "y = np.random.normal(size=8)\n",
337 |     "\n",
338 |     "model = LinearRegression()\n",
339 |     "model.fit(x, y)\n",
340 |     "model.intercept_, model.coef_"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "id": "2e6151a2",
346 |    "metadata": {},
347 |    "source": [
348 |     "In this chapter, we'll use two new packages for regression in frequentist and Bayesian paradigms. The first will be [statsmodels](https://www.statsmodels.org/stable/index.html), which fits linear models using a frequentist approach, and [Bambi](https://bambinos.github.io/bambi/), which uses `PyMC` for approximate inference in Bayesian linear models."
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "id": "f52cc47c",
354 |    "metadata": {},
355 |    "source": [
356 |     "## Known, unknown, random, and fixed"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "id": "98534348",
362 |    "metadata": {},
363 |    "source": [
364 |     "In regression modeling, we typically use a probability model for the observed target values $y$, but we tend to assume that the predictors $X$ are **fixed and known**. To summarize:\n",
365 |     "\n",
366 |     "| Variable | Description | Known or unknown? | Fixed or random? (Bayesian) | Fixed or random? (frequentist) |\n",
367 |     "|---|---|---|---|---|\n",
368 |     "| $x$ | Predictors | Known | Fixed, known | Fixed, known |\n",
369 |     "| $y$ | Target values | Known for training set, unknown for test set | Random | Random |\n",
370 |     "| $\\beta$ | Coefficients | Unknown | Random | Fixed |"
371 |    ]
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "celltoolbar": "Edit Metadata",
376 |   "kernelspec": {
377 |    "display_name": "Python 3 (ipykernel)",
378 |    "language": "python",
379 |    "name": "python3"
380 |   },
381 |   "language_info": {
382 |    "codemirror_mode": {
383 |     "name": "ipython",
384 |     "version": 3
385 |    },
386 |    "file_extension": ".py",
387 |    "mimetype": "text/x-python",
388 |    "name": "python",
389 |    "nbconvert_exporter": "python",
390 |    "pygments_lexer": "ipython3",
391 |    "version": "3.11.5"
392 |   }
393 |  },
394 |  "nbformat": 4,
395 |  "nbformat_minor": 5
396 | }
397 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/01/05_decision_theory.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "24b4e387",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Statistical Decision Theory"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "22007ae2",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Up until now, we've used error rates to help us understand tradeoffs in binary decision-making. In this section, we'll introduce a more general theoretical framework to understand and quantify errors we make, and start to explore the theoretical branch of statistics known as **statistical decision theory**.\n",
 17 |     "\n",
 18 |     "Remember our setup: we have some **unknown quantity** $\\theta$ that we're interested in. We collect data $x$. Our data are random, and come from the distribution $p(x|\\theta)$. We use the data to reason about $\\theta$. Often, we'll want to use the data to compute an estimate for $\\theta$ but sometimes, we may want to do something slightly different. In order to describe \"the thing we do to the data\", we'll use the notation $\\delta(x)$. This represents the result of applying some procedure $\\delta$ to the data. For example, $\\delta$ might be the sample average of many data points, or the result of logistic regression. The obvious next question is: how do we choose which procedure $\\delta$ to use? We'll decide by quantifying how \"good\" each $\\delta$ is, and then trying to choose the \"best\" one.\n",
 19 |     "\n",
 20 |     "\"Good\" is a very abstract notion: to quantify it, we'll need a quantitative measure of how good (or to be more precise, how bad) our procedure $\\delta$ is. We'll call this a **loss function**. Notationally, we'll write $\\ell(\\theta, \\delta(x))$ to represent the loss associated with the outcome $\\delta(x)$ if the true value is $\\theta$. To summarize:\n",
 21 |     "\n",
 22 |     "$$\n",
 23 |     "\\begin{align*}\n",
 24 |     "    \\text{Variable/notation} & \\quad \\text{What it means} \\\\\n",
 25 |     "    \\hline\n",
 26 |     "    \\theta      & \\quad \\text{unknown quantity/quantities of interest: parameter(s)} \\\\\n",
 27 |     "    x           & \\quad \\text{observed data} \\\\\n",
 28 |     "    p(x|\\theta) & \\quad \\text{probability distribution for data $x$ (depends on $\\theta$)} \\\\\n",
 29 |     "    \\delta(x)   & \\quad \\text{decision or result computed from $x$, often an estimate of $\\theta$} \\\\\n",
 30 |     "    \\ell(\\delta(x), \\theta) & \\quad \\text{loss (badness) for output $\\delta(x)$ and true parameter(s) $\\theta$}\n",
 31 |     "\\end{align*}\n",
 32 |     "$$"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "id": "0de533ea",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Examples"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "932a33da",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "That's a very abstract definition: let's make it more concrete with a few examples."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "d0d8e020",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "### Binary decision: 0-1 loss\n",
 57 |     "\n",
 58 |     "For our first example, we'll return to our binary decision-making setting. In that case:\n",
 59 |     "* Our unknown parameter $\\theta$ is binary, and corresponds to reality, which we've been calling $R$.\n",
 60 |     "* Our data $x$ were whatever we used to compute the p-value.\n",
 61 |     "* The decision $\\delta$ is a binary decision, which we've been calling $D$.\n",
 62 |     "\n",
 63 |     "| | D&nbsp;=&nbsp;$\\delta(x)$&nbsp;=&nbsp;0 | D&nbsp;=&nbsp;$\\delta(x)$&nbsp;=&nbsp;1 |\n",
 64 |     "| ---: | :---: | :---: |\n",
 65 |     "| $R=\\theta=0$ | TN loss | FP loss | \n",
 66 |     "| $R=\\theta=1$ | FN loss | TP loss |\n",
 67 |     "\n",
 68 |     "Here are a few concrete examples, and what each of these quantities would represent:\n",
 69 |     "\n",
 70 |     "| Example | Unknown $\\theta$ / $R$ | Data $x$ | Decision $\\delta$ / $D$ |\n",
 71 |     "| --- | --- | --- | --- |\n",
 72 |     "| Disease testing | Whether a person has a disease | Collected clinical data (blood sample, vital signs, etc.) | Should we give the person treatments for that disease? |\n",
 73 |     "| Finding oil wells | Whether underground oil exists in a certain area | Readings from seismic sensors, etc. | Should we drill for oil in this location? |\n",
 74 |     "| Product recommendation | Will a user buy this product? | User behavior, interest in similar products, etc. | Should we recommend the product to the user? \n",
 75 |     "\n",
 76 |     "\n",
 77 |     "Note that we haven't really talked much about $p(x|\\theta)$, since we've been working with $\\delta(x)$ directly: we'll discuss this more in the next chapter.\n",
 78 |     "\n",
 79 |     "Our loss function will depend on the problem we're solving. Since in this case, both the inputs ($\\theta$/$R$ and $\\delta$/$D$) are binary, we can write the loss in a 2x2 table that looks exactly like the ones we've seen before.\n",
 80 |     "If both kinds of error (false positive and false negative) are equally bad, we can use the simplest loss function, the **0-1 loss**:\n",
 81 |     "\n",
 82 |     "$$\n",
 83 |     "\\ell(\\delta(x), \\theta) = \\begin{cases}\n",
 84 |     "    0 & \\text{if }\\theta = \\delta(x) \\\\\n",
 85 |     "    1 & \\text{if }\\theta \\neq \\delta(x)\n",
 86 |     "\\end{cases}\n",
 87 |     "$$\n",
 88 |     "\n",
 89 |     "**Exercise**: Suppose we have a situation where a false positive is five times worse than a false negative. How would you write the loss function?"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "id": "35cdc4e1",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Continuous decision: $\\ell_2$ loss\n",
 98 |     "\n",
 99 |     "Now, suppose our parameter$\\theta$ is continuous, and $\\delta(x)$ is our estimate of the parameter from the data. To make things a little more concrete, $\\theta$ could be the average height of people in a population, and $x$ could be the heights of people in a random sample from that population. In this case, our loss shouldn't just be right vs wrong: we should use a loss function that's smaller when our estimate is close, and larger when our estimate is far away. \n",
100 |     "\n",
101 |     "You've probably already seen one before: the squared error loss, also known as the **$\\ell_2$ loss**:\n",
102 |     "\n",
103 |     "$$\n",
104 |     "\\ell(\\delta(x), \\theta) = \\big(\\delta(x) - \\theta\\big)^2\n",
105 |     "$$\n",
106 |     "\n",
107 |     "We'll analyze the $\\ell_2$ loss a little more later.\n",
108 |     "\n",
109 |     "**Exercise**: Suppose we have a situation where it's much worse to make a guess that's too high, compared to a guess that's too low. How would you construct a loss function for this problem?"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "id": "dec2db0b",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Known and unknown"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "id": "61ee72fe",
123 |    "metadata": {},
124 |    "source": [
125 |     "At this point, you may be wondering: if $\\theta$ is unknown, how can we ever compute the loss function? It's important to keep in mind that when we apply $\\delta(x)$ on real data, we don't know $\\theta$. But right now, we're building up some machinery to help us analyze different procedures. In other words, we're trying to get to a place where we can answer questions like \"what procedures are most likely to give us estimates that are close to $\\theta$?\""
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "id": "b92a9abd",
131 |    "metadata": {},
132 |    "source": [
133 |     "## Fixed and random: finding the average loss in Bayesian and frequentist approaches"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "882506a5",
139 |    "metadata": {},
140 |    "source": [
141 |     "The loss function is a function of $\\delta(x)$, which is the procedure result for particular data $x$, and the particular parameter $\\theta$. This isn't particularly useful to us: we'd like to understand how the loss does \"on average\". But in order to compute any kind of averages, we need to decide what's random and what's fixed. This is an important fork in the road: we can either take the Bayesian or the frequentist route. Let's examine what happens if we try each one:"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "6b07474d",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Frequentist loss analysis"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "35db21e0",
155 |    "metadata": {},
156 |    "source": [
157 |     "In the frequentist world, we assume that our unknown $\\theta$ is **fixed**. The data are the only random piece. So, we're going to look at the average across different possibilities for the data $x$. Since the data comes from the distribution $p(x|\\theta)$, which depends on $\\theta$, we should expect that this \"averaging\" will produce something that depends on $\\theta$. We'll call our average the **frequentist risk**:\n",
158 |     "\n",
159 |     "$$\n",
160 |     "\\begin{align*}\n",
161 |     "    R(\\theta) \n",
162 |     "        &= E_{x|\\theta}\\left[\\ell(\\delta(x), \\theta)\\right] \\\\\n",
163 |     "        &= \\begin{cases} \n",
164 |     "                \\displaystyle \\sum_x \\ell(\\delta(x), \\theta) p(x|\\theta) & \\text{if $x$ discrete} \\\\\n",
165 |     "                \\displaystyle \\int \\ell(\\delta(x), \\theta) p(x|\\theta) dx & \\text{if $x$ continuous} \n",
166 |     "           \\end{cases}\n",
167 |     "\\end{align*}\n",
168 |     "$$"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "id": "fa3b3455",
174 |    "metadata": {},
175 |    "source": [
176 |     "The frequentist risk is a function of $\\theta$. It tells us: for a particular value of $\\theta$, how poorly does the procedure $\\delta$ do if we average over all possible datasets?"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "id": "87657f77",
182 |    "metadata": {},
183 |    "source": [
184 |     "### Bayesian loss analysis"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "id": "71f41991",
190 |    "metadata": {},
191 |    "source": [
192 |     "In the Bayesian world, we assume that our unknown $\\theta$ is **random**. Since we observe a particular dataset $x$, we'll be a lot more interested in the randomness in $\\theta$ than the randomness in $x$. So, we'll condition on the particular dataset we got, and look at the average across different possibilities for the unknown parameter $\\theta$. We'll call our average the **Bayesian posterior risk**:\n",
193 |     "\n",
194 |     "$$\n",
195 |     "\\begin{align*}\n",
196 |     "    \\rho(x) \n",
197 |     "        &= E_{\\theta|x}\\left[\\ell(\\delta(x), \\theta)\\right] \\\\\n",
198 |     "        &= \\begin{cases} \n",
199 |     "                \\displaystyle \\sum_\\theta \\ell(\\delta(x), \\theta) p(\\theta|x) & \\text{if $\\theta$ discrete} \\\\\n",
200 |     "                \\displaystyle \\int \\ell(\\delta(x), \\theta) p(\\theta|x) d\\theta & \\text{if $\\theta$ continuous} \n",
201 |     "           \\end{cases}\n",
202 |     "\\end{align*}\n",
203 |     "$$"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "id": "e9450455",
209 |    "metadata": {},
210 |    "source": [
211 |     "The Bayesian risk is a function of $x$. It tells us: given that we observed a particular dataset $x$, how poorly does the procedure $\\delta$ do, averaged over all possible values of the parameter $\\theta$?"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "id": "8e0151dd",
217 |    "metadata": {},
218 |    "source": [
219 |     "### Comparing frequentist and Bayesian risk"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "id": "188096cb",
225 |    "metadata": {},
226 |    "source": [
227 |     "Operationally, both of these look kind of similar: we're averaging the loss with respect to some conditional probability distribution. But conceptually, they're very different: the frequentist risk fixes the parameter, and averages over all the data;  the Bayesian posterior risk fixes the data, and averages over all parameters."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "id": "8e0f7356",
233 |    "metadata": {},
234 |    "source": [
235 |     "## Example: frequentist risk for $\\ell_2$ loss and the bias-variance tradeoff"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "fcf84569",
241 |    "metadata": {},
242 |    "source": [
243 |     "Let's work through an example compuing the frequentist risk using the $\\ell_2$ loss. We'll find that the result can give us some important insights.\n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "id": "fe76a9f9",
249 |    "metadata": {},
250 |    "source": [
251 |     "$$\n",
252 |     "\\begin{align}\n",
253 |     "R(\\theta) \n",
254 |     "&= E_{x|\\theta}\\left[\\ell(\\delta(x), \\theta)\\right] \\\\\n",
255 |     "&= E_{x|\\theta}\\Big[\\big(\\delta(x) - \\theta\\big)^2\\Big] \\\\\n",
256 |     "\\end{align}\n",
257 |     "$$"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "id": "fff65cfe",
263 |    "metadata": {},
264 |    "source": [
265 |     "To make the math work out later, we'll add and subtract the term $E_{x|\\theta}[\\delta(x)]$. Before we work out the result, let's think about what this term means. It's the average value of the procedure $\\delta$: in other words, for a particular $\\theta$, it tells us what value of $\\delta(x)$ we should expect to get, averaged across different possible values of $x$."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "id": "2d1f1fc0",
271 |    "metadata": {},
272 |    "source": [
273 |     "$$\n",
274 |     "\\begin{align}\n",
275 |     "R(\\theta) \n",
276 |     "&= E_{x|\\theta}\\Big[\\big(\\delta(x) - \\theta\\big)^2\\Big] \\\\\n",
277 |     "&= E_{x|\\theta}\\Big[\\big(\n",
278 |     "    \\delta(x) \\overbrace{- E_{x|\\theta}[\\delta(x)] + E_{x|\\theta}[\\delta(x)]}^{=0} - \\theta\n",
279 |     "\\big)^2\\Big] \\\\\n",
280 |     "&= E_{x|\\theta}\\Big[\\big(\n",
281 |     "    \\underbrace{\\delta(x) - E_{x|\\theta}[\\delta(x)]}_{\\text{prediction minus avg. prediction}} + \n",
282 |     "    \\underbrace{E_{x|\\theta}[\\delta(x)] - \\theta}_{\\text{avg. prediction minus true value}}\n",
283 |     "\\big)^2\\Big] \\\\\n",
284 |     "\\end{align}\n",
285 |     "$$"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "id": "d2930c38",
291 |    "metadata": {},
292 |    "source": [
293 |     "To make the math a little easier to read, we'll write $\\delta = \\delta(x)$ and $\\bar{\\delta} = E_{x|\\theta}[\\delta(x)]$:"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "id": "b9ef7fbc",
299 |    "metadata": {},
300 |    "source": [
301 |     "$$\n",
302 |     "\\begin{align}\n",
303 |     "R(\\theta) \n",
304 |     "&= E_{x|\\theta}\\Big[\\big(\n",
305 |     "    \\delta(x) - E_{x|\\theta}[\\delta(x)] + E_{x|\\theta}[\\delta(x)] - \\theta\n",
306 |     "\\big)^2\\Big] \\\\\n",
307 |     "&= E_{x|\\theta}\\Big[\\big(\n",
308 |     "    \\delta - \\bar{\\delta} + \\bar{\\delta} - \\theta\n",
309 |     "\\big)^2\\Big] \\\\\n",
310 |     "&= E_{x|\\theta}\\Big[\n",
311 |     "    \\big(\\delta - \\bar{\\delta}\\big)^2 +\n",
312 |     "    \\underbrace{2\\big(\\delta - \\bar{\\delta}\\big)\\big(\\bar{\\delta} - \\theta\\big)}_{=0} + \n",
313 |     "    \\big(\\bar{\\delta} - \\theta\\big)^2\n",
314 |     "\\Big] \\\\\n",
315 |     "&= E_{x|\\theta}\\Big[\\big(\\delta - \\bar{\\delta}\\big)^2\\Big] + \n",
316 |     "     E_{x|\\theta}\\Big[\\big(\\bar{\\delta} - \\theta\\big)^2\\Big] \\\\\n",
317 |     "&= \\underbrace{E_{x|\\theta}\\Big[\\big(\\delta - \\bar{\\delta}\\big)^2\\Big]}_{\\text{variance of }\\delta(x)} + \n",
318 |     "     \\big(\\underbrace{\\bar{\\delta} - \\theta}_{\\text{bias of }\\delta(x))}\\big)^2 \\\\\n",
319 |     "\\end{align}\n",
320 |     "$$"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "id": "e289f8d1",
326 |    "metadata": {},
327 |    "source": [
328 |     "We've shown that for the $\\ell_2$ loss, the frequentist risk is the sum of two terms, called the **variance** and the square of the **bias**.\n",
329 |     "\n",
330 |     "The **variance**, $E_{x|\\theta}\\Big[\\big(\\delta(x) - E_{x|\\theta}[\\delta(x)]\\big)^2\\Big]$, answers the question: as the data varies, how far away will $\\delta$ be from its average value? In general, if your procedure $\\delta$ is very sensitive to variations in the data, your variance will be high.\n",
331 |     "\n",
332 |     "The **bias**, $E_{x|\\theta}[\\delta(x)] - \\theta$, answers the question: how far is the average value of $\\delta$ from the true parameter $\\theta$? In general, if your procedure $\\delta$ does a good job of capturing the complexity of predicting $\\theta$, your bias will be low.\n",
333 |     "\n",
334 |     "When trying to reduce the risk (average loss), most methods try to reduce the variance and/or the bias. Many methods for estimation and prediction try to deal with the tradeoff between variance and bias: ideally we'd like both to be as small as possible, but we often need to accept a little more of one in order to make big reductions in the other."
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "id": "d25579a1",
340 |    "metadata": {},
341 |    "source": [
342 |     "## Bayes risk"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "id": "f386209b",
348 |    "metadata": {},
349 |    "source": [
350 |     "The two risks above are obtained by taking the expectation with respect to either the data $x$ or the parameter $\\theta$. What if we take the expectation with respect to both? The **Bayes risk** is exactly that:\n",
351 |     "\n",
352 |     "$$\n",
353 |     "\\begin{align*}\n",
354 |     "R(\\delta) \n",
355 |     "&= E_{x, \\theta} [\\ell(\\delta(x), \\theta)]  \\\\\n",
356 |     "&= E_\\theta [R(\\theta)] \\\\\n",
357 |     "&= E_x [R(x)]\n",
358 |     "\\end{align*}\n",
359 |     "$$\n",
360 |     "\n",
361 |     "where the last two inequalities follow from Fubini's theorem (i.e., that we can do the integrals for the expectations in either order and get the same result). The Bayes risk is a single number that summarizes the procedure $\\delta$. The name is somewhat misleading: it isn't really Bayesian or frequentist."
362 |    ]
363 |   }
364 |  ],
365 |  "metadata": {
366 |   "kernelspec": {
367 |    "display_name": "Python 3 (ipykernel)",
368 |    "language": "python",
369 |    "name": "python3"
370 |   },
371 |   "language_info": {
372 |    "codemirror_mode": {
373 |     "name": "ipython",
374 |     "version": 3
375 |    },
376 |    "file_extension": ".py",
377 |    "mimetype": "text/x-python",
378 |    "name": "python",
379 |    "nbconvert_exporter": "python",
380 |    "pygments_lexer": "ipython3",
381 |    "version": "3.11.5"
382 |   }
383 |  },
384 |  "nbformat": 4,
385 |  "nbformat_minor": 5
386 | }
387 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/04/06_instrumental_variables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Instrumental variables\n",
  8 |     "\n",
  9 |     "*Special thanks to [Yan Shuo Tan](https://sites.google.com/view/yanshuotan/home), who wrote most of this section's content.*"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Review and introduction"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "To briefly recap what we have learnt so far:\n",
 24 |     "1. We defined a superpopulation model, i.e. a distribution for $(X_i,Z_i,Y_i(0),Y_i(1))$:\n",
 25 |     "   * $Z$ is the (binary) treatment decision,\n",
 26 |     "   * $Y(0)$ and $Y(1)$ are the potential outcomes in the universes where the unit wasn't/was treated,\n",
 27 |     "   * $X$ is a confounding variable (in other words, it has a causal effect on $Z$ and on $Y$)\n",
 28 |     "So far, we haven't needed to make any assumptions about the distribution of these variables in general (only that it exists).\n",
 29 |     "   \n",
 30 |     "2. We defined our quantity of interest, the average treatment effect (ATE): $\\tau = E[Y(1) - Y(0)]$, which tells us the average effect of the treatment. We saw that this is impossible to estimate unless we make further assumptions.\n",
 31 |     "\n",
 32 |     "3. We saw that in a randomized experiment, we have the following:\n",
 33 |     "   * The treatment decisions are random, and therefore are **independent of the potential outcomes**.\n",
 34 |     "   * In other words, $\\big(Y(0),Y(1)\\big)\\perp\\!\\!\\perp Z$.\n",
 35 |     "\n",
 36 |     "In this section, we'll investigate how we can estimate the ATE in situations where we have unknown confounding variables. We'll rely on **natural experiments** to help us. Note that you've probably seen [natural experiments before in Data 8, when learning about John Snow's study of cholera](https://inferentialthinking.com/chapters/02/2/snow-s-grand-experiment.html)."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Linear structural model (LSM)\n",
 44 |     "\n",
 45 |     "In some fields (such as economics), it is typical to work with *structural models*, which place some restrictions on the joint distribution of all the variables, and in doing so, make it easier to estimate the parameters of the model.\n",
 46 |     "\n",
 47 |     "We will work with the **linear structural model** relating our outcome $Y$ to our treatment $Z$ and confounding variable(s) $X$:\n",
 48 |     "$$\n",
 49 |     "Y = \\alpha + \\tau Z + \\beta^TX + \\epsilon,\n",
 50 |     "$$\n",
 51 |     "\n",
 52 |     "where $\\epsilon$ has mean zero, and is independent of $Z$ and $X$ (in economics, we say that $\\epsilon$ is *exogenous*). We sometimes further assume that $\\epsilon \\sim \\mathcal{N}(0,\\sigma^2)$, but this is not necessary for any of the analysis we're going to do.\n",
 53 |     "\n",
 54 |     "*Note: in general, we often add the further structural equation $Z = f(X, \\delta)$ where $\\delta$ is an exogenous noise variable, and $f$ encodes the structural relationship between $X$ and $Z$. We won't go into this level of detail, but when reading this equation, you should assume that $\\textrm{Cov}(Z,X)$ is not necessarily 0.*\n",
 55 |     "\n",
 56 |     "This is not quite the same as the *linear model* that we have seen when we learned about GLMs, and that you've seen in previous classes! While it looks very similar, the linear model we worked with before is a statement about associations and predictions, while this linear structural model is a statement about intervention and action. \n",
 57 |     "\n",
 58 |     "Specifically, this model assumes that if for unit $i$, if we could set $Z_i = 1$, we will observe $Y_i(1) = \\tau + \\beta^TX_i + \\epsilon_i$, and if we could set $Z_i = 0$, we will observe $Y_i(0) = \\beta^TX_i + \\epsilon_i$. (If $Z$ is not binary, then there will be a potential outcome for each possible value of $Z$.) This is a subtle but important point, that also situates the linear structural model as a special case of the potential outcomes framework!\n",
 59 |     "\n",
 60 |     "From this, we see that the average treatment effect in this model is $\\tau$ (can you show this is true?), and furthermore, that the individual treatment effect for every unit is\n",
 61 |     "\n",
 62 |     "$$Y_i(1) - Y_i(0) = \\tau.$$\n",
 63 |     "\n",
 64 |     "In other words, the linear structural model is making an implicit assumption that the treatment effect is constant across all units."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### Causal graphs and LSMs\n",
 72 |     "\n",
 73 |     "Apart from the causal effect of $Z$ on $Y$, the linear structural model also does something new from before. It asserts the causal relationships between the other variables, i.e. it tells us how $Z$ and $Y$ change if we manipulate $X$. \n",
 74 |     "\n",
 75 |     "The above linear structural model can be represented graphically as follows:\n",
 76 |     "\n",
 77 |     "<img src=\"causal_graph1.png\" align=\"center\"/>\n",
 78 |     "\n",
 79 |     "As a reminder, the arrows from $X$ into $Z$ and $Y$ assert that $X$ causes both $Z$ and $Y$ (i.e. intervening on $X$ changes the values of $Z$ and $Y$), and the arrow from $Z$ into $Y$ asserts that $Z$ causes $Y$."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Confounding and omitted variable bias"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "In many scenarios, confounding is complicated and involves many different variables, and it may be impossible to collect, observe, or describe all of them. In that case we must assume that $X$ is unobserved. If this happens, then just as before, we're in trouble because of *confounding*. Here are some examples. In each one, we've only listed one possible confounder $X$, but there are likely many more: can you think of at least one for each example?\n",
 94 |     "\n",
 95 |     "| Treatment $Z$ | Outcome $Y$ | Possible confounder(s) $X$ |\n",
 96 |     "| --- | --- | --- |\n",
 97 |     "| Health insurance | Health outcomes | Socioeconomic background |\n",
 98 |     "| Military service | Salary | Socioeconomic background |\n",
 99 |     "| Family size | Whether the mother is in the labor force | Socioeconomic background |\n",
100 |     "| Years of schooling | Salary | Socioeconomic background |\n",
101 |     "| Smoking | Lung cancer | Socioeconomic background |\n",
102 |     "\n",
103 |     "Note that in most of these examples, socioeconomic background is a confounder. This is particularly common in economics and econometrics, where most of the methods in this section originated.\n",
104 |     "\n",
105 |     "Let's be a bit more precise about quantifying the effect of confounding. Specifically, we'll assume the linear structural model above, and then see what happens when we naively try to fit a linear regression to $Y$ using $Z$, without accounting for $X$. \n",
106 |     "\n",
107 |     "Let $\\hat{\\tau}_{OLS}$ be the solution of the least squares problem $\\min_{\\tau,\\alpha} \\mathbb{E}[(\\alpha + \\tau Z - Y)^2]$. We then get\n",
108 |     "\n",
109 |     "$$\n",
110 |     "\\begin{align}\n",
111 |     "    \\hat{\\tau}_{OLS} \n",
112 |     "    & = \\frac{\\text{Cov}(Y,Z)}{\\text{Var}(Z)} \\\\\n",
113 |     "    & = \\frac{\\text{Cov}(\\alpha + \\tau Z + \\beta^TX + \\epsilon,Z)}{\\text{Var}(Z)} \\\\\n",
114 |     "    & = \\frac{\\text{Cov}(\\tau Z,Z)}{\\text{Var}(Z)} + \\frac{\\text{Cov}(\\beta^TX,Z)}{\\text{Var}(Z)} \\\\\n",
115 |     "    & = \\underbrace{\\tau}_\\text{true ATE} + \\underbrace{\\beta^T\\frac{\\text{Cov}(X,Z)}{\\text{Var}(Z)}}_{\\text{bias involving }X}.\n",
116 |     "\\end{align}\n",
117 |     "$$\n",
118 |     "\n",
119 |     "The second term is a bias in the $\\tau_{OLS}$ estimator: in other words, it's the difference between the true value and the estimator, and it depends on the omitted (i.e., unobserved) variable $X$. So, we'll call this term $\\beta^T\\frac{\\text{Cov}(X,Z)}{\\text{Var}(Z)}$ the **omitted variable bias**.\n",
120 |     "\n",
121 |     "*Remark: $\\frac{\\text{Cov}(Y,Z)}{\\text{Var}(Z)}$ is the infinite population version of the typical formula $\\hat{\\tau}_{OLS} = (Z^TZ)^{-1}Z^TY$, where we now use $Z$ and $Y$ to denote matrices/vectors.*\n",
122 |     "\n",
123 |     "**Why can't we just adjust for confounding?** Having such confounders is problematic because in order to avoid omitted variable bias, we need to have observed them, and added them to our regression (collection of such data may not always be feasible for a number of reasons.) Furthermore, there could always be *other* confounders that we are unaware of, which leaves our causal conclusions under an inescapable cloud of doubt."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Instrumental Variables"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "Is there a middle way between a randomized experiment and assuming unconfoundedness, which is sometimes unrealistic?\n",
138 |     "\n",
139 |     "One way forward is when nature provides us with a \"partial\" natural experiment, i.e. we have a truly randomized \"instrument\" that injects an element of partial randomization into the treatment variable of interest. This is the idea of instrumental variables. We will first define the concept mathematically, and then illustrate what it means for a few examples.\n",
140 |     "\n",
141 |     "**Definition:** Assume the linear structural model defined above. We further assume a variable $W$ such that $Z = \\alpha' + \\gamma W + (\\beta')^TX + \\delta$, with $\\gamma \\neq 0$ (relevance), $W$ independent of $X$, $\\delta$ and $\\epsilon$ (exogeneity). Such a $W$ is called an *instrumental variable*.\n",
142 |     "\n",
143 |     "*Remark:* This replaces the earlier equation from before that $Z = f(X,\\delta)$.\n",
144 |     "\n",
145 |     "Let us now see how to use $W$ to identify the ATE $\\tau$."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "$$\n",
153 |     "\\begin{align}\n",
154 |     "    \\textrm{Cov}(Y,W) & = \\textrm{Cov}(\\alpha + \\tau Z + \\beta^TX + \\epsilon,W) \\\\\n",
155 |     "    & = \\tau \\textrm{Cov}(Z,W) \\\\\n",
156 |     "    & = \\tau \\textrm{Cov}(\\alpha' + \\gamma W + (\\beta')^TX + \\delta, W) \\\\\n",
157 |     "    & = \\tau\\gamma \\textrm{Var}(W).\n",
158 |     "\\end{align}\n",
159 |     "$$\n",
160 |     "\n",
161 |     "Where the second equality follows from the exogeneity of $W$. Meanwhile, a similar computation with $Z$ and $W$ gives us\n",
162 |     "$$\n",
163 |     "\\textrm{Cov}(Z,W) = \\gamma\\textrm{Var}(W).\n",
164 |     "$$\n",
165 |     "\n",
166 |     "Putting everything together gives\n",
167 |     "$$\n",
168 |     "\\tau = \\frac{\\frac{\\textrm{Cov}(Y,W)}{\\textrm{Var}(W)}}{\\frac{\\textrm{Cov}(Z,W)}{\\textrm{Var}(W)}}.\n",
169 |     "$$\n",
170 |     "\n",
171 |     "In other words, $\\tau$ is the ratio between the (infinite population) regression coefficient of $W$ on $Y$, and that of $W$ on $Z$.\n",
172 |     "\n",
173 |     "This motivates the **instrumental variable estimator** of the ATE in finite samples:\n",
174 |     "\n",
175 |     "$$\n",
176 |     "\\hat{\\tau}_{IV} = \\frac{\\overbrace{(W^TW)^{-1}W^TY}^{\\text{OLS coeff. of W for Y}}}{\\underbrace{(W^TW)^{-1}W^TZ}_{\\text{OLS coeff. of W for Z}}},\n",
177 |     "$$\n",
178 |     "\n",
179 |     "where again, abusing notation, $W$, $Z$ and $Y$ refer to the vectors of observations. If $\\alpha' = 0$, then this is a plug-in estimator of $\\tau$, and is consistent.\n",
180 |     "\n",
181 |     "**Further interpretation for binary $W$:** When $W$ is binary, we can show that\n",
182 |     "\n",
183 |     "$$\n",
184 |     "\\tau = \\frac{\\mathbb{E}[Y|W=1] - \\mathbb{E}[Y|W=0]}{\\mathbb{E}[Z|W=1] - \\mathbb{E}[Z|W=0]}.\n",
185 |     "$$\n",
186 |     "\n",
187 |     "Hence, we can think of IV as measuring the ratio of the prima facie treatment effect of $W$ on $Y$ and that of $W$ on $Z$."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "### Causal graph for instrumental variables\n",
195 |     "\n",
196 |     "The relationships between $W, Z, X$, and $Y$ can be represented as the following causal graph:\n",
197 |     "\n",
198 |     "<img src=\"causal_graph2.png\" align=\"center\"/>\n",
199 |     "\n",
200 |     "How to read this graph:\n",
201 |     "- The arrow from $W$ into $Z$ shows that $W$ has a causal effect on $Z$\n",
202 |     "- The absence of any arrow into $W$ means that $W$ is exogeneous, i.e. no variable in the diagram causes $W$, and in particular $W$ is independent of $X$.\n",
203 |     "- The absence of an arrow from $W$ into $Y$ means that the only effect of $W$ on $Y$ is through $Z$.\n",
204 |     "- We shaded in $W$, $Z$ and $Y$ because these nodes are observed, but $X$ is unshaded because it is latent (unobserved).\n",
205 |     "\n",
206 |     "Note that we do not need to know or even be aware of what $X$ is in order for instrumental variables to work! It doesn't matter how many confounders there are, or whether we're even able to list all of them: as long as we can guarantee that they do not have any causal relationship to the instrument (exclusion restriction), instrumental variables will work."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Examples of instrumental variables"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "Let's examine what we might use as instrumental variables for the five examples from the table in the previous section. The first four are taken from the econometrics literature:\n",
221 |     "\n",
222 |     "**Example 1:** $Z$ is health insurance, $Y$ is health outcomes, $X$ is socioeconomic background. Baicker et al. (2013) used the 2008 expansion of Medicaid in Oregon via lottery. The instrument $W$ here was the lottery assignment. We previously talked about why this was an imperfect experiment because of compliance reasons (only a fraction of individuals who won the lottery enrolled into Medicaid), so IV provides a way of overcoming this limitation.\n",
223 |     "\n",
224 |     "**Example 2:** $Z$ is military service, $Y$ is salary, $X$ is socioeconomic background. Angrist (1990) used the Vietnam era draft lottery as the instrument $W$, and found that among white veterans, there was a 15% drop in earnings compared to non-veterans.\n",
225 |     "\n",
226 |     "**Example 3:** $Z$ is family size, $Y$ is mother's employment, $X$ is socioeconomic background. Angrist and Evans (1998) used sibling sex decomposition (in other words, the assigned sexes at birth of a sibling) as the IV. This is plausible because of the pseudo randomization of the sibling sex composition. This is based on the fact that parents in the US with two children of the same sex are more likely to have a third child than those parents with two children of different sex.\n",
227 |     "\n",
228 |     "**Example 4:** $Z$ is years of schooling, $Y$ is salary, $X$ is socioeconomic background. Card (1993) used geographical variation in college proximity as the instrumental variable.\n",
229 |     "\n",
230 |     "**Example 5:** $Z$ is smoking, $Y$ is lung cancer, $X$ is socioeconomic background. Unfortunately, this example does not lend itself well to an instrumental variable: despite decades of searching, nobody has yet found one that is convincing. This leads to an important lesson: **not every problem is amenable to the use of instrumental variables, or even natural experiments!**\n",
231 |     "\n",
232 |     "As we see in these examples, sometimes you need to be quite ingenious to come up with an appropriate instrumental variable. Joshua Angrist, David Card, and Guido Imbens, who is named in several of these examples, are phenomenally good at this: in fact, they won the Nobel Prize in economics for their collected body of work!"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "## Extensions\n",
240 |     "\n",
241 |     "### Multiple treatments / instruments, and two-stage least squares.\n",
242 |     "\n",
243 |     "So far, we have considered scalar treatment and instrumental variables $Z$ and $W$. It is also possible to consider vector-valued instruments and treatments. To generalize IV to this setting, we need to recast the IV estimator in the previous sections as follows.\n",
244 |     "\n",
245 |     "First define the conditional expectation $\\tilde{Z} = \\mathbb{E}[Z|W]$, and observe that $\\tilde{Z} = \\alpha' + W\\gamma$.\n",
246 |     "\n",
247 |     "If we regress $Y$ on $\\tilde{Z}$, the regression coefficient we obtain is\n",
248 |     "$$\n",
249 |     "\\begin{align}\n",
250 |     "\\frac{\\textrm{Cov}(\\tilde{Z},Y)}{\\textrm{Var}(\\tilde{Z})} & = \\frac{\\textrm{Cov}(\\tilde{Z}, \\alpha + \\tau Z + \\beta^TX + \\epsilon)}{\\textrm{Var}(\\tilde{Z})} \\\\\n",
251 |     "& = \\frac{\\textrm{Cov}(\\tilde{Z}, \\tau Z)}{\\textrm{Var}(\\tilde{Z})} \\\\\n",
252 |     "& = \\tau\\frac{\\textrm{Cov}(\\tilde{Z},  Z)}{\\textrm{Var}(\\tilde{Z})} \\\\\n",
253 |     "& = \\tau.\n",
254 |     "\\end{align}\n",
255 |     "$$\n",
256 |     "\n",
257 |     "Here, the 2nd equality holds because $W$ is independent of all $X$ and $\\epsilon$, while the 4th equality holds because of a property of conditional expectations (one can also check this by hand by expanding out $Z = \\alpha' + \\gamma W + (\\beta')^TX + \\delta$.)\n",
258 |     "\n",
259 |     "In finite samples, we thus arrive at the following algorithm:\n",
260 |     "\n",
261 |     "**Two-stage least squares algorithm (2SLS):**\n",
262 |     "- Step 1: Regress $Z$ on $W$ to get $\\tilde{Z} = W\\hat{\\gamma} = W(W^TW)^{-1}W^TZ$.\n",
263 |     "- Step 2: Regress $Y$ on $\\tilde{Z}$ to get $\\hat{\\tau}_{2SLS} = (\\tilde{Z}^T\\tilde{Z})^{-1}\\tilde{Z}^TY$.\n",
264 |     "\n",
265 |     "For the scalar setting, it is easy to see that $\\hat{\\tau}_{2SLS} = \\hat{\\tau}_{IV}$, but the benefit of this formulation is that it directly applies for vector-valued $Z$ and $W$.\n",
266 |     "\n",
267 |     "### (Optional) A non-parametric perspective on instrumental variables\n",
268 |     "\n",
269 |     "In this notebook, we have introduced instrumental variables in the context of structural linear models. What if our model is nonlinear?\n",
270 |     "\n",
271 |     "In an amazing coincidence, for binary treatment $Z$, the expression\n",
272 |     "\n",
273 |     "$$\n",
274 |     "\\tau = \\frac{\\mathbb{E}[Y|W=1] - \\mathbb{E}[Y|W=0]}{\\mathbb{E}[Z|W=1] - \\mathbb{E}[Z|W=0]}.\n",
275 |     "$$\n",
276 |     "\n",
277 |     "has a meaning beyond the linear model setting. This is the subject of this groundbreaking [paper](https://www.jstor.org/stable/2291629?seq=1) by Angrist and Imbens in 1996."
278 |    ]
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "Python 3 (ipykernel)",
284 |    "language": "python",
285 |    "name": "python3"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.11.5"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 4
302 | }
303 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |     wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |     wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-ShareAlike 4.0 International Public
 58 | License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-ShareAlike 4.0 International Public License ("Public
 63 | License"). To the extent this Public License may be interpreted as a
 64 | contract, You are granted the Licensed Rights in consideration of Your
 65 | acceptance of these terms and conditions, and the Licensor grants You
 66 | such rights in consideration of benefits the Licensor receives from
 67 | making the Licensed Material available under these terms and
 68 | conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. Share means to provide material to the public by any means or
126 |      process that requires permission under the Licensed Rights, such
127 |      as reproduction, public display, public performance, distribution,
128 |      dissemination, communication, or importation, and to make material
129 |      available to the public including in ways that members of the
130 |      public may access the material from a place and at a time
131 |      individually chosen by them.
132 | 
133 |   l. Sui Generis Database Rights means rights other than copyright
134 |      resulting from Directive 96/9/EC of the European Parliament and of
135 |      the Council of 11 March 1996 on the legal protection of databases,
136 |      as amended and/or succeeded, as well as other essentially
137 |      equivalent rights anywhere in the world.
138 | 
139 |   m. You means the individual or entity exercising the Licensed Rights
140 |      under this Public License. Your has a corresponding meaning.
141 | 
142 | 
143 | Section 2 -- Scope.
144 | 
145 |   a. License grant.
146 | 
147 |        1. Subject to the terms and conditions of this Public License,
148 |           the Licensor hereby grants You a worldwide, royalty-free,
149 |           non-sublicensable, non-exclusive, irrevocable license to
150 |           exercise the Licensed Rights in the Licensed Material to:
151 | 
152 |             a. reproduce and Share the Licensed Material, in whole or
153 |                in part; and
154 | 
155 |             b. produce, reproduce, and Share Adapted Material.
156 | 
157 |        2. Exceptions and Limitations. For the avoidance of doubt, where
158 |           Exceptions and Limitations apply to Your use, this Public
159 |           License does not apply, and You do not need to comply with
160 |           its terms and conditions.
161 | 
162 |        3. Term. The term of this Public License is specified in Section
163 |           6(a).
164 | 
165 |        4. Media and formats; technical modifications allowed. The
166 |           Licensor authorizes You to exercise the Licensed Rights in
167 |           all media and formats whether now known or hereafter created,
168 |           and to make technical modifications necessary to do so. The
169 |           Licensor waives and/or agrees not to assert any right or
170 |           authority to forbid You from making technical modifications
171 |           necessary to exercise the Licensed Rights, including
172 |           technical modifications necessary to circumvent Effective
173 |           Technological Measures. For purposes of this Public License,
174 |           simply making modifications authorized by this Section 2(a)
175 |           (4) never produces Adapted Material.
176 | 
177 |        5. Downstream recipients.
178 | 
179 |             a. Offer from the Licensor -- Licensed Material. Every
180 |                recipient of the Licensed Material automatically
181 |                receives an offer from the Licensor to exercise the
182 |                Licensed Rights under the terms and conditions of this
183 |                Public License.
184 | 
185 |             b. Additional offer from the Licensor -- Adapted Material.
186 |                Every recipient of Adapted Material from You
187 |                automatically receives an offer from the Licensor to
188 |                exercise the Licensed Rights in the Adapted Material
189 |                under the conditions of the Adapter's License You apply.
190 | 
191 |             c. No downstream restrictions. You may not offer or impose
192 |                any additional or different terms or conditions on, or
193 |                apply any Effective Technological Measures to, the
194 |                Licensed Material if doing so restricts exercise of the
195 |                Licensed Rights by any recipient of the Licensed
196 |                Material.
197 | 
198 |        6. No endorsement. Nothing in this Public License constitutes or
199 |           may be construed as permission to assert or imply that You
200 |           are, or that Your use of the Licensed Material is, connected
201 |           with, or sponsored, endorsed, or granted official status by,
202 |           the Licensor or others designated to receive attribution as
203 |           provided in Section 3(a)(1)(A)(i).
204 | 
205 |   b. Other rights.
206 | 
207 |        1. Moral rights, such as the right of integrity, are not
208 |           licensed under this Public License, nor are publicity,
209 |           privacy, and/or other similar personality rights; however, to
210 |           the extent possible, the Licensor waives and/or agrees not to
211 |           assert any such rights held by the Licensor to the limited
212 |           extent necessary to allow You to exercise the Licensed
213 |           Rights, but not otherwise.
214 | 
215 |        2. Patent and trademark rights are not licensed under this
216 |           Public License.
217 | 
218 |        3. To the extent possible, the Licensor waives any right to
219 |           collect royalties from You for the exercise of the Licensed
220 |           Rights, whether directly or through a collecting society
221 |           under any voluntary or waivable statutory or compulsory
222 |           licensing scheme. In all other cases the Licensor expressly
223 |           reserves any right to collect such royalties.
224 | 
225 | 
226 | Section 3 -- License Conditions.
227 | 
228 | Your exercise of the Licensed Rights is expressly made subject to the
229 | following conditions.
230 | 
231 |   a. Attribution.
232 | 
233 |        1. If You Share the Licensed Material (including in modified
234 |           form), You must:
235 | 
236 |             a. retain the following if it is supplied by the Licensor
237 |                with the Licensed Material:
238 | 
239 |                  i. identification of the creator(s) of the Licensed
240 |                     Material and any others designated to receive
241 |                     attribution, in any reasonable manner requested by
242 |                     the Licensor (including by pseudonym if
243 |                     designated);
244 | 
245 |                 ii. a copyright notice;
246 | 
247 |                iii. a notice that refers to this Public License;
248 | 
249 |                 iv. a notice that refers to the disclaimer of
250 |                     warranties;
251 | 
252 |                  v. a URI or hyperlink to the Licensed Material to the
253 |                     extent reasonably practicable;
254 | 
255 |             b. indicate if You modified the Licensed Material and
256 |                retain an indication of any previous modifications; and
257 | 
258 |             c. indicate the Licensed Material is licensed under this
259 |                Public License, and include the text of, or the URI or
260 |                hyperlink to, this Public License.
261 | 
262 |        2. You may satisfy the conditions in Section 3(a)(1) in any
263 |           reasonable manner based on the medium, means, and context in
264 |           which You Share the Licensed Material. For example, it may be
265 |           reasonable to satisfy the conditions by providing a URI or
266 |           hyperlink to a resource that includes the required
267 |           information.
268 | 
269 |        3. If requested by the Licensor, You must remove any of the
270 |           information required by Section 3(a)(1)(A) to the extent
271 |           reasonably practicable.
272 | 
273 |   b. ShareAlike.
274 | 
275 |      In addition to the conditions in Section 3(a), if You Share
276 |      Adapted Material You produce, the following conditions also apply.
277 | 
278 |        1. The Adapter's License You apply must be a Creative Commons
279 |           license with the same License Elements, this version or
280 |           later, or a BY-SA Compatible License.
281 | 
282 |        2. You must include the text of, or the URI or hyperlink to, the
283 |           Adapter's License You apply. You may satisfy this condition
284 |           in any reasonable manner based on the medium, means, and
285 |           context in which You Share Adapted Material.
286 | 
287 |        3. You may not offer or impose any additional or different terms
288 |           or conditions on, or apply any Effective Technological
289 |           Measures to, Adapted Material that restrict exercise of the
290 |           rights granted under the Adapter's License You apply.
291 | 
292 | 
293 | Section 4 -- Sui Generis Database Rights.
294 | 
295 | Where the Licensed Rights include Sui Generis Database Rights that
296 | apply to Your use of the Licensed Material:
297 | 
298 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299 |      to extract, reuse, reproduce, and Share all or a substantial
300 |      portion of the contents of the database;
301 | 
302 |   b. if You include all or a substantial portion of the database
303 |      contents in a database in which You have Sui Generis Database
304 |      Rights, then the database in which You have Sui Generis Database
305 |      Rights (but not its individual contents) is Adapted Material,
306 |      including for purposes of Section 3(b); and
307 | 
308 |   c. You must comply with the conditions in Section 3(a) if You Share
309 |      all or a substantial portion of the contents of the database.
310 | 
311 | For the avoidance of doubt, this Section 4 supplements and does not
312 | replace Your obligations under this Public License where the Licensed
313 | Rights include other Copyright and Similar Rights.
314 | 
315 | 
316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317 | 
318 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
319 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
320 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
321 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
322 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
323 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
324 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
325 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
326 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
327 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
328 | 
329 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
330 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
331 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
332 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
333 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
334 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
335 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
336 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
337 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
338 | 
339 |   c. The disclaimer of warranties and limitation of liability provided
340 |      above shall be interpreted in a manner that, to the extent
341 |      possible, most closely approximates an absolute disclaimer and
342 |      waiver of all liability.
343 | 
344 | 
345 | Section 6 -- Term and Termination.
346 | 
347 |   a. This Public License applies for the term of the Copyright and
348 |      Similar Rights licensed here. However, if You fail to comply with
349 |      this Public License, then Your rights under this Public License
350 |      terminate automatically.
351 | 
352 |   b. Where Your right to use the Licensed Material has terminated under
353 |      Section 6(a), it reinstates:
354 | 
355 |        1. automatically as of the date the violation is cured, provided
356 |           it is cured within 30 days of Your discovery of the
357 |           violation; or
358 | 
359 |        2. upon express reinstatement by the Licensor.
360 | 
361 |      For the avoidance of doubt, this Section 6(b) does not affect any
362 |      right the Licensor may have to seek remedies for Your violations
363 |      of this Public License.
364 | 
365 |   c. For the avoidance of doubt, the Licensor may also offer the
366 |      Licensed Material under separate terms or conditions or stop
367 |      distributing the Licensed Material at any time; however, doing so
368 |      will not terminate this Public License.
369 | 
370 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371 |      License.
372 | 
373 | 
374 | Section 7 -- Other Terms and Conditions.
375 | 
376 |   a. The Licensor shall not be bound by any additional or different
377 |      terms or conditions communicated by You unless expressly agreed.
378 | 
379 |   b. Any arrangements, understandings, or agreements regarding the
380 |      Licensed Material not stated herein are separate from and
381 |      independent of the terms and conditions of this Public License.
382 | 
383 | 
384 | Section 8 -- Interpretation.
385 | 
386 |   a. For the avoidance of doubt, this Public License does not, and
387 |      shall not be interpreted to, reduce, limit, restrict, or impose
388 |      conditions on any use of the Licensed Material that could lawfully
389 |      be made without permission under this Public License.
390 | 
391 |   b. To the extent possible, if any provision of this Public License is
392 |      deemed unenforceable, it shall be automatically reformed to the
393 |      minimum extent necessary to make it enforceable. If the provision
394 |      cannot be reformed, it shall be severed from this Public License
395 |      without affecting the enforceability of the remaining terms and
396 |      conditions.
397 | 
398 |   c. No term or condition of this Public License will be waived and no
399 |      failure to comply consented to unless expressly agreed to by the
400 |      Licensor.
401 | 
402 |   d. Nothing in this Public License constitutes or may be interpreted
403 |      as a limitation upon, or waiver of, any privileges and immunities
404 |      that apply to the Licensor or You, including from the legal
405 |      processes of any jurisdiction or authority.
406 | 
407 | 
408 | =======================================================================
409 | 
410 | Creative Commons is not a party to its public
411 | licenses. Notwithstanding, Creative Commons may elect to apply one of
412 | its public licenses to material it publishes and in those instances
413 | will be considered the “Licensor.” The text of the Creative Commons
414 | public licenses is dedicated to the public domain under the CC0 Public
415 | Domain Dedication. Except for the limited purpose of indicating that
416 | material is shared under a Creative Commons public license or as
417 | otherwise permitted by the Creative Commons policies published at
418 | creativecommons.org/policies, Creative Commons does not authorize the
419 | use of the trademark "Creative Commons" or any other trademark or logo
420 | of Creative Commons without its prior written consent including,
421 | without limitation, in connection with any unauthorized modifications
422 | to any of its public licenses or any other arrangements,
423 | understandings, or agreements concerning use of licensed material. For
424 | the avoidance of doubt, this paragraph does not form part of the
425 | public licenses.
426 | 
427 | Creative Commons may be contacted at creativecommons.org.
428 | 


--------------------------------------------------------------------------------
/ds-102-book/content/chapters/03/00_figure_generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "d5d19570",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from scipy import integrate, stats\n",
 13 |     "\n",
 14 |     "%matplotlib inline\n",
 15 |     "\n",
 16 |     "import matplotlib.pyplot as plt\n",
 17 |     "import seaborn as sns\n",
 18 |     "\n",
 19 |     "sns.set()"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "id": "9636d88f",
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAACuCAYAAACx83usAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAOW5JREFUeJzt3Qd0VEUXB/Ar6YSElkJLQhokIQkQOoROgNA7BJDepQgiKIoICFJF6U0QUVEB5ZMuSJcSCCX0DtKR3qt8587yXhZIQjZ5u+/tvv/vnBxmI+yOb7Mvc2fu3HnnxYsXLwgAAAAAAEBBmZR8MgAAAAAAAAQaAAAAAABgFljRAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxSHQAAAAAAAAxdkr/5Sgd7du3aJ169bRs2fPqGLFiuTt7a12lwDAyuzZs4cOHDhAfn5+FB0dTZkyYV4MANLu7t27Yizy6NEjcQ/JmzcvLp8KcOcGRc2cOZN8fX2pcePG1Lx5c9EeN24cvXjxAlcaAN7qypUrVL16dYqKiqI2bdqIyQpuHzt2DFcPANLkxx9/FJMUDRo0oBYtWoj2559/Tv/99x+uoIW98wIjQFDIqFGj6OOPP072v/EHfMiQIbjWAJCif//9l8qUKUMnT5584795ennRzvh4MWAAAEjJtGnTqEePHsn+t/fff58mTJiAi2dBCDRAEcuWLaO6devKj8vWbETubm60auE8+Xt//vknxcTE4IoDwBueP39O1apVow0bNojH2Ty8qWbTNrRl1RI6f/q4+F7xEiVo+7ZtZGdnhysIAG/YuHEjVa1aVdxPxD2jci3KnScvLVvwLb14uZqxaNEikXUBloFAAzLszp07FBoaShcvXhSPm3T5gLr2+VC0f50zhWaNHy7a+QMC6MihQ+Tk5ISrDgCvmDp1Kr333nuinTWnF01csJzy5PWhO7duUq+4WnTxn9Piv02ePFn+ewAAEt6LERERQSdOnBCPY1t2pr6DhtE777xDy3+dT18PNYxLvHPlohPHj1OWLFlw8SwAezQgwzgtSgoywktXpC69+8v/rUm77hRRrLRonzl1iubO/Q5XHADe2JcxaNAg+XH/UVNEkMHcs2Wn/l8kpToMG/6FGFAAABgbPXq0HGQERRSn3gOHiCCD1WramkpVrGa431y+TFOmTsXFsxAEGpAh58+fFzOMzMHJmfp9Nlr+YIsfsEyZqOuHSXszRn75JT19+hRXHQBkY8aModu3b4t2dK3GVLps+VeuDk9WRFerLdpXr1ymb7/9FlcPAGTXr1+nsWPHiradnT31HTKG7O2TCqvyuKRL/6TAg4vUPHjwAFfQAhBoQIbwB1sKHGo270A+fvnf+DsFI4pS8XKVRfvcP2fFfg4AAGkD+PTp00XbwdGJuvcfnOyFadm1j9ye8M1EVLIDANk333xD9+/fF+1K9VtQgdCwN66Ob0AwVahh2Et67d9/6eeff8EVtAAEGpChdAcuZ8scnV2oVcfuKf7dRm26yO1pM2bhqgOAwBVgpJnFyvXjyMs7V7JXJjgskiJLlBHtk8eP0d9//40rCABiNXTixIniStjZO1Cbrr1TGYt0ldvTZ2EsYgkINCDdeBZSypWu2rA15fTwTPHvRpWpQF65DYfl/LVmtUi5AgB9e/jwoShFyewdHOndLr1S/fuxjVvJ7akzDJMcAKBvc+fOlVMvy8U2ojz5fFP8u6GRUZQ/qKBo79y+jY4cOWKxfuoVAg1IFz71e9bL2YB3MmWiuPZJKxbJ4XKU1Rs0F20+MOfXhYtw5QF07pdffqFbt26JdsmqdShXntRP7i0fU5tc3dxFe9kff9CTJ08s0k8A0CY+Ck5KvWRxHZI/P0PCezRqNIyTH/+04Gez9g8QaEA68T6LCxcuiHbhslUo98sKMampWLOe3EagAQDSagZr2LLdWy+Ik7MLla5UXbTv3rlNa9euxUUE0DE+d+fo0aOiXaBIKQoqYFitSI20T4MtWrzYrP0DBBqgwAChfou2afo3foEFyScgWLTjt2+ly5cv4/oD6NTu3bspPj5etH2CwqhwVIk0/Tte1ZD89MtCs/UPAKxrLFKneZs0/RtO4w6JjBLtwwcPyCVxwTyQOgUmO3funDjlm3nk8aGyFaum6d/xkmX5lyUqebnzt99/x9UH0HFetSS26buvlMVOTfFylcjZJbNoL1/2h3wCMADoy40bN2jJkiWi7Z7dg6rWrJPmf2s8YfELUrnNCoEGmOznn5NyGivVbirOykirctVi5fayFStx9QF0useL92dIJW1r1m+c5n/L6VMlog3lsm/duEG7du0yWz8BQLsWLVokl9cvV7MBOTo6pfnflqtaS26vWLnKLP0DAwQaYLKffvpJbseaMEBgQaERlDV7DtHevHEjDu8D0KG//vpLnJ/BipSrSm4vN3inlXQuD1u2AoMEAD0yHovUrN/EpH+b18+fcvv4yanc0hkcoDwEGmCSQ4cO0d69e0XbP7Qw+foHmvYDlykTRZWpKNr37t6hHTt24B0A0Jkff/xRbler28jkfx9VtoLcXv0yjRMA9JXCvWnTJtH29g2g0PDCJj9HsbKGscizp09p3br1ivcRDBBogEkWLFggtyvVapiuqyd9uNlSpE8B6Aofzvf7y/1ZLq5uFF05xuTnyJXXl/LlN0xy7N65g+7cuaN4PwFAuzj1kvd6sgo1G6R5j5exYmUrye3lSJ8yGwQakGb8oZaWKvnsjOp10xlolEv6cP+5GrORAHorjX3v3j3RLlmllkl51a9vCme8GXzt2r8U7SMAWFMKt2lpU5KipaIpk52daK9Zg7GIuSDQgDTjNKdTp06JdlixsuTh6Z2uq+fhlUs+mTNx7266e/cu3gUAHQ4QqqcjbUoipWCyNX+ty3C/AMA6HD58mPbs2SOncPvkD0jX8/DhnyERhjK3p04cR8l9M0GgAekaIFROZ9qUJKJYafmU8O3bt+NdANCBmzdv0ooVK0Q7a04vKlY6Ot3PFR5VUk6X2LJli2J9BADrGYukN4VbElncMBZhmzZtztBzQfIQaIDJ5SjtHRxNqlednPCXgQbbvBkfbgA9WLx4sVxprmxMXbJ7mbaQHm5Zs1H+4BDRPnQgESujADpM4Y6p0yBDzxderJTc/mvDxgz3D96EQAPSZN26dXT16lXRLlKuCmVxz5qhK8ezkRIEGgD6qzZVs0HTDD9feFQpeWX077+3Zvj5AEDb4uPj5RTu0Kgy5OmVK0PPV6hICayMmhkCDTB5qbJq7fTnVUu8cuelnLnyyns/njx5gncCwIadP3+eNm40zBh6+/inqxxlSoEGw2wkgO1TMoWb8aSpf3CoaB85uB8V7MwAgQa81cOHD+m3336Ty1FWqFpdkasWFFFMfv7du3fjnQDQSTnK6HSWo3xdhFHaw5bN2KcBoKsU7ti6ijxvoZcZFrwyunUrVkaVhkAD3mr58uVy/nOJyrHk6OSsyFULiiwht5E+BaCjU3wzUG3KmGeuPJQrr49o70nYiZVRABu2fv16unLlimgXLleF3DKYwv16cRqGlVHlIdAA007xraPMAIEFv1zRYKg8BWC7jhw5Iq9a+hWMoPyBwYo9d6GX6VOPHz+iffv2Kfa8AKDdsYgSKdzJ7Rndtg1VMJWGQAPSXI7SPYcnlSxbXrErlss3kFyzuMkbvADA9gcIFWIzViXmdSERReX23xgkANh8CrezaxaqUCVGsefmldGcLzeV79uzW6RQgXIQaECqFi1aJKcjlI2pl6FylG/88NnZUUThovJG0UuXLuHdALDxcpQ16yk3E/l6oLFtByYsAGzR0qVLk1K4K9UiJ2cXs9xH7t29Q8eOHVP0ufUOgQakeSayRv0mil+tiKKGUznZzp078W4A2BhOi5TKUYZElSEv79yKPn9ASCGyt3cQ7QTcQwBsfiwSU7ex4s9f0HjCYvsOxZ9fzxBoQIrOnTtnVI4ygApFFlH8akUWTdqngfQpANseIFSupexqBnN0dKKAgmGiferEMbp9+7birwEA6rlx4watXLlStLN6eFPJstFmDTT+RqChKAQakKIFCxbI7fKxypSjfF1hBBoANotPAZfKUTo4OlE1hcpRpjRI4DStXbt2meU1AEAdCxcuFPcSVjamrqIp3JKChZLO9dmFlVFFIdCANM1ExpohbYrlypOXcuXKJadOYRMWgO1Ys2YNXbt2TbSLRFcjN3d3s7zOqxvCkfYAYLsp3E3N8hqubu7k4x8k2ocOJNLjx4/N8jp6hEADkrV//35KTEwU7YCwouSbP8AsV4pXSUqWNJSWu3XrFp04cQLvCICN+OGHH+R2VQVLY6eW9rAdFewAbMaZM2fkc7Zy+QVSWHik2V6rYIQhPfzpkyfy+AcyDoEGJOvbb7+V25UUrFednBIlkg7u27EDs5EAtpJXLZWjdHXPRuUrVzPba/FMZGbXLKK9exeKSgDYirlz58rtCrGNzJLCLSkYnjRhsWXrNrO9jt4g0IA3PHr0iObPny/nVdcyU9pUcoGGdKgXAFh/uoOUfhAd20hs2jaXTJkyUXCYYabzyqWLdPXqVbO9FgBYxvPnz2nOnDlyaey6jVuY9fWMV0bjdyWY9bX0BIEGvOH3338Xs5GseKVYypo9u1mvUtGiSR/uPXv24B0BsHK8KXvWrFny43pNW5v9NYNCI+R2QgImLACs3erVq8UZWyyydGWxp9Oc/INDxKQF27d3r1lfS08QaMAbZs+eLbdrN21l9ivk5eVFefMabiB79+4VgxQAsF5c2IH3ebGAQkWpQKih/Kw5BYaGy23MRgLY1lgktnFLs7+es0tmeUP48aOH5cOKIWMQaMAr+ETMdevWibZXvvxUsozy9apTW9XgGvinT5/GuwJgxaZPny63qzcy/wCBBRkFGglYGQWwahcuXBCngbOsOb2oYtXqFnldaWX02dOndPDgQYu8pq1DoAGv+Prrr+V2TMOWZt14ZQzpUwC24cqVK3I5ShdXN6pZt6FFXtfXP5gcnZxFO3Ef0h4ArNnkyZPp2bNnol2pbnOyd3CwyOsap2BiZVQZCDRAdv36dfruu+9E28nFlRq2aGOxq2McaGBDOID1mjZtmpxyULlBHLm+rAZlbnb29hRQIFS0z546SXfu3LHI6wKAsu7fv08zZswQbQ4wmrbpaLFLbLwyGr8TG8KVgEADZDNnzqSHDx+KdoU6TSlrtmwWuzpY0QCwjYp1U6dOFe1MdnbUvG1ni75+oNFsJO/3AgDrM2/ePLp586Zol6xaj7y9DYf6WkJgSFKgsXcvitMoAYEGyDMIEyZMEG1Ol2rerotFr4yfnx9lf1ndCpWnAKx38+a///4rV6zLk9fHoq9vPBu5A+UpAawOr4aOGTNGftyifVeLvr5b1myU6+V96/DBA6LELmQMAg2Q8yHlAULlWuRnppPAU8LBjbSqcfnyZfEFANaDV0NHjhwpP47r1MvifQgymo1M2I3ZSABrw+dmnD17VrTDS1WgUDOeBP62ldGHD+7TiRMnLP76tgaBBohcZmkGgQ/Fad9rgCpXBelTANa9N+PSpUuiHVWhBoVHFrZ4H/wLhIqULYY6+ADWN1kxYsQI+XE7lcYixhMWOxOwTyOjEGgAff755/IBfaWq1aPgAgVVuSoINACst9LUsGHD5NXJDr3VGSA4ObuQr1EdfOlkcgDQPp7wlA7oK1yuKhUuWlyVfhinYO7chcM/MwqBhs4dOHCAJk6cKNoOjk7Uuc9HqvUFgQaAdRo4cKA4A4eVjW1MBUMLqdYXKe3h+bNnqIMPYCVOnTpFo0aNEm1elezc9xPV+mJc4hZ7RjMOgYbON1116NBB3uxUt00P8s2fX7X+FCxYkFxcXEQbJW4BrMPKlStFlRiWOYs7vffhZ6r2xzjtYcfOXar2BQDejscgnTp1ElXrWEyTdlQwNEy1S5fTKxdly5FTtA8k7qMXL16o1hdbgEBDxwYPHkw7d+6UTwFv3623qv2xs7OjyMhIeXZDmiEFAG3iog1t27aVH7foMYA8vbxU7ZNx2sMubAgH0LzRo0fT+vXrRTubhzd1fl+d1EsJp38GhhhWNW7euC5OKYf0Q6ChU/Pnz5c3gNvZO9CAUVPJ+eVqgpqM06cSExNV7QsApF4Su379+nK1uogylan5ux1Uv2TGdfAT9+5TtS8AkLolS5aISU9pgP/ByEmU1T2r6pfNeMJiNyYsMgSBhg4tXLhQpExJWvQYSIWLRpEWFClSRG7jwC0Abbp79y41aNCA4uPj5VnIQV9OpEyZ1P+VwnXwvfPkE+1DB/fTf//9p3aXACAZy5cvp7i4OPkzWr99bypdroImrpXxhMXO3dgQnhHq/1YAi3n27JmoMNW8eXPRZpUbtqa2Xd7TzLuAQANA2w4dOkTR0dG0du1a8djZNQsNnfoDeXh6klZIg4QH9++JNEwA0NaejLFjx4oVUWlfRqmY+tRN5ZQpY4EhSQUt9uzZq2pfrB0CDR3gD/LPP/8s9j8MHTpU3tgUXbspDfh8lFiu1IqIiAh5VhQrGgDacfLkSerdu7dIb5TSGjO7ZaUhk36gsEJJVVq0NkhA2gOAdgrQ/Pbbb1S8eHEaMGCAXIiGDwkePGai2KepFXn9AkS5bLY/ESmYGWGfoX8NmnT16lVKSEgQX7t27aINGza8srGaS8c17tSPOvXsq4lUB2OZM2emAgUK0JEjR0Tp3adPn5KDg4Pa3QLQFV7xPHz4sLh/8H1k+/bt4k9jufMH0afjZ1OBkBDSmtfTHpo1a6pqfwD06Pr166+MRTZu3Ci+J+FJztqtu1PPDz/RVJDBuD98AOiRxN30z5nT4mBjd3d3tbtllRBo2IDTp0/TH3/8Qdu2bRMDgrNnz6b4dwMjilG3AUOpSJQ6B+GkNX2KAw2e/eA/eZUDAMyHA/p169aJdCgpqOBTepPj5JKZajRrT516fkAumTNr8m0x3siJtAcAy+DD9v73v//R1q1bxX0ktbRF3wLh1HXAUCpZppxm3x6esOBAg/EqLqeMgukQaFgpXnJctGgRffXVV/KGzJS4ZHGjIuWqUc0GzalM+YqaSpVKDqdmcKqXlD6FQAPAPM6dO0fjx4+n77//nm7evJnq380XGELRNRpQvaZx5Onlrem3xCt3PsrinpXu3bmNtAcAM+KN3DzRyfeRLVu2pPp3nVxcqXDZylS9fjOqUCVG82MR4wmL+F0JCDTSCYGGFeLZgs6dO4tNma9zdHYhvwLhFFSoMBUoFEmFIouSn3+g5lKk0rohnE/lfPfdd1XtD4Ct4dWK4cOH07hx48Rqxus8cvtQQFhhCg6NoJDIIhQWXpjcs2YjayHq4BcsRPt2bqWrly+JdFIvlc/3ALA1vPLJY5HkTs92cHQi3wKFKCiMxyKFKSyyKPkHBmkuRSqtKZgJKHGbbgg0rAivYnzyySfi/AvjkyrFTGP1elSucjUKLhhGdvbW/bYWLlxYbmNDOICyeO9Fw4YN6ejRo/L3HJycqVj56lS+ei0qXqoc5fDQTgWpjAwSONBg+/bto5iYGLW7BGATePwxYsQIGjJkyCvlo3P5BVL56vWpXJUYKhgaTvZWvr/SPzhETNLy/2PiPmwITy/rHpHqrG59ixYtaMWKFfL3/ApGUMe+n1DpaO2nQ5nC29ubcufOTZcuXRKBBt/UbOn/D0AtK1euFOWt+X4iHdYZG9eRWnd6j3LaQHCRUuWpnQm7EWgAKFTFsk2bNuI8LknegILUvs8gqlC1uk39rnZ2yUx58wfSuVPH6eiRQyhOk04INKzAvXv3qFatWnL+I1eNiuvxEbXu1J3srXz1IrX0KQ40OG+c88h9fX3V7hKAVVu6dCk1btxYTpXKFxhKg8ZMpeCQULJFSHsAUD7I4NXQVatWicccVDTq+D51fK8vOTg62uTl5hRMDjSeojhNullP4r5OPX78mOrVqycHGVy3/vMpP1G7br1sNshgOLgPQDlcTapJkyZykBFVoQZN+WmZzQYZzDcgmOztDakbqIMPkPGS182aNZODDK4+99FXc6hb34E2G2QwnMmTcQg0NIxThnr06EHr168Xj11c3Wj49AWicpStQ6ABoIxjx46JIIPLRbNS1erSiEnfUuYsrjZ9iXnwkz/YcMbHyePH6MGDB2p3CcBqDRw4UKyKSkVnPps0n6pUjyVb98qZPAmGUrdgGgQaGjZ58mSaM2eOXMHh8yk/UGSRKNIDBBoAGceHTPGKqHRgZ+GyVejzcVNtejU0udlI3szJB4ACgOm4/DWX0mdcbOb90bM0ff6FkoKMAo09e/eq2hdrhUBDo7hKSv/+/eXHPT4bR1ElSpFeBAUFkaurYcYVlacA0qd3795ydak8/gXo869mWH0lmPTORu5GeUoAkx0/fpy6d+8uP+7x8QiKKKmfg+uye3hSDk/DuUEH9+97peInpA0CDY3WuG/VqpWc6lAzrhPVadiU9IRLykllbvnk81u3bqndJQCrwlVh5s2bJ9rOmV1p6MS5lMXNjfTEOL96FwINAJPwnq7WrVvLaYedOnWiWs3a6O4qSveR27duieI0YBoEGho0ePBgOnjwoFwZ5r0PPyU9Mk6f4hUeAEibK1euUNeuXeXHnQYMp/wBgbq7fFwxRoKVUQDTjBo1iuLj40U7ODiYvv76a11eQuOVUdxHTIdAQ2MSExPlD7O9oyN9NHoKOTs5kx5hnwZA+nz44YeiNDQrUaUW1WsSp8tL6ermTrnyGUpjHz54QBx6CgBvd+LECXEoH+PTvH/44Qc5nVlvgoxXRrEh3GQINDSENyx269ZN/mXYoF0vKhgaRnqFQAPAdBs2bKD58+fL5bD7fTbapg7RMlVQSIT488GD+3Ty5Em1uwOgebwPoWfPnqK8Puvbty+VLFmS9OqVM3n2YEO4qRBoaMjcuXNp27Ztou3t409tu/UiPQsPDxd7NRiWKwHejvd1cUlsSaueH5GHp22d+J2RfRp79uxRtS8A1mDx4sW0evVq0c6XLx8NGTKE9Cy3T35xSjhL3IdAw1QINDR0+vcnn3wiP+4x6EvdpkxJXFxcKCTEUAef96xIm+MBIHkzZsygw4cPi7Z/WGFq0rKt7i+V8WwkAg2A1PEqBqdeSr755hvKkiWLri8bp44FFDRkl5z/5yyK05gIgYZGjB8/XmzgZMUq1qSyFSqp3SVNKFq0qFz9QhpAAUDyZ2YMGzZMftxz0JfyiqCeGa9ooKgEQOqmTZtGZ86cEe1q1apRw4YNccleS5/CfcQ0+C2kARxgjB07VrQz2dlT1w/0WWUqOdinAZA2fA+5du2aaJesWocii+rjcM+38cyVh7K4ZxNtpGACpIwP9vziiy/kx2PGjNH1/q6UDu7bjRRMkyDQ0ACehbx//75oV67fgvwDg9TukiYDDaQ9ACTv0qVLRif3OlDXD5LSMPWOB0r5CxjSHi5fviy+AOBNo0ePpuvXr4s2n+UlZRTAa2fyJGCvlykQaKiMlyhnzpwp2k4urtSxV1JuJJB8aB/DbCRA8kaOHCkfqlWlYSvy9fPHpTKSvyDSHgBSc/XqVbm0vqOj4ysrG0CUPzhETkXdhw3hJkGgoYEDcZ49eybateI6kqeX4ah7MPD09KS8efPKgQaX3QOAV1czZs2aJdpOLpmpQ88PcHleI61oSPcRAHhzn+jDhw9Fu3v37pQ/f35cIiNOzi7k42/INjl65DCK05gAgYaK+Cj7OXPmiLZzZldq2aG7mt3RfPoU54+ePXtW7e4AaG5vhlTvvmqjd8nDQ9/lbJPjVwAlbgFSwnu7pkyZItpOTk40YMAAXKxUNoQ/e/qUDh06hGuURgg0VM6H5GpKrHrTdpQte3Y1u6NZ2BAOkHIhienTp4u2o5MzteqUdIYGJMnjF0AOjk6ijRUNgFdNmDBB3ifaqVMnypMnDy5RMnAmT/og0FDJxYsXafbs2aLt5JyZWnXEakZKEGgAvD3doUK9OPLw8MKlSoa9vQP5BBYQ7WPHjsmDKgC9u3nzJk2aNEm0HRwcaODAgWp3ySpK3O7ajQ3haYVAQwPpDtUav0s5cnqo1RXNM658gdlIgKR0h6lTp8rpDjXjOuPSpML/ZfoU7/Pav38/rhXAywP57t69K65F+/btycfHB9clTSsa2OuVVgg0VMDl4/gEX+aAdIe38vf3Jzc3N9FGoAFgMHnyZHlmvlWb9pTNA4UkUoMN4QCvunfvHk2cOFG07e3t6eOPP8YlSkW2HB7k4Z1btA/sT0RxmjRCoKHSyZtSukOlus1RaeotuKScVOaWN4PfuHHD/G8SgIbx/UPavGlnZ0fde/dVu0ua518waTYSExYARHPnzhWpU6xly5aoNGXCqsbdOyhOk1YINCzs0aNHYiaSvZMpE7Xo0M3SXbD6fRr79u1TtS8Aavv+++/lU8CbN29O+Xx81e6S5vkFh8ptHP4Jevf8+XP53Az2wQcoi50WgUZn8uA+kjYINCzsp59+EpViWLEKNXCwVhphQziAwX///SefAs4wQEgbF9csFBRkqIOfmJgon18EoEdLliyhU6dOiXZMTAxFRkaq3SWr26exM2G3qn2xFgg0LIg3IXKVGElcR5SiTCsEGgAGy5YtE5WTWOXKlSkqKgqXxsT7CK8sHz9+HNcNdGvcuHFyu3///qr2xVorT+3GhvA0QaBhQatWrZIPeQmKKEaRUSUs+fJWrVChQiIXnSG/GvTMeLICqxmmkfZ6MdxHQK+2bt1K27dvF+2IiAixogFpk9vHjzK7ZhHt/YlI404LBBoqzSA0bou9GaZwdnamsLAw0eZgTSoNDKAnO3fupE2bNol2SEgIxcbGqt0lq4KVUYBXxyL9+vWjd955B5fFhOI0AS8LS1w8fw7FadIAgYaF8KahdevWibZnXj+qXL2WpV7a5gYJnFstrQwB6Hk1g3/pQdrhTB7QuxMnToj9GSx37twUFxendpesep8GitO8HX5LqTBAqN+qs5wGBOmbjdy9G5uwQF/OnDlDCxcuFG0vLy9q3bq12l2yOjyw8vT0lO8hvG8OQE8mTJgg/9z36tVLHPYJ6d+nsSN+Fy7fWyDQsIDz58/TL7/8Itqu7tmpXlPMIKRHsWLF5HZ8fLxi7w+AtZzgyxWnWM+ePUU6IZiGU0Sk+wiXB+bgDUBPhwXz2RnM1dWVunbtqnaXrFJwWFKFrm07dqjaF2uAQMMC+ORNqZRiTOPW5JLZ1RIva3N4gCCliuzAhxt05NatWzR79mzRdnFxoe7du6vdJatVqlQpuY37COjJ9OnT5cOCO3ToQDly5FC7S1bJPziEnF1cRDth5061u6N5CDTM7M6dOzRjxgzRtndwoGZtOpn7JW1WlixZKDzcsGS5f/9+unfvntpdArCImTNnyj/v7dq1Iw8PD1z5dEKgAXrEJZ0nTZok2jxh9/7776vdJatlZ29PwWGGCnYXzv9Dly9fVrtLmoZAw8y+/fZbEWywsjUakqeXt7lfUheDBE4hSUhIULs7AGb35MkTsSoqpf707dsXVz0DSpYsKbelEp8AejosuFGjRhQQEKB2l6xaaGTS+UW4j6QOgYYZcboU51VLWnRAukNGlS5dWm4j7QH04Ndff6ULFy6Idr169Sg4OFjtLlm1nDlzyteQqwFyIAdgy3jz91dffSU/xvk7GRdiFGhs2YoJi9Qg0DCjxYsX09mzZ0U7vFRFCi4Yas6X013aA2YRQA8DBBzQZ777CJ/Hg/KUYOtWr15NBw8eFO2yZcu+MmEHGQ80tmPPaKoQaFhogNC0HVYzlMCHlLm5uYk2VjTA1q1fv14+wbpEiRIUHR2tdpdsAvZpgJ5gskJ5nrnykId3btHeuzuBnj9/boZXsQ0INMxky5Yt4hRfli8wlMqUr2iul9IVPn+EB1zs4sWLonQwgF4GCDjBVxlYGQW94BW7tWvXinZgYCDVr19f7S7ZjJAIw6rG/Xt36fDhw2p3R7MQaFhggNCgTRcMEBRkvOyL9CmwVfyLa8WKFaLt5+dHjRs3VrtLNqNw4cLyQWVYGQVbZrw3gytN4bBg5YQWTkqf2rptm4LPbFsQaJjB8ePH6Y8//hDtrB7eVLNeI3O8jG4h7QH0coKvpE+fPmRvb69qf2yJo6MjRUUZBgknTpwQB5kB2Bpe9V+wYIFoZ8+endq3b692l2wKNoSnDQINM/j666/FHg0W26wdOTkaZs5A+UDj77//xmUFm3P16lX6/vvvRdvd3Z06duyodpdsemV069atqvYFwBwmT55MT58+Fe1u3bqJ08BB2RPCM9nZifb2bbiHpASBhsJ4Zmzu3Lmi7eSSmRq1bKv0S+iet7e3XJ5y165d9ODBA91fE7AtU6dOFRWRWJcuXUSwAcoy3li/adMmXF6wKffv3xcngTMHBwfq2bOn2l2yOS6ZXSk4NEK0jx89QteuXVO7S5qEQENh/MF++PChaJev3ZSyZ8+h9EsAEVWsaNhcz7M12KcBtoTvH1OmTBFtzqfu1auX2l2ySeXLl5fbCDTA1vCE582bN0U7Li6O8uTJo3aXbFJkiTJyG/eR5CHQUNCjR4/EUiV7J1MmatG+m5JPD0YqVKggt/HhBlsyf/58eWasWbNm5Ovrq3aXbJKnpyeFhYWJdkJCAt27d0/tLgEogkutcgq3pF+/friyZhJRLCnQ+Gv9RlznZCDQUNC8efPo8uXLoh1Vvjr55fdX8unBCAINsEXPnj2jMWPGyI9xgq9l7iM8MMM+DbAVCxcupJMnT4p2tWrVRJU1MI/wqJJyVVFMeiYPgYaZBggtO/dW6qkhGVzuk7/Ytm3b6MmTJ7hOYHMDhGLFiqndJZuGCQuwNVyI5ssvv5Qff/zxx6r2x9a5Zc1G/sGhon3oQCLdvn1b7S5pDgINhfz666906tQp0Q4rEU2RRZPqK4N5BwmcssabwgGs2X///UcjR46UHw8aNEjV/ugBAg2wNXz2TmJiolyhsXLlymp3STf7NPgejkqYb0KgoQD+4TKeQYjr3EeJpwUTBgkbNyI3Eqzb8uXL6cCBA3Lp1UqVKqndJZuXN29ecVqydHCfVMgDwFpXM0aMGPHKZIWU1gPmE1EsqVT22vUbcKlfg0BDAcuWLZMHCAHhUVSqbFLZRDB/5Sm2YQM+3GC9MEBQf8KC0y9RwQ6sGe8R4FRiFh4eTnXq1FG7S7oLNDYg0HgDAg2FBwjNO/XGDIKFBAUFiRlJtnnzZpFCBWCNOFDmGXUWERFBtWvXVrtLumGcWrJmzRpV+wKQEcapl7w3I1MmDPEsIbuHJ/kFFhDtfXsS5LLCYICfwgziX0zx8fGinTcwhCpXq5HRp4Q04iXh6tWrizanPHCwAWCNhg0bJrcxQLCsmJgYub169WoLvzqAMng17s8//xTtgIAAURobLKdY2UpyKv3atWtx6Y0g0Mjgasann34qP8ZqhuXVrFlTbq9atUqFHgBkzLp16+TUP16la9q0KS6pBeXKlYuKFCki2rt376arV6/i+oPVGTx4sNz+6KOPyN7eXtX+6E3x6KSV0aXLV6raF61BoJHBvRk7d+6UVzOq166v1PsCacQlQKXlYcxGgrVPVgwdOhQDBBXUqJG0Ei3NCgNYCy6GIs2i82pGu3bt1O6S7kQWL02OTs6ivWbNn+LeDgYINNKJl8eMZxBa9/iQ7Ozs0vt0kE45cuSgEiVKiPbBgwfp/PnzuJZgNVauXClv3ixUqBA1b95c7S6R3ldGMWEB1oQHtMZjkc8//5wcHBxU7ZMeOTm7yJvCL1+8QIcPH1a7S5qBQCOdFi9eTPv27RNtv5BIqlqjlpLvC6RzNhKDBLDWAQKvZmCyQh1ly5alLFmyyCsaPJEEYC37RKX9iSEhIdSyZUu1u6RbxcsllSRfhvQpGQKNdHj69OkrA4Q2732ISlMamY3kswgArMGiRYvEngBWtGhRatiwodpd0i1HR0eqUqWKaPMejYSEBLW7BPBWHBAbH+yJyQp1lTDep7Fsmap90RIEGukwY8YMOnr0qGgHRRSn8pWrKf2+gAlKlixJnp6e8obwBw8e4PqBpj1+/JgGDhwoPx4+fDhKUarM+MyB33//XdW+AKTFjz/+KAfFhQsXpiZNmuDCqcg3sADl9vET7W1/b6br16/j/UCgYTqujzxkyBD5cdcBQ7CaoTJON2nQoIFc5hbpU6B1EydOpNOnT4t21apVqVYtpF6qrX79+vK9nFNjsZkTtOz+/fuiFLZk3LhxmKxQGd8/oqsZ7uXPnz+n//3vD7W7pAlY0TDRF198QTdu3BDtUjH1qUiUYSMyqKtRo0Zy+7ffflO1LwCp+ffff8V9RPrFNH78eExWaICXlxeVL19etI8dO4bNnKBpfN+4cOGCvBrHFRhBfdHVkg5b/XXRYlX7ohUINExw/PhxmjRpkmg7ODpR1w8+Mdf7Aibi/Gp3d3e57PCTJ09wDUGTeEX0zp07ot2xY0eR8gDam7BA+hRoFQcYo0ePFm0+L2Ps2LFqdwleComMohye3qK9ft1aunfvnu6vDQKNNOJl9B49eoiN4Cy2ZWfy8fHV/Q+QljZz1q1bV7Rv3bqFkzlBk+Lj42n69OmizVWOeG8GaIfxhvxff/1V1b4ApKRPnz7yXsTu3buLalOgDXyuV7mqsaL95PFj+uMPpE8h0Eijn376SR685vDOQx26v2++n1RIF+MTlb///ntcRdCUZ8+eUZcuXeTcf17Z4FOpQTt8fX2pVKlSop2YmCiXMAfQiqVLl4o9RIyLoPC5GaAtFWsYJj3ZnO/mkd4h0EgD3pPRt29f+XGXgV+Q68ua66AdsbGx5OHhIdpLliwRKxsAWvHNN9/IA1dOl3r/fUxWaFHbtm3l9rx5GCSAdnAaTs+ePeXHEyZMEIfWgrZEFC9D3nnyifb6v9bSxYsXSc8QaKRBv379xAZOFlWxBg7n03D6lHRYEZcPXbhwodpdAhBOnDhBn332mbwBfObMmSK3GrSHT2fne4lUPlRKlwVQ2yeffEL//POPaPPmbxzOp930qWr1mspnnfzww4+kZwg03oKXKKVZLefMrtTnk5GWeF9AgdnI7777DtcRNJEy9e6778o51bzXi89+AW3iGeJ69erJh/fx2TwAWjgBnMtiM2dnZ5o2bRqq1WlYTL1mcnvO3O90XS4bgUYqLl26RF27dpUft+8/jPLkzWuJ9wXSiU9YDg8PF+2tW7fSnj17cC1BVV9++SVt375dtIOCgmjUqFF4R6xowmLKlCmq9gWA07fbtWsnXwiuOMX3EtCuvH7+VKioYULp6JFDtGnTJtIrBBop4MNW+IMtnezIKVMNmxnSckC7OC3lvffeeyUvHkAtHGAMHTpUPlhy/vz5otoUaH+/V/78+UWbDwA9dOiQ2l0CneKZcC4iIeX5x8TEvLJPA7SrXlx7uT1u/FekVwg0UsCVHP7880/RzprTiwYMx6Fa1oLTVLJnzy7aCxYsoMuXL6vdJdChK1euUOPGjcWkhZRfXbp0abW7BWnAQWHv3r3lx1LKCoAaB/NJVab499rcuXNxAriVqFC9DuX0MlQWXL5sKZ06dYr0CIFGMrjusXRyL2/q6TJ4AuXMaahmBNrn6uoqZoAYH9w3efJktbsEOsMbiHlTsTQLWaFCBfr000/V7haYoEOHDvLqE5fLlgqCAFjKunXraODAga/sO8yL9G2rYe/gQPVbdpBXpvS6qoFA4zWc09+qVSv5cY8Bn1FIsTKWfl8ggzh9Sqrqw+lT165dwzUFix7uuXHjRvE4T5484vA3BwcHvANWJGvWrOLkdvbw4UPsrQGLOnLkiDgbiqsWMZ6okIoUgPWo3bQ1Obm4iPbs2bPo3LlzpDcINIycPn2aatWqJR8Zzx/yuI5J+f5gPXx8fKhTp06ize/nmDFj1O4S6ATvyZg9e7Zoc3CxaNEi8vb2VrtbkA48m8wVfqRN4RcuXMB1BIsUoqlZs6bYBM64jYP5rJN7thzUoKVhwuLpkyc0dNhw0hsEGi/xL5AaNWrI+fxly5YVZW15czFYJ86Jd3JyEu1JkyaJQBLAnDiXX9r8LaXclCmDFVFrlTt3bnnjLZ/Ng/Q3MDdO0ePA4uzZs/Lhnr/88ovYNwTWqVmHHuTiakjDnPfdXDp48CDpCQINInEATsWKFen48ePiooSEhIh9Gi4vl7vAOuXLl0+uQPXo0SPq06eP2l0CGzZu3LhXfsa++uoratGihap9AmVWNdzd3eUc+S1btuCyglnwRGflypUpMTFRPPbz86OVK1fKP39gvasaTdt1l89V6ta9u67O1dB9oHHgwAEqX748nTx5UlyQgIAAUc4wZ86car83oIAhQ4aIWUm2dOlSkcYCoCTOoebVsw8//FD+3uDBg6lv37640DbAw8ODRowYIT/u1q2bmLgAUBJPdHLRCGm2mzd9c+VL6fcXWP+qRu58fqK9ZfNmmjNnDumFrgONFStWiBQpXtFgBQoUEBs4fX191e4aKIRngiZMmCA/7ty5s/x+A2QUn/bN1aVGjhwpf2/48OE0bNgwXFwb0r17dypWrJho80DQOKgEyKgNGzZQqVKl5KwKHoPwWITHJGAbnJxdqNenX8qPe/XuTUePHiU90GWgwSVPeTm8Tp06dPfuXfE9/iXCH3ZOtwHb0qxZM2rSpIlo37p1SzzmKjIAGbF3714qXry4vErGpbC//vpr5PHbIM6P57QpaWM4l8z++eef1e4WWDlOo+FJiWrVqtHNmzfF98LCwsQp0oGBgWp3DxRWonwVqtnYcPDzwwcPqHGTJnTnzh2bv866CzR2794tNmdyFSIpR44P1eIPNpYobRNv6J81a5bId2U7duwQJYylg9QATJ2oGDVqFJUsWZIOHz4svufm5ib2dWEfkO0KDw9/ZXW0bdu2cgljAFPxyhjvDeX0Xul3EW8C37p1q/y7CmxPj4+Gk29AsGgfPHBAjD/5d4ot002gcfXqVbH8zTOQHGxIpSc54OAa95kzZ1a7i2BG2bJlo99//10+gIvbvFGXK8kApAVPTHC6ZUREBH388cfiUD5WpEgREbzWrl0bF9LGde3aVT5bgwcH/J5zHj1AWnHJ2n79+olqUhxUSKuhXL6W9xHy+S1gu1wyu9KQb+ZQFvds4vHatWupXr36dP/+fbJVNh9o8OEoPMuYP39+mj59uryKERoaStu3bxe5tvwhB9tXtGhRkeYiHZzG7SpVqmDPBqSKZxuXLFkicqh5YHns2DF5pax///7iPsL3E7B9/J5PmzZNnLfEeHDAKbicMqenKjKQvrMxBgwYIFYreGVMWsXgAjSbN28WKxvSIbNg23wDgmn4lO/J8WX5/dWrV4miRNIeHVtjkyNsvvnzKkVsbKwIMLi2vZSTzzPaY8eOFfnVUVFRancVLIzPSuEUF2kFi2eUeGaJBw+cLwsg4Y16XD3K39+fGjZsSDt37pT/W7ly5SghIUHcS6SzWkAfeKJi8eLF4meC8coWVxjje8uhQ4fU7h5oCI87eJKifv36YoM33y+kA4G5fD7vz+DKl1yUBvQlPKokjZr1C2V2dROP9+zZI8akXBbd1lKp3nlhA9Mw/KZw3em///5b1JzmTd2vp8TwwJKXvXkFw5S9GNtOXqfDl+6Qbw6kVint9LV7VDnEi4K8DB80S+JBI28KP3PmjPy9oKAgcThXmzZtKHv27BbvE6hfwz4+Pp7++usvkSJ14sSJN/4Op0lxKVvOq9XSYZ6Xbz+i5YkXySd7ZsqUSTv90oob95+QXSaiRlH5FHvfeGJi0KBBYvBovGmcC0/wfYQHj1gt1xcOOjlw4AksHousW7fujcIjPDHB6XdckMYaK1yuOnCZrt17TLncDYURIGNOHztMQ/t2ogtnDEcsMF714jPA2rVrR56enlZ/ia0m0OC65bzP4sqVK3Tq1CmxxMQDAd6MyasTKUWA/EHu0KED9ejRI11vGAIN2ww0pApUvXr1oh9++OGV7/NggZcxOa2Kq5Hx4JKDUy0NLMF0fKvjyi58D7lw4YK4f/B9hL/27duXYgodDxZ5dZTvIfynFn8OEGhYPtCQrFq1irp06SLSdI3lypVLpFiVLl1a3Ef4IFjsBbR+PIkpjUVOnz4tj0WOHDkiZqVTOmMlT548ooAA/86x5sIzCDSU9/D+fZo86jNa8/tPr6Rg8u8eLl7EVcn4HsLp3/xzZG0TGIoGGv/++69IS+EInvMPpS8+0Mr4cXLf5w8npzy9/sWlv/hDbUoJMC5Ry7nUTZs2FadsZuRNQaBhu4GGhHPseZaaZ59SwrNQPj4+4hAl3qzHX3xGB6ficSoF59byn8ZtaUCT3J/JfY+X0uvWravrzYB8O+IKcDwrmJb7hvEXzzDzuRbJ3UeuX78u7iNpTY/j9zA6Olrk38fFxYmbu5Yh0FAv0GD8O4/z7nmvBv8eTO3wP5784j+N7yN8f3n9PsJf/LsrpftFSn9yeVReTdFiQGwpPKHwv//9T6QpmTIOMR6LvH4vkcYit2/fTnM/OKDgYJNXQKtXry4msawdAg3zePDkGe3bt5c2L5hMa1avSvHvOTo6irEI/07iIjfSfcTV1fWVe4fxvSQt9w3jNt+P+Hdfjhw5tBVo8NNwNRbpVEtL4kNtuNQkf1WqVEmUIVTiJrs88RKNWH6I7jx6RnZIR1Dc8/9eUA5XB/ooNpRqRag/w8M/u3xaJ/+Ckk6KtzQe3PLGQL3iqiv16tWz+OvyTZor0vE9hGegq1atalUB34/bz9JXa47Rs/+sYoHa4vj3k7ODHQ2pG0a1I80XNPIgdeHCheJrzZo1qp0gvmzZMl1XQeM9VFJFJ0vijd1cNILvI3zKN89A21LAx2OiYcsO0oMnzymTDf1/aeUe5WifiT6KDaFIt4f07bffiol7XilTA4/neaVfiZ9fRUscKF0xgZ+P69N7eXm98sUzQpxPHxwcLP7kQYI5zNx0ki7eVucXhV7ce/yMZmw6pYlAo1ChQjR+/HgaN26cCDR4Hwdv+OUNnpxWc/bsWXkjH5iHOWb8eKWIZ2ak+4e3t7f44gOxpHsIr4Ja23K0sR93/EPX79vWBkKl8YTRzE2nzBpo8IF+7777rvjiGfFdu3aJewiXVOc0G76PcNoez6KD9dxH+Pl49Vq6f0j3Er5v8D1Euo/weMWW8Zjoyh2UhDeneVvP0tJe0TR69GjxxfcNaSzCk6F8D+EvU1bWbCp1inPeOV+V0xP4g2n8xb/EU/se36A5YDD+4iUiNa3Yf4nG/XmU7j1CNSJzcXO2pw+qF9REoPE2/FHhGUr+gPMyOv/JgwneACh98c++1Jb+zet/Jvc9xp8BnoW0ppl0pfG14MEZz+Kk5b7x+tfr9xDOibfmAMKUe9XY1Ufp/mPcq1KSxcme+tdQ/17D6Tl3796V7yP8xXsMje8j0r2E/25K94uU/uRSy7wqZ0sz6abi68tjEd5PYco4hL84bSS5sYier6cEYyLt3KMevRyLSPcRTu97fQzytrGI9Ofr3+Ofd07501zqFAAAAAAAgMT2p/oAAAAAAMDiEGgAAAAAAIDiEGgAAAAAAIDiEGgAAAAAAIDiEGgAAAAAAAACDQAAAAAA0D6saAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAgOIQaAAAAAAAACnt/4AiWViUCI28AAAAAElFTkSuQmCC",
 31 |       "text/plain": [
 32 |        "<Figure size 1000x200 with 3 Axes>"
 33 |       ]
 34 |      },
 35 |      "metadata": {},
 36 |      "output_type": "display_data"
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "\n",
 41 |     "N = 1000\n",
 42 |     "t = np.linspace(-9, 5, N)\n",
 43 |     "dx = t[1] - t[0]\n",
 44 |     "p = (\n",
 45 |     "    stats.norm(0, 1).pdf(t)\n",
 46 |     "    + stats.norm(-2.8, 1.5).pdf(t)\n",
 47 |     ") / 2\n",
 48 |     "areas = np.zeros([N, N])\n",
 49 |     "for i in range(N):\n",
 50 |     "    for j in range(i, N):\n",
 51 |     "        areas[i, j] = integrate.trapezoid(p[i:j], dx=dx)\n",
 52 |     "cred = 0.8\n",
 53 |     "ends = np.argmin((areas - cred)**2, axis=1)\n",
 54 |     "fin = np.where(ends == N-1)[0][0]\n",
 55 |     "starts = np.arange(fin)\n",
 56 |     "ends = ends[:fin]\n",
 57 |     "sizes = ends - starts\n",
 58 |     "\n",
 59 |     "best_start = np.argmin(sizes)\n",
 60 |     "lefty_start = best_start - 120\n",
 61 |     "righty_start = fin-2\n",
 62 |     "\n",
 63 |     "color = 'tab:blue'\n",
 64 |     "f, axes = plt.subplots(1, 3, figsize=(10, 2), dpi=100)\n",
 65 |     "for start, ax in zip([lefty_start, best_start, righty_start], axes.flatten()):\n",
 66 |     "    ax.plot(t, p, color='black', lw=2)\n",
 67 |     "    end = ends[start]\n",
 68 |     "    ax.fill_between(t[start:end], p[start:end], color=color, alpha=0.3)\n",
 69 |     "    ax.axis('off')\n",
 70 |     "    ax.plot([t[start], t[end]], [0, 0], color=color, lw=2)\n",
 71 |     "    \n",
 72 |     "f.savefig('figures/credible_interval_comparison.png')"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "id": "76c73e7e",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": []
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "venv",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.13.5"
101 |   }
102 |  },
103 |  "nbformat": 4,
104 |  "nbformat_minor": 5
105 | }
106 | 


--------------------------------------------------------------------------------