├── .maint ├── contributors.json └── creators.json ├── .gitignore ├── tox.ini ├── requirements.txt ├── Makefile ├── README.md ├── notebooks ├── data │ └── orings.csv ├── _toc.yml ├── utils │ └── utils.py ├── utils.py ├── _config.yml ├── 11-ModelingCategoricalRelationships.py ├── 08-HypothesisTesting.py ├── index.md ├── 09-StatisticalPower.py ├── 11-ModelingCategoricalRelationships.ipynb ├── 05-Probability.py ├── 07-ResamplingAndSimulation.py ├── 09-StatisticalPower.ipynb ├── 06-Sampling.py ├── 08-HypothesisTesting.ipynb ├── 07-ResamplingAndSimulation.ipynb ├── 10-BayesianStatistics.py ├── 05-Probability.ipynb ├── 03-DataVisualization.py ├── 04-FittingSimpleModels.py ├── 02-SummarizingData.py ├── 10-BayesianStatistics.ipynb ├── 13-GeneralLinearModel.py ├── 06-Sampling.ipynb └── 03-DataVisualization.ipynb ├── .zenodo.json ├── .github └── workflows │ └── deploy-book.yml ├── LICENSE.md └── CONTRIBUTING.md /.maint/contributors.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, E128, E402, W291, E127, W293, E265 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book 2 | matplotlib 3 | seaborn 4 | numpy 5 | pandas 6 | scikit-learn 7 | nhanes 8 | jupytext 9 | -------------------------------------------------------------------------------- /.maint/creators.json: -------------------------------------------------------------------------------- 1 | 2 | [ 3 | { 4 | "affiliation": "Department of Psychology, Stanford University", 5 | "name": "Poldrack, Russell A.", 6 | "orcid": "0000-0001-6755-0259" 7 | } 8 | ] 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | jupyter-book build notebooks/ 3 | cleanbuild: clean 4 | jupyter-book build notebooks/ 5 | clean: 6 | -rm -rf notebooks/_build/ 7 | 8 | current_dir = $(shell pwd) 9 | 10 | shell: 11 | docker run -it -p 9994:9994 -v $(shell pwd):/book -w /book --platform linux/x86_64 --entrypoint=bash $(DOCKER_USERNAME)/statsthinking21 12 | 13 | # docker run --platform linux/x86_64 -p 9994:9994 -it --entrypoint=bash -v $(current_dir):/analysis poldrack/statsthinking21 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![deploy-book](https://github.com/statsthinking21/statsthinking21-python/workflows/deploy-book/badge.svg) 2 | 3 | This repo houses the in-progress Python companion to [Statistical Thinking for the 21st Century](https://statsthinking21.org/). 4 | 5 | The ultimate goal of this companion is to replicate all of the R code used in both the core statistics book and the R companion. 6 | 7 | Please see the [Contributor's guide](CONTRIBUTING.md) for more details on how to contribute. 8 | -------------------------------------------------------------------------------- /notebooks/data/orings.csv: -------------------------------------------------------------------------------- 1 | ,Temperature,Erosion,Blowby,Total,DamageIndex 2 | 1,53,3,2,5,11 3 | 2,57,1,0,1,4 4 | 3,58,1,0,1,4 5 | 4,63,1,0,1,2 6 | 5,66,0,0,0,0 7 | 6,67,0,0,0,0 8 | 7,67,0,0,0,0 9 | 8,67,0,0,0,0 10 | 9,68,0,0,0,0 11 | 10,69,0,0,0,0 12 | 11,70,1,0,1,4 13 | 12,70,0,0,0,0 14 | 13,70,1,0,1,4 15 | 14,70,0,0,0,0 16 | 15,72,0,0,0,0 17 | 16,73,0,0,0,0 18 | 17,75,0,0,0,0 19 | 18,75,0,2,1,4 20 | 19,76,0,0,0,0 21 | 20,76,0,0,0,0 22 | 21,78,0,0,0,0 23 | 22,79,0,0,0,0 24 | 23,81,0,0,0,0 -------------------------------------------------------------------------------- /notebooks/_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-book 2 | root: index 3 | chapters: 4 | - file: 01-IntroductionToPython.ipynb 5 | - file: 02-SummarizingData.ipynb 6 | - file: 03-DataVisualization.ipynb 7 | - file: 04-FittingSimpleModels.ipynb 8 | - file: 05-Probability.ipynb 9 | - file: 06-Sampling.ipynb 10 | - file: 07-ResamplingAndSimulation.ipynb 11 | - file: 08-HypothesisTesting.ipynb 12 | - file: 09-StatisticalPower.ipynb 13 | - file: 10-BayesianStatistics.ipynb 14 | - file: 11-ModelingCategoricalRelationships.ipynb 15 | - file: 13-GeneralLinearModel.ipynb 16 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Python companion to Statistical Thinking for the 21st Century", 3 | "description": "

This is a companion to the main statistical text, which provides an introduction to how to accomplish statistical computations using Python.

", 4 | "contributors": [ 5 | { 6 | "affiliation": "Department of Psychology, Stanford University", 7 | "name": "Poldrack, Russell A.", 8 | "orcid": "0000-0001-6755-0259" 9 | } 10 | ], 11 | "keywords": [ 12 | "statistics", 13 | "Python", 14 | "data science", 15 | "preprocessing" 16 | ], 17 | "license": "CC-BY-NC", 18 | "related_identifiers": [ 19 | { 20 | "identifier": "https://statsthinking21.org", 21 | "relation": "documents", 22 | "scheme": "url" 23 | } 24 | ], 25 | "upload_type": "book" 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/deploy-book.yml: -------------------------------------------------------------------------------- 1 | name: deploy-book 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - master 8 | 9 | # This job installs dependencies, build the book, and pushes it to `gh-pages` 10 | jobs: 11 | deploy-book: 12 | runs-on: ubuntu-latest 13 | container: poldrack/statsthinking21 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | # Build the book 19 | - name: Build the book 20 | run: | 21 | jupyter-book build notebooks 22 | 23 | # save artifacts 24 | - name: Archive production artifacts 25 | uses: actions/upload-artifact@v4 26 | with: 27 | name: notebooks 28 | path: notebooks 29 | 30 | # Push the book's HTML to github-pages 31 | - name: GitHub Pages action 32 | uses: peaceiris/actions-gh-pages@v3.6.1 33 | with: 34 | github_token: ${{ secrets.GITHUB_TOKEN }} 35 | publish_dir: ./notebooks/_build/html 36 | -------------------------------------------------------------------------------- /notebooks/utils/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def threshold_df_correlation(df, thresh=0.3, 6 | use_absolute_corr=True, show_corrs=True): 7 | """ 8 | return correlations in a data frame that exceed a particular threshold 9 | """ 10 | cc = df.corr() 11 | if use_absolute_corr: 12 | cc = np.abs(cc) 13 | cc_exceeds_thresh = np.where(cc > thresh) 14 | exceedence = pd.DataFrame() 15 | for variable_idx in range(len(cc_exceeds_thresh[0])): 16 | row = cc_exceeds_thresh[0][variable_idx] 17 | col = cc_exceeds_thresh[1][variable_idx] 18 | if row == col: 19 | continue 20 | exceedence.loc[variable_idx, 'rowvar'] = cc.index[row] 21 | exceedence.loc[variable_idx, 'colvar'] = cc.index[col] 22 | exceedence.loc[variable_idx, 'corr'] = cc.iloc[row, col] 23 | if show_corrs: 24 | with pd.option_context('display.max_rows', None): 25 | print(exceedence) 26 | 27 | return(exceedence) 28 | -------------------------------------------------------------------------------- /notebooks/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def threshold_df_correlation(df, thresh=0.3, 6 | use_absolute_corr=True, show_corrs=False): 7 | """ 8 | return correlations in a data frame that exceed a particular threshold 9 | """ 10 | cc = df.corr() 11 | if use_absolute_corr: 12 | cc = np.abs(cc) 13 | cc_exceeds_thresh = np.where(cc > thresh) 14 | exceedence = pd.DataFrame() 15 | for variable_idx in range(len(cc_exceeds_thresh[0])): 16 | row = cc_exceeds_thresh[0][variable_idx] 17 | col = cc_exceeds_thresh[1][variable_idx] 18 | if row == col: 19 | continue 20 | exceedence.loc[variable_idx, 'rowvar'] = cc.index[row] 21 | exceedence.loc[variable_idx, 'colvar'] = cc.index[col] 22 | exceedence.loc[variable_idx, 'corr'] = cc.iloc[row, col] 23 | if show_corrs: 24 | with pd.option_context('display.max_rows', None): 25 | print(exceedence) 26 | 27 | return(exceedence) 28 | 29 | 30 | def find_matching_variables(df, substring): 31 | """ 32 | find variables in a data frame that contain a substring 33 | """ 34 | return([i for i in df.columns if i.find(substring) > -1]) 35 | -------------------------------------------------------------------------------- /notebooks/_config.yml: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # Book settings 3 | title : Python Companion to Statistical Thinking in the 21st Century 4 | author: Russell A. Poldrack 5 | # logo: 'qe-logo-large.png' 6 | 7 | # Information about where the book exists on the web 8 | description: >- 9 | A guide to statistical data analysis using Python 10 | 11 | ####################################################################################### 12 | # Execution settings 13 | execute: 14 | execute_notebooks : cache 15 | 16 | ####################################################################################### 17 | # HTML-specific settings 18 | html: 19 | home_page_in_navbar : false 20 | 21 | # ####################################################################################### 22 | # Interact link settings 23 | notebook_interface : "notebook" 24 | 25 | ####################################################################################### 26 | # Launch button settings 27 | repository: 28 | url : https://github.com/statsthinking21/statsthinking21-python 29 | path_to_book : "notebooks" 30 | 31 | binder: 32 | binderhub_url : "https://mybinder.org" 33 | text : "Launch binder" 34 | 35 | latex: 36 | latex_engine : "xelatex" 37 | latex_documents: 38 | targetname: book.tex 39 | -------------------------------------------------------------------------------- /notebooks/11-ModelingCategoricalRelationships.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.4.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Modeling categorical relationships in Python 18 | # 19 | # So far we have discussed the general concept of statistical modeling and hypothesis testing, and applied them to some simple analyses. In this chapter we will focus on the modeling of *categorical* relationships, by which we mean relationships between variables that are measured qualitatively. These data are usually expressed in terms of counts; that is, for each value of the variable (or combination of values of multiple variables), how many observations take that value? For example, when we count how many people from each major are in our class, we are fitting a categorical model to the data. 20 | # As an example, we will use the NHANES dataset to ask whether there is a relationship between being a smoker and having ever had cancer (of any type). 21 | 22 | # %% 23 | from nhanes.load import load_NHANES_data 24 | nhanes_data = load_NHANES_data() 25 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 26 | 27 | # clean up smoking variables 28 | adult_nhanes_data.loc[adult_nhanes_data['SmokedAtLeast100CigarettesInLife'] == 0, 'DoYouNowSmokeCigarettes'] = 'Not at all' 29 | adult_nhanes_data.loc[:, 'SmokeNow'] = (adult_nhanes_data['DoYouNowSmokeCigarettes'] != 'Not at all') 30 | 31 | categorical_df = adult_nhanes_data[['SmokeNow', 'EverToldYouHadCancerOrMalignancy']].dropna().astype('int').rename(columns={'EverToldYouHadCancerOrMalignancy': 'HadCancer'}) 32 | 33 | 34 | # %% [markdown] 35 | # ## The Pearson Chi-squared test 36 | # The Pearson Chi-squared test is used to test for an association between two categorical variables, against the null hypothesis of independence. We will use the `statsmodels.stats.Table` function for this, which has a number of useful features. 37 | 38 | # %% 39 | import statsmodels.api as sm 40 | table = sm.stats.Table.from_data(categorical_df, shift_zeros=False) 41 | table.table_orig 42 | 43 | # %% [markdown] 44 | # We can also see the predicted frequencies under the null hypothesis of independence, which are stored in the `.fittedvalues` element: 45 | 46 | # %% 47 | table.fittedvalues 48 | 49 | # %% [markdown] 50 | # Using these, we can compute the chi-squared statistic: 51 | 52 | # %% 53 | import numpy as np 54 | orig_vector = np.ravel(table.table_orig) 55 | independence_vector = np.ravel(table.fittedvalues) 56 | squared_resid = (orig_vector - independence_vector)**2 57 | chi2 = np.sum(squared_resid/independence_vector) 58 | chi2 59 | 60 | 61 | # %% [markdown] 62 | # We can confirm this by comparing it to the result from the built-in function to compute the association: 63 | # 64 | # chi2_result = table.test_nominal_association() 65 | # print(chi2_result) 66 | 67 | # %% [markdown] 68 | # We can also see the standardized residuals: 69 | 70 | # %% 71 | table.standardized_resids 72 | 73 | 74 | # %% [markdown] 75 | # This shows that there is an unexpectedly large number of people who smoke but don't have cancer, and similarly an unexpectedly low number of smokers who report having had cancer before. Does this tell us that smoking results in lower rates of cancer? 76 | -------------------------------------------------------------------------------- /notebooks/08-HypothesisTesting.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Hypothesis testing in Python 18 | # In this chapter we will present several examples of using Python to perform hypothesis testing. 19 | # 20 | # ## Simple example: Coin-flipping 21 | # Let's say that we flipped 100 coins and observed 70 heads. We would like to use these data to test the hypothesis that the true probability is 0.5. 22 | # First let's generate our data, simulating 100,000 sets of 100 flips. We use such a large number because it turns out that it's very rare to get 70 heads, so we need many attempts in order to get a reliable estimate of these probabilties. This will take a couple of minutes to complete. 23 | 24 | # %% 25 | 26 | import numpy as np 27 | import pandas as pd 28 | 29 | num_runs = 10000 30 | 31 | 32 | def toss_coins_and_count_heads(num_coins=100, p_heads=0.5): 33 | """ 34 | flip a coin num_coins times and return number of heads 35 | """ 36 | 37 | flips = np.random.rand(num_coins) > (1 - p_heads) 38 | return(np.sum(flips)) 39 | 40 | 41 | flip_results_df = pd.DataFrame({'n_heads': np.zeros(num_runs)}) 42 | 43 | for run in range(num_runs): 44 | flip_results_df.loc[run, 'n_heads'] = toss_coins_and_count_heads() 45 | 46 | 47 | # %% [markdown] 48 | # Now we can compute the proportion of samples from the distribution observed when the true proportion of heads is 0.5. 49 | 50 | # %% 51 | import scipy.stats 52 | 53 | pvalue = 100 - scipy.stats.percentileofscore(flip_results_df, 70) 54 | pvalue 55 | 56 | # %% [markdown] 57 | # For comparison, we can also compute the p-value for 70 or more heads based on a null hypothesis of $P_{heads}=0.5$, using the binomial distribution. 58 | # 59 | # 60 | # compute the probability of 69 or fewer heads, when P(heads)=0.5 61 | 62 | # %% 63 | 64 | p_lt_70 = scipy.stats.binom.cdf(k=69, n=100, p=0.5) 65 | p_lt_70 66 | 67 | # %% [markdown] 68 | # the probability of 70 or more heads is simply the complement of p_lt_70 69 | # %% 70 | 71 | p_ge_70 = 1 - p_lt_70 72 | p_ge_70 73 | # 74 | # %% [markdown] 75 | # ## Simulating p-values 76 | # 77 | # In this exercise we will perform hypothesis testing many times in order to test whether the p-values provided by our statistical test are valid. We will sample data from a normal distribution with a mean of zero, and for each sample perform a t-test to determine whether the mean is different from zero. We will then count how often we reject the null hypothesis; since we know that the true mean is zero, these are by definition Type I errors. 78 | # 79 | # %% 80 | 81 | num_runs = 5000 82 | 83 | 84 | # create a function that will take a sample 85 | # and perform a one-sample t-test 86 | def sample_ttest(sampSize=32): 87 | """ 88 | perform a ttest on random data of n=sampSize 89 | """ 90 | 91 | ttresult = scipy.stats.ttest_1samp(np.random.normal(loc=0.0, scale=1.0, size=sampSize), 0) 92 | return(ttresult.pvalue) 93 | 94 | 95 | # create input data frame for the function 96 | sim_results_df = pd.DataFrame({'p_value': np.zeros(num_runs)}) 97 | 98 | # perform simulations 99 | for run in range(num_runs): 100 | sim_results_df.loc[run, 'p_value'] = sample_ttest() 101 | 102 | p_error = sim_results_df['p_value'] < 0.05 103 | p_error = p_error.mean(axis=0) 104 | p_error 105 | 106 | # %% [markdown] 107 | # We should see that the proportion of samples with p < .05 is about 5%. 108 | 109 | # %% 110 | -------------------------------------------------------------------------------- /notebooks/index.md: -------------------------------------------------------------------------------- 1 | # Preface 2 | 3 | **NOTE**: This book is a work in progress! Please check back regularly for updates. 4 | 5 | This book is a companion to [Statistical Thinking for the 21st Century](https://statsthinking21.org/), an open source statistical textbook. It focuses on the use of the Python statistical programming language for statistics and data analysis. 6 | 7 | 8 | ## Why Python? 9 | 10 | The original companion to *Statistical Thinking for the 21st Century* was written using the R programming language. R is very popular for statistical data analysis --- so why would we go to the trouble of creating a whole new guide for the Python language? The main reason is that Python is a serious *general-purpose* programming language, whereas R is much more tailored for data analysis and statistics. This means that if you learn to program in Python, you can do much more than you can using R. 11 | 12 | Most serious software engineers would agree with me that Python is a much better language for programming in general compared to R. One of the main reasons that I prefer Python is that it is much pickier than R; in some cases R will allow the programmer to do something wrong and quietly return a nonsensical result, whereas Python would alert the programmer that something is wrong by raising an error. 13 | 14 | Finally, one important benefit of using Python is that it doesn't prevent you from using R when you need it! There is a Python library called [rpy2](https://rpy2.github.io/) that allows one to call R functions directly from within Python. Thus, if there is a tool that is only available in R, you can use it from within Python. 15 | 16 | ## The golden age of data 17 | 18 | Throughout this book I have tried when possible to use examples from real data. This is now very easy because we are swimming in open datasets, as governments, scientists, and companies are increasingly making data freely available. I think that using real datasets is important because it prepares students to work with real data rather than toy datasets, which I think should be one of the major goals of statistical training. It also helps us realize (as we will see at various points throughout the book) that data don't always come to us ready to analyze, and often need *wrangling* to help get them into shape. Using real data also shows that the idealized statistical distributions often assumed in statistical methods don't always hold in the real world -- for example, as we will see in Chapter \@ref(summarizing-data), distributions of some real-world quantities (like the number of friends on Facebook) can have very long tails that can break many standard assumptions. 19 | 20 | I apologize up front that the datasets are heavily US-centric. This is primarily because the best dataset for many of the demonstrations is the National Health and Nutrition Examination Surveys (NHANES) dataset that is available as an R package, and because many of the other complex datasets included in R (such as those in the `fivethirtyeight` package) are also based in the US. If you have suggestions for datasets from other regions, please pass them along to me! 21 | 22 | ## An open source book 23 | 24 | This book is meant to be a living document, which is why its source is available online at [https://github.com/statsthinking21/statsthinking21-python](https://github.com/statsthinking21/statsthinking21-python). If you find any errors in the book or want to make a suggestion for how to improve it, please open an issue on the Github site. Even better, submit a pull request with your suggested change. 25 | 26 | This book is licensed using the [Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) License](https://creativecommons.org/licenses/by-nc/4.0/). Please see the terms of that license for more details. 27 | 28 | ## Acknowledgements 29 | 30 | Thanks to everyone who has contributed to this project: John Butler, ... 31 | 32 | -------------------------------------------------------------------------------- /notebooks/09-StatisticalPower.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Statistical Power Analysis in Python 18 | # In this chapter we focus specifically on statistical power. We will use the NHANES dataset, so let's first set that up. 19 | # %% 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | np.random.seed(12345) 25 | 26 | from nhanes.load import load_NHANES_data 27 | 28 | nhanes_data = load_NHANES_data() 29 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 18') 30 | adult_nhanes_data = adult_nhanes_data.dropna(subset=['WeightKg']).rename(columns={'WeightKg': 'Weight'}) 31 | 32 | 33 | # %% [markdown] 34 | # ## Power analysis 35 | # 36 | # We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups. Let's say that we think that an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest. We wish to have 80% power to find the effect if it exists. We can compute the sample size needed for adequate power using the `TTestIndPower()` function: 37 | 38 | # %% 39 | 40 | import scipy.stats 41 | import statsmodels.stats.power as smp 42 | import matplotlib.pyplot as plt 43 | 44 | power_analysis = smp.TTestIndPower() 45 | sample_size = power_analysis.solve_power(effect_size=0.5, power=0.8, alpha=0.05) 46 | sample_size 47 | 48 | # %% [markdown] 49 | # Thus, about 64 participants would be needed in each group in order to test the hypothesis with adequate power. 50 | # 51 | # ## Power curves 52 | # 53 | # We can also create plots that can show us how the power to find an effect varies as a function of effect size and sample size, at the alpha specified in the power analysis. We will use the `plot_power()` function. The x-axis is defined by the `dep_var` argument, while sample sizes (nobs) and effect sizes (effect_size) are provided as arrays. 54 | # %% 55 | #+ 56 | effect_sizes = np.array([0.2, 0.5, 0.8]) 57 | sample_sizes = np.array(range(10, 500, 10)) 58 | 59 | plt.style.use('seaborn') 60 | fig = plt.figure() 61 | ax = fig.add_subplot(1, 1, 1) 62 | fig = power_analysis.plot_power( 63 | dep_var='nobs', nobs=sample_sizes, 64 | effect_size=effect_sizes, alpha=0.05, ax=ax, 65 | title='Power of Independent Samples t-test\n$\\alpha = 0.05$') 66 | 67 | #- 68 | 69 | # %% [markdown] 70 | # ## Simulating statistical power 71 | # 72 | # We can also simulate data to see whether the power analysis actually gives the right answer. 73 | # We will sample data for two groups, with a difference of 0.5 standard deviations between their underlying distributions and a sample size based on power analysis, and we will then look at how often we reject the null hypothesis. 74 | # %% 75 | #+ 76 | num_runs = 5000 77 | effectSize = 0.5 78 | 79 | # perform power analysis to get sample size 80 | power_analysis = smp.TTestIndPower() 81 | sampleSize = power_analysis.solve_power( 82 | effect_size=effectSize, power=0.8, alpha=0.05) 83 | 84 | # round up from estimated sample size 85 | sampleSize = np.int(np.ceil(sampleSize)) 86 | 87 | # create a function that will generate samples and test for 88 | # a difference between groups using a two-sample t-test 89 | 90 | 91 | def get_t_result(sampleSize, effectSize): 92 | """ 93 | perform a ttest on random data of n=sampSize 94 | """ 95 | 96 | group1 = np.random.normal(loc=0.0, scale=1.0, size=sampleSize) 97 | group2 = np.random.normal(loc=effectSize, scale=1.0, size=sampleSize) 98 | ttresult = scipy.stats.ttest_ind(group1, group2) 99 | return(ttresult.pvalue) 100 | 101 | 102 | # create input data frame for output 103 | power_sim_results = pd.DataFrame({'p_value': np.zeros(num_runs)}) 104 | 105 | for run in range(num_runs): 106 | power_sim_results.loc[run, 'p_value'] = get_t_result(sampleSize, effectSize) 107 | 108 | 109 | p_reject = np.mean(power_sim_results['p_value'] < 0.05) 110 | p_reject 111 | #- 112 | 113 | 114 | # %% [markdown] 115 | # This should return a number very close to 0.8. 116 | -------------------------------------------------------------------------------- /notebooks/11-ModelingCategoricalRelationships.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Modeling categorical relationships in Python\n", 8 | "\n", 9 | "So far we have discussed the general concept of statistical modeling and hypothesis testing, and applied them to some simple analyses. In this chapter we will focus on the modeling of *categorical* relationships, by which we mean relationships between variables that are measured qualitatively. These data are usually expressed in terms of counts; that is, for each value of the variable (or combination of values of multiple variables), how many observations take that value? For example, when we count how many people from each major are in our class, we are fitting a categorical model to the data.\n", 10 | "As an example, we will use the NHANES dataset to ask whether there is a relationship between being a smoker and having ever had cancer (of any type)." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "lines_to_next_cell": 2 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "from nhanes.load import load_NHANES_data\n", 22 | "nhanes_data = load_NHANES_data()\n", 23 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17')\n", 24 | "\n", 25 | "# clean up smoking variables\n", 26 | "adult_nhanes_data.loc[adult_nhanes_data['SmokedAtLeast100CigarettesInLife'] == 0, 'DoYouNowSmokeCigarettes'] = 'Not at all'\n", 27 | "adult_nhanes_data.loc[:, 'SmokeNow'] = (adult_nhanes_data['DoYouNowSmokeCigarettes'] != 'Not at all')\n", 28 | "\n", 29 | "categorical_df = adult_nhanes_data[['SmokeNow', 'EverToldYouHadCancerOrMalignancy']].dropna().astype('int').rename(columns={'EverToldYouHadCancerOrMalignancy': 'HadCancer'})" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## The Pearson Chi-squared test\n", 37 | "The Pearson Chi-squared test is used to test for an association between two categorical variables, against the null hypothesis of independence. We will use the `statsmodels.stats.Table` function for this, which has a number of useful features." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import statsmodels.api as sm\n", 47 | "table = sm.stats.Table.from_data(categorical_df, shift_zeros=False)\n", 48 | "table.table_orig" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "We can also see the predicted frequencies under the null hypothesis of independence, which are stored in the `.fittedvalues` element:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "table.fittedvalues" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Using these, we can compute the chi-squared statistic:" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "lines_to_next_cell": 2 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import numpy as np\n", 83 | "orig_vector = np.ravel(table.table_orig)\n", 84 | "independence_vector = np.ravel(table.fittedvalues)\n", 85 | "squared_resid = (orig_vector - independence_vector)**2\n", 86 | "chi2 = np.sum(squared_resid/independence_vector)\n", 87 | "chi2" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "We can confirm this by comparing it to the result from the built-in function to compute the association:\n", 95 | "\n", 96 | "chi2_result = table.test_nominal_association()\n", 97 | "print(chi2_result)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "We can also see the standardized residuals:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "lines_to_next_cell": 2 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "table.standardized_resids" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "This shows that there is an unexpectedly large number of people who smoke but don't have cancer, and similarly an unexpectedly low number of smokers who report having had cancer before. Does this tell us that smoking results in lower rates of cancer?" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "jupytext": { 128 | "formats": "ipynb,py:percent" 129 | }, 130 | "kernelspec": { 131 | "display_name": "Python 3", 132 | "language": "python", 133 | "name": "python3" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /notebooks/05-Probability.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.4.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Probability 18 | # In this chapter we will go over how to perform probability computations in Python. 19 | # 20 | # ## Basic probability calculations 21 | # 22 | # Let's create a vector of outcomes from one to 6, using the `np.arange()` function to create such a sequence. This function takes the minimum and maximum values as its inputs, but note that the maximum is not included in the sequence; that is, the sequence goes up to but not including the maximum. Thus, we would have to give 1 and 7 as the minimum and maximum in order to get a sequence of numbers from 1 to 6: 23 | 24 | # %% 25 | import numpy as np 26 | outcomes = np.arange(1, 7) 27 | outcomes 28 | 29 | # %% [markdown] 30 | # Now let's create a vector of logical values based on whether the outcome in each position is equal to 1. Remember that `==` tests for equality of each element in a vector: 31 | 32 | # %% 33 | outcome1isTrue = outcomes == 1 34 | outcome1isTrue 35 | 36 | # %% [markdown] 37 | # Remember that the simple probability of an outcome is number of occurrences of the outcome divided by the total number of events. To compute a probability, we can take advantage of the fact that TRUE/FALSE are equivalent to 1/0 in Python. The formula for the mean (sum of values divided by the number of values) is thus exactly the same as the formula for the simple probability! So, we can compute the probability of the event by simply taking the mean of the logical vector. 38 | 39 | # %% 40 | p1isTrue = np.mean(outcome1isTrue) 41 | p1isTrue 42 | 43 | # %% [markdown] 44 | # ## Empirical frequency 45 | # Let's walk through how [we computed empirical frequency of rain in San Francisco](https://statsthinking21.github.io/statsthinking21-core-site/probability.html#empirical-frequency). 46 | # 47 | # First we load the data: 48 | 49 | # %% 50 | #+ 51 | import pandas as pd 52 | SFrain = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-python/master/notebooks/data/SanFranciscoRain.csv') 53 | 54 | # we will remove the STATION and NAME variables 55 | # since they are identical for all rows 56 | 57 | SFrain = SFrain.drop(columns=['STATION', 'NAME']) 58 | SFrain 59 | #- 60 | 61 | # %% [markdown] 62 | # We see that the data frame contains a variable called `PRCP` which denotes the amount of rain each day. Let's create a new variable called `rainToday` that denotes whether the amount of precipitation was above zero: 63 | 64 | # %% 65 | SFrain['rainToday'] = SFrain['PRCP'] > 0 66 | SFrain 67 | 68 | # %% [markdown] 69 | # Now we will summarize the data to compute the probability of rain: 70 | 71 | # %% 72 | pRainInSF = SFrain['rainToday'].mean() 73 | pRainInSF 74 | 75 | # %% [markdown] 76 | # ## Conditional probability 77 | # Let's determine the conditional probability of someone having hearing problems, given that they are over 70 years of age, using the NHANES dataset. First, let's create a new variable called `Over70` that denotes whether each individual is over 70 or not. 78 | 79 | # %% 80 | from nhanes.load import load_NHANES_data 81 | nhanes_data = load_NHANES_data() 82 | 83 | nhanes_data['Over70'] = nhanes_data['AgeInYearsAtScreening'] > 70 84 | 85 | # %% [markdown] 86 | # Now let's create a cleaned-up dataset that only includes the over70 variable along with the variable called `HaveSeriousDifficultyHearing` that denotes whether a person reports having serious hearing difficulty (coded as 1 for "yes" and 0 for "no"). 87 | 88 | # %% 89 | hearing_data = nhanes_data[['Over70', 'HaveSeriousDifficultyHearing']].dropna() 90 | hearing_data 91 | 92 | # %% [markdown] 93 | # First, what's the probability of being over 70? 94 | 95 | # %% 96 | p_over_70 = hearing_data['Over70'].mean() 97 | p_over_70 98 | 99 | # %% [markdown] 100 | # Second, what's the probability of having hearing problems? 101 | 102 | # %% 103 | p_hearing_problem = hearing_data['HaveSeriousDifficultyHearing'].mean() 104 | p_hearing_problem 105 | 106 | # %% [markdown] 107 | # What's the probability for each combination of hearing problems/no problems and over 70/ not? We can create a table that finds the joint probability for each combination, using the `pd.crosstab()` function: 108 | 109 | # %% 110 | joint_table = pd.crosstab(hearing_data.Over70, hearing_data['HaveSeriousDifficultyHearing'], normalize=True) 111 | joint_table 112 | 113 | 114 | # %% [markdown] 115 | # Finally, what's the probability of someone having hearing problems, given that they are over 70 years of age? To do this, we limit the computation of the probability of having hearing problems to only include those people who are over 70: 116 | 117 | # %% 118 | p_hearingproblem_given_over_70 = hearing_data.query('Over70 == True')['HaveSeriousDifficultyHearing'].mean() 119 | p_hearingproblem_given_over_70 120 | 121 | # %% [markdown] 122 | # Now compute the opposite: What is the probability of being over 70 given that one has a hearing problem? 123 | 124 | # %% 125 | p_over_70_given_hearingproblem = hearing_data.query('HaveSeriousDifficultyHearing == True')['Over70'].mean() 126 | p_over_70_given_hearingproblem 127 | -------------------------------------------------------------------------------- /notebooks/07-ResamplingAndSimulation.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Resampling and simulation 18 | # 19 | # ## Generating random samples 20 | # Here we will generate random samples from a number of different distributions and plot their histograms. We could write out separate commands to plot each of our functions of interest, but that would involve repeating a lot of code, so instead we will take advantage of the fact that Python allows us to treat modules as variables. We will specify the module that creates each distribution, and then loop through them, each time incrementing the panel number. Some distributions also take specific parameters; for example, the Chi-squared distribution requires specifying the degrees of freedom. We will store those in a separate dictionary and use them as needed. 21 | 22 | # %% 23 | import scipy.stats 24 | import matplotlib.pyplot as plt 25 | 26 | num_samples = 10000 27 | 28 | plt.figure(figsize=(8, 8)) 29 | 30 | generators = {'Uniform': scipy.stats.uniform, 31 | 'Normal': scipy.stats.norm, 32 | 'Exponential': scipy.stats.expon, 33 | 'Chi-squared': scipy.stats.chi2} 34 | 35 | generator_parameters = {'Chi-squared': 10} 36 | panel_num = 1 37 | for distribution in generators: 38 | plt.subplot(2, 2, panel_num) 39 | if distribution in generator_parameters: 40 | sample = generators[distribution].rvs( 41 | generator_parameters[distribution], size=num_samples) 42 | else: 43 | sample = generators[distribution].rvs(size=num_samples) 44 | plt.hist(sample, bins=100) 45 | plt.title(distribution) 46 | plt.xlabel('Value') 47 | plt.ylabel('Density') 48 | # the following function prevents the labels from overlapping 49 | plt.tight_layout() 50 | panel_num += 1 51 | 52 | 53 | # %% [markdown] 54 | # ## Simulating the maximum finishing time 55 | # Let's simulate 5000 samples of 150 observations, collecting the maximum value from each sample, and then plotting the distribution of maxima. 56 | 57 | # %% 58 | import numpy as np 59 | import pandas as pd 60 | 61 | num_runs = 5000 62 | sample_size = 150 63 | 64 | 65 | def sample_and_return_max(sample_size, 66 | distribution=None): 67 | """ 68 | function to sample from a distribution and return maximum 69 | """ 70 | 71 | # if distribution is not specified, then use the normal 72 | if distribution is None: 73 | distribution = scipy.stats.norm 74 | 75 | sample = distribution.rvs(size=sample_size) 76 | return(np.max(sample)) 77 | 78 | 79 | sample_max_df = pd.DataFrame({'max': np.zeros(num_runs)}) 80 | 81 | for i in range(num_runs): 82 | sample_max_df.loc[i, 'max'] = sample_and_return_max(sample_size) 83 | 84 | 85 | # %% [markdown] 86 | # Now let's find the 95th percentile of the maximum distriibution. There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us: 87 | # 88 | 89 | 90 | # %% 91 | cutoff = scipy.stats.scoreatpercentile(sample_max_df['max'], 95) 92 | 93 | 94 | # %% [markdown] 95 | # Plot the histogram of the maximum values, along with a vertical line at the 95th percentile. 96 | 97 | # %% 98 | hist = plt.hist(sample_max_df['max'], bins=100) 99 | plt.ylabel('Count') 100 | plt.xlabel('Maximum value') 101 | _ = plt.axvline(x=cutoff, ymax=np.max(hist[0]), color='k') 102 | 103 | 104 | # %% [markdown] 105 | # ## The bootstrap 106 | # The bootstrap is useful for creating confidence intervals in cases where we don't have a parametric distribution. One example is for the median; let's look at how that works. We will start by implementing it by hand, to see more closely how it works. We will start by collecting a sample of individuals from the NHANES dataset, and the using the bootstrap to obtain confidence intervals on the median for the Height variable. 107 | 108 | # %% 109 | #+ 110 | from nhanes.load import load_NHANES_data 111 | nhanes_data = load_NHANES_data() 112 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 113 | adult_nhanes_data = adult_nhanes_data.dropna(subset=['StandingHeightCm']).rename(columns={'StandingHeightCm': 'Height'}) 114 | 115 | num_runs = 5000 116 | sample_size = 100 117 | 118 | # Take a sample for which we will perform the bootstrap 119 | 120 | nhanes_sample = adult_nhanes_data.sample(sample_size) 121 | 122 | # Perform the resampling 123 | 124 | bootstrap_df = pd.DataFrame({'mean': np.zeros(num_runs)}) 125 | for sampling_run in range(num_runs): 126 | bootstrap_sample = nhanes_sample.sample(sample_size, replace=True) 127 | bootstrap_df.loc[sampling_run, 'mean'] = bootstrap_sample['Height'].mean() 128 | 129 | # Compute the 2.5% and 97.5% percentiles of the distribution 130 | 131 | 132 | bootstrap_ci = [scipy.stats.scoreatpercentile(bootstrap_df['mean'], 2.5), 133 | scipy.stats.scoreatpercentile(bootstrap_df['mean'], 97.5)] 134 | 135 | #- 136 | 137 | # %% [markdown] 138 | # Let's compare the bootstrap distribution to the sampling distribution that we would expect given the sample mean and standard deviation: 139 | 140 | # %% 141 | # hist = plt.hist(bootstrap_df['mean'], 100, density=True) 142 | # 143 | # hist_bin_min = np.min(hist[1]) 144 | # hist_bin_max = np.max(hist[1]) 145 | # step_size = 0.01 146 | # x_values = np.arange(hist_bin_min, hist_bin_max, step_size) 147 | # normal_values = scipy.stats.norm.pdf( 148 | # x_values, 149 | # loc=nhanes_sample['Height'].mean(), 150 | # scale=nhanes_sample['Height'].std()/np.sqrt(sample_size)) 151 | # plt.plot(x_values, normal_values, color='r') 152 | # 153 | # 154 | 155 | # %% [markdown] 156 | # This shows that the bootstrap sampling distrbution does a good job of recapitulating the theoretical sampling distribution in this case. 157 | -------------------------------------------------------------------------------- /notebooks/09-StatisticalPower.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "lines_to_next_cell": 0 7 | }, 8 | "source": [ 9 | "# Statistical Power Analysis in Python\n", 10 | "In this chapter we focus specifically on statistical power. We will use the NHANES dataset, so let's first set that up." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "lines_to_next_cell": 2 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "\n", 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "np.random.seed(12345) \n", 26 | "\n", 27 | "from nhanes.load import load_NHANES_data\n", 28 | "\n", 29 | "nhanes_data = load_NHANES_data() \n", 30 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 18')\n", 31 | "adult_nhanes_data = adult_nhanes_data.dropna(subset=['WeightKg']).rename(columns={'WeightKg': 'Weight'})" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Power analysis\n", 39 | "\n", 40 | "We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups. Let's say that we think that an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest. We wish to have 80% power to find the effect if it exists. We can compute the sample size needed for adequate power using the `TTestIndPower()` function:" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "\n", 50 | "import scipy.stats\n", 51 | "import statsmodels.stats.power as smp\n", 52 | "import matplotlib.pyplot as plt\n", 53 | "\n", 54 | "power_analysis = smp.TTestIndPower()\n", 55 | "sample_size = power_analysis.solve_power(effect_size=0.5, power=0.8, alpha=0.05)\n", 56 | "sample_size" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "lines_to_next_cell": 0 63 | }, 64 | "source": [ 65 | "Thus, about 64 participants would be needed in each group in order to test the hypothesis with adequate power.\n", 66 | "\n", 67 | "## Power curves\n", 68 | "\n", 69 | "We can also create plots that can show us how the power to find an effect varies as a function of effect size and sample size, at the alpha specified in the power analysis. We will use the `plot_power()` function. The x-axis is defined by the `dep_var` argument, while sample sizes (nobs) and effect sizes (effect_size) are provided as arrays. " 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "#+\n", 79 | "effect_sizes = np.array([0.2, 0.5, 0.8])\n", 80 | "sample_sizes = np.array(range(10, 500, 10))\n", 81 | "\n", 82 | "plt.style.use('seaborn')\n", 83 | "fig = plt.figure()\n", 84 | "ax = fig.add_subplot(1, 1, 1)\n", 85 | "fig = power_analysis.plot_power(\n", 86 | " dep_var='nobs', nobs=sample_sizes, \n", 87 | " effect_size=effect_sizes, alpha=0.05, ax=ax, \n", 88 | " title='Power of Independent Samples t-test\\n$\\\\alpha = 0.05$')\n", 89 | "\n", 90 | "#-" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "lines_to_next_cell": 0 97 | }, 98 | "source": [ 99 | "## Simulating statistical power\n", 100 | "\n", 101 | "We can also simulate data to see whether the power analysis actually gives the right answer.\n", 102 | "We will sample data for two groups, with a difference of 0.5 standard deviations between their underlying distributions and a sample size based on power analysis, and we will then look at how often we reject the null hypothesis." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "lines_to_next_cell": 2 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "#+\n", 114 | "num_runs = 5000\n", 115 | "effectSize = 0.5\n", 116 | "\n", 117 | "# perform power analysis to get sample size\n", 118 | "power_analysis = smp.TTestIndPower()\n", 119 | "sampleSize = power_analysis.solve_power(\n", 120 | " effect_size=effectSize, power=0.8, alpha=0.05)\n", 121 | "\n", 122 | "# round up from estimated sample size\n", 123 | "sampleSize = np.int(np.ceil(sampleSize))\n", 124 | "\n", 125 | "# create a function that will generate samples and test for\n", 126 | "# a difference between groups using a two-sample t-test\n", 127 | "\n", 128 | "\n", 129 | "def get_t_result(sampleSize, effectSize):\n", 130 | " \"\"\"\n", 131 | " perform a ttest on random data of n=sampSize\n", 132 | " \"\"\"\n", 133 | " \n", 134 | " group1 = np.random.normal(loc=0.0, scale=1.0, size=sampleSize)\n", 135 | " group2 = np.random.normal(loc=effectSize, scale=1.0, size=sampleSize)\n", 136 | " ttresult = scipy.stats.ttest_ind(group1, group2)\n", 137 | " return(ttresult.pvalue)\n", 138 | "\n", 139 | "\n", 140 | "# create input data frame for output\n", 141 | "power_sim_results = pd.DataFrame({'p_value': np.zeros(num_runs)})\n", 142 | "\n", 143 | "for run in range(num_runs):\n", 144 | " power_sim_results.loc[run, 'p_value'] = get_t_result(sampleSize, effectSize)\n", 145 | "\n", 146 | "\n", 147 | "p_reject = np.mean(power_sim_results['p_value'] < 0.05)\n", 148 | "p_reject\n", 149 | "#-" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "This should return a number very close to 0.8." 157 | ] 158 | } 159 | ], 160 | "metadata": { 161 | "jupytext": { 162 | "formats": "ipynb,py:percent" 163 | }, 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 4 172 | } 173 | -------------------------------------------------------------------------------- /notebooks/06-Sampling.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Sampling 18 | # In this chapter we will learn how to use Python to understand sampling and sampling error. 19 | # 20 | # ## Sampling error 21 | # Here we will repeatedly sample from the NHANES Height variable in order to obtain the sampling distribution of the mean. First let's load the data and clean them up. 22 | 23 | # %% 24 | 25 | from nhanes.load import load_NHANES_data 26 | nhanes_data = load_NHANES_data() 27 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 28 | adult_nhanes_data = adult_nhanes_data.dropna(subset=['StandingHeightCm']).rename(columns={'StandingHeightCm': 'Height'}) 29 | 30 | 31 | # %% [markdown] 32 | # Now let's repeatedly sample 50 individuals from the dataset, compute the mean, and store the resulting values. For this we are going to use a *for loop*, which allows us to repeatedly perform a particular set of actions. 33 | 34 | # %% 35 | #+ 36 | sample_size = 50 37 | num_samples = 5000 38 | 39 | import pandas as pd 40 | import numpy as np 41 | 42 | # set up a variable to store the result 43 | sampling_results = pd.DataFrame({'mean': np.zeros(num_samples)}) 44 | 45 | for sample_num in range(num_samples): 46 | sample = adult_nhanes_data.sample(sample_size) 47 | sampling_results.loc[sample_num, 'mean'] = sample['Height'].mean() 48 | #- 49 | 50 | # %% [markdown] 51 | # Now let's plot the sampling distribution. We will also overlay the sampling distribution of the mean predicted on the basis of the population mean and standard deviation, to show that it properly describes the actual sampling distribution. We also place a vertical line at the population mean. 52 | 53 | # %% 54 | #+ 55 | import matplotlib.pyplot as plt 56 | import numpy as np 57 | import scipy.stats 58 | import seaborn as sns 59 | 60 | hist = plt.hist(sampling_results['mean'], 100, density=True) 61 | # hist[0] contains the histogram data 62 | # we need to use the maximum of those data to set 63 | # the height of the vertical line that shows the mean 64 | plt.axvline(x=adult_nhanes_data['Height'].mean(), 65 | ymax=1, color='k') 66 | 67 | # draw the normal distribution with same mean and standard deviation 68 | # as the sampling distribution 69 | hist_bin_min = np.min(hist[1]) 70 | hist_bin_max = np.max(hist[1]) 71 | step_size = 0.01 72 | x_values = np.arange(hist_bin_min, hist_bin_max, step_size) 73 | normal_values = scipy.stats.norm.pdf( 74 | x_values, 75 | loc=sampling_results['mean'].mean(), 76 | scale=sampling_results['mean'].std()) 77 | plt.plot(x_values, normal_values, color='r') 78 | #+ 79 | 80 | # %% [markdown] 81 | # ## Central limit theorem 82 | # The central limit theorem tells us that the sampling distribution of the mean becomes normal as the sample size grows. Let's test this by sampling a clearly non-normal variable and look at the normality of the results using a Q-Q plot. For example, let's look at the variable that represents annual family income. This variable is oddly distributed: 83 | 84 | # %% 85 | plt.hist(adult_nhanes_data['AnnualFamilyIncome']) 86 | 87 | 88 | # %% [markdown] 89 | # This odd distribution comes in part from the how the variable is coded, as shown [here](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.htm#INDFMIN2). Let's resample this variable 5000 times, compute the mean, and examine the distribution. To do this, we will create a function that resamples and returns the mean: 90 | 91 | 92 | # %% 93 | def sample_and_return_mean(df, variable_name, 94 | sample_size=250, num_samples=5000): 95 | """ 96 | repeatedly take samples from a particular variable in a data frame 97 | and compute the mean 98 | 99 | Parameters: 100 | ----------- 101 | df: data frame containing the data 102 | variable_name: the name of the variable to be analyzed 103 | sample_size: the number of observations to sample each time 104 | num_samples: the number of samples to take 105 | 106 | Returns: 107 | -------- 108 | sampling_distribution: data frame containing the means 109 | """ 110 | sampling_distribution = pd.DataFrame({'mean': np.zeros(num_samples)}) 111 | for sample_number in range(num_samples): 112 | sample_df = df.sample(sample_size) 113 | sampling_distribution.loc[sample_number, 'mean'] = sample_df[variable_name].mean() 114 | return(sampling_distribution) 115 | 116 | 117 | # %% [markdown] 118 | # Now, using this function, let's compute the sampling distribution for the annual family income variable and plot its histogram. 119 | 120 | 121 | # %% 122 | adult_income_data = adult_nhanes_data.dropna(subset=['AnnualFamilyIncome']) 123 | family_income_sampling_dist = sample_and_return_mean(adult_income_data, 'AnnualFamilyIncome') 124 | _ = plt.hist(family_income_sampling_dist['mean'], 100) 125 | 126 | 127 | # %% [markdown] 128 | # This distribution looks nearly normal. We can also use a quantile-quantile, or "Q-Q" plot, to examine this. We will plot two Q-Q plots; on the left we plot one for the original data, and on the right we plot one for the sampling distribution of the mean. 129 | 130 | # %% 131 | 132 | plt.figure(figsize=(12, 6)) 133 | plt.subplot(1, 2, 1) 134 | scipy.stats.probplot(adult_income_data['AnnualFamilyIncome'], plot=sns.mpl.pyplot) 135 | plt.title('Original data') 136 | 137 | plt.subplot(1, 2, 2) 138 | scipy.stats.probplot(family_income_sampling_dist['mean'], plot=sns.mpl.pyplot) 139 | plt.title('Sampling distribution') 140 | 141 | # %% [markdown] 142 | # We see that the raw data are highly non-normal, evidenced by the fact that the data values diverge greatly from the unit line. On the other hand, the sampling distribution looks much more normally distributed. 143 | # 144 | # ## Confidence intervals 145 | # 146 | # Remember that confidence intervals are intervals that will contain the population parameter in a certain proportion of samples from the population. In this example we will walk through [the simulation that was presented in the book](https://statsthinking21.github.io/statsthinking21-core-site/sampling.html#confidence-intervals) to show that this actually works properly. To do this, let's create a function that takes a sample from the NHANES population and returns the confidence interval for the mean of the `Height` variable within that sample. We will use the t distribution to obtain our confidence intervals. 147 | 148 | 149 | # %% 150 | def get_confidence_interval(df, variable_name, 151 | ci_percent=95, 152 | sample_size=50): 153 | sample_df = df.sample(sample_size) 154 | mean = sample_df[variable_name].mean() 155 | std = sample_df[variable_name].std() 156 | sem = std / np.sqrt(sample_size) 157 | t_tail_proportion = 1 - ((100 - ci_percent) / 100) / 2 158 | t_cutoff = scipy.stats.t.ppf(t_tail_proportion, sample_size - 1) 159 | upper_ci = mean + sem * t_cutoff 160 | lower_ci = mean - sem * t_cutoff 161 | return([lower_ci, upper_ci]) 162 | 163 | 164 | # %% [markdown] 165 | # Using this function, let's resample the data 1000 times and look how often the resulting interval contains the population mean. 166 | 167 | # %% 168 | 169 | num_runs = 1000 170 | 171 | ci_df = pd.DataFrame({'lower': np.zeros(num_runs), 172 | 'upper': np.zeros(num_runs)}) 173 | 174 | for i in range(num_runs): 175 | ci_df.iloc[i, :] = get_confidence_interval( 176 | adult_nhanes_data, 177 | 'Height' 178 | ) 179 | 180 | # %% [markdown] 181 | # Now we need to compute the proportion of confidence intervals that capture the population mean (which we know because we are treating the entire NHANES dataset as our population). Here we will use a trick that relies upon the fact that Python treat `True`/`False` identically to one and zero respectively. We will test for each of the confidence limits (upper and lower) whether it captures the population mean, and then we will multiply those two series of values together. This will create a new variable that is True only if both limits capture the population mean. We then simply take the mean of those truth values to compute the poportion of confidence intervals that capture the mean. 182 | 183 | # %% 184 | ci_df['captures_mean'] = (ci_df['lower'] < adult_nhanes_data['Height'].mean()) * (ci_df['upper'] > adult_nhanes_data['Height'].mean()) 185 | 186 | ci_df['captures_mean'].mean() 187 | 188 | # %% [markdown] 189 | # This number should be very close to 0.95. 190 | -------------------------------------------------------------------------------- /notebooks/08-HypothesisTesting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "yQHZoAndW5zs" 7 | }, 8 | "source": [ 9 | "# Hypothesis testing in Python\n", 10 | "In this chapter we will present several examples of using Python to perform hypothesis testing.\n", 11 | "\n", 12 | "## Simple example: Coin-flipping\n", 13 | "Let's say that we flipped 100 coins and observed 70 heads. We would like to use these data to test the hypothesis that the true probability is 0.5.\n", 14 | "First let's generate our data, simulating 200,000 sets of 100 flips. We use such a large number because it turns out that it's very rare to get 70 heads, so we need many attempts in order to get a reliable estimate of these probabilties. This will take a couple of minutes to complete." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "lines_to_next_cell": 2, 22 | "id": "zGgG23i1W5zu" 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "\n", 30 | "num_runs = 200000\n", 31 | "\n", 32 | "\n", 33 | "def toss_coins_and_count_heads(num_coins=100, p_heads=0.5):\n", 34 | " \"\"\"\n", 35 | " flip a coin num_coins times and return number of heads\n", 36 | " \"\"\"\n", 37 | "\n", 38 | " flips = np.random.rand(num_coins) > (1 - p_heads)\n", 39 | " return(np.sum(flips))\n", 40 | "\n", 41 | "\n", 42 | "flip_results_df = pd.DataFrame({'n_heads': np.zeros(num_runs)})\n", 43 | "\n", 44 | "for run in range(num_runs):\n", 45 | " flip_results_df.loc[run, 'n_heads'] = toss_coins_and_count_heads()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "Yddd8XBaW5zv" 52 | }, 53 | "source": [ 54 | "Now we can compute the proportion of samples from the distribution observed that landed on head for at least 70 times, when the true probability of heads is 0.5. " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "id": "iFKqf2rFW5zv" 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "import scipy.stats\n", 66 | "\n", 67 | "pvalue = 100 - scipy.stats.percentileofscore(flip_results_df['n_heads'], 70)\n", 68 | "print(pvalue)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "5bEDSYZuW5zv" 75 | }, 76 | "source": [ 77 | "For comparison, we can also compute the p-value for 70 or more heads based on a null hypothesis of $P_{heads}=0.5$, using the binomial distribution.\n", 78 | "\n", 79 | "\n", 80 | "compute the probability of 69 or fewer heads, when P(heads)=0.5" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "id": "F_QCV9l7W5zv" 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "\n", 92 | "p_lt_70 = scipy.stats.binom.cdf(k=69, n=100, p=0.5)\n", 93 | "p_lt_70" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "lines_to_next_cell": 0, 100 | "id": "hQeCiqjHW5zv" 101 | }, 102 | "source": [ 103 | "the probability of 70 or more heads is simply the complement of p_lt_70" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "lines_to_next_cell": 0, 111 | "id": "xgO_h8PNW5zv" 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "\n", 116 | "p_ge_70 = 1 - p_lt_70\n", 117 | "p_ge_70\n", 118 | "#" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "source": [ 124 | "## Performing t-test with Python\n", 125 | "Let's draw a sample of 250 participants from the \"population\" who participated the NHANES study\n" 126 | ], 127 | "metadata": { 128 | "id": "Jb361YaoZQG4" 129 | } 130 | }, 131 | { 132 | "cell_type": "code", 133 | "source": [ 134 | "! pip install nhanes\n", 135 | "from nhanes.load import load_NHANES_data\n", 136 | "nhanes_data = load_NHANES_data()\n", 137 | "\n" 138 | ], 139 | "metadata": { 140 | "id": "oYoivOQeZ2nt" 141 | }, 142 | "execution_count": null, 143 | "outputs": [] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "source": [ 148 | "import numpy as np\n", 149 | "import seaborn as sns\n", 150 | "sample_size = 250\n", 151 | "nhanes_data['PhysActive'] = np.logical_or(nhanes_data['VigorousRecreationalActivities'], nhanes_data['ModerateRecreationalActivities'])\n", 152 | "print('Unique values in PhysActive:',nhanes_data['PhysActive'].unique())\n", 153 | "\n", 154 | "sample = nhanes_data.dropna(subset=['PhysActive', 'BodyMassIndexKgm2']).sample(sample_size)\n", 155 | "sns.boxplot(data=sample, x=\"PhysActive\", y=\"BodyMassIndexKgm2\")" 156 | ], 157 | "metadata": { 158 | "id": "BjiYATqLc_lA" 159 | }, 160 | "execution_count": null, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "source": [ 166 | "We will use `scipy.stats.ttest_ind` to perform t-test between two independently drawn samples." 167 | ], 168 | "metadata": { 169 | "id": "xtZofsU6qGU3" 170 | } 171 | }, 172 | { 173 | "cell_type": "code", 174 | "source": [ 175 | "from scipy.stats import ttest_ind\n", 176 | "# By default, ttest_ind assumes equal variance of the two samples\n", 177 | "print('assuming equal variance of the two population:')\n", 178 | "t, p = ttest_ind(sample.query('PhysActive==1.0')['BodyMassIndexKgm2'], sample.query('PhysActive==0.0')['BodyMassIndexKgm2'])\n", 179 | "print('t-statistic:', t)\n", 180 | "print('p-value:', p)\n", 181 | "\n", 182 | "# If we don't make the assumption, the result may be slightly different:\n", 183 | "print('without assuming equal variance of the two populations:')\n", 184 | "t, p = ttest_ind(sample.query('PhysActive==1.0')['BodyMassIndexKgm2'], sample.query('PhysActive==0.0')['BodyMassIndexKgm2'], equal_var=False)\n", 185 | "print('t-statistic:', t)\n", 186 | "print('p-value:', p)\n" 187 | ], 188 | "metadata": { 189 | "id": "gSmTuPZmlqsn" 190 | }, 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "lines_to_next_cell": 0, 198 | "id": "dhBkcLs9W5zw" 199 | }, 200 | "source": [ 201 | "## Simulating p-values\n", 202 | "\n", 203 | "In this exercise we will perform hypothesis testing many times in order to test whether the p-values provided by our statistical test are valid. We will sample data from a normal distribution with a mean of zero, and for each sample perform a t-test to determine whether the mean is different from zero. We will then count how often we reject the null hypothesis; since we know that the true mean is zero, these are by definition Type I errors.\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "id": "cUMidxAmW5zw" 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "\n", 215 | "num_runs = 5000\n", 216 | "\n", 217 | "\n", 218 | "# create a function that will take a sample\n", 219 | "# and perform a one-sample t-test\n", 220 | "def sample_ttest(sampSize=32):\n", 221 | " \"\"\"\n", 222 | " perform a ttest on random data of n=sampSize\n", 223 | " \"\"\"\n", 224 | "\n", 225 | " ttresult = scipy.stats.ttest_1samp(np.random.normal(loc=0.0, scale=1.0, size=sampSize), 0)\n", 226 | " return(ttresult.pvalue)\n", 227 | "\n", 228 | "\n", 229 | "# create input data frame for the function\n", 230 | "sim_results_df = pd.DataFrame({'p_value': np.zeros(num_runs)})\n", 231 | "\n", 232 | "# perform simulations\n", 233 | "for run in range(num_runs):\n", 234 | " sim_results_df.loc[run, 'p_value'] = sample_ttest()\n", 235 | "\n", 236 | "p_error = sim_results_df['p_value'] < 0.05\n", 237 | "p_error = p_error.mean(axis=0)\n", 238 | "p_error" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "id": "V6u-l5WJW5zw" 245 | }, 246 | "source": [ 247 | "We should see that the proportion of samples with p < .05 is about 5%." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "id": "AhY-KrHcW5zw" 255 | }, 256 | "outputs": [], 257 | "source": [] 258 | } 259 | ], 260 | "metadata": { 261 | "jupytext": { 262 | "formats": "ipynb,py:percent" 263 | }, 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "colab": { 270 | "provenance": [] 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 0 275 | } -------------------------------------------------------------------------------- /notebooks/07-ResamplingAndSimulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "iYBJesVTuILl" 7 | }, 8 | "source": [ 9 | "# Resampling and simulation\n", 10 | "\n", 11 | "## Generating random samples\n", 12 | "Here we will generate random samples from a number of different distributions and plot their histograms. We could write out separate commands to plot each of our functions of interest, but that would involve repeating a lot of code, so instead we will take advantage of the fact that Python allows us to treat modules as variables. We will specify the module that creates each distribution, and then loop through them, each time incrementing the panel number. Some distributions also take specific parameters; for example, the Chi-squared distribution requires specifying the degrees of freedom. We will store those in a separate dictionary and use them as needed." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "lines_to_next_cell": 2, 20 | "id": "SFat-Ll-uILm" 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import scipy.stats\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "\n", 27 | "num_samples = 20000\n", 28 | "\n", 29 | "plt.figure(figsize=(8, 8))\n", 30 | "\n", 31 | "generators = {'Uniform': scipy.stats.uniform,\n", 32 | " 'Normal': scipy.stats.norm,\n", 33 | " 'Exponential': scipy.stats.expon,\n", 34 | " 'Chi-squared': scipy.stats.chi2}\n", 35 | "\n", 36 | "generator_parameters = {'Chi-squared': 10}\n", 37 | "panel_num = 1\n", 38 | "for distribution in generators:\n", 39 | " plt.subplot(2, 2, panel_num)\n", 40 | " if distribution in generator_parameters:\n", 41 | " sample = generators[distribution].rvs(\n", 42 | " generator_parameters[distribution], size=num_samples)\n", 43 | " else:\n", 44 | " sample = generators[distribution].rvs(size=num_samples)\n", 45 | " plt.hist(sample, bins=100, density=True)\n", 46 | " plt.title(distribution)\n", 47 | " plt.xlabel('Value')\n", 48 | " plt.ylabel('Density')\n", 49 | " # the following function prevents the labels from overlapping\n", 50 | " plt.tight_layout()\n", 51 | " panel_num += 1\n", 52 | "plt.show()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "id": "sXRGnsxvuILm" 59 | }, 60 | "source": [ 61 | "## Simulating the maximum finishing time\n", 62 | "Let's simulate 5000 samples of 150 observations, collecting the maximum value from each sample, and then plotting the distribution of maxima." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "lines_to_next_cell": 2, 70 | "id": "ovEu_Lm_uILn" 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "import numpy as np\n", 75 | "import pandas as pd\n", 76 | "\n", 77 | "num_runs = 5000\n", 78 | "sample_size = 150\n", 79 | "\n", 80 | "\n", 81 | "def sample_and_return_max(sample_size,\n", 82 | " distribution=None):\n", 83 | " \"\"\"\n", 84 | " function to sample from a distribution and return maximum\n", 85 | " \"\"\"\n", 86 | "\n", 87 | " # if distribution is not specified, then use the normal\n", 88 | " if distribution is None:\n", 89 | " distribution = scipy.stats.norm\n", 90 | "\n", 91 | " sample = distribution.rvs(size=sample_size)\n", 92 | " return(np.max(sample))\n", 93 | "\n", 94 | "\n", 95 | "sample_max_df = pd.DataFrame({'max': np.zeros(num_runs)})\n", 96 | "\n", 97 | "for i in range(num_runs):\n", 98 | " sample_max_df.loc[i, 'max'] = sample_and_return_max(sample_size, distribution=scipy.stats.norm(loc=5,scale=1))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "lines_to_next_cell": 2, 105 | "id": "Lxa9SHX7uILn" 106 | }, 107 | "source": [ 108 | "Now let's find the 99th percentile of the maximum distriibution. There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us:\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "lines_to_next_cell": 2, 116 | "id": "YsMZgfXCuILn" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "cutoff = scipy.stats.scoreatpercentile(sample_max_df['max'], 99)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "MD96LFTWuILn" 127 | }, 128 | "source": [ 129 | "Plot the histogram of the maximum values, along with a vertical line at the 95th percentile." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "lines_to_next_cell": 2, 137 | "id": "ekKmMab-uILo" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "hist = plt.hist(sample_max_df['max'], bins=100)\n", 142 | "plt.ylabel('Count')\n", 143 | "plt.xlabel('Maximum value')\n", 144 | "_ = plt.axvline(x=cutoff, ymax=np.max(hist[0]), color='k')\n", 145 | "plt.show()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "TO7rg7DWuILo" 152 | }, 153 | "source": [ 154 | "## The bootstrap\n", 155 | "The bootstrap is useful for creating confidence intervals in cases where we don't have a parametric distribution. One example is for the median; let's look at how that works. We will start by implementing it by hand, to see more closely how it works. We will start by collecting a sample of individuals from the NHANES dataset, and the using the bootstrap to obtain confidence intervals on the median for the Height variable." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "id": "AzTS0T5ruILo" 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "#+\n", 167 | "! pip install nhanes\n", 168 | "from nhanes.load import load_NHANES_data\n", 169 | "nhanes_data = load_NHANES_data()\n", 170 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17')\n", 171 | "adult_nhanes_data = adult_nhanes_data.dropna(subset=['StandingHeightCm']).rename(columns={'StandingHeightCm': 'Height'})\n", 172 | "\n", 173 | "num_runs = 5000\n", 174 | "sample_size = 100\n", 175 | "\n", 176 | "# Take a sample for which we will perform the bootstrap\n", 177 | "\n", 178 | "nhanes_sample = adult_nhanes_data.sample(sample_size)\n", 179 | "\n", 180 | "# Perform the resampling\n", 181 | "\n", 182 | "bootstrap_df = pd.DataFrame({'mean': np.zeros(num_runs)})\n", 183 | "for sampling_run in range(num_runs):\n", 184 | " bootstrap_sample = nhanes_sample.sample(sample_size, replace=True)\n", 185 | " bootstrap_df.loc[sampling_run, 'mean'] = bootstrap_sample['Height'].mean()\n", 186 | "\n", 187 | "# Compute the 2.5% and 97.5% percentiles of the distribution\n", 188 | "\n", 189 | "\n", 190 | "bootstrap_ci = [scipy.stats.scoreatpercentile(bootstrap_df['mean'], 2.5),\n", 191 | " scipy.stats.scoreatpercentile(bootstrap_df['mean'], 97.5)]\n", 192 | "\n", 193 | "#-" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "id": "MwlWGg6quILo" 200 | }, 201 | "source": [ 202 | "Let's compare the bootstrap distribution to the sampling distribution that we would expect given the sample mean and standard deviation:" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "id": "5LE9i0KyuILp" 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "hist = plt.hist(bootstrap_df['mean'], 100, density=True)\n", 214 | "\n", 215 | "hist_bin_min = np.min(hist[1])\n", 216 | "hist_bin_max = np.max(hist[1])\n", 217 | "step_size = 0.01\n", 218 | "x_values = np.arange(hist_bin_min, hist_bin_max, step_size)\n", 219 | "normal_values = scipy.stats.norm.pdf(\n", 220 | " x_values,\n", 221 | " loc=nhanes_sample['Height'].mean(),\n", 222 | " scale=nhanes_sample['Height'].std()/np.sqrt(sample_size))\n", 223 | "plt.plot(x_values, normal_values, color='r')\n", 224 | "plt.legend([' Normal distribution based on sample mean and SEM','Means of bootstrap samples'])\n", 225 | "plt.xlabel('Height (cm)')\n", 226 | "plt.show()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "id": "AsrX_9NhuILp" 233 | }, 234 | "source": [ 235 | "This shows that the bootstrap sampling distrbution does a good job of recapitulating the theoretical sampling distribution in this case." 236 | ] 237 | } 238 | ], 239 | "metadata": { 240 | "jupytext": { 241 | "formats": "ipynb,py:percent" 242 | }, 243 | "kernelspec": { 244 | "display_name": "Python 3", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "colab": { 249 | "provenance": [] 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 0 254 | } -------------------------------------------------------------------------------- /notebooks/10-BayesianStatistics.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Bayesian Statistics in Python 18 | # In this chapter we will introduce how to basic Bayesian computations using Python. 19 | # 20 | # ## Applying Bayes' theorem: A simple example 21 | # TBD: MOVE TO MULTIPLE TESTING EXAMPLE SO WE CAN USE BINOMIAL LIKELIHOOD 22 | # A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive. What is the likelihood that they actually have COVID-19, as opposed to a regular cold or flu? We can use Bayes' theorem to compute this. Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074. The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%. Its sensitivity is not known, but probably is no higher than 90%. 23 | # First let's look at the probability of disease given a single positive test. 24 | 25 | # %% 26 | 27 | sensitivity = 0.90 28 | specificity = 0.99 29 | prior = 0.074 30 | likelihood = sensitivity # p(test|disease present) 31 | marginal_likelihood = sensitivity * prior + (1 - specificity) * (1 - prior) 32 | posterior = (likelihood * prior) / marginal_likelihood 33 | posterior 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | # %% [markdown] 42 | # The high specificity of the test, along with the relatively high base rate of the disease, means that most people who test positive actually have the disease. 43 | # Now let's plot the posterior as a function of the prior. Let's first create a function to compute the posterior, and then apply this with a range of values for the prior. 44 | 45 | # %% 46 | 47 | import numpy as np 48 | import pandas as pd 49 | import scipy.stats 50 | import matplotlib.pyplot as plt 51 | 52 | 53 | def compute_posterior(prior, sensitivity, specificity): 54 | likelihood = sensitivity # p(test|disease present) 55 | marginal_likelihood = sensitivity * prior + (1 - specificity) * (1 - prior) 56 | posterior = (likelihood * prior) / marginal_likelihood 57 | return(posterior) 58 | 59 | 60 | prior_values = np.arange(0.001, 0.5, 0.001) 61 | posterior_values = compute_posterior(prior_values, sensitivity, specificity) 62 | 63 | plt.plot(prior_values, posterior_values) 64 | plt.xlabel('prior') 65 | _ = plt.ylabel('posterior') 66 | 67 | # %% [markdown] 68 | # This figure highlights a very important general point about diagnostic testing: Even when the test has high specificity, if a condition is rare then most positive test results will be false positives. 69 | # 70 | # ## Estimating posterior distributions 71 | # In this example we will look at how to estimate entire posterior distributions. 72 | # We will implement the [drug testing example](https://statsthinking21.github.io/statsthinking21-core-site/bayesian-statistics.html#estimating-posterior-distributions) from the book. In that example, we administered a drug to 100 people, and found that 64 of them responded positively to the drug. What we want to estimate is the probability distribution for the proportion of responders, given the data. For simplicity we started with a uniform prior; that is, all proprtions of responding are equally likely to begin with. In addition, we will use a discrete probability distribution; that is, we will estimate the posterior probabiilty for each particular proportion of responders, in steps of 0.01. This greatly simplifies the math and still retains the main idea. 73 | 74 | # %% 75 | #+ 76 | num_responders = 64 77 | num_tested = 100 78 | 79 | bayes_df = pd.DataFrame({'proportion': np.arange(0.0, 1.01, 0.01)}) 80 | 81 | # compute the binomial likelihood of the observed data for each 82 | # possible value of proportion 83 | bayes_df['likelihood'] = scipy.stats.binom.pmf(num_responders, 84 | num_tested, 85 | bayes_df['proportion']) 86 | # The prior is equal for all possible values 87 | bayes_df['prior'] = 1 / bayes_df.shape[0] 88 | 89 | # compute the marginal likelihood by adding up the likelihood of each possible proportion times its prior probability. 90 | 91 | marginal_likelihood = (bayes_df['likelihood'] * bayes_df['prior']).sum() 92 | 93 | bayes_df['posterior'] = (bayes_df['likelihood'] * bayes_df['prior']) / marginal_likelihood 94 | 95 | # plot the likelihood, prior, and posterior 96 | 97 | plt.plot(bayes_df['proportion'], bayes_df['likelihood'], label='likelihood') 98 | plt.plot(bayes_df['proportion'], bayes_df['prior'], label='prior') 99 | plt.plot(bayes_df['proportion'], bayes_df['posterior'], 100 | 'k--', label='posterior') 101 | 102 | plt.legend() 103 | 104 | #- 105 | 106 | # %% [markdown] 107 | # The plot shows that the posterior and likelihood are virtually identical, which is due to the fact that the prior is uniform across all possible values. Now let's look at a case where the prior is not uniform. Let's say that we now run a larger study of 1000 people with the same treatment, and we find that 312 of the 1000 individuals respond to the treatment. In this case, we can use the posterior from the earlier study of 100 people as the prior for our new study. This is what we sometimes refer to as *Bayesian updating*. 108 | 109 | # %% 110 | #+ 111 | num_responders = 312 112 | num_tested = 1000 113 | 114 | # # copy the posterior from the previous analysis and rename it as the prior 115 | 116 | study2_df = bayes_df[['proportion', 'posterior']].rename(columns={'posterior': 'prior'}) 117 | 118 | # compute the binomial likelihood of the observed data for each 119 | # possible value of proportion 120 | 121 | study2_df['likelihood'] = scipy.stats.binom.pmf(num_responders, 122 | num_tested, 123 | study2_df['proportion']) 124 | 125 | # compute the marginal likelihood by adding up the likelihood of each possible proportion times its prior probability. 126 | 127 | marginal_likelihood = (study2_df['likelihood'] * study2_df['prior']).sum() 128 | 129 | study2_df['posterior'] = (study2_df['likelihood'] * study2_df['prior']) / marginal_likelihood 130 | 131 | # plot the likelihood, prior, and posterior 132 | 133 | plt.plot(study2_df['proportion'], study2_df['likelihood'], label='likelihood') 134 | plt.plot(study2_df['proportion'], study2_df['prior'], label='prior') 135 | plt.plot(study2_df['proportion'], study2_df['posterior'], 136 | 'k--', label='posterior') 137 | 138 | plt.legend() 139 | 140 | #- 141 | 142 | # %% [markdown] 143 | # Here we see two important things. First, we see that the prior is substantially wider than the likelihood, which occurs because there is much more data going into the likelihood (1000 data points) compared to the prior (100 data points), and more data reduces our uncertainty. Second, we see that the posterior is much closer to the value observed for the second study than for the first, which occurs for the same reason --- we put greater weight on the estimate that is more precise due to a larger sample. 144 | # 145 | # ## Bayes factors 146 | # There are no convenient off-the-shelf tools for estimating Bayes factors using Python, so we will use the `rpy2` package to access the `BayesFactor` library in R. Let's compute a Bayes factor for a T-test comparing the amount of reported alcohol computing between smokers versus non-smokers. First, let's set up the NHANES data and collect a sample of 150 smokers and 150 nonsmokers. 147 | 148 | # %% 149 | #+ 150 | from nhanes.load import load_NHANES_data 151 | nhanes_data = load_NHANES_data() 152 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 153 | rseed = 1 154 | 155 | # clean up smoking variables 156 | adult_nhanes_data.loc[adult_nhanes_data['SmokedAtLeast100CigarettesInLife'] == 0, 'DoYouNowSmokeCigarettes'] = 'Not at all' 157 | adult_nhanes_data.loc[:, 'SmokeNow'] = adult_nhanes_data['DoYouNowSmokeCigarettes'] != 'Not at all' 158 | 159 | # Create average alcohol consumption variable between the two dietary recalls 160 | adult_nhanes_data.loc[:, 'AvgAlcohol'] = adult_nhanes_data[['AlcoholGm_DR1TOT', 'AlcoholGm_DR2TOT']].mean(1) 161 | adult_nhanes_data = adult_nhanes_data.dropna(subset=['AvgAlcohol']) 162 | 163 | sample_size_per_group = 150 164 | 165 | nonsmoker_sample = adult_nhanes_data.query('SmokeNow == False').sample(sample_size_per_group, random_state=rseed)[['SmokeNow', 'AvgAlcohol']] 166 | smoker_sample = adult_nhanes_data.query('SmokeNow == True').sample(sample_size_per_group, random_state=rseed)[['SmokeNow', 'AvgAlcohol']] 167 | 168 | full_sample = pd.concat((nonsmoker_sample, smoker_sample)) 169 | full_sample.loc[:, 'SmokeNow'] = full_sample['SmokeNow'].astype('int') 170 | full_sample.groupby('SmokeNow').mean() 171 | #- 172 | 173 | 174 | # %% [markdown] 175 | # Now let's use functions from R to perform a standard t-test as well as compute a Bayes Factor for this comparison. 176 | 177 | # %% 178 | #+ 179 | 180 | # import the necessary functions from rpy2 181 | import rpy2.robjects as robjects 182 | from rpy2.robjects import r, pandas2ri 183 | from rpy2.robjects.packages import importr 184 | pandas2ri.activate() 185 | 186 | # import the BayesFactor package 187 | BayesFactor = importr('BayesFactor') 188 | 189 | # import the data frames into the R workspace 190 | robjects.globalenv["smoker_sample"] = smoker_sample 191 | robjects.globalenv["nonsmoker_sample"] = nonsmoker_sample 192 | 193 | # perform the standard t-test 194 | ttest_output = r('print(t.test(smoker_sample$AvgAlcohol, nonsmoker_sample$AvgAlcohol, alternative="greater"))') 195 | 196 | # compute the Bayes factor 197 | r('bf = ttestBF(y=nonsmoker_sample$AvgAlcohol, x=smoker_sample$AvgAlcohol, nullInterval = c(0, Inf))') 198 | r('print(bf[1]/bf[2])') 199 | 200 | #- 201 | # %% [markdown] 202 | # This shows that the difference between these groups is significant, and the Bayes factor suggests fairly strong evidence for a difference. 203 | -------------------------------------------------------------------------------- /notebooks/05-Probability.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "_FaOYtAF8lp3" 7 | }, 8 | "source": [ 9 | "# Probability\n", 10 | "In this chapter we will go over how to perform probability computations in Python.\n", 11 | "\n", 12 | "## Basic probability calculations\n", 13 | "\n", 14 | "Let's create a vector of outcomes from one to 6, using the `np.arange()` function to create such a sequence. This function takes the minimum and maximum values as its inputs, but note that the maximum is not included in the sequence; that is, the sequence goes up to but not including the maximum. Thus, we would have to give 1 and 7 as the minimum and maximum in order to get a sequence of numbers from 1 to 6:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "prt56R7U8lp5" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "outcomes = np.arange(1, 7)\n", 27 | "print(outcomes)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "KNWTWGLL8lp5" 34 | }, 35 | "source": [ 36 | "Now let's create a vector of logical values based on whether the outcome in each position is equal to 1. Remember that `==` tests for equality of each element in a vector:" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "id": "vDryGcal8lp5" 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "outcome1isTrue = outcomes == 1\n", 48 | "print(outcome1isTrue)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "id": "M3B1-MPD8lp6" 55 | }, 56 | "source": [ 57 | "Remember that the simple probability of an outcome is number of occurrences of the outcome divided by the total number of events. To compute a probability, we can take advantage of the fact that TRUE/FALSE are equivalent to 1/0 in Python. The formula for the mean (sum of values divided by the number of values) is thus exactly the same as the formula for the simple probability! So, we can compute the probability of the event by simply taking the mean of the logical vector." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "id": "u3_F8OpQ8lp6" 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "p1isTrue = np.mean(outcome1isTrue)\n", 69 | "print(p1isTrue)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "id": "lAZCaTvd8lp6" 76 | }, 77 | "source": [ 78 | "## Empirical frequency\n", 79 | "Let's walk through how [we computed empirical frequency of rain in San Francisco](https://statsthinking21.github.io/statsthinking21-core-site/probability.html#empirical-frequency).\n", 80 | "\n", 81 | "First we load the data:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "id": "Jsi4szsT8lp6" 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "#+\n", 93 | "import pandas as pd\n", 94 | "SFrain = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-python/master/notebooks/data/SanFranciscoRain.csv')\n", 95 | "\n", 96 | "# we will remove the STATION and NAME variables\n", 97 | "# since they are identical for all rows\n", 98 | "\n", 99 | "SFrain = SFrain.drop(columns=['STATION', 'NAME'])\n", 100 | "print(SFrain)\n", 101 | "#-" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "id": "_MlEaKt38lp6" 108 | }, 109 | "source": [ 110 | "We see that the data frame contains a variable called `PRCP` which denotes the amount of rain each day. Let's create a new variable called `rainToday` that denotes whether the amount of precipitation was above zero:" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "id": "-RzbJtFp8lp6" 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "SFrain['rainToday'] = SFrain['PRCP'] > 0\n", 122 | "print(SFrain)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": { 128 | "id": "xX9JA59D8lp6" 129 | }, 130 | "source": [ 131 | "Now we will summarize the data to compute the probability of rain:" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "id": "z0nMoJCU8lp7" 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "pRainInSF = SFrain['rainToday'].mean()\n", 143 | "print(pRainInSF)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "XIrFx09f8lp7" 150 | }, 151 | "source": [ 152 | "## Conditional probability\n", 153 | "Let's determine the conditional probability of someone having hearing problems, given that they are over 70 years of age, using the NHANES dataset. First, let's create a new variable called `Over70` that denotes whether each individual is over 70 or not." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "id": "nScHP-x28lp7" 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "! pip install nhanes\n", 165 | "from nhanes.load import load_NHANES_data\n", 166 | "nhanes_data = load_NHANES_data()\n", 167 | "\n", 168 | "nhanes_data['Over70'] = nhanes_data['AgeInYearsAtScreening'] > 70" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "id": "jSFI9IYJ8lp7" 175 | }, 176 | "source": [ 177 | "Now let's create a cleaned-up dataset that only includes the over70 variable along with the variable called `HaveSeriousDifficultyHearing` that denotes whether a person reports having serious hearing difficulty (coded as 1 for \"yes\" and 0 for \"no\")." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "id": "k_qa4BMJ8lp7" 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "hearing_data = nhanes_data[['Over70', 'HaveSeriousDifficultyHearing']].dropna()\n", 189 | "print(hearing_data)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "id": "9_RsFSm68lp7" 196 | }, 197 | "source": [ 198 | "First, what's the probability of being over 70?" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "id": "Db3h91eL8lp7" 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "p_over_70 = hearing_data['Over70'].mean()\n", 210 | "print(p_over_70)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "id": "gTGdXcnZ8lp7" 217 | }, 218 | "source": [ 219 | "Second, what's the probability of having hearing problems?" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "id": "bHwHewwI8lp7" 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "p_hearing_problem = hearing_data['HaveSeriousDifficultyHearing'].mean()\n", 231 | "print(p_hearing_problem)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "id": "d6uHpfiB8lp8" 238 | }, 239 | "source": [ 240 | "What's the probability for each combination of hearing problems/no problems and over 70/ not? We can create a table that finds the joint probability for each combination, using the `pd.crosstab()` function:" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "lines_to_next_cell": 2, 248 | "id": "CZsdLo-V8lp8" 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "joint_table = pd.crosstab(hearing_data['Over70'], hearing_data['HaveSeriousDifficultyHearing'], normalize=True)\n", 253 | "print(joint_table)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "id": "Pa8eEh0L8lp8" 260 | }, 261 | "source": [ 262 | "Finally, what's the probability of someone having hearing problems, given that they are over 70 years of age? To do this, we limit the computation of the probability of having hearing problems to only include those people who are over 70:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "id": "1KMwvBok8lp8" 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "p_hearingproblem_given_over_70 = hearing_data.query('Over70 == True')['HaveSeriousDifficultyHearing'].mean()\n", 274 | "print(p_hearingproblem_given_over_70)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "id": "lCAN0XG38lp8" 281 | }, 282 | "source": [ 283 | "Now compute the opposite: What is the probability of being over 70 given that one has a hearing problem?" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "id": "8y6YdAqQ8lp9" 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "p_over_70_given_hearingproblem = hearing_data.query('HaveSeriousDifficultyHearing == True')['Over70'].mean()\n", 295 | "print(p_over_70_given_hearingproblem)" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "jupytext": { 301 | "formats": "ipynb,py:percent" 302 | }, 303 | "kernelspec": { 304 | "display_name": "Python 3", 305 | "language": "python", 306 | "name": "python3" 307 | }, 308 | "colab": { 309 | "provenance": [] 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 0 314 | } -------------------------------------------------------------------------------- /notebooks/03-DataVisualization.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Data Visualization 18 | 19 | # %% [markdown] 20 | # There are two main packages that we will use for visualization in Python: [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/), which is based on matplotlib. First, let's import these. It is customary to import the pyplot module from matplotlib, since it contains most of the important plotting functions: 21 | 22 | # %% 23 | import matplotlib.pyplot as plt 24 | import seaborn as sns 25 | import numpy as np 26 | import pandas as pd 27 | 28 | 29 | # %% [markdown] 30 | # ## Let's think through a visualization 31 | # 32 | # Principles we want to keep in mind: 33 | # 34 | # * Show the data without distortion 35 | # * Use color, shape, and location to encourage comparisons 36 | # * Minimize visual clutter (maximize your information to ink ratio) 37 | # 38 | # The two questions you want to ask yourself before getting started are: 39 | # 40 | # * What type of variable(s) am I plotting? 41 | # * What comparison do I want to make salient for the viewer (possibly myself)? 42 | # 43 | # Figuring out *how* to highlight a comparison and include relevant variables usually benefits from sketching the plot out first. 44 | 45 | 46 | # %% [markdown] 47 | # ## Plotting the distribution of a single variable 48 | # 49 | # One of the most common uses of plotting is to plot the *distribution* of the data --- which you can think of as the *shape* of the data. There are various ways to do this, but one of the most common is known as a *histogram*, which plots the number of observations that fall into specific bins. We can plot a histogram using the `plt.hist()` function from matplotlib. As an example, let's look at the distribution of ages in the NHANES dataset. First we need to load the data: 50 | 51 | 52 | # %% 53 | from nhanes.load import load_NHANES_data 54 | nhanes_data = load_NHANES_data() 55 | 56 | # %% [markdown] 57 | # Then we can plot the histogram: 58 | 59 | 60 | # %% 61 | age_histogram = plt.hist(nhanes_data['AgeInYearsAtScreening']) 62 | 63 | # %% [markdown] 64 | # You can see from this plot that the `plt.hist()` function has binned together individuals across several years; That's because we let it automatically determine the size of the bins. Let's say that instead we want to bin each year separately. We can do this using the `bins` argument to `plt.hist`. Because this argument takes a list of bins, we need to create a list that spans from the youngest to the oldest age. We can do this using the `np.arange()` function from numpy, which generates a list of numbers that span a particular range. In this case, we need to span from the youngest to the oldest value, which are equivalent to the minimum and maximum values which we can obtain using the `.min()` and `.max()` operators; because Python starts at zero, we need to add one to the maximum in order to get the bins to cover the entire range: 65 | 66 | # %% 67 | bins = np.arange(nhanes_data['AgeInYearsAtScreening'].min(), nhanes_data['AgeInYearsAtScreening'].max() + 1) 68 | age_histogram_1year_bins = plt.hist(nhanes_data['AgeInYearsAtScreening'], bins=bins) 69 | 70 | # %% [markdown] 71 | # Sometimes it's more useful to look at the density rather than the counts, which we can do by setting `density=True` in our call to the histogram function: 72 | 73 | # %% 74 | age_density_1year_bins = plt.hist(nhanes_data['AgeInYearsAtScreening'], bins=bins, density=True) 75 | 76 | # %% [markdown] 77 | # Now we see the proportion of individuals that fall into each age bin. Why do you think there are so many eighty-year-olds in the dataset? Have a look at the [documentation for the Age question](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.htm#RIDAGEYR) and see if you can figure it out. 78 | 79 | # %% [markdown] 80 | # ### Bar vs. line plots 81 | # 82 | # The histograms above are an example of *bar plots* where each number is represented by a bar. We could also plot the distribution using a line instead. One reason to do this is that we can make the line a bit *smoother* than the actual data. For example, here are the histogram data from above, plotted as a line: 83 | 84 | # %% 85 | plt.plot(age_density_1year_bins[1][1:], age_density_1year_bins[0]) 86 | 87 | # %% [markdown] 88 | # Here we have taken advantage of the fact that the output of our histogram command above contains both the bins (in its [1] position) and the histogram values (in its [0]) position. Why do we include `[1:]` after the bins variable? This is because the bins include both the upper and lower edges of the bin, which means that there is one more bin value than there are average values. Adding `[1:]` is equivalent to saying "start with the second bin" which is equivalent to using the top edges of each bin for our X axis. 89 | # 90 | # Now let's plot a smoothed version of the histogram, using the `sns.distplot()` function from the seaborn library. 91 | 92 | # %% 93 | sns.distplot(nhanes_data['AgeInYearsAtScreening'], bins=bins) 94 | 95 | # %% [markdown] 96 | # You can see that the line is now much smoother (less bumpy) than the one above. It generally follows the overall shape of the data pretty closely, but you can also see that it mostly hides the large bump at 80 years. It's always important to keep in mind that anything we do to the data has the potential to distort their message. 97 | 98 | # %% [markdown] 99 | # ## Plots with two variables 100 | # 101 | # Another common use of visualization is to examine the relationship betwen two variables. For example, let's say that we wanted to plot average height as a function of age in the NHANES dataset. We would first summarize the data to obtain the average height for each age: 102 | 103 | # %% 104 | mean_height_by_age = nhanes_data.groupby('AgeInYearsAtScreening')['StandingHeightCm'].mean() 105 | 106 | # %% [markdown] 107 | # Here we use a method called `.groupby()` along with a builtin in method for computing the average of a variable in a dataframe (`.mean()`). This returns a single average height value for all of the individuals in each age group, which we can then plot. While we are at it, we will add descriptive labels to the X and Y axes, which is always a good idea: 108 | 109 | # %% 110 | plt.plot(mean_height_by_age.index, mean_height_by_age) 111 | plt.xlabel('Age at screening') 112 | plt.ylabel('Standing Height (cm)') 113 | 114 | # %% [markdown] 115 | # As expected, people get taller up to about age 18, and then then slowly shrink over time. Since we know that men and women differ in their height, we can also plot their average heights separately. We could do this using the matplot plotting function, but it's actually easier to do using the `sns.lineplot()` function from the seaborn library that we imported above. We simply give it the X and Y variables that we want to plot as well as the variable that we want to separate (using different colors), and it does the work for us: 116 | 117 | # %% 118 | sns.lineplot(x='AgeInYearsAtScreening', y='StandingHeightCm', hue='Gender', data=nhanes_data) 119 | 120 | 121 | # %% [markdown] 122 | # You will notice that the lines have shaded areas around them; these are called *confidence intervals*, and you will learn about them later in the course. They basically tell us something about the uncertainty around our estimates of the average. 123 | 124 | # %% [markdown] 125 | # ## Plotting dispersion 126 | # 127 | # An important job of statistical visualization is to show us the variability, or *dispersion*, of our data. We have already see how to do this using histograms; now let's look at how we can compare distributions. 128 | # 129 | # Let's start with a simple example: Comparing the height of adult men and women in the NHANES sample. One commonly used plot is the *box plot* (sometimes known as a *box and whiskers plot*). This form of plot uses quartiles to give us a sense of spread. The thickest line, somewhere inside the box, represents the *median*. The upper and lower bounds of the box (the *hinges*) are the first and third quartiles (can you use them to approximate the interquartile range?). The lines extending from the hinges are the remaining data points, excluding **outliers**, which are plotted as individual points. 130 | 131 | # %% 132 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 133 | sns.boxplot(x='Gender', y='StandingHeightCm', data=adult_nhanes_data) 134 | 135 | # %% [markdown] 136 | # This tells us that the median male is taller than 75% of all of the females in the sample. 137 | # 138 | # Another type of plot that is commonly used is the *violin plot*, which shows the shape of the entire distribution: 139 | 140 | # %% 141 | 142 | sns.violinplot(x='Gender', y='StandingHeightCm', data=adult_nhanes_data) 143 | 144 | # %% [markdown] 145 | # ### Scatter plot 146 | # 147 | # When we have multiple *continuous* variables, we can use points to plot each variable on an axis. This is known as a **scatter plot**. As an example, let's look at the blood pressure readings taken in the NHANES study. Each individual has their blood pressure taken three times. Here we will plot the first reading against the second reading, using a scatter plot. We will also add a line showing where the x axis is equal to the y axis, which makes it easier to see how the two variables are related to each other. 148 | 149 | # %% 150 | sns.scatterplot(x='SystolicBloodPres1StRdgMmHg', 151 | y='SystolicBloodPres2NdRdgMmHg', 152 | data=adult_nhanes_data) 153 | plt.plot([adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(), 154 | adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()], 155 | [adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(), 156 | adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()], 157 | color='k') 158 | plt.xlabel('Systolic BP - First reading') 159 | plt.ylabel('Systolic BP - Second reading') 160 | 161 | # %% [markdown] 162 | # Here we can see that the two variables are closely related to one another. We can also see that most of the blue points fall below the black line, which tells us that the second reading is generally somewhat lower than the first reading. 163 | 164 | 165 | # %% [markdown] 166 | # In this section we will recreate a version of [Figure 4.2](https://statsthinking21.github.io/statsthinking21-core-site/data-visualization.html#anatomy-of-a-plot) from Statistical Thinking in the 21st Century. 167 | 168 | # %% 169 | 170 | oring_data = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-python/master/notebooks/data/orings.csv', index_col=0) 171 | 172 | ax = sns.lineplot(x='Temperature', y='DamageIndex', data=oring_data, ci=None) 173 | plt.xlabel('Temperature at time of launch') 174 | plt.ylabel('Damage Index') 175 | ax.fill_between([26, 29], 0, 12, 176 | facecolor='red', alpha=0.3) 177 | ax.text(27, 1, 'Forecasted temperature on Jan 28', rotation=90) 178 | plt.show() 179 | 180 | 181 | # %% 182 | -------------------------------------------------------------------------------- /notebooks/04-FittingSimpleModels.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.4.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Fitting simple models 18 | # 19 | # In this chapter we will focus on how to compute the measures of central tendency and variability that were covered in the previous chapter. Most of these can be computed using a built-in Python function, but we will show how to do them manually in order to give some intuition about how they work. First let's load the NHANES data that we will use for our examples. 20 | 21 | # %% 22 | from nhanes.load import load_NHANES_data 23 | nhanes_data = load_NHANES_data() 24 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 25 | 26 | # %% [markdown] 27 | # Since we will be analyzing the `StandingHeightCm` variable, we should exclude any observations that are missing this measurement. We will also recode the variable to be called `Height` in order to simplify the coding later. 28 | 29 | # %% 30 | adult_nhanes_data = adult_nhanes_data.dropna(subset=['StandingHeightCm']).rename(columns={'StandingHeightCm': 'Height'}) 31 | 32 | 33 | # %% [markdown] 34 | # ## Mean 35 | # The mean is defined as the sum of values divided by the number of values being summed: 36 | # $$ 37 | # \bar{X} = \frac{\sum_{i=1}^{n}x_i}{n} 38 | # $$ 39 | # Let's say that we want to obtain the mean height for adults in the NHANES database (contained in the data `Height` that we generated above). We would sum the individual heights (using the `.sum()` operator) and then divide by the number of values: 40 | 41 | # %% 42 | adult_nhanes_data['Height'].sum() / adult_nhanes_data['Height'].shape[0] 43 | 44 | # %% [markdown] 45 | # There is, of course, a built-in operator for the data frame called `.mean()` that will compute the mean. 46 | 47 | # %% 48 | adult_nhanes_data['Height'].mean() 49 | 50 | 51 | # %% [markdown] 52 | # ## Median 53 | # The median is the middle value after sorting the entire set of values. First we sort the data in order of their values: 54 | 55 | # %% 56 | height_sorted = adult_nhanes_data['Height'].sort_values() 57 | 58 | 59 | # %% [markdown] 60 | # Next we find the median value. If there is an odd number of values in the list, then this is just the value in the middle, whereas if the number of values is even then we take the average of the two middle values. We can determine whether the number of items is even by dividing the length by two and seeing if there is a remainder; we do this using the `%%` operator, which is known as the *modulus* and returns the remainder: 61 | 62 | # %% 63 | height_length_mod_2 = height_sorted.shape[0] % 2 64 | 65 | # %% [markdown] 66 | # Here we will test whether the remainder is equal to one; if it is, then we will take the middle value, otherwise we will take the average of the two middle values. We can do this using an if/else structure, which executes different processes depending on which of the arguments are true. Here is a simple example: 67 | 68 | # %% 69 | if 1 > 2: 70 | print('1 > 2') 71 | else: 72 | print('1 is not greater than two!') 73 | 74 | # %% [markdown] 75 | # For our example, we can use an if statement to determine how to compute the median, depending on whether there is an odd or even number of data points. 76 | 77 | # %% 78 | import numpy as np 79 | if height_length_mod_2 == 1: 80 | # odd number values - take the single midpoint 81 | midpoint = int(np.ceil(height_sorted.shape[0] / 2)) 82 | median = height_sorted[midpoint] 83 | else: 84 | # even number of values - need to average the two middle points 85 | midpoints = [int((height_sorted.shape[0] / 2) - 1), 86 | int(height_sorted.shape[0] / 2)] 87 | median = height_sorted.iloc[midpoints].mean() 88 | 89 | 90 | # %% [markdown] 91 | # There is a lot going on there, so let's unpack it. The first line of the if statement asks whether the remainder is equal to one --- if so, then it executes the lines that are indented below it. Python uses indentation as part of its syntax, so you always need to be very careful about indentation. If the remainder is one, that means that the number of observations is odd, and thus that we can simply take the single middle point. We determine this by dividing the number of observations by two, and then rounding up (which is what the `np.ceil()` function does). Finally, we have to convert this number into an integer using the `int()` function, since we can only use integers to index a data frame. 92 | # If the first test is false --- that is, if the remainder is zero --- then, the second section of code (after the `else` statement) will be executed instead. Here we need to find the two midpoints and average them, so we create a new list containing those two points, and then use that index our data and then take the mean. 93 | 94 | 95 | # %% [markdown] 96 | # ## Mode 97 | # The mode is the most frequent value that occurs in a variable. For example, let's say that we had the following data: 98 | 99 | # %% 100 | import pandas as pd 101 | toy_data = pd.DataFrame({'myvar': ['a', 'a', 'b', 'c']}) 102 | 103 | # %% [markdown] 104 | # We can see by eye that the mode is "a" since it occurs more often than the others. To find it computationally, let's use the `.value_counts()` operator to find the frequency of each value: 105 | 106 | # %% 107 | myvar_frequencies = toy_data['myvar'].value_counts() 108 | myvar_frequencies 109 | 110 | # %% [markdown] 111 | # Now let's find the highest frequency, using the `.max()` operator: 112 | 113 | # %% 114 | max_frequency = myvar_frequencies.max() 115 | max_frequency 116 | 117 | # %% [markdown] 118 | # Now we can find the values that have the maximum frequency: 119 | 120 | # %% 121 | mode = myvar_frequencies.loc[myvar_frequencies == max_frequency].index.values 122 | 123 | # %% [markdown] 124 | # ## Creating functions 125 | # It is often useful to create our own custom *function* in order to perform a particular action. Let's do that for our mode function: 126 | 127 | 128 | # %% 129 | def my_mode_function(input): 130 | """ 131 | A function to compute the mode. 132 | 133 | Inputs: 134 | ------ 135 | input: a pandas Series 136 | 137 | Outputs: 138 | -------- 139 | mode: an array containing the mode values 140 | """ 141 | 142 | # make sure the input is a pandas series 143 | input = pd.Series(input) 144 | 145 | # compute the frequency distribution 146 | frequencies = input.value_counts() 147 | 148 | # compute the maximum frequency 149 | max_frequency = frequencies.max() 150 | 151 | # find the values matching the maximum frequency (i.e. the mode) 152 | mode = frequencies.loc[ 153 | frequencies == max_frequency].index.values 154 | 155 | return(mode) 156 | 157 | 158 | # %% [markdown] 159 | # Let's look at this one section at a time. 160 | # The first row tells Python to define a new function, called "my_mode_function", which takes in a single variable that will be called "input". This variable only exists inside the function; you can't access it from the outside. 161 | # The next section, surrounded by triple-quotes, is known as a *docstring*, and it provides documentation about our function. It's always a good idea to write a docstring that describes what the function does, what kinds of inputs it expects, and what kind of output it produces. 162 | # The next line converts the input to a particular kind of variable called a pandas *Series*; this is the same kind of variable as a column in a data frame. Including this command allows our function to take in various types of variables (including Series and lists) and treat them as if they were a Series, using the operators that are available such as `.value_counts()`. 163 | # The remaining lines perform the computations that we performed above to compute the mean. 164 | # The final line tells Python to return the value of the mode when the function is called. Let's see this in action: 165 | 166 | 167 | # %% 168 | my_mode_function(['a', 'a', 'b', 'c']) 169 | 170 | 171 | # %% [markdown] 172 | # Let's also make sure that it works properly if there are multiple modes: 173 | 174 | 175 | # %% 176 | my_mode_function(['a', 'a', 'b', 'c', 'c']) 177 | 178 | 179 | # %% [markdown] 180 | # ## Variability 181 | # Let's first compute the *variance*, which is the average squared difference between each value and the mean. Let's do this with our cleaned-up version of the height data, but instead of working with the entire dataset, let's take a random sample of 150 individuals: 182 | 183 | # %% 184 | sample_size = 150 185 | height_sample = adult_nhanes_data.sample(sample_size)['Height'] 186 | 187 | # %% [markdown] 188 | # We could have simply entered the number 150 into the sample function, but by first creating a new variable called `sample_size` and setting it to 150, we make it clearer to the reader of the code exactly what this number refers to. It's always good practice to create a new variable rather than typing a number directly into a formula. 189 | # 190 | # To compute the variance we need we need to first compute the sum of squared errors from the mean. In Python, we can square a vector using `**2`: 191 | 192 | # %% 193 | sum_of_squared_errors = np.sum((height_sample - height_sample.mean())**2) 194 | 195 | 196 | # %% [markdown] 197 | # Then we divide by N - 1 to get the estimated variance: 198 | 199 | # %% 200 | variance_estimate = sum_of_squared_errors / (height_sample.shape[0] - 1) 201 | variance_estimate 202 | 203 | # %% [markdown] 204 | # We can compare this to the built-in `.var()` operator: 205 | 206 | # %% 207 | height_sample.var() 208 | 209 | # %% [markdown] 210 | # We can get the *standard deviation* by simply taking the square root of the variance: 211 | 212 | # %% 213 | std_dev_estimate = np.sqrt(variance_estimate) 214 | std_dev_estimate 215 | 216 | # %% [markdown] 217 | # Which is the same value obtained using the built-in `.std()` operator: 218 | 219 | # %% 220 | height_sample.std() 221 | 222 | 223 | # %% [markdown] 224 | # ## Z-scores 225 | # A Z-score is obtained by first subtracting the mean and then dividing by the standard deviation of a distribution. Let's do this for the `height_sample` data. 226 | 227 | # %% 228 | mean_height = height_sample.mean() 229 | sd_height = height_sample.std() 230 | 231 | z_height = (height_sample - mean_height) / sd_height 232 | 233 | 234 | # %% [markdown] 235 | # Now let's plot the histogram of Z-scores alongside the histogram for the original values. Matplotlib allows us to create a grid of figures using the `plt.subplot()` function. Let's see this in action: 236 | 237 | # %% 238 | import matplotlib.pyplot as plt 239 | 240 | plt.figure(figsize=(12, 6)) 241 | plt.subplot(1, 2, 1) 242 | plt.hist(height_sample) 243 | 244 | plt.subplot(1, 2, 2) 245 | plt.hist(z_height) 246 | 247 | # %% [markdown] 248 | # You will notice that the shapes of the histograms are exactly the same. We can also see this by plotting the two variables against one another in a scatterplot: 249 | 250 | # %% 251 | plt.scatter(height_sample, z_height) 252 | 253 | # %% [markdown] 254 | # You see here that they fall along a straight line, meaning that they are perfectly related to each other exactly --- the only difference is where they are located on the number line. 255 | -------------------------------------------------------------------------------- /notebooks/02-SummarizingData.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.4.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # Summarizing Data 18 | # 19 | 20 | # %% [markdown] 21 | # This chapter will introduce you to how to summarize data using data frames in Pandas. 22 | 23 | # %% [markdown] 24 | # ## Working with data frames 25 | 26 | # %% [markdown] 27 | # In the last chapter you were introduced to the concept of a *data frame*, which we will use throughout much of this book. 28 | # In particular, we will use a dataset known as [NHANES](https://www.cdc.gov/nchs/nhanes/index.htm) for several of our examples, so let's load the library that provides us access to the data. 29 | # This is a large dataset collected from a sample of individuals in the United States every two years, which measures many different aspects of their health and lifestyles. 30 | # To access the data, we will use a Python package called [nhanes](https://github.com/poldrack/nhanes) that contains a function to load a cleaned-up version of the dataset. 31 | 32 | # %% 33 | from nhanes.load import load_NHANES_data 34 | nhanes_data = load_NHANES_data() 35 | 36 | # %% [markdown] 37 | # We can use the ``.head()`` method to look inside the data frame: 38 | 39 | # %% 40 | nhanes_data.head() 41 | 42 | # %% [markdown] 43 | # Let's look at the structure of this dataset. We can see the *shape* of the data frame -- that is, the number of rows and columns -- using the ``.shape`` method: 44 | 45 | # %% 46 | nhanes_data.shape 47 | 48 | # %% [markdown] 49 | # We see that the dataset has many more rows than columns. Let's look more closely at what is contained in the rows and columns. To obtain the labels for the rows, we use the ``.index`` operator: 50 | 51 | # %% 52 | nhanes_data.index 53 | 54 | # %% [markdown] 55 | # The index contains a bunch of numbers, each of which refers to one of the individuals in the NHANES data set. In a data frame, the rows always refer to *observations*, by which mean that each row reflects an individual unit of data. In the case of a dataset like NHANES, the observations would usually refer to individual people, though as we will see later, we sometimes want the rows to be even more specific. 56 | # We can also look at the content of the columns, which we can access using the ``.columns`` operator: 57 | 58 | # %% 59 | nhanes_data.columns 60 | 61 | # %% [markdown] 62 | # Each of the columns contains a different *variable* -- that is, a different thing that is measured on each observation. 63 | 64 | 65 | # %% [markdown] 66 | # ### Selecting rows (observations) and columns (variables) 67 | 68 | # %% [markdown] 69 | # We often want to select a subset of rows from a data frame. You saw in the last chapter how we can access specific rows using the ``.loc`` operator. This operator requires us to refer to the row names (that is, the *index*) and column names. For example, if we wanted to know the value of the `GeneralHealthCondition` variable for the indivdiual labeled 93707, we could use the following: 70 | 71 | # %% 72 | nhanes_data.loc[93707, 'GeneralHealthCondition'] 73 | 74 | # %% [markdown] 75 | # If there were several variables that we wanted to see for this individual, we could include the names of those variables in a *list*: 76 | 77 | # %% 78 | nhanes_data.loc[93707, ['GeneralHealthCondition', 'Gender', 'AgeInYearsAtScreening']] 79 | 80 | # %% [markdown] 81 | # This shows us the values for each of those variables for this individual. We could also do the same if there were several individuals that we were interested in: 82 | 83 | # %% 84 | nhanes_data.loc[[102951, 102955, 93707], ['GeneralHealthCondition', 'Gender', 'AgeInYearsAtScreening']] 85 | 86 | # %% [markdown] 87 | # ### Missing values 88 | 89 | # %% [markdown] 90 | # You will notice that the `GeneralHealthCondition` variable for the first individual in the previous cell contained *NaN*, which stands for "not a number". This is generally used to denote that the data are missing for this particular observation; perhaps they declined to answer the question, or the interviewer failed to properly record the answer. Missing data are common when we are working with real data. There are many sophsticated ways to deal with missing data in statistics, but for the moment we will just remove observations that have a missing data on one of our variables of interest, which we can do using the `.dropna()` operator: 91 | 92 | # %% 93 | my_subset = nhanes_data.loc[[102951, 102955, 93707], ['GeneralHealthCondition', 'Gender', 'AgeInYearsAtScreening']] 94 | my_subset.dropna() 95 | 96 | # %% [markdown] 97 | # This operator removes any rows that have missing values for any of the variables in the data frame. 98 | 99 | 100 | # %% [markdown] 101 | # ### Selecting rows by value 102 | 103 | # %% [markdown] 104 | # Let's say that we want to analyze NHANES data, but only for those individuals who are over 50 years of age. We can use the `query` operator on a data frame to find rows that match particular conditions: 105 | 106 | # %% 107 | over_50_df = nhanes_data.query('AgeInYearsAtScreening >= 50') 108 | over_50_df.shape 109 | 110 | # %% [markdown] 111 | # This shows that there were 2898 observations that matched our criterion. We can also search for specific values: For example, let's say that we want to find anyone who reported that their general health condition was "Good". This one is a bit tricky, because we are searching for a string of text, which we have to embed in our query, which is also a string of text. Fortunately, there are two different quotation marks that we can use (`'` or `"`) and Python treats them as distinct operators, so we can surround our text within double quotes, inside a string surrounded by single quotes: 112 | 113 | # %% 114 | good_health_df = nhanes_data.query('GeneralHealthCondition == "Good"') 115 | good_health_df.shape 116 | 117 | # %% [markdown] 118 | # ## Creating new variables 119 | 120 | # %% [markdown] 121 | # We can add a new variable to a data frame easily, by simply giving it a new name. Let's say that we wanted to convert the weight value in NHANES (stored in the `WeightKg` variable) from kilograms to pounds. 122 | 123 | # %% 124 | nhanes_data['WeightLbs'] = nhanes_data['WeightKg'] * 2.205 125 | 126 | # %% [markdown] 127 | # This shows another way to refer to a particular variable in a dataframe: simply put its name in square brackets. Pandas also has the ability to replace particular values in a variable. First, let's look at the values of the `Gender` variable in the data frame, to see what values it takes. We can see all of the unique values of a variable using the `.unique()` operator: 128 | 129 | # %% 130 | nhanes_data['Gender'].unique() 131 | 132 | # %% [markdown] 133 | # Now let's say that we wanted to recode the `Gender` variable so that instead of "Female" and "Male" its values were "F" and "M". One way to do this would be to use the `.rename()` operator on the data frame: 134 | 135 | 136 | # %% 137 | nhanes_data['GenderMF'] = nhanes_data['Gender'].replace({'Female': 'F', 'Male': 'M'}) 138 | nhanes_data['GenderMF'].unique() 139 | 140 | 141 | # %% [markdown] 142 | # ### Understanding your data 143 | 144 | # %% [markdown] 145 | # Let's say that we want to learn more about the variable labeled `GeneralHealthCondition` in the dataset. We can load some information about that variable using the `open_variable_page()` function from the `nhanes` package: 146 | 147 | # %% 148 | from nhanes.load import open_variable_page 149 | open_variable_page('GeneralHealthCondition') 150 | 151 | # %% [markdown] 152 | # This shows us the question that was asked ("Would you say {your/SP's} health in general is..."). 153 | 154 | # %% [markdown] 155 | # ## Summarizing data using a frequency distribution 156 | 157 | # %% [markdown] 158 | # Let's say that we want to know the frequencies of all of the different answers to the GeneralHealthCondition question. We can do this using the `.value_counts()` method of the data frame: 159 | 160 | # %% 161 | nhanes_data['GeneralHealthCondition'].value_counts() 162 | 163 | # %% [markdown] 164 | # It's usually more helpful to present a *relative frequency distribution*, which shows proportions rather than counts. We can obtain that by simply dividing the frequency distribution by the total number of cases, which we can obtain using the `.sum()` operator: 165 | 166 | # %% 167 | GeneralHealthCondition_frequency_dist = nhanes_data['GeneralHealthCondition'].value_counts() 168 | GeneralHealthCondition_frequency_dist / GeneralHealthCondition_frequency_dist.sum() 169 | 170 | 171 | # %% [markdown] 172 | # ### Data Cleaning 173 | 174 | # %% [markdown] 175 | # When we work with real data, they often have problems that we have to fix before we can analyze them properly. An example is the `GeneralHealthCondition` variable that we worked with in the previous example. You may have noticed that the values of the variable had some extranous information, which were held over from the way that the question is worded (including "Fair or" and "Poor?"). We can clean these up by replacing the problematic values using the `.replace()` method: 176 | 177 | # %% 178 | nhanes_data['GeneralHealthConditionFixed'] = nhanes_data['GeneralHealthCondition'].replace({'Fair or': 'Fair', 'Poor?': 'Poor'}) 179 | nhanes_data['GeneralHealthConditionFixed'].unique() 180 | 181 | # %% [markdown] 182 | # Now let's look at a more complex example. Let's say that we want to know who is currently a smokker in the NHANES sample. If we look more closely at the [details of the smoking questionnaire](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SMQ_J.htm), we will see that not all individuals got the same questions; for example, if a person said that they had not smoked more than 100 cigarettes in their life (recorded in the `SmokedAtLeast100CigarettesInLife` variable), then they were not asked the question about whether they currently smoked cigarettes (stored as `DoYouNowSmokeCigarettes`). We can see this in the number of respondents to each question: 183 | 184 | # %% 185 | print(nhanes_data['SmokedAtLeast100CigarettesInLife'].value_counts()) 186 | 187 | # %% 188 | nhanes_data['DoYouNowSmokeCigarettes'].value_counts() 189 | 190 | # %% [markdown] 191 | # Looking at these two variables, it's plain to see that there are many more values included in the first question than in the second. Let's look at bit more closely at the data to see what's going on: 192 | 193 | # %% 194 | nhanes_data[['SmokedAtLeast100CigarettesInLife', 'DoYouNowSmokeCigarettes']].head(10) 195 | 196 | # %% [markdown] 197 | # Here we can see that anyone who said No to the question about having smoked at least 100 cigarettes in their life (which is coded as zero) has a missing value for the question about current smoking (since that question wasn't asked to these individuals). To clean up these data, we need to do two things. First, we need to remove the individuals who have NaN for both questions, then we need to recode the NaN's for the second question for those people who said no on the first question. To do this, let's first create a new data frame that contains just the variables we are interested in: 198 | 199 | # %% 200 | smoking_df = nhanes_data[['SmokedAtLeast100CigarettesInLife', 'DoYouNowSmokeCigarettes']] 201 | smoking_df.shape 202 | 203 | # %% [markdown] 204 | # First let's remove any rows that have NaN in both columns. We can do this using the `.dropna()` method, which has an option that allows us to specify that we only drop rows that are all NaN (by setting `how='all'`): 205 | 206 | # %% 207 | smoking_df = smoking_df.dropna(how='all') 208 | smoking_df.shape 209 | 210 | # %% [markdown] 211 | # Next we need to recode the NaN values for the second question, for those individuals who said no to the first question. We will replace them with the answer 'Not at all'. To do this, we can use the `.loc` operator with a test for the value of the first column: 212 | 213 | # %% 214 | smoking_df.loc[smoking_df['SmokedAtLeast100CigarettesInLife'] == 0, 'DoYouNowSmokeCigarettes'] = 'Not at all' 215 | smoking_df.head() 216 | 217 | # %% [markdown] 218 | # This replaced the NaN values in the second question, but only for those individuals who said no to the first question. Now we can summarize the frequency of smoking across the entire group: 219 | 220 | # %% 221 | smoking_df['DoYouNowSmokeCigarettes'].value_counts() / smoking_df['DoYouNowSmokeCigarettes'].value_counts().sum() 222 | 223 | # %% [markdown] 224 | # In the next chapter you will learn how to visualize data like these using statistical graphs. 225 | 226 | -------------------------------------------------------------------------------- /notebooks/10-BayesianStatistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bayesian Statistics in Python\n", 8 | "In this chapter we will introduce how to basic Bayesian computations using Python.\n", 9 | "\n", 10 | "## Applying Bayes' theorem: A simple example\n", 11 | "TBD: MOVE TO MULTIPLE TESTING EXAMPLE SO WE CAN USE BINOMIAL LIKELIHOOD\n", 12 | "A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive. What is the likelihood that they actually have COVID-19, as opposed to a regular cold or flu? We can use Bayes' theorem to compute this. Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074. The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%. Its sensitivity is not known, but probably is no higher than 90%. \n", 13 | "First let's look at the probability of disease given a single positive test." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "\n", 23 | "sensitivity = 0.90\n", 24 | "specificity = 0.99\n", 25 | "prior = 0.074\n", 26 | "likelihood = sensitivity # p(test|disease present) \n", 27 | "marginal_likelihood = sensitivity * prior + (1 - specificity) * (1 - prior)\n", 28 | "posterior = (likelihood * prior) / marginal_likelihood\n", 29 | "posterior\n", 30 | "\n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "The high specificity of the test, along with the relatively high base rate of the disease, means that most people who test positive actually have the disease. \n", 42 | "Now let's plot the posterior as a function of the prior. Let's first create a function to compute the posterior, and then apply this with a range of values for the prior." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "\n", 52 | "import numpy as np\n", 53 | "import pandas as pd\n", 54 | "import scipy.stats\n", 55 | "import matplotlib.pyplot as plt\n", 56 | "\n", 57 | "\n", 58 | "def compute_posterior(prior, sensitivity, specificity):\n", 59 | " likelihood = sensitivity # p(test|disease present) \n", 60 | " marginal_likelihood = sensitivity * prior + (1 - specificity) * (1 - prior)\n", 61 | " posterior = (likelihood * prior) / marginal_likelihood\n", 62 | " return(posterior)\n", 63 | "\n", 64 | "\n", 65 | "prior_values = np.arange(0.001, 0.5, 0.001)\n", 66 | "posterior_values = compute_posterior(prior_values, sensitivity, specificity)\n", 67 | "\n", 68 | "plt.plot(prior_values, posterior_values)\n", 69 | "plt.xlabel('prior')\n", 70 | "_ = plt.ylabel('posterior')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "This figure highlights a very important general point about diagnostic testing: Even when the test has high specificity, if a condition is rare then most positive test results will be false positives.\n", 78 | "\n", 79 | "## Estimating posterior distributions \n", 80 | "In this example we will look at how to estimate entire posterior distributions. \n", 81 | "We will implement the [drug testing example](https://statsthinking21.github.io/statsthinking21-core-site/bayesian-statistics.html#estimating-posterior-distributions) from the book. In that example, we administered a drug to 100 people, and found that 64 of them responded positively to the drug. What we want to estimate is the probability distribution for the proportion of responders, given the data. For simplicity we started with a uniform prior; that is, all proprtions of responding are equally likely to begin with. In addition, we will use a discrete probability distribution; that is, we will estimate the posterior probabiilty for each particular proportion of responders, in steps of 0.01. This greatly simplifies the math and still retains the main idea." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "#+\n", 91 | "num_responders = 64\n", 92 | "num_tested = 100\n", 93 | "\n", 94 | "bayes_df = pd.DataFrame({'proportion': np.arange(0.0, 1.01, 0.01)})\n", 95 | "\n", 96 | "# compute the binomial likelihood of the observed data for each\n", 97 | "# possible value of proportion\n", 98 | "bayes_df['likelihood'] = scipy.stats.binom.pmf(num_responders,\n", 99 | " num_tested,\n", 100 | " bayes_df['proportion'])\n", 101 | "# The prior is equal for all possible values\n", 102 | "bayes_df['prior'] = 1 / bayes_df.shape[0]\n", 103 | "\n", 104 | "# compute the marginal likelihood by adding up the likelihood of each possible proportion times its prior probability.\n", 105 | "\n", 106 | "marginal_likelihood = (bayes_df['likelihood'] * bayes_df['prior']).sum()\n", 107 | "\n", 108 | "bayes_df['posterior'] = (bayes_df['likelihood'] * bayes_df['prior']) / marginal_likelihood\n", 109 | "\n", 110 | "# plot the likelihood, prior, and posterior\n", 111 | "\n", 112 | "plt.plot(bayes_df['proportion'], bayes_df['likelihood'], label='likelihood')\n", 113 | "plt.plot(bayes_df['proportion'], bayes_df['prior'], label='prior')\n", 114 | "plt.plot(bayes_df['proportion'], bayes_df['posterior'],\n", 115 | " 'k--', label='posterior')\n", 116 | "\n", 117 | "plt.legend()\n", 118 | "\n", 119 | "#-" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "The plot shows that the posterior and likelihood are virtually identical, which is due to the fact that the prior is uniform across all possible values. Now let's look at a case where the prior is not uniform. Let's say that we now run a larger study of 1000 people with the same treatment, and we find that 312 of the 1000 individuals respond to the treatment. In this case, we can use the posterior from the earlier study of 100 people as the prior for our new study. This is what we sometimes refer to as *Bayesian updating*." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "#+\n", 136 | "num_responders = 312\n", 137 | "num_tested = 1000\n", 138 | "\n", 139 | "# copy the posterior from the previous analysis and rename it as the prior\n", 140 | "\n", 141 | "study2_df = bayes_df[['proportion', 'posterior']].rename(columns={'posterior': 'prior'})\n", 142 | "\n", 143 | "# compute the binomial likelihood of the observed data for each\n", 144 | "# possible value of proportion\n", 145 | "\n", 146 | "study2_df['likelihood'] = scipy.stats.binom.pmf(num_responders,\n", 147 | " num_tested,\n", 148 | " study2_df['proportion'])\n", 149 | "\n", 150 | "# compute the marginal likelihood by adding up the likelihood of each possible proportion times its prior probability.\n", 151 | "\n", 152 | "marginal_likelihood = (study2_df['likelihood'] * study2_df['prior']).sum()\n", 153 | "\n", 154 | "study2_df['posterior'] = (study2_df['likelihood'] * study2_df['prior']) / marginal_likelihood\n", 155 | "\n", 156 | "# plot the likelihood, prior, and posterior\n", 157 | "\n", 158 | "plt.plot(study2_df['proportion'], study2_df['likelihood'], label='likelihood')\n", 159 | "plt.plot(study2_df['proportion'], study2_df['prior'], label='prior')\n", 160 | "plt.plot(study2_df['proportion'], study2_df['posterior'],\n", 161 | " 'k--', label='posterior')\n", 162 | "\n", 163 | "plt.legend()\n", 164 | "\n", 165 | "#-" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Here we see two important things. First, we see that the prior is substantially wider than the likelihood, which occurs because there is much more data going into the likelihood (1000 data points) compared to the prior (100 data points), and more data reduces our uncertainty. Second, we see that the posterior is much closer to the value observed for the second study than for the first, which occurs for the same reason --- we put greater weight on the estimate that is more precise due to a larger sample. \n", 173 | "\n", 174 | "## Bayes factors\n", 175 | "There are no convenient off-the-shelf tools for estimating Bayes factors using Python, so we will use the `rpy2` package to access the `BayesFactor` library in R. Let's compute a Bayes factor for a T-test comparing the amount of reported alcohol computing between smokers versus non-smokers. First, let's set up the NHANES data and collect a sample of 150 smokers and 150 nonsmokers." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "lines_to_next_cell": 2 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "#+\n", 187 | "from nhanes.load import load_NHANES_data\n", 188 | "nhanes_data = load_NHANES_data()\n", 189 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17')\n", 190 | "rseed = 1\n", 191 | "\n", 192 | "# clean up smoking variables\n", 193 | "adult_nhanes_data.loc[adult_nhanes_data['SmokedAtLeast100CigarettesInLife'] == 0, 'DoYouNowSmokeCigarettes'] = 'Not at all'\n", 194 | "adult_nhanes_data.loc[:, 'SmokeNow'] = adult_nhanes_data['DoYouNowSmokeCigarettes'] != 'Not at all'\n", 195 | "\n", 196 | "# Create average alcohol consumption variable between the two dietary recalls\n", 197 | "adult_nhanes_data.loc[:, 'AvgAlcohol'] = adult_nhanes_data[['AlcoholGm_DR1TOT', 'AlcoholGm_DR2TOT']].mean(1)\n", 198 | "adult_nhanes_data = adult_nhanes_data.dropna(subset=['AvgAlcohol'])\n", 199 | "\n", 200 | "sample_size_per_group = 150\n", 201 | "\n", 202 | "nonsmoker_sample = adult_nhanes_data.query('SmokeNow == False').sample(sample_size_per_group, random_state=rseed)[['SmokeNow', 'AvgAlcohol']]\n", 203 | "smoker_sample = adult_nhanes_data.query('SmokeNow == True').sample(sample_size_per_group, random_state=rseed)[['SmokeNow', 'AvgAlcohol']]\n", 204 | "\n", 205 | "full_sample = pd.concat((nonsmoker_sample, smoker_sample))\n", 206 | "full_sample.loc[:, 'SmokeNow'] = full_sample['SmokeNow'].astype('int')\n", 207 | "full_sample.groupby('SmokeNow').mean()\n", 208 | "#-" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "Now let's use functions from R to perform a standard t-test as well as compute a Bayes Factor for this comparison. " 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "lines_to_next_cell": 0 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "#+\n", 227 | "\n", 228 | "# import the necessary functions from rpy2\n", 229 | "import rpy2.robjects as robjects\n", 230 | "from rpy2.robjects import r, pandas2ri\n", 231 | "from rpy2.robjects.packages import importr\n", 232 | "pandas2ri.activate()\n", 233 | "\n", 234 | "# import the BayesFactor package\n", 235 | "BayesFactor = importr('BayesFactor')\n", 236 | "\n", 237 | "# import the data frames into the R workspace\n", 238 | "robjects.globalenv[\"smoker_sample\"] = smoker_sample\n", 239 | "robjects.globalenv[\"nonsmoker_sample\"] = nonsmoker_sample\n", 240 | "\n", 241 | "# perform the standard t-test\n", 242 | "ttest_output = r('print(t.test(smoker_sample$AvgAlcohol, nonsmoker_sample$AvgAlcohol, alternative=\"greater\"))')\n", 243 | "\n", 244 | "# compute the Bayes factor\n", 245 | "r('bf = ttestBF(y=nonsmoker_sample$AvgAlcohol, x=smoker_sample$AvgAlcohol, nullInterval = c(0, Inf))')\n", 246 | "r('print(bf[1]/bf[2])')\n", 247 | "\n", 248 | "#-" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "This shows that the difference between these groups is significant, and the Bayes factor suggests fairly strong evidence for a difference." 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "jupytext": { 261 | "formats": "ipynb,py:percent" 262 | }, 263 | "kernelspec": { 264 | "display_name": "Python 3", 265 | "language": "python", 266 | "name": "python3" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 4 271 | } 272 | -------------------------------------------------------------------------------- /notebooks/13-GeneralLinearModel.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:percent 5 | # text_representation: 6 | # extension: .py 7 | # format_name: percent 8 | # format_version: '1.3' 9 | # jupytext_version: 1.15.2 10 | # kernelspec: 11 | # display_name: Python 3 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # %% [markdown] 17 | # # The General Linear Model 18 | # In this chapter we will explore how to fit general linear models in Python. We will focus on the tools provided by the `statsmodels` package. 19 | 20 | # %% 21 | from nhanes.load import load_NHANES_data 22 | nhanes_data = load_NHANES_data() 23 | adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17') 24 | 25 | 26 | # %% [markdown] 27 | # ## Linear regression 28 | # To perform linear regression in Python, we use the `OLS()` function (which stands for *ordinary least squares*) from the `statsmodels` package. Let's generate some simulated data and use this function to compute the linear regression solution. 29 | 30 | # %% 31 | 32 | import numpy as np 33 | import pandas as pd 34 | import matplotlib.pyplot as plt 35 | 36 | 37 | def generate_linear_data(slope, intercept, 38 | noise_sd=1, x=None, 39 | npoints=100, seed=None): 40 | """ 41 | generate data with a given slope and intercept 42 | and add normally distributed noise 43 | 44 | if x is passed as an argument then a given x will be used, 45 | otherwise it will be generated randomly 46 | 47 | Returns: 48 | -------- 49 | a pandas data frame with variables x and y 50 | """ 51 | if seed is not None: 52 | np.random.seed(seed) 53 | if x is None: 54 | x = np.random.randn(npoints) 55 | 56 | y = x * slope + intercept + np.random.randn(x.shape[0]) * noise_sd 57 | return(pd.DataFrame({'x': x, 'y': y})) 58 | 59 | 60 | slope = 1 61 | intercept = 10 62 | noise_sd = 1 63 | simulated_data = generate_linear_data(slope, intercept, noise_sd, seed=1) 64 | 65 | plt.scatter(simulated_data['x'], simulated_data['y']) 66 | 67 | 68 | # %% [markdown] 69 | # We can then perform linear regression on these data using the `ols` function. This function doesn't automatically include an intercept in its model, so we need to add one to the design. Fitting the model using this function is a two-step process. First, we set up the model and store it to a variable (which we will call `ols_model`). Then, we actually fit the model, which generates the results that we store to a different variable called `ols_results`, and view a summary using the `.summary()` method of the results variable. 70 | 71 | # %% 72 | from statsmodels.formula.api import ols 73 | 74 | ols_model = ols(formula='y ~ x + 1', data=simulated_data) 75 | ols_result = ols_model.fit() 76 | ols_result.summary() 77 | 78 | # %% [markdown] 79 | # We should see three things in these results: 80 | # 81 | # * The estimate of the Intercept in the model should be very close to the intercept that we specified 82 | # * The estimate for the x parameter should be very close to the slope that we specified 83 | # * The residual standard deviation should be roughly similar to the noise standard deviation that we specified. The summary doesn't report the residual standard deviation directly but we can compute it using the residuals that are stored in the `.resid` element in the result output: 84 | 85 | # %% 86 | ols_result.resid.std() 87 | 88 | 89 | # %% [markdown] 90 | # ## Model criticism and diagnostics 91 | # Once we have fitted the model, we want to look at some diagnostics to determine whether the model is actually fitting properly. 92 | # The first thing to examine is to make sure that the residuals are (at least roughly) normally distributed. We can do this using a Q-Q plot: 93 | 94 | # %% 95 | import seaborn as sns 96 | import scipy.stats 97 | 98 | _ = scipy.stats.probplot(ols_result.resid, plot=sns.mpl.pyplot) 99 | 100 | # %% [markdown] 101 | # This looks pretty good, in the sense that the residual data points fall very close to the unit line. This is not surprising, since we generated the data with normally distributed noise. We should also plot the predicted (or *fitted*) values against the residuals, to make sure that the model does work systematically better for some predicted values versus others. 102 | 103 | # %% 104 | plt.scatter(ols_result.fittedvalues, ols_result.resid) 105 | plt.xlabel('Fitted value') 106 | plt.ylabel('Residual') 107 | 108 | # %% [markdown] 109 | # As expected, we see no clear relationship. 110 | # 111 | # ## Examples of problematic model fit 112 | # Let's say that there was another variable at play in this dataset, which we were not aware of. This variable causes some of the cases to have much larger values than others, in a way that is unrelated to the X variable. We play a trick here using the `seq()` function to create a sequence from zero to one, and then threshold those 0.5 (in order to obtain half of the values as zero and the other half as one) and then multiply by the desired effect size: 113 | 114 | # %% 115 | simulated_data.loc[:, 'x2'] = (simulated_data.index < (simulated_data.shape[0] / 2)).astype('int') 116 | hidden_effect_size = 10 117 | simulated_data.loc[:, 'y2'] = simulated_data['y'] + simulated_data['x2'] * hidden_effect_size 118 | 119 | # %% [markdown] 120 | # Now we fit the model again, and examine the residuals: 121 | # 122 | 123 | # %% 124 | ols_model2 = ols(formula='y2 ~ x + 1', data=simulated_data) 125 | ols_result2 = ols_model2.fit() 126 | 127 | plt.figure(figsize=(12, 6)) 128 | plt.subplot(1, 2, 1) 129 | scipy.stats.probplot(ols_result2.resid, plot=sns.mpl.pyplot) 130 | 131 | plt.subplot(1, 2, 2) 132 | plt.scatter(ols_result2.fittedvalues, ols_result2.resid) 133 | plt.xlabel('Fitted value') 134 | plt.ylabel('Residual') 135 | 136 | # %% [markdown] 137 | # The lack of normality is clear from the Q-Q plot, and we can also see that there is obvious structure in the residuals. 138 | # 139 | # Let's look at another potential problem, in which the y variable is nonlinearly related to the X variable. We can create these data by squaring the X variable when we generate the Y variable: 140 | 141 | # %% 142 | noise_sd = 0.1 143 | simulated_data['y3'] = (simulated_data['x']**2) * slope + intercept + np.random.randn(simulated_data.shape[0]) * noise_sd 144 | 145 | plt.scatter(simulated_data['x'], simulated_data['y3']) 146 | 147 | # %% 148 | ols_model3 = ols(formula='y3 ~ x + 1', data=simulated_data) 149 | ols_result3 = ols_model3.fit() 150 | ols_result3.summary() 151 | 152 | # %% [markdown] 153 | # Now we see that there is no significant linear relationship between $X^2$ and Y/ But if we look at the residuals the problem with the model becomes clear: 154 | # 155 | # plt.figure(figsize=(12, 6)) 156 | # plt.subplot(1, 2, 1) 157 | # scipy.stats.probplot(ols_result3.resid, plot=sns.mpl.pyplot) 158 | # 159 | # plt.subplot(1, 2, 2) 160 | # plt.scatter(ols_result3.fittedvalues, ols_result3.resid) 161 | # plt.xlabel('Fitted value') 162 | # plt.ylabel('Residual') 163 | 164 | # %% [markdown] 165 | # In this case we can see the clearly nonlinear relationship between the predicted and residual values, as well as the clear lack of normality in the residuals. 166 | # 167 | # As we noted in the previous chapter, the "linear" in the general linear model doesn't refer to the shape of the response, but instead refers to the fact that model is linear in its parameters --- that is, the predictors in the model only get multiplied the parameters (e.g., rather than being raised to a power of the parameter). Here is how we would build a model that could account for the nonlinear relationship, by using `x**2` in the model: 168 | 169 | # %% 170 | simulated_data.loc[:, 'x_squared'] = simulated_data['x'] ** 2 171 | ols_model4 = ols(formula='y3 ~ x_squared + 1', data=simulated_data) 172 | ols_result4 = ols_model4.fit() 173 | ols_result4.summary() 174 | 175 | # %% [markdown] 176 | # Now we see that the effect of $X^2$ is significant, and if we look at the residual plot we should see that things look much better: 177 | 178 | # %% 179 | 180 | plt.figure(figsize=(12, 6)) 181 | plt.subplot(1, 2, 1) 182 | scipy.stats.probplot(ols_result4.resid, plot=sns.mpl.pyplot) 183 | 184 | plt.subplot(1, 2, 2) 185 | plt.scatter(ols_result4.fittedvalues, ols_result4.resid) 186 | plt.xlabel('Fitted value') 187 | plt.ylabel('Residual') 188 | 189 | # %% [markdown] 190 | # Not perfect, but much better than before! 191 | # 192 | # ## Extending regression to binary outcomes. 193 | # 194 | 195 | # %% 196 | from statsmodels.formula.api import logit 197 | 198 | diabetes_df = adult_nhanes_data.query( 199 | 'DoctorToldYouHaveDiabetes != "Borderline"').dropna( 200 | subset=['DoctorToldYouHaveDiabetes', 'AgeInYearsAtScreening', 'BodyMassIndexKgm2']).rename( 201 | columns={'DoctorToldYouHaveDiabetes': 'Diabetes', 'AgeInYearsAtScreening': 'Age', 'BodyMassIndexKgm2': 'BMI'}) 202 | diabetes_df.loc[:, 'Diabetes'] = diabetes_df['Diabetes'].astype('int') 203 | 204 | 205 | # %% [markdown] 206 | # Now we would like to build a model that allows us to predict who has diabetes, based on their age and Body Mass Index (BMI). However, you may have noticed that the Diabetes variable is a binary variable; because linear regression assumes that the residuals from the model will be normally distributed, and the binary nature of the data will violate this, we instead need to use a different kind of model, known as a *logistic regression* model, which is built to deal with binary outcomes. We can fit this model using the `logit()` function: 207 | 208 | # %% 209 | 210 | logitfit = logit(formula = 'Diabetes ~ Age + BMI', data = diabetes_df).fit(disp=0) 211 | 212 | logitfit.summary() 213 | 214 | 215 | # %% [markdown] 216 | # This looks very similar to the output from the `ols()` function, and it shows us that there is a significant relationship between the age, weight, and diabetes. The model provides us with a predicted probability that each individual will have diabetes; if this is greater than 0.5, then that means that the model predicts that the individual is more likely than not to have diabetes. 217 | # We can start by simply comparing those predictions to the actual outcomes. 218 | 219 | # %% 220 | 221 | diabetes_df.loc[:, 'LogitPrediction'] = (logitfit.predict() > 0.5).astype('int') 222 | 223 | pd.crosstab(diabetes_df['Diabetes'], diabetes_df['LogitPrediction']) 224 | 225 | # %% [markdown] 226 | # This table shows that the model did somewhat well, in that it labeled most non-diabetic people as non-diabetic, and most diabetic people as diabetic. However, it also made a lot of mistakes, mislabeling nearly half of all diabetic people as non-diabetic. 227 | # 228 | # We would often like a single number that tells us how good our prediction is. We could simply ask how many of our predictions are correct on average: 229 | 230 | # %% 231 | (diabetes_df['LogitPrediction'] == diabetes_df['Diabetes']).mean() 232 | 233 | # %% [markdown] 234 | # This tells us that we are doing fairly well at prediction, with over 80% accuracy. However, this measure is problematic, because most people in the sample don't have diabetes. This means that we could get relatively high accuracy if we simply said that no one has diabetes: 235 | 236 | # %% 237 | (np.zeros(diabetes_df.shape[0]) == diabetes_df['Diabetes']).mean() 238 | 239 | # %% [markdown] 240 | # One commonly used value when we have a graded prediction (as we do here, with the probabiilty that is predicted by the model) is called the *area under the receiver operating characteristic* or *AUROC*. This is a number that ranges from zero to one, where 0.5 means that we are guessing, and one means that our predictions are perfect. Let's see what that comes out to for this dataset, using the `roc_auc_score` from the [scikit-learn](https://scikit-learn.org/stable/) package: 241 | 242 | # %% 243 | from sklearn.metrics import roc_auc_score 244 | rocscore = roc_auc_score(diabetes_df['Diabetes'], logitfit.predict()) 245 | rocscore 246 | 247 | # %% [markdown] 248 | # Our model performs relatively well according to this score. What if we wanted to know whether this is better than chance? One option would be to create a null model, in which we purposely break the relationship between our variables. We could then ask how likely our observed score would be if there is no true relationship. 249 | 250 | # %% 251 | from sklearn.utils import shuffle 252 | 253 | shuffled_df = diabetes_df.copy() 254 | 255 | num_runs = 1000 256 | 257 | roc_scores = pd.DataFrame({'auc': np.zeros(num_runs)}) 258 | 259 | for simulation_run in range(num_runs): 260 | # shuffle the diabetes labels in order to break the relationship 261 | shuffled_df.loc[:, 'Diabetes'] = shuffle(shuffled_df['Diabetes'].values) 262 | randomfit = logit(formula = 'Diabetes ~ Age + BMI', data = shuffled_df).fit(disp=0) 263 | roc_scores.loc[simulation_run, 'auc'] = roc_auc_score(shuffled_df['Diabetes'], randomfit.predict()) 264 | 265 | pvalue = (100 - scipy.stats.percentileofscore(roc_scores['auc'], rocscore))/100 266 | pvalue 267 | 268 | # %% [markdown] 269 | # This shows us that our observed score is higher than all of 1000 scores obtained using random permutations. Thus, we can conclude that our accuracy is greater than chance. However, this doesn't tell us how well we can predict whether a *new* individual will have diabetes. This is what we turn to next. 270 | # 271 | # ## Cross-validation 272 | # 273 | # Cross-validation is a powerful technique that allows us to estimate how well our results will generalize to a new dataset. Here we will build our own crossvalidation code to see how it works, continuing the logistic regression example from the previous section. 274 | # In cross-validation, we want to split the data into several subsets and then iteratively train the model while leaving out each subset (which we usually call *folds*) and then test the model on that held-out fold. 275 | # We can use one of the tools from the [scikit-learn](https://scikit-learn.org/stable/) package to create our cross-validation folds for us. Let's start by using 10-fold crossvalidation, in which we split the data into 10 parts, and the fit the model while holding out one of those parts and then testing it on the held-out data. 276 | 277 | # %% 278 | from sklearn.model_selection import KFold 279 | 280 | kf = KFold(n_splits=10, shuffle=True) 281 | 282 | diabetes_df['Predicted'] = np.nan 283 | 284 | for train_index, test_index in kf.split(diabetes_df): 285 | train_data = diabetes_df.iloc[train_index, :] 286 | test_data = diabetes_df.iloc[test_index, :] 287 | model = logit(formula = 'Diabetes ~ Age + BMI', data = train_data) 288 | trainfit = model.fit(disp=0) 289 | diabetes_df['Predicted'].iloc[list(test_index)] = trainfit.predict( 290 | test_data[['Age', 'BMI']]) 291 | 292 | print(pd.crosstab(diabetes_df['Diabetes'], diabetes_df['Predicted']>0.5)) 293 | 294 | roc_auc_score(diabetes_df['Diabetes'], diabetes_df['Predicted']) 295 | 296 | 297 | # %% [markdown] 298 | # This result shows that our model is able to generalize to new individuals relatively well --- in fact, almost as well as the original model. This is because our sample size is very large; with smaller samples, the generalization performance is usually much less using crossvalidation than using the full sample. 299 | -------------------------------------------------------------------------------- /notebooks/06-Sampling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "MOg8ZuPL7SjT" 7 | }, 8 | "source": [ 9 | "# Sampling\n", 10 | "In this chapter we will learn how to use Python to understand sampling and sampling error.\n", 11 | "\n", 12 | "## Sampling error\n", 13 | "Here we will repeatedly sample from the NHANES Height variable in order to obtain the sampling distribution of the mean. First let's load the data and clean them up." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "lines_to_next_cell": 2, 21 | "id": "FEFht0qo7SjV" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "! pip install nhanes\n", 26 | "from nhanes.load import load_NHANES_data\n", 27 | "nhanes_data = load_NHANES_data()\n", 28 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17')\n", 29 | "adult_nhanes_data = adult_nhanes_data.dropna(subset=['StandingHeightCm']).rename(columns={'StandingHeightCm': 'Height'})" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "source": [ 35 | "Now let's draw a sample of 50 individuals from the dataset, and calculate its mean.\n", 36 | "Try to execude the next cell repeatedly. What do you see?" 37 | ], 38 | "metadata": { 39 | "id": "t_pKb6uq7qsX" 40 | } 41 | }, 42 | { 43 | "cell_type": "code", 44 | "source": [ 45 | "sample_size = 50\n", 46 | "sample = adult_nhanes_data.sample(sample_size)\n", 47 | "print('Sample mean:', sample['Height'].mean())\n", 48 | "print('Sample standard deviation:', sample['Height'].std())" 49 | ], 50 | "metadata": { 51 | "id": "FN_DN2Lo7qCb" 52 | }, 53 | "execution_count": null, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "id": "Rak5pDws7SjW" 60 | }, 61 | "source": [ 62 | "Now let's repeatedly sample 50 individuals from the dataset, compute the mean, and store the resulting values. For this we are going to use a *for loop*, which allows us to repeatedly perform a particular set of actions." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "id": "s_gDIauW7SjW" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "#+\n", 74 | "sample_size = 50\n", 75 | "num_samples = 5000\n", 76 | "\n", 77 | "import pandas as pd\n", 78 | "import numpy as np\n", 79 | "\n", 80 | "# set up a variable to store the result\n", 81 | "sampling_results = pd.DataFrame({'mean': np.zeros(num_samples)})\n", 82 | "print('An empty data frame to be filled with sampling means:')\n", 83 | "print(sampling_results)\n", 84 | "for sample_num in range(num_samples):\n", 85 | " sample = adult_nhanes_data.sample(sample_size)\n", 86 | " sampling_results.loc[sample_num, 'mean'] = sample['Height'].mean()\n", 87 | "#-\n", 88 | "print('Means of 5000 samples:')\n", 89 | "print(sampling_results)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "id": "kNLpg7uL7SjW" 96 | }, 97 | "source": [ 98 | "Now let's plot the sampling distribution. We will also overlay the sampling distribution of the mean predicted on the basis of the population mean and standard deviation, to show that it properly describes the actual sampling distribution. We also place a vertical line at the population mean." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "id": "cfWIzi1j7SjW" 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "#+\n", 110 | "import matplotlib.pyplot as plt\n", 111 | "import numpy as np\n", 112 | "import scipy.stats\n", 113 | "import seaborn as sns\n", 114 | "\n", 115 | "hist = plt.hist(sampling_results['mean'], 100, density=True)\n", 116 | "# hist[0] contains the histogram data\n", 117 | "# we need to use the maximum of those data to set\n", 118 | "# the height of the vertical line that shows the mean\n", 119 | "plt.axvline(x=adult_nhanes_data['Height'].mean(),\n", 120 | " ymax=1, color='k')\n", 121 | "\n", 122 | "# draw the normal distribution with same mean and standard deviation\n", 123 | "# as the sampling distribution\n", 124 | "hist_bin_min = np.min(hist[1])\n", 125 | "hist_bin_max = np.max(hist[1])\n", 126 | "step_size = 0.01\n", 127 | "x_values = np.arange(hist_bin_min, hist_bin_max, step_size)\n", 128 | "normal_values = scipy.stats.norm.pdf(\n", 129 | " x_values,\n", 130 | " loc=sampling_results['mean'].mean(),\n", 131 | " scale=sampling_results['mean'].std())\n", 132 | "plt.plot(x_values, normal_values, color='r')\n", 133 | "#+\n", 134 | "print('standard deviation of the sample means:', sampling_results['mean'].std())" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "source": [ 140 | "Now, can you redo the simulation of sampling above, but make the following changes each time?\n", 141 | "\n", 142 | "- Changing the sample size to 5 or 500. What difference do you observe in the distribution of sample means?\n", 143 | "\n", 144 | "- Changing the number of times to draw the samples to 50,000. Does the histogram appear closer to a normal distribution?" 145 | ], 146 | "metadata": { 147 | "id": "p5J5iklPDqhu" 148 | } 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "NCHSLEpH7SjW" 154 | }, 155 | "source": [ 156 | "## Central limit theorem\n", 157 | "The central limit theorem tells us that the sampling distribution of the mean becomes normal as the sample size grows. Let's test this by sampling a clearly non-normal variable and look at the normality of the results using a Q-Q plot. For example, let's look at the variable that represents annual family income. This variable is oddly distributed:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "lines_to_next_cell": 2, 165 | "id": "IWfLWmCo7SjX" 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "plt.hist(adult_nhanes_data['AnnualFamilyIncome'])\n", 170 | "plt.show()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "lines_to_next_cell": 2, 177 | "id": "MpwtW2qq7SjX" 178 | }, 179 | "source": [ 180 | "This odd distribution comes in part from the how the variable is coded, as shown [here](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.htm#INDFMIN2). Let's resample this variable 5000 times, compute the mean, and examine the distribution. To do this, we will create a function that resamples and returns the mean:" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "lines_to_next_cell": 2, 188 | "id": "TsMEGZm07SjX" 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "def sample_and_return_mean(df, variable_name,\n", 193 | " sample_size=250, num_samples=5000):\n", 194 | " \"\"\"\n", 195 | " repeatedly take samples from a particular variable in a data frame\n", 196 | " and compute the mean\n", 197 | "\n", 198 | " Parameters:\n", 199 | " -----------\n", 200 | " df: data frame containing the data\n", 201 | " variable_name: the name of the variable to be analyzed\n", 202 | " sample_size: the number of observations to sample each time\n", 203 | " num_samples: the number of samples to take\n", 204 | "\n", 205 | " Returns:\n", 206 | " --------\n", 207 | " sampling_distribution: data frame containing the means\n", 208 | " \"\"\"\n", 209 | " sampling_distribution = pd.DataFrame({'mean': np.zeros(num_samples)})\n", 210 | " for sample_number in range(num_samples):\n", 211 | " sample_df = df.sample(sample_size)\n", 212 | " sampling_distribution.loc[sample_number, 'mean'] = sample_df[variable_name].mean()\n", 213 | " return(sampling_distribution)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "lines_to_next_cell": 2, 220 | "id": "os---mao7SjX" 221 | }, 222 | "source": [ 223 | "Now, using this function, let's compute the sampling distribution for the annual family income variable and plot its histogram." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "lines_to_next_cell": 2, 231 | "id": "VYUT9Zo97SjX" 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "adult_income_data = adult_nhanes_data.dropna(subset=['AnnualFamilyIncome'])\n", 236 | "family_income_sampling_dist = sample_and_return_mean(adult_income_data, 'AnnualFamilyIncome')\n", 237 | "_ = plt.hist(family_income_sampling_dist['mean'], 100)\n", 238 | "plt.show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "id": "O3FH7bGx7SjX" 245 | }, 246 | "source": [ 247 | "This distribution looks nearly normal. We can also use a quantile-quantile, or \"Q-Q\" plot, to examine this. \n", 248 | "\n", 249 | "Quantile means the value below which certain percentage of all the scores are distributed. 5 percentile means 5% of the score is below this value. If two distributions are of the same shape, then their corresponding percentiles should form a linear relationship.\n", 250 | "\n", 251 | "We will plot two Q-Q plots; on the left we plot one for the original data, and on the right we plot one for the sampling distribution of the mean." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "id": "UtXksTcP7SjX" 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "\n", 263 | "plt.figure(figsize=(12, 6))\n", 264 | "plt.subplot(1, 2, 1)\n", 265 | "scipy.stats.probplot(adult_income_data['AnnualFamilyIncome'], plot=sns.mpl.pyplot)\n", 266 | "plt.title('Original data')\n", 267 | "\n", 268 | "plt.subplot(1, 2, 2)\n", 269 | "scipy.stats.probplot(family_income_sampling_dist['mean'], plot=sns.mpl.pyplot)\n", 270 | "plt.title('Sampling distribution')" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "lines_to_next_cell": 2, 277 | "id": "3A8vxnHh7SjX" 278 | }, 279 | "source": [ 280 | "We see that the raw data are highly non-normal, evidenced by the fact that the data values diverge greatly from the unit line. On the other hand, the sampling distribution looks much more normally distributed.\n", 281 | "\n", 282 | "## Confidence intervals\n", 283 | "\n", 284 | "Remember that confidence intervals are intervals that will contain the population parameter in a certain proportion of samples from the population. In this example we will walk through [the simulation that was presented in the book](https://statsthinking21.github.io/statsthinking21-core-site/sampling.html#confidence-intervals) to show that this actually works properly. To do this, let's create a function that takes a sample from the NHANES population and returns the confidence interval for the mean of the `Height` variable within that sample. We will use the t distribution to obtain our confidence intervals." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "id": "7JI7ZczG7SjY" 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "def get_confidence_interval(df, variable_name,\n", 296 | " ci_percent=95,\n", 297 | " sample_size=50):\n", 298 | " sample_df = df.sample(sample_size)\n", 299 | " mean = sample_df[variable_name].mean()\n", 300 | " std = sample_df[variable_name].std()\n", 301 | " sem = std / np.sqrt(sample_size)\n", 302 | " t_tail_proportion = 1 - ((100 - ci_percent) / 100) / 2\n", 303 | " t_cutoff = scipy.stats.t.ppf(t_tail_proportion, sample_size - 1)\n", 304 | " upper_ci = mean + sem * t_cutoff\n", 305 | " lower_ci = mean - sem * t_cutoff\n", 306 | " return([lower_ci, upper_ci])" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "id": "tUwfOvxi7SjY" 313 | }, 314 | "source": [ 315 | "Using this function, let's resample the data 1000 times and look how often the resulting interval contains the population mean." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "id": "PsmQDGkJ7SjY" 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "\n", 327 | "num_runs = 1000\n", 328 | "\n", 329 | "ci_df = pd.DataFrame({'lower': np.zeros(num_runs),\n", 330 | " 'upper': np.zeros(num_runs)})\n", 331 | "\n", 332 | "for i in range(num_runs):\n", 333 | " ci_df.iloc[i, :] = get_confidence_interval(\n", 334 | " adult_nhanes_data,\n", 335 | " 'Height'\n", 336 | " )" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "id": "0KH1rOfq7SjY" 343 | }, 344 | "source": [ 345 | "Now we need to compute the proportion of confidence intervals that capture the population mean (which we know because we are treating the entire NHANES dataset as our population). Here we will use a trick that relies upon the fact that Python treat `True`/`False` identically to one and zero respectively. We will test for each of the confidence limits (upper and lower) whether it captures the population mean, and then we will multiply those two series of values together. This will create a new variable that is True only if both limits capture the population mean. We then simply take the mean of those truth values to compute the poportion of confidence intervals that capture the mean." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "id": "Xei96nBr7SjY" 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "ci_df['captures_mean'] = (ci_df['lower'] < adult_nhanes_data['Height'].mean()) * (ci_df['upper'] > adult_nhanes_data['Height'].mean())\n", 357 | "\n", 358 | "ci_df['captures_mean'].mean()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": { 364 | "id": "WQQ_1csO7SjY" 365 | }, 366 | "source": [ 367 | "This number should be very close to 0.95." 368 | ] 369 | } 370 | ], 371 | "metadata": { 372 | "jupytext": { 373 | "formats": "ipynb,py:percent" 374 | }, 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "colab": { 381 | "provenance": [] 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 0 386 | } -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## creative commons 2 | 3 | # Attribution-NonCommercial 4.0 International 4 | 5 | Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible. 6 | 7 | ### Using Creative Commons Public Licenses 8 | 9 | Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses. 10 | 11 | * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors). 12 | 13 | * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees). 14 | 15 | ## Creative Commons Attribution-NonCommercial 4.0 International Public License 16 | 17 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 18 | 19 | ### Section 1 – Definitions. 20 | 21 | a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 22 | 23 | b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 24 | 25 | c. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 26 | 27 | d. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 28 | 29 | e. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 30 | 31 | f. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 32 | 33 | g. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 34 | 35 | h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License. 36 | 37 | i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 38 | 39 | j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 40 | 41 | k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 42 | 43 | l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 44 | 45 | ### Section 2 – Scope. 46 | 47 | a. ___License grant.___ 48 | 49 | 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 50 | 51 | A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 52 | 53 | B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 54 | 55 | 2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 56 | 57 | 3. __Term.__ The term of this Public License is specified in Section 6(a). 58 | 59 | 4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 60 | 61 | 5. __Downstream recipients.__ 62 | 63 | A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 64 | 65 | B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 66 | 67 | 6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 68 | 69 | b. ___Other rights.___ 70 | 71 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 72 | 73 | 2. Patent and trademark rights are not licensed under this Public License. 74 | 75 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 76 | 77 | ### Section 3 – License Conditions. 78 | 79 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 80 | 81 | a. ___Attribution.___ 82 | 83 | 1. If You Share the Licensed Material (including in modified form), You must: 84 | 85 | A. retain the following if it is supplied by the Licensor with the Licensed Material: 86 | 87 | i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 88 | 89 | ii. a copyright notice; 90 | 91 | iii. a notice that refers to this Public License; 92 | 93 | iv. a notice that refers to the disclaimer of warranties; 94 | 95 | v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 96 | 97 | B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 98 | 99 | C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 100 | 101 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 102 | 103 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 104 | 105 | 4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. 106 | 107 | ### Section 4 – Sui Generis Database Rights. 108 | 109 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 110 | 111 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 112 | 113 | b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and 114 | 115 | c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 116 | 117 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 118 | 119 | ### Section 5 – Disclaimer of Warranties and Limitation of Liability. 120 | 121 | a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__ 122 | 123 | b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__ 124 | 125 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 126 | 127 | ### Section 6 – Term and Termination. 128 | 129 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 130 | 131 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 132 | 133 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 134 | 135 | 2. upon express reinstatement by the Licensor. 136 | 137 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 138 | 139 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 140 | 141 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 142 | 143 | ### Section 7 – Other Terms and Conditions. 144 | 145 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 146 | 147 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 148 | 149 | ### Section 8 – Interpretation. 150 | 151 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 152 | 153 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 154 | 155 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 156 | 157 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 158 | 159 | > Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. 160 | > 161 | > Creative Commons may be contacted at creativecommons.org 162 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributors Guide for the Python companion to *Statistical Thinking for the 21st Century* 2 | 3 | Welcome to the this project! 4 | We're excited you're here and want to contribute. 5 | 6 | The goal of this project is to develop a fully Pythonic companion to the [core statistical text](https://statsthinking21.github.io/statsthinking21-core-site/), in parallel with the in-progress [R companion](https://github.com/statsthinking21/statsthinking21-R). 7 | 8 | Any contributions to this project are welcome, from flagging minor typos to developing entirely new chapters. 9 | 10 | If you are experienced with the use of Git/Github for collaborative projects, you can jump to the [Technical Guidelines](#Technical-guidelines) section below. 11 | 12 | ## Practical guide to submitting your contribution 13 | 14 | These guidelines are designed to make it as easy as possible to get involved. 15 | If you have any questions that aren't discussed below, 16 | please let us know by opening an [issue][link_issues]! 17 | 18 | Before you start, you'll need to set up a free [GitHub][link_github] account and sign in. 19 | Here are some [instructions][link_signupinstructions]. 20 | 21 | Already know what you're looking for in this guide? Jump to the following sections: 22 | 23 | * [Joining the conversation](#joining-the-conversation) 24 | * [Contributing through Github](#contributing-through-github) 25 | * [Understanding issues](#understanding-issues) 26 | * [Making a change](#making-a-change) 27 | * [Structuring contributions](#fMRIPrep-coding-style-guide) 28 | * [Licensing](#licensing) 29 | * [Recognizing contributors](#recognizing-contributions) 30 | 31 | ## Joining the conversation 32 | 33 | Discussions regarding the content and structure of the book take place via Github [issues][link_issues]. We actively monitor this space and look forward to hearing from you with any questions or suggestions. 34 | 35 | ## Contributing through GitHub 36 | 37 | [git][link_git] is a really useful tool for version control. 38 | [GitHub][link_github] sits on top of git and supports collaborative and distributed working. 39 | 40 | If you're not yet familiar with `git`, there are lots of great resources to help you *git* started! 41 | Some of our favorites include the [git Handbook][link_handbook] and 42 | the [Software Carpentry introduction to git][link_swc_intro]. 43 | 44 | On GitHub, You'll use [Markdown][markdown] to chat in issues and pull requests. 45 | You can think of Markdown as a few little symbols around your text that will allow GitHub 46 | to render the text with a little bit of formatting. 47 | For example, you could write words as bold (`**bold**`), or in italics (`*italics*`), 48 | or as a [link][rick_roll] (`[link](https://youtu.be/dQw4w9WgXcQ)`) to another webpage. 49 | 50 | GitHub has a really helpful page for getting started with 51 | [writing and formatting Markdown on GitHub][writing_formatting_github]. 52 | 53 | ## Understanding issues 54 | 55 | Every project on GitHub uses [issues][link_issues] slightly differently. The following outlines how the *statsthinking21* developers think about these tools. 56 | 57 | **Issues** are individual pieces of work that need to be completed to move the project forward. 58 | A general guideline: if you find yourself tempted to write a great big issue that 59 | is difficult to describe as one unit of work, please consider splitting it into two or more issues. 60 | 61 | Issues are assigned [labels](#issue-labels) which explain how they relate to the overall project's goals and immediate next steps. 62 | 63 | ### Issue Labels 64 | 65 | The current list of issue labels are [here][link_labels] and include: 66 | 67 | * [![Good first issue](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/good%20first%20issue)][link_firstissue] *These issues contain a task that is amenable to new contributors because it doesn't entail a steep learning curve.* 68 | 69 | If you feel that you can contribute to one of these issues, we especially encourage you to do so! 70 | 71 | * [![Bug](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/bug)][link_bugs] *These issues point to problems in the code.* 72 | 73 | If you find a new bug, please give as much detail as possible in your issue, 74 | including steps to recreate the error. 75 | If you experience the same bug as one already listed, 76 | please add any additional information that you have as a comment. 77 | 78 | * [![Invalid](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/invalid)][link_bugs] *These issues point to conceptual or statistical errors in the text.* 79 | 80 | If you find a conceptual or statistical problem with the text, please note its line number, describe the rationale for your report, and suggest a fix if possible. 81 | 82 | * [![Typo](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/typo)][link_bugs] *These issues point to typographic errors in the text.* 83 | 84 | If you find a new typo, please note its line number, and also note the recommended correction. 85 | 86 | * [![New chapter](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/new%20chapter)][link_enhancement] *These issues are proposing a new chapter.* 87 | 88 | If you wish to propose a new chapter, please describe your rationale, and how it would fit with the existing chapters. If possible, provide an outline of chapter subtopics. 89 | 90 | * [![New section](https://img.shields.io/github/labels/statsthinking21/statsthinking21-python/new%20section)][link_enhancement] *These issues are proposing a new section to an existing chapter.* 91 | 92 | If you wish to propose a new section for an existing chapter, please describe your rationale, the topics that you think it should address, and how it would fit into the existing chapter. 93 | 94 | ## Making a change 95 | 96 | We appreciate all contributions to this book, 97 | but those accepted fastest will follow a workflow similar to the following: 98 | 99 | 1. **Comment on an existing issue or open a new issue referencing your addition.**
100 | This allows other members of the development team to confirm that you aren't 101 | overlapping with work that's currently underway and that everyone is on the same page 102 | with the goal of the work you're going to carry out.
103 | [This blog][link_pushpullblog] is a nice explanation of why putting this work in up front 104 | is so useful to everyone involved. 105 | 106 | 1. **[Fork][link_fork] the [book repository][link_pybookrepo] to your profile.**
107 | This is now your own unique copy of the book source. 108 | Changes here won't affect anyone else's work, so it's a safe space to explore edits to the code! 109 | 110 | 1. **[Clone][link_clone] your forked book repository to your machine/computer.**
111 | While you can edit files [directly on github][link_githubedit], sometimes the changes 112 | you want to make will be complex and you will want to use a [text editor][link_texteditor] 113 | that you have installed on your local machine/computer. 114 | (One great text editor is [vscode][link_vscode]).
115 | In order to work on the code locally, you must clone your forked repository.
116 | To keep up with changes in the main book repository, 117 | add the ["upstream" book repository as a remote][link_addremote] 118 | to your locally cloned repository. 119 | ```Shell 120 | git remote add upstream https://github.com/statsthinking21/statsthinking21-python.git 121 | ``` 122 | Make sure to [keep your fork up to date][link_updateupstreamwiki] with the upstream repository.
123 | For example, to update your master branch on your local cloned repository: 124 | ```Shell 125 | git fetch upstream 126 | git checkout master 127 | git merge upstream/master 128 | ``` 129 | 130 | 1. **Create a [new branch][link_branches] to develop and maintain the proposed code changes.**
131 | For example: 132 | ```Shell 133 | git fetch upstream # Always start with an updated upstream 134 | git checkout -b fix/bug-1222 upstream/master 135 | ``` 136 | Please consider using appropriate branch names as those listed below, and mind that some of them 137 | are special (e.g., `doc/` and `docs/`): 138 | * `fix/`: for bugfixes 139 | * `enh/`: for new features 140 | 141 | 1. **Make the changes you've discussed, following the [style guide for Python code](https://www.python.org/dev/peps/pep-0008/).**
142 | Try to keep the your changes focused: it is generally easy to review changes that address one new section or bug fix at a time. 143 | Once you are satisfied with your local changes, [add/commit/push them][link_add_commit_push] 144 | to the branch on your forked repository. 145 | 146 | 1. **Submit a [pull request][link_pullrequest].**
147 | A member of the development team will review your changes to confirm 148 | that they can be merged into the main code base.
149 | Pull request titles should begin with a descriptive prefix 150 | (for example, `FIX: Correct error in computation of standard deviation`): 151 | * `ENH`: enhancements, such as new text or code ([example][enh_ex]) 152 | * `FIX`: bug or typo fixes ([example][fix_ex]) 153 | * `TST`: new or updated tests ([example][tst_ex]) 154 | * `STY`: style changes ([example][sty_ex]) 155 | * `REF`: refactoring existing code ([example][ref_ex]) 156 | * `CI`: updates to continous integration infrastructure ([example][ci_ex]) 157 | * `MAINT`: general maintenance ([example][maint_ex]) 158 | * For works-in-progress, add the `WIP` tag in addition to the descriptive prefix. 159 | Pull-requests tagged with `WIP:` will not be merged until the tag is removed. 160 | 161 | 1. **Have your PR reviewed by the developers team, and update your changes accordingly in your branch.**
162 | The reviewers will take special care in assisting you address their comments, as well as dealing with conflicts 163 | and other tricky situations that could emerge from distributed development. 164 | 165 | ## Technical guidelines 166 | 167 | The (currently proposed) technical plan for the book is as follows. 168 | 169 | 1. The code should be written in pure Python, targeting version 3.7 or greater. All code should follow the [Python code style guide [PEP8]](https://www.python.org/dev/peps/pep-0008/), and should pass the [flake8](https://flake8.pycqa.org/en/latest/) style checker before submission. 170 | 171 | 1. The python file for each chapter will be named *chapter-<*topic*>.py*. In principle, the chapters should coordinate with those in the [core text](https://statsthinking21.github.io/statsthinking21-core-site/); if one wishes to break this rule, then please raise an issue for discussion. Any additional files (e.g. those defining utility functions) should be placed with the *utils* directory, and preferably named with the chapter topic in the name (e.g. "chapter-topic-utils.py"). 172 | 173 | 1. The chapters files should be written using [Jupytext](https://github.com/mwouts/jupytext), which allows one to generate a jupyter notebook from a pure Python file, using the *percent* format in which cells are delimited with a commented %%. This decision was made in order to simplify the use of version control on the code; when using plain Jupyter notebooks, the metadata is saved in the file such that the file contents change every time the notebook is executed, making it very difficult to determine the relevant changes. 174 | 175 | 1. The chapter files will be automatically converted to standard Jupyter notebooks using Jupytext using continuous integration. 176 | 177 | 1. The book will be generated using [jupyter-book](https://jupyterbook.org/intro.html), which renders the jupyter notebooks to html. 178 | 179 | *TBD*: Identify additional style issues regarding the structure of the notebooks. 180 | 181 | ## Recognizing contributions 182 | 183 | We welcome and recognize all contributions regardless their size, content or scope: 184 | from documentation to testing and code development. 185 | You can see a list of current developers and contributors in our [zenodo file][link_zenodo]. 186 | Before every release, a new [zenodo file][link_zenodo] will be generated. 187 | After the first draft is complete, we will create an update script that will also sort creators and contributors by 188 | the relative size of their contributions, as provided by the `git-line-summary` utility 189 | distributed with the `git-extras` package. 190 | Last positions in both the *creators* and *contributors* list will be reserved to 191 | the project leaders. 192 | These special positions can be revised to add names by punctual request and revised for 193 | removal and update of ordering in an scheduled manner every two years. 194 | All the authors enlisted as *creators* participate in the revision of modifications. 195 | 196 | ### Creators 197 | 198 | Creators are members of a the team who have been responsible for _establishing and/or driving the project_. 199 | Names and contacts of all creators are included in the 200 | [``.maint/creators.json`` file](https://github.com/statsthinking21/statsthinking21-python/blob/master/.maint/developers.json) 201 | Examples of steering activities that _drive the project_ are: actively participating in the development of new content, helping with the design of the project, and providing resources (in the broad sense, including funding). 202 | 203 | ### Contributors 204 | 205 | Contributors listed in the 206 | [``.maint/contributors.json`` file](https://github.com/statsthinking21/statsthinking21-python//blob/master/.maint/contributors.json) 207 | actively help or have previously helped the project in a broad sense: writing code or text, 208 | proposing new features, and finding bugs. 209 | If you are new to the project, don't forget to add your name and affiliation to the list 210 | of contributors there! 211 | 212 | Contributors who have contributed at some point to the project but wish to drop out of the project are listed in the [``.maint/former.json`` file](https://github.com/poldracklab/fmriprep/blob/master/.maint/former.json). 213 | 214 | ## Licensing 215 | 216 | This companion is licensed under the Creative Commons Attribution-NonCommercial 4.0 International (CC-BY-NC). While we are generally opposed to the use of non-commercial licenses, in this case it is necessary. The core statistical text will be published by a commercial publisher as a low-cost paperback book, and this publisher reasonably requires that the open-source version be licensed to prevent other commercial reuse. Because the companion texts may end up incorporating text from the core text, we must thus also license the companions according to CC-BY-NC as well. A benefit of this is that contibutors do not need to worry about "contaminating" the companions with text from the core; in fact, it's perfectly ok to do so, as long as the license terms are upheld. 217 | 218 | By contributing to this book, 219 | you acknowledge that any contributions will be licensed under the same terms. 220 | 221 | ## Thank you! 222 | 223 | You're awesome. :wave::smiley: 224 | 225 |
226 | 227 | *— Based on contributing guidelines from the [STEMMRoleModels][link_stemmrolemodels] and [fMRIprep][link_fmriprep] projects.* 228 | 229 | [link_github]: https://github.com/ 230 | [link_fMRIPrep]: https://github.com/poldracklab/fmriprep 231 | [link_pybookrepo]: https://github.com/statsthinking21/statsthinking21-python/ 232 | [link_signupinstructions]: https://help.github.com/articles/signing-up-for-a-new-github-account 233 | 234 | [link_git]: https://git-scm.com/ 235 | [link_handbook]: https://guides.github.com/introduction/git-handbook/ 236 | [link_swc_intro]: http://swcarpentry.github.io/git-novice/ 237 | 238 | [writing_formatting_github]: https://help.github.com/articles/getting-started-with-writing-and-formatting-on-github 239 | [markdown]: https://daringfireball.net/projects/markdown 240 | [rick_roll]: https://www.youtube.com/watch?v=dQw4w9WgXcQ 241 | 242 | [link_issues]: https://github.com/statsthinking21/statsthinking21-python/issues 243 | [link_labels]: https://github.com/statsthinking21/statsthinking21-python/labels 244 | [link_discussingissues]: https://help.github.com/articles/discussing-projects-in-issues-and-pull-requests 245 | 246 | [link_bugs]: https://github.com/statsthinking21/statsthinking21-python/labels/bug 247 | [link_firstissue]: https://github.com/statsthinking21/statsthinking21-python/labels/good%20first%20issue 248 | [link_enhancement]: https://github.com/statsthinking21/statsthinking21-python/labels/enhancement 249 | 250 | [link_pullrequest]: https://help.github.com/articles/creating-a-pull-request-from-a-fork 251 | [link_fork]: https://help.github.com/articles/fork-a-repo/ 252 | [link_clone]: https://help.github.com/articles/cloning-a-repository 253 | [link_githubedit]: https://help.github.com/articles/editing-files-in-your-repository 254 | [link_texteditor]: https://en.wikipedia.org/wiki/Text_editor 255 | [link_vscode]: https://code.visualstudio.com/ 256 | [link_addremote]: https://help.github.com/articles/configuring-a-remote-for-a-fork 257 | [link_pushpullblog]: https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/ 258 | [link_branches]: https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/ 259 | [link_add_commit_push]: https://help.github.com/articles/adding-a-file-to-a-repository-using-the-command-line 260 | [link_updateupstreamwiki]: https://help.github.com/articles/syncing-a-fork/ 261 | [link_stemmrolemodels]: https://github.com/KirstieJane/STEMMRoleModels 262 | [link_zenodo]: https://github.com/statsthinking21/statsthinking21-python//blob/master/.zenodo.json 263 | [link_update_script]: https://github.com/poldracklab/fmriprep/blob/master/.maintenance/update_zenodo.py 264 | [link_devel]: https://fmriprep.readthedocs.io/en/latest/contributors.html 265 | [link_fmriprep]: http://fmriprep.org 266 | [link_bidsapps]: https://bids-apps.neuroimaging.io 267 | [link_mattermost]: https://mattermost.brainhack.org/brainhack/channels/fmriprep 268 | [link_aroma]: https://fmriprep.readthedocs.io/en/stable/workflows.html#ica-aroma 269 | 270 | [enh_ex]: https://github.com/poldracklab/fmriprep/pull/1508 271 | [fix_ex]: https://github.com/poldracklab/fmriprep/pull/1378 272 | [tst_ex]: https://github.com/poldracklab/fmriprep/pull/1098 273 | [doc_ex]: https://github.com/poldracklab/fmriprep/pull/1515 274 | [sty_ex]: https://github.com/poldracklab/fmriprep/pull/675 275 | [ref_ex]: https://github.com/poldracklab/fmriprep/pull/816 276 | [ci_ex]: https://github.com/poldracklab/fmriprep/pull/1048 277 | [maint_ex]: https://github.com/poldracklab/fmriprep/pull/1239 -------------------------------------------------------------------------------- /notebooks/03-DataVisualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "7zZ2q4nYLyCS" 7 | }, 8 | "source": [ 9 | "# Data Visualization" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "r8TdljPGLyCT" 16 | }, 17 | "source": [ 18 | "There are two main packages that we will use for visualization in Python: [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/), which is based on matplotlib. First, let's import these. It is customary to import the pyplot module from matplotlib, since it contains most of the important plotting functions:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "lines_to_next_cell": 2, 26 | "id": "H8vkTL-ELyCT" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import matplotlib.pyplot as plt\n", 31 | "import seaborn as sns\n", 32 | "import numpy as np\n", 33 | "import pandas as pd" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "lines_to_next_cell": 2, 40 | "id": "yewDfcHLLyCT" 41 | }, 42 | "source": [ 43 | "## Let's think through a visualization\n", 44 | "\n", 45 | "Principles we want to keep in mind:\n", 46 | "\n", 47 | "* Show the data without distortion\n", 48 | "* Use color, shape, and location to encourage comparisons\n", 49 | "* Minimize visual clutter (maximize your information to ink ratio)\n", 50 | "\n", 51 | "The two questions you want to ask yourself before getting started are:\n", 52 | "\n", 53 | "* What type of variable(s) am I plotting?\n", 54 | "* What comparison do I want to make salient for the viewer (possibly myself)?\n", 55 | "\n", 56 | "Figuring out *how* to highlight a comparison and include relevant variables usually benefits from sketching the plot out first." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "lines_to_next_cell": 2, 63 | "id": "bq7cbi_bLyCU" 64 | }, 65 | "source": [ 66 | "## Plotting the distribution of a single variable\n", 67 | "\n", 68 | "One of the most common uses of plotting is to plot the *distribution* of the data --- which you can think of as the *shape* of the data. There are various ways to do this, but one of the most common is known as a *histogram*, which plots the number of observations that fall into specific bins. We can plot a histogram using the `plt.hist()` function from matplotlib. As an example, let's look at the distribution of ages in the NHANES dataset. First we need to load the data:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "source": [ 74 | "! pip install nhanes" 75 | ], 76 | "metadata": { 77 | "id": "YVFh6cuJL_c_" 78 | }, 79 | "execution_count": null, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "id": "PdkNcsWkLyCU" 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "from nhanes.load import load_NHANES_data\n", 91 | "nhanes_data = load_NHANES_data()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "lines_to_next_cell": 2, 98 | "id": "6nhPy3KNLyCV" 99 | }, 100 | "source": [ 101 | "Then we can plot the histogram:" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "id": "NhTGc7YFLyCV" 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "age_histogram = plt.hist(nhanes_data['AgeInYearsAtScreening'])" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "id": "2GGjHfk1LyCV" 119 | }, 120 | "source": [ 121 | "You can see from this plot that the `plt.hist()` function has binned together individuals across several years; That's because we let it automatically determine the size of the bins. Let's say that instead we want to bin each year separately. We can do this using the `bins` argument to `plt.hist`. Because this argument takes a list of bins, we need to create a list that spans from the youngest to the oldest age. We can do this using the `np.arange()` function from numpy, which generates a list of numbers that span a particular range. In this case, we need to span from the youngest to the oldest value, which are equivalent to the minimum and maximum values which we can obtain using the `.min()` and `.max()` operators. The addition of 2 is to ensure that every age group is placed into one single bin (the reason is related to how `plt.hist` decide whether the values at the boundary of each bin is counted into that bin and how `np.arange()` defines the boundary, which go too much into details)." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "id": "T19ag0nFLyCV" 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "print(np.sort(nhanes_data['AgeInYearsAtScreening'].unique()))\n", 133 | "\n", 134 | "bins = np.arange(nhanes_data['AgeInYearsAtScreening'].min(), nhanes_data['AgeInYearsAtScreening'].max() + 2)\n", 135 | "age_histogram_1year_bins = plt.hist(nhanes_data['AgeInYearsAtScreening'], bins=bins)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "id": "IiERBzQPLyCW" 142 | }, 143 | "source": [ 144 | "Sometimes it's more useful to look at the density rather than the counts, which we can do by setting `density=True` in our call to the histogram function:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "id": "olscS_unLyCW" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "age_density_1year_bins = plt.hist(nhanes_data['AgeInYearsAtScreening'], bins=bins, density=True)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "ITbFJp7PLyCW" 162 | }, 163 | "source": [ 164 | "Now we see the proportion of individuals that fall into each age bin. Why do you think there are so many eighty-year-olds in the dataset? Have a look at the [documentation for the Age question](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.htm#RIDAGEYR) and see if you can figure it out." 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "id": "7Ic4a2PzLyCW" 171 | }, 172 | "source": [ 173 | "### Bar vs. line plots\n", 174 | "\n", 175 | "The histograms above are an example of *bar plots* where each number is represented by a bar. We could also plot the distribution using a line instead. One reason to do this is that we can make the line a bit *smoother* than the actual data. For example, here are the histogram data from above, plotted as a line:" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "id": "2SV_R9NRLyCW" 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "plt.plot(age_density_1year_bins[1][1:], age_density_1year_bins[0])\n", 187 | "plt.show()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "id": "ZdujRtB7LyCW" 194 | }, 195 | "source": [ 196 | "Here we have taken advantage of the fact that the output of our histogram command above contains both the bins (in its [1] position) and the histogram values (in its [0]) position. Why do we include `[1:]` after the bins variable? This is because the bins include both the upper and lower edges of the bin, which means that there is one more bin value than there are average values. Adding `[1:]` is equivalent to saying \"start with the second bin\" which is equivalent to using the top edges of each bin for our X axis.\n", 197 | "\n", 198 | "Now let's plot a smoothed version of the histogram, using the `sns.histplot()` function from the seaborn library by including an argument of `kde=True`. " 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "id": "97MYjMqrLyCW" 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "sns.histplot(nhanes_data['AgeInYearsAtScreening'], bins=bins, kde=True)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "id": "g0WT7rNbLyCW" 216 | }, 217 | "source": [ 218 | "You can see that the line is now much smoother (less bumpy) than the one above. It generally follows the overall shape of the data pretty closely, but you can also see that it mostly hides the large bump at 80 years. It's always important to keep in mind that anything we do to the data has the potential to distort their message." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "OxZb4Ef7LyCW" 225 | }, 226 | "source": [ 227 | "## Plots with two variables\n", 228 | "\n", 229 | "Another common use of visualization is to examine the relationship betwen two variables. For example, let's say that we wanted to plot average height as a function of age in the NHANES dataset. We would first summarize the data to obtain the average height for each age:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "id": "Aj0Yf0kZLyCW" 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "mean_height_by_age = nhanes_data.groupby('AgeInYearsAtScreening')['StandingHeightCm'].mean()\n", 241 | "print(mean_height_by_age)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "id": "ClDM9fTdLyCW" 248 | }, 249 | "source": [ 250 | "Here we use a method called `.groupby()` along with a builtin in method for computing the average of a variable in a dataframe (`.mean()`). This returns a single average height value for all of the individuals in each age group, which we can then plot. While we are at it, we will add descriptive labels to the X and Y axes, which is always a good idea:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "id": "ZsAKHwlQLyCW" 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "plt.plot(mean_height_by_age.index, mean_height_by_age)\n", 262 | "plt.xlabel('Age at screening')\n", 263 | "plt.ylabel('Standing Height (cm)')\n", 264 | "plt.show()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": { 270 | "id": "mTlqHfcALyCX" 271 | }, 272 | "source": [ 273 | "As expected, people get taller up to about age 18, and then then slowly shrink over time. Since we know that men and women differ in their height, we can also plot their average heights separately. We could do this using the matplot plotting function, but it's actually easier to do using the `sns.lineplot()` function from the seaborn library that we imported above. We simply give it the X and Y variables that we want to plot as well as the variable that we want to separate (using different colors), and it does the work for us:" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "lines_to_next_cell": 2, 281 | "id": "rP6OuhiVLyCX" 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "sns.lineplot(x='AgeInYearsAtScreening', y='StandingHeightCm', hue='Gender', data=nhanes_data)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "id": "jOYKs86zLyCX" 292 | }, 293 | "source": [ 294 | "You will notice that the lines have shaded areas around them; these are called *confidence intervals*, and you will learn about them later in the course. They basically tell us something about the uncertainty around our estimates of the average." 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "id": "QtB-ORaaLyCX" 301 | }, 302 | "source": [ 303 | "## Plotting dispersion\n", 304 | "\n", 305 | "An important job of statistical visualization is to show us the variability, or *dispersion*, of our data. We have already see how to do this using histograms; now let's look at how we can compare distributions.\n", 306 | "\n", 307 | "Let's start with a simple example: Comparing the height of adult men and women in the NHANES sample. One commonly used plot is the *box plot* (sometimes known as a *box and whiskers plot*). This form of plot uses quartiles to give us a sense of spread. The thickest line, somewhere inside the box, represents the *median*. The upper and lower bounds of the box (the *hinges*) are the first and third quartiles (can you use them to approximate the interquartile range?). The lines extending from the hinges are the remaining data points, excluding **outliers**, which are plotted as individual points." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "id": "y0vj7VaCLyCX" 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "adult_nhanes_data = nhanes_data.query('AgeInYearsAtScreening > 17')\n", 319 | "sns.boxplot(x='Gender', y='StandingHeightCm', data=adult_nhanes_data)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "id": "k6SNl6GuLyCX" 326 | }, 327 | "source": [ 328 | "This tells us that the median male is taller than 75% of all of the females in the sample.\n", 329 | "\n", 330 | "Another type of plot that is commonly used is the *violin plot*, which shows the shape of the entire distribution:" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "id": "0qHuk6pfLyCX" 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "\n", 342 | "sns.violinplot(x='Gender', y='StandingHeightCm', data=adult_nhanes_data)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": { 348 | "id": "acCJu4YvLyCX" 349 | }, 350 | "source": [ 351 | "### Scatter plot\n", 352 | "\n", 353 | "When we have multiple *continuous* variables, we can use points to plot each variable on an axis. This is known as a **scatter plot**. As an example, let's look at the blood pressure readings taken in the NHANES study. Each individual has their blood pressure taken three times. Here we will plot the first reading against the second reading, using a scatter plot. We will also add a line showing where the x axis is equal to the y axis, which makes it easier to see how the two variables are related to each other." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "id": "HMJHGcbALyCX" 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "sns.scatterplot(x='SystolicBloodPres1StRdgMmHg',\n", 365 | " y='SystolicBloodPres2NdRdgMmHg',\n", 366 | " data=adult_nhanes_data)\n", 367 | "plt.plot([adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),\n", 368 | " adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],\n", 369 | " [adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),\n", 370 | " adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],\n", 371 | " color='k')\n", 372 | "plt.xlabel('Systolic BP - First reading')\n", 373 | "plt.ylabel('Systolic BP - Second reading')" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": { 379 | "lines_to_next_cell": 2, 380 | "id": "nSdvasQVLyCX" 381 | }, 382 | "source": [ 383 | "Here we can see that the two variables are closely related to one another. We can also see that most of the blue points fall below the black line, which tells us that the second reading is generally somewhat lower than the first reading." 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": { 389 | "id": "4VGw0LYhLyCX" 390 | }, 391 | "source": [ 392 | "In this section we will recreate a version of [Figure 4.2](https://statsthinking21.github.io/statsthinking21-core-site/data-visualization.html#anatomy-of-a-plot) from Statistical Thinking in the 21st Century. " 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "lines_to_next_cell": 2, 400 | "id": "0pEsgG6cLyCX" 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "\n", 405 | "oring_data = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-python/master/notebooks/data/orings.csv', index_col=0)\n", 406 | "\n", 407 | "ax = sns.lineplot(x='Temperature', y='DamageIndex', data=oring_data, ci=None)\n", 408 | "plt.xlabel('Temperature at time of launch')\n", 409 | "plt.ylabel('Damage Index')\n", 410 | "ax.fill_between([26, 29], 0, 12,\n", 411 | " facecolor='red', alpha=0.3)\n", 412 | "ax.text(27, 1, 'Forecasted temperature on Jan 28', rotation=90)\n", 413 | "plt.show()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "id": "CgI1Cw0eLyCX" 421 | }, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "jupytext": { 428 | "formats": "ipynb,py:percent" 429 | }, 430 | "kernelspec": { 431 | "display_name": "Python 3", 432 | "language": "python", 433 | "name": "python3" 434 | }, 435 | "language_info": { 436 | "codemirror_mode": { 437 | "name": "ipython", 438 | "version": 3 439 | }, 440 | "file_extension": ".py", 441 | "mimetype": "text/x-python", 442 | "name": "python", 443 | "nbconvert_exporter": "python", 444 | "pygments_lexer": "ipython3", 445 | "version": "3.8.3" 446 | }, 447 | "colab": { 448 | "provenance": [] 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 0 453 | } --------------------------------------------------------------------------------