├── .gitignore ├── LICENSE ├── README.md ├── Versions.ipynb ├── chapter_1 ├── 1_Probability_Sampling.ipynb └── 2_Stratified_Sampling .ipynb ├── chapter_10 ├── 3_Autocorrelation_mean_variance_ccf.ipynb ├── 4_noise.ipynb ├── 5_Stationary.ipynb └── airline-passengers.csv ├── chapter_11 ├── 1_arma.ipynb ├── 2_arima.ipynb ├── 3_more_on_model_evaluation.ipynb └── COCO COLA.csv ├── chapter_12 ├── 1_cross_correlation.ipynb ├── 2_arimax.ipynb └── 3_var.ipynb ├── chapter_14 ├── 1_kaplan_meier.ipynb ├── 2_Exponential_Models.ipynb └── 3_Cox_Proportional_Hazards.ipynb ├── chapter_2 ├── 1_types_of_data.ipynb ├── 2_distributions.ipynb ├── 3_Bootstrapping.ipynb ├── 4_1_Permutations_and_Combinations.ipynb ├── 4_2_Permutation_Testing.ipynb └── 5_Transformation.ipynb ├── chapter_3 ├── 3_1_Zscore_and _Zstatistics.ipynb ├── 3_2_Ztest_for_Means.ipynb ├── 3_3_Ztest_for_Proportions.ipynb └── 4_statistical_power.ipynb ├── chapter_4 ├── 1_assumptions.ipynb ├── 2_t-tests.ipynb ├── 3_multiple_tests_and_anova.ipynb └── 4_pearson_correlation.ipynb ├── chapter_5 ├── 1_permutation_test.ipynb ├── 2_rank-sum_test.ipynb ├── 3_Signed-Rank_Test.ipynb ├── 4_Kruskal-Wallis_Test.ipynb └── 5_6_Chi_Square_and_Spearman_Corr.ipynb ├── chapter_6 ├── 1_Ordinary_Least_Squares.ipynb ├── 3_linear_model_assumptions.ipynb └── 4_5_ModelValidation_RegressionVariations.ipynb ├── chapter_7 ├── 1_multiple_linear_regression.ipynb ├── 2_feature_selection.ipynb ├── 3_Shrinkage.ipynb ├── 4_Dimension_Reduction.ipynb └── Data │ └── Hitters.csv ├── chapter_8 ├── 1_Probit_and_Logit_Models.ipynb ├── 2_Multinomial_Logit_Model.ipynb ├── 3_poisson_model.ipynb └── 4_Negative_Binomial_Regression.ipynb ├── chapter_9 ├── 2_Linear_Discriminant_Analysis.ipynb └── 3_Quadratic_discriminant_analysis.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # standard python git ignore from 4 | # https://github.com/github/gitignore/blob/main/Python.gitignore 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Machine Learning Summit 2025

2 | 3 | ## Machine Learning Summit 2025 4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges** 5 | 6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of: 7 | - **Agentic and Generative AI** 8 | - **Applied Machine Learning in the Real World** 9 | - **ML Engineering and Optimization** 10 | 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh) 12 | 13 | --- 14 | 15 | ## Join Our Newsletters 📬 16 | 17 | ### DataPro 18 | *The future of AI is unfolding. Don’t fall behind.* 19 | 20 |

DataPro QR

21 | 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers. 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast. 24 | 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat. 26 | 27 | --- 28 | 29 | ### BIPro 30 | *Business runs on data. Make sure yours tells the right story.* 31 | 32 |

BIPro QR

33 | 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders. 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**. 36 | 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing. 38 | 39 | 40 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY) 41 | 42 |

[![Packt Conference](https://hub.packtpub.com/wp-content/uploads/2023/08/put-generative-ai-to-work-packt.png)](https://packt.link/JGIEY)

43 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 44 | 45 | Code: USD75OFF 46 | 47 | # Building Statistical Models in Python 48 | 49 | Building Statistical Models in Python 50 | 51 | This is the code repository for [Building Statistical Models in Python](https://www.packtpub.com/product/building-statistical-models-in-python/9781804614280), published by Packt. 52 | 53 | **Develop useful models for regression, classification, time series, and survival analysis** 54 | 55 | ## What is this book about? 56 | 57 | The ability to proficiently perform statistical modeling is a fundamental skill for data scientists and essential for businesses reliant on data insights. Building Statistical Models with Python is a comprehensive guide that will empower you to leverage mathematical and statistical principles in data assessment, understanding, and inference generation. 58 | 59 | 60 | This book covers the following exciting features: 61 | * Explore the use of statistics to make decisions under uncertainty 62 | * Answer questions about data using hypothesis tests 63 | * Understand the difference between regression and classification models 64 | * Build models with stats models in Python 65 | * Analyze time series data and provide forecasts 66 | * Discover Survival Analysis and the problems it can solve 67 | 68 | If you feel this book is for you, get your [copy](https://www.amazon.in/Building-Statistical-Models-Python-classification-ebook/dp/B0C7GV7FNN/ref=monarch_sidesheet) today! 69 | 70 | https://www.packtpub.com/ 71 | 72 | ## Instructions and Navigations 73 | All of the code is organized into folders. 74 | 75 | The code will look like the following: 76 | ``` 77 | A = [3,5,4] 78 | B = [43,41,56,78,54] 79 | permutation_testing(A,B,n_iter=10000) 80 | ``` 81 | **Following is what you need for this book:** 82 | 83 | If you are looking to get started with building statistical models for your data sets, this book is for you! Building Statistical Models in Python bridges the gap between statistical theory and practical application of Python. Since you’ll take a comprehensive journey through theory and application, no previous knowledge of statistics is required, but some experience with Python will be useful. 84 | 85 | With the following software and hardware list you can run all code files present in the book (Chapter 1-14). 86 | 87 | ### Software and Hardware List 88 | 89 | | Chapter | Software required | OS required | 90 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 91 | | 1-14 | Python version ≥ 3.8 | Any OS | 92 | | 1-14 | Statsmodels 0.13.2 | Any OS | 93 | | 1-14 | SciPy 1.8.1 | Any OS | 94 | | 1-14 | lifelines 0.27.4 | Any OS | 95 | | 1-14 | scikit-learn 1.1.1 | Any OS | 96 | | 1-14 | pmdarima 2.02 | Any OS | 97 | | 1-14 | Sktime 0.15.0 | Any OS | 98 | | 1-14 | Pandas 1.4.3 | Any OS | 99 | | 1-14 | Matplotlib 3.5.2 | Any OS | 100 | | 1-14 | Numpy 1.23.0 | Any OS | 101 | 102 | 103 | ### Related products 104 | * Hands-On Simulation Modeling with Python - Second Edition [[Packt]](https://www.packtpub.com/product/hands-on-simulation-modeling-with-python-second-edition/9781804616888) [[Amazon]](https://www.amazon.in/Hands-Simulation-Modeling-Python-decision-making/dp/1804616885/ref=sr_1_4?keywords=Hands-On+Simulation+Modeling+with+Python&sr=8-4) 105 | 106 | * Modern Time Series Forecasting with Python [[Packt]](https://www.packtpub.com/product/modern-time-series-forecasting-with-python/9781803246802) [[Amazon]](https://www.amazon.in/Modern-Time-Forecasting-Python-industry-ready/dp/1803246804/ref=sr_1_1?keywords=Modern+Time+Series+Forecasting+with+Python&sr=8-1) 107 | 108 | ## Get to Know the Author(s) 109 | **Huy Hoang Nguyen** is a mathematician and data scientist with extensive experience in advanced mathematics, strategic leadership, and applied machine learning research. He holds a PhD in Mathematics, as well as two Master’s degrees in Applied Mathematics and Data Science. His previous work focused on Partial Differential Equations, Functional Analysis, and their applications in Fluid Mechanics. After transitioning from academia to the healthcare industry, he has undertaken a variety of data science projects, ranging from traditional machine learning to deep learning. 110 | 111 | **Paul Adams** is a Data Scientist with a background primarily in the healthcare industry, on both the provider and insurance sides of business. Paul applies statistics and machine learning in multiple areas of industry, focusing on projects in process engineering, process improvement, metrics and business rules development, anomaly detection, forecasting, clustering and classification. Paul holds a Master of Science in Data Science from Southern Methodist University. 112 | 113 | **Stuart Miller** is a Machine Learning Engineer with degrees in Data Science, Electrical Engineering, and Engineering Physics. Stuart has worked at several Fortune 500 companies, including Texas Instruments and StateFarm, where he built software that utilized statistical and machine learning techniques. Stuart is currently an engineer at Toyota Connected helping to build a more modern cockpit experience for drivers using machine learning 114 | ## Setup Instructions 115 | 116 | To install the packages required for this book, 117 | enter the following commands in a terminal. 118 | 119 | ```bash 120 | # clone repo 121 | git clone git@github.com:PacktPublishing/Implementing-Statistical-Modeling-with-Python.git 122 | 123 | # navigate to the folder 124 | cd Implementing-Statistical-Modeling-with-Python 125 | 126 | # setup python environment 127 | # (this installs the packages used in the book) 128 | python3 -m venv venv 129 | source venv/bin/activate 130 | pip install -r requirements.txt 131 | ``` 132 | 133 | *Note:* Installation instructions for Anaconda are provided in Chapter 1. 134 | -------------------------------------------------------------------------------- /Versions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "07c418a4-15aa-47f9-879f-6f16f0541466", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import scipy as sp\n", 12 | "import pandas as pd\n", 13 | "import statsmodels as sm\n", 14 | "import matplotlib\n", 15 | "import sklearn" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "id": "fb043b27-88b6-4cda-97c6-5e99d6304393", 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "numpy version: 1.23.0\n", 29 | "scipy version: 1.8.1\n", 30 | "pandas version: 1.4.3\n", 31 | "statsmodels version: 0.13.2\n", 32 | "matplotlib version: 3.5.2\n", 33 | "sklearn version: 1.1.1\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "print(f\"numpy version: {np.__version__}\")\n", 39 | "print(f\"scipy version: {sp.__version__}\")\n", 40 | "print(f\"pandas version: {pd.__version__}\")\n", 41 | "print(f\"statsmodels version: {sm.__version__}\")\n", 42 | "print(f\"matplotlib version: {matplotlib.__version__}\")\n", 43 | "print(f\"sklearn version: {sklearn.__version__}\")" 44 | ] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3 (ipykernel)", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.8.0" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 5 68 | } 69 | -------------------------------------------------------------------------------- /chapter_1/1_Probability_Sampling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "fbdd15f6-13a4-4a10-85d4-91f1eb43ff97", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "e13e95dd-8962-4726-9a28-e62edec570d3", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# setup generator for reproducibility \n", 21 | "random_generator = np.random.default_rng(2020) " 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "id": "449621a5-a5ab-4be8-805d-c3aa853cd531", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "[1 8 5]\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "population = np.arange(1, 10 + 1) \n", 40 | "sample = random_generator.choice( \n", 41 | " population, # sample from population \n", 42 | " size=3, # number of samples to take \n", 43 | " replace=False # only allow to sample individuals once \n", 44 | ") \n", 45 | "print(sample) \n", 46 | "# array([1, 8, 5]) " 47 | ] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3 (ipykernel)", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.9.7" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 5 71 | } 72 | -------------------------------------------------------------------------------- /chapter_1/2_Stratified_Sampling .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b6096342-9963-495e-a8df-95a396e4a59e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "5a4b7682-9534-4c70-8fe6-024a1b7946a8", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# setup generator for reproducibility \n", 21 | "random_generator = np.random.default_rng(2020)\n", 22 | " " 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "ff34a34d-70a9-4868-b4e5-7b42717ae2f8", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | " \n", 33 | "population = [ \n", 34 | " 1, \"A\", 3, 4, \n", 35 | " 5, 2, \"D\", 8, \n", 36 | " \"C\", 7, 6, \"B\" \n", 37 | "] \n", 38 | "# group strata \n", 39 | "strata = { \n", 40 | " 'number' : [], \n", 41 | " 'string' : [], \n", 42 | "} \n", 43 | " " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "id": "27f6d1c6-dcc9-49c8-bbc4-9f964470710b", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "{'number': array([4, 1, 2, 8]), 'string': array(['D', 'C'], dtype='" 36 | ] 37 | }, 38 | "metadata": { 39 | "needs_background": "light" 40 | }, 41 | "output_type": "display_data" 42 | } 43 | ], 44 | "source": [ 45 | "label = ['model A', 'model B']\n", 46 | "counts = [3, 5]\n", 47 | "\n", 48 | "edu_label = ['BS', 'MS', 'PhD']\n", 49 | "edu_counts = [10, 5, 2]\n", 50 | "\n", 51 | "fig, ax = plt.subplots(1,2, figsize=(12, 5))\n", 52 | "\n", 53 | "ax[0].bar(label, counts)\n", 54 | "ax[0].set_title('Counts of Machine Models')\n", 55 | "ax[0].set_ylabel('Count')\n", 56 | "ax[0].set_xlabel('Machine Model')\n", 57 | "\n", 58 | "ax[1].bar(edu_label, edu_counts)\n", 59 | "ax[1].set_title('Counts of Education Level')\n", 60 | "ax[1].set_ylabel('Count')\n", 61 | "ax[1].set_xlabel('Education Level')\n", 62 | "\n", 63 | "fig.savefig('figure2.2.png', dpi=300);" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "d65b37de-a8c6-4bee-b59b-c60740289677", 69 | "metadata": {}, 70 | "source": [ 71 | "Wait time example histogram." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "id": "af15610e-1465-4821-85de-7d0c43a4402e", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "image/png": "", 83 | "text/plain": [ 84 | "
" 85 | ] 86 | }, 87 | "metadata": { 88 | "needs_background": "light" 89 | }, 90 | "output_type": "display_data" 91 | } 92 | ], 93 | "source": [ 94 | "a = 4\n", 95 | "x = skewnorm.rvs(a, size=3000) + 0.5\n", 96 | "x = x[x > 0]\n", 97 | "\n", 98 | "# dfw highs for april and may 2022\n", 99 | "# retrieved from https://www.iweathernet.com/texas-dfw-weather-records\n", 100 | "dfw_highs = [\n", 101 | " 85, 87, 75, 88, 80, 86, 90, 94 , 93, 92, 90, 92, 94,\n", 102 | " 93, 97, 90, 95, 96, 96, 95, 92, 70, 79, 73, 88, 92,\n", 103 | " 94, 93, 95, 76, 78, 86, 81, 95, 77, 71, 69, 88, 86,\n", 104 | " 89, 84, 82, 77, 84, 81, 79, 75, 75, 91, 86, 86, 84,\n", 105 | " 82, 68, 75, 78, 82, 83, 85\n", 106 | "]\n", 107 | "\n", 108 | "fig, ax = plt.subplots(1,2, figsize=(12, 5))\n", 109 | "\n", 110 | "ax[0].hist(x, bins=30)\n", 111 | "ax[0].set_xlabel('Wait Time (hr)')\n", 112 | "ax[0].set_ylabel('Frequency')\n", 113 | "ax[0].set_title('Wait Times');\n", 114 | "\n", 115 | "ax[1].hist(dfw_highs, bins=7)\n", 116 | "ax[1].set_title('High Temperatures for DFW (4/2022-5/2022)')\n", 117 | "ax[1].set_ylabel('Frequency')\n", 118 | "ax[1].set_xlabel('Temperature (F)');\n", 119 | "\n", 120 | "fig.savefig('figure2.3.png', dpi=300);" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "8d4b06ff", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "Python 3 (ipykernel)", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.8.0" 149 | }, 150 | "vscode": { 151 | "interpreter": { 152 | "hash": "c98c9082bd23a1abe952c03d7cf1f940136afe77006f0c5f53f05fb13ab2661a" 153 | } 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 5 158 | } 159 | -------------------------------------------------------------------------------- /chapter_2/4_1_Permutations_and_Combinations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f831728b-1ef5-46e9-b0e9-8f985ecc21d3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Permutations\n", 11 | "\n", 12 | "from itertools import permutations " 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7756e167-5876-40c1-a48a-070bb5793acc", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "There are 720 ways to distribute the prizes!\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "# list of 10 people in the party \n", 31 | "people = ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10'] \n", 32 | "# all the ways that the 3 prizes are distributed \n", 33 | "perm = permutations(people, 3) \n", 34 | "list_perm = list(perm) \n", 35 | "print(f\"There are {len(list_perm)} ways to distribute the prizes!\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "id": "a528fa69-6239-4ea2-bdda-8c04a168e8e1", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "There are 3628800 ways to distribute the prizes!\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "# list of 10 people in the party \n", 54 | "people = ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10'] \n", 55 | "# all the ways that the 10 different gifts are distributed \n", 56 | "perm = permutations(people) \n", 57 | "list_perm = list(perm) \n", 58 | "print(f\"There are {len(list_perm)} ways to distribute the prizes!\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "4ec9ff2d-1abb-4ec5-8509-ecdbe5fba469", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "id": "efbbcfe3-fcd0-4ce8-8464-6f77538db7ae", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Combinations\n", 77 | "from itertools import combinations" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "id": "d92604c5-4005-4e3c-9353-cdc73d42157f", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "There are 120 ways to distribute the prizes!\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "# list of 10 people in the party \n", 96 | "people = ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10'] \n", 97 | "# all the ways that the 3 prizes are distributed \n", 98 | "comb = combinations(people, 3) \n", 99 | "list_comb = list(comb) \n", 100 | "print(f\"There are {len(list_comb)} ways to distribute the prizes!\") " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "362da4e7-e3ec-47f2-9458-6663ea0a66f3", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3 (ipykernel)", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.9.7" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 5 133 | } 134 | -------------------------------------------------------------------------------- /chapter_2/4_2_Permutation_Testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e17c8e22-7fc5-437b-9e3f-ae0e643a7089", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "c1c628e4-f2fc-466f-9a25-c8b63fabd0cd", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# create permutation testing function \n", 21 | "def permutation_testing(A,B,n_iter=1000):\n", 22 | "#A, B are 2 lists of samples to test the hypothesis,  \n", 23 | "#n_iter is number of iterations with the default is 1000  \n", 24 | " differences = []\n", 25 | " P = np.array(A+B)\n", 26 | " original_mean = np.array(A).mean()-np.array(B).mean()\n", 27 | " for i in range(n_iter):\n", 28 | " np.random.shuffle(P) # create a random permutation of P\n", 29 | " A_new = P[:len(A)] # having the same size of A \n", 30 | " B_new = P[-len(B):] # having the same size of B \n", 31 | " differences.append(A_new.mean()-B_new.mean()) \n", 32 | " #Calculate p_value \n", 33 | " p_value = round(1-(float(len(np.where(differences<=original_mean)[0]))/float(n_iter)),2) \n", 34 | " return p_value " 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "id": "e96b6297-e200-435f-8e6c-ff44bd431445", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "0.98" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "A = [3,5,4]\n", 56 | "B = [43,41,56,78,54]\n", 57 | "\n", 58 | "permutation_testing(A,B,n_iter=10000)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "e530312a-2b4f-498e-b386-535e9768842d", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3 (ipykernel)", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.9.7" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 5 91 | } 92 | -------------------------------------------------------------------------------- /chapter_2/5_Transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "3c9672c4-507e-4b42-99d3-3804e2b44906", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "6549417b-8adc-4860-ab39-9aa7a4a91b0e", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "np.random.seed(42) # for reproducible purpose \n", 22 | "\n", 23 | "# create a random data  \n", 24 | "df = np.random.beta(a=1,b=10,size=10000)\n", 25 | "df_log = np.log(df) #log transformation\n", 26 | "df_sqrt = np.sqrt(df) # Square Root transformation \n", 27 | "df_cbrt = np.cbrt(df) # Cube Root transformation " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "id": "cdd5ec94-7342-4690-b050-11b9385a3544", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "image/png": "\n", 39 | "text/plain": [ 40 | "
" 41 | ] 42 | }, 43 | "metadata": { 44 | "needs_background": "light" 45 | }, 46 | "output_type": "display_data" 47 | } 48 | ], 49 | "source": [ 50 | "#Plots \n", 51 | "\n", 52 | "plt.figure(figsize=(10,10))\n", 53 | "plt.subplot(2,2,1)\n", 54 | "plt.hist(df)\n", 55 | "plt.title(\"Original Data\")\n", 56 | "plt.subplot(2,2,2) \n", 57 | "plt.hist(df_log) \n", 58 | "plt.title(\"Log Transformation\") \n", 59 | "plt.subplot(2,2,3) \n", 60 | "plt.hist(df_sqrt) \n", 61 | "plt.title(\"Square Root Transformation\") \n", 62 | "plt.subplot(2,2,4) \n", 63 | "plt.hist(df_cbrt) \n", 64 | "plt.title(\"Cube Root Transformation\") \n", 65 | "plt.show()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "acd71a93-36e5-4efb-b86e-877b9101f7d6", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3 (ipykernel)", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.9.7" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 5 98 | } 99 | -------------------------------------------------------------------------------- /chapter_3/3_1_Zscore_and _Zstatistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "aa4f724c-1bcb-46f8-b555-f63e09128118", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd \n", 11 | "import numpy as np \n", 12 | "import scipy.stats as stats \n", 13 | "import math " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "fb4c9ca9-1110-4b2e-8292-fb92f213bbe7", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/html": [ 25 | "
\n", 26 | "\n", 39 | "\n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
IQ scorez-score
090-0.860663
178-1.721326
21100.573775
31100.573775
499-0.215166
51150.932385
61302.008214
7100-0.143444
895-0.502053
993-0.645497
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " IQ score z-score\n", 104 | "0 90 -0.860663\n", 105 | "1 78 -1.721326\n", 106 | "2 110 0.573775\n", 107 | "3 110 0.573775\n", 108 | "4 99 -0.215166\n", 109 | "5 115 0.932385\n", 110 | "6 130 2.008214\n", 111 | "7 100 -0.143444\n", 112 | "8 95 -0.502053\n", 113 | "9 93 -0.645497" 114 | ] 115 | }, 116 | "execution_count": 2, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "IQ = np.array([90, 78,110, 110, 99, 115,130, 100, 95, 93]) \n", 123 | "z_score = stats.zscore(IQ) \n", 124 | " \n", 125 | "# Create dataframe \n", 126 | " \n", 127 | "data_zscore = { \n", 128 | " \"IQ score\": IQ, \n", 129 | " \"z-score\": z_score \n", 130 | "} \n", 131 | " \n", 132 | "IQ_zscore = pd.DataFrame(data_zscore) \n", 133 | "IQ_zscore " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "id": "4c556684-f3a1-49d0-bb71-b405193cb49d", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "The probability that the taken score between 95 and 104 is 29.02%!\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "#calculate z scores at x=95 and 104 \n", 152 | "zscore_95 = round((95-98)/12,2) \n", 153 | "zscore_104 = round((104-98)/12,2) \n", 154 | " \n", 155 | "#calculate cdf and probability \n", 156 | "cdf_95 = stats.norm.cdf(zscore_95) \n", 157 | "cdf_104 = stats.norm.cdf(zscore_104) \n", 158 | "prob = abs(cdf_95-cdf_104) \n", 159 | " \n", 160 | "#print the probability \n", 161 | "print(f\"The probability that the taken score between 95 and 104 is {round(prob*100,2)}%!\") " 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 4, 167 | "id": "032b6685-ef70-4bbf-9d70-a42660e3c3c6", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "The probability that the taken score between 95 and 104 is 53.28%!\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | " # standard error \n", 180 | "n= 4 \n", 181 | "sigma = 12 \n", 182 | "se = sigma/math.sqrt(n) \n", 183 | " \n", 184 | "#calculate z scores at x=95 and 104 \n", 185 | "zscore_95 = round((95-98)/se,2) \n", 186 | "zscore_104 = round((104-98)/se,2) \n", 187 | " \n", 188 | "#calculate cdf and probability \n", 189 | "cdf_95 = stats.norm.cdf(zscore_95) \n", 190 | "cdf_104 = stats.norm.cdf(zscore_104) \n", 191 | "prob = abs(cdf_95-cdf_104) \n", 192 | " \n", 193 | "#print the probability \n", 194 | "print(f\"The probability that the taken score between 95 and 104 is {round(prob*100,2)}%!\") " 195 | ] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3 (ipykernel)", 201 | "language": "python", 202 | "name": "python3" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.9.7" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 5 219 | } 220 | -------------------------------------------------------------------------------- /chapter_3/3_2_Ztest_for_Means.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ae1383f5-2299-49bc-ae1f-f17393b852bc", 6 | "metadata": {}, 7 | "source": [ 8 | "## One-sample Z-test" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "5e1e577e-a737-4196-859d-577273d5357e", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import scipy.stats \n", 19 | "import numpy as np\n", 20 | "import math\n", 21 | "from statsmodels.stats.weightstats import ztest as ztest " 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "0c48d248-4ce4-4820-8be6-92b7ee90b1ad", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "0.0038" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "#find p-value associated with a z-score of -2.67 in a left-tailed test. \n", 43 | "round(scipy.stats.norm.sf(abs(-2.67)),4)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "id": "2643b01a-7689-47ae-87aa-2e9afada8bf7", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "0.007585124695370977" 56 | ] 57 | }, 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "#find p-value for two-tailed test \n", 65 | "scipy.stats.norm.sf(abs(2.67))*2 " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "id": "2c2119f8-27f3-476c-acab-cbb8fb4001e9", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | " The critical value is -1.6448536269514729\n", 79 | " The critical value is 1.6448536269514722\n", 80 | " The critical values are -1.959963984540054 and 1.959963984540054\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "# To compute the critical value \n", 86 | "#The following is the implementation of the code in Python for left-tailed, right-tailed and two-tailed tests. \n", 87 | "\n", 88 | "alpha = 0.05 # level of significance \n", 89 | " \n", 90 | "#find Z critical value for left-tailed test \n", 91 | "print(f\" The critical value is {scipy.stats.norm.ppf(alpha)}\") \n", 92 | " \n", 93 | "#find Z critical value for left-tailed test \n", 94 | "print(f\" The critical value is {scipy.stats.norm.ppf(1-alpha)}\") \n", 95 | " \n", 96 | "##find Z critical value for two-tailed test \n", 97 | "print(f\" The critical values are {-scipy.stats.norm.ppf(1-alpha/2)} and {scipy.stats.norm.ppf(1-alpha/2)}\") " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "id": "467617c0-0a3e-4174-8e03-79903e78ff3a", 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "104.16666666666667" 110 | ] 111 | }, 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "# Example of IQ test scores again in the high school in Dallas \n", 119 | "IQscores = [95,110, 105, 120, 125, 110, 98, 90, 99, 100, \n", 120 | " 110, 112, 106, 92, 108, 97, 95, 99, 100, 100, \n", 121 | " 103, 125, 122, 110, 112, 102, 92, 97, 89, 102] \n", 122 | " \n", 123 | "IQmean = np.array(IQscores).mean() \n", 124 | "IQmean" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "id": "de560509-8959-4264-b348-d2e0fecdbbaa", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# with level of significance 0.05\n", 135 | "\n", 136 | "n=30 #number of students \n", 137 | "sigma =12 #population standard deviation \n", 138 | "IQmean = 104.17 # IQ mean of 30 students after the training \n", 139 | "mu = 98 # population mean \n", 140 | " \n", 141 | "z = (IQmean-mu)/(sigma/math.sqrt(n)) #z = 2.81620681650573\n", 142 | "\n", 143 | "# Since the test statistic value 2.8162 > 1.64485, we reject the null hypothesis. \n", 144 | "# This means that the training does affect the IQ levels of these students and helps them improve their IQ scores. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 7, 150 | "id": "6ffbb49e-5bcc-4fa4-bb83-594aa7765555", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "The test statistic is 3.397499328379722 and the corresponding p-value is 0.00034002377451735716.\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# Example of IQ test scores again in the high school in Dallas\n", 163 | "\n", 164 | " \n", 165 | "#IQ scores after training sections \n", 166 | "IQscores = [95,110, 105, 120, 125, 110, 98, 90, 99, 100, \n", 167 | " 110, 112, 106, 92, 108, 97, 95, 99, 100, 100, \n", 168 | " 103, 125, 122, 110, 112, 102, 92, 97, 89, 102] \n", 169 | " \n", 170 | "#perform one sample z-test \n", 171 | "z_statistic, p_value = ztest(IQscores, value=98, alternative = 'larger') \n", 172 | " \n", 173 | "print(f\"The test statistic is {z_statistic} and the corresponding p-value is {p_value}.\") " 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "ddaa8d82-63d7-4131-9478-d84417f6f704", 179 | "metadata": {}, 180 | "source": [ 181 | "## Two-sample Z-test" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 8, 187 | "id": "b1943883-f085-4f99-a3fb-2ceb5ca5b6e7", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "The test statistic is 1.7572688143960962 and the corresponding p-value is 0.07887200072419194.\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "#IQ score \n", 200 | "A= [95,110, 105, 120, 125, 110, 98, 90, 99, 100, \n", 201 | " 110, 112, 106, 92, 108, 97, 95, 99, 100, 100, \n", 202 | " 103, 125, 122, 110, 112, 102, 92, 97, 89, 102] # school A \n", 203 | "B = [98, 90, 100, 93, 91, 79, 90, 100, 121, 89, \n", 204 | " 101, 98, 75, 90, 95, 99, 100, 120, 121, 95, \n", 205 | " 96, 89, 115, 99, 95, 121, 122, 98, 97, 97] # school B \n", 206 | " \n", 207 | "#perform two- sample z-test \n", 208 | "z_statistic, p_value = ztest(A, B, value=0, alternative = 'two-sided') \n", 209 | " \n", 210 | "print(f\"The test statistic is {z_statistic} and the corresponding p-value is {p_value}.\") " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "5b276a6e-8d5c-4378-8a0a-3deee6f01662", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3 (ipykernel)", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.9.7" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 5 243 | } 244 | -------------------------------------------------------------------------------- /chapter_3/3_3_Ztest_for_Proportions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "37dabfbf-b6cd-408b-b1ec-e901f4e64d07", 6 | "metadata": {}, 7 | "source": [ 8 | "## One-proportion Z-test" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "dccd35bc-d8f0-43c0-b615-1dd8f1edcda7", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "The test statistic is -2.236067977499786 and the corresponding p-value is 0.0253473186774685.\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "#import proportions_ztest function \n", 27 | "from statsmodels.stats.proportion import proportions_ztest \n", 28 | " \n", 29 | "count = 0.8*500 \n", 30 | "nobs = 500 \n", 31 | "value = 0.84 \n", 32 | " \n", 33 | "#perform one proportion two-tailed z-test \n", 34 | "z_statistic, p_value = proportions_ztest(count, nobs, value, alternative = 'two-sided') \n", 35 | " \n", 36 | "print(f\"The test statistic is {z_statistic} and the corresponding p-value is {p_value}.\") " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "6171c637-ef73-440e-b56d-e112d239db24", 42 | "metadata": {}, 43 | "source": [ 44 | "## Two-proportion Z-test" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "id": "a00f60dd-a51e-4b9b-9289-7578e1b0d520", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "The test statistic is 1.6329931618554536 and the p-value for two tailed test is 0.1024704348597491.\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "import math \n", 63 | "import scipy \n", 64 | "p_1bar = 0.8 \n", 65 | "p_2bar = 0.7 \n", 66 | "n1 = 100.0 \n", 67 | "n2 = 100.0 \n", 68 | " \n", 69 | "p= (p_1bar*n1 + p_2bar*n2)/(n1+n2) # the total pooled proportion \n", 70 | " \n", 71 | "z = (p_1bar-p_2bar)/math.sqrt(p*(1-p)*(1/n1+1/n2)) \n", 72 | "pval = scipy.stats.norm.sf(abs(z))*2 \n", 73 | " \n", 74 | "print(f\"The test statistic is {z} and the p-value for two tailed test is {pval}.\") " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "b890aadc-d4b3-4350-88bc-1d08e8be32c4", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.9.7" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } 108 | -------------------------------------------------------------------------------- /chapter_5/3_Signed-Rank_Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fc0559a-59b5-42a6-a910-75dd3be64047", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import scipy.stats as stats \n", 11 | "import numpy as np \n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "3bded326-b845-4fce-bf70-17bbb4f10d2d", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "WilcoxonResult(statistic=41.5, pvalue=0.013671875)" 24 | ] 25 | }, 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "before_treatment = np.array([37, 14, 22, 12, 24, 35, 35, 51,39]) \n", 33 | "after_treatment = np.array([38,17, 19, 7, 15, 25, 24, 38,19]) \n", 34 | " \n", 35 | "# Signed Rank Test \n", 36 | "stats.wilcoxon(before_treatment, after_treatment, alternative = 'greater') " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "75ebbeac-5da0-4731-ba91-5aaf51e4c0b7", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [] 46 | } 47 | ], 48 | "metadata": { 49 | "kernelspec": { 50 | "display_name": "Python 3 (ipykernel)", 51 | "language": "python", 52 | "name": "python3" 53 | }, 54 | "language_info": { 55 | "codemirror_mode": { 56 | "name": "ipython", 57 | "version": 3 58 | }, 59 | "file_extension": ".py", 60 | "mimetype": "text/x-python", 61 | "name": "python", 62 | "nbconvert_exporter": "python", 63 | "pygments_lexer": "ipython3", 64 | "version": "3.9.7" 65 | } 66 | }, 67 | "nbformat": 4, 68 | "nbformat_minor": 5 69 | } 70 | -------------------------------------------------------------------------------- /chapter_5/4_Kruskal-Wallis_Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4eddd39-c814-4b43-bfdd-87559b1e649d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from scipy import stats " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "707949a6-8144-4dd1-b4f6-f4848dc70e7c", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "KruskalResult(statistic=5.7342701722574905, pvalue=0.056861597028239855)" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "group1 = [8, 13, 13, 15, 12, 10, 6, 15, 13, 9] \n", 32 | "group2 = [16, 17, 14, 14, 15, 12, 9, 12, 11, 9] \n", 33 | "group3 = [7, 8, 9, 9, 4, 15, 13, 9, 11, 9] \n", 34 | "#Kruskal-Wallis Test \n", 35 | "stats.kruskal(group1, group2, group3) " 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "fd24c334-fd3d-4ea5-aa65-8961680ae39b", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3 (ipykernel)", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.9.7" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 5 68 | } 69 | -------------------------------------------------------------------------------- /chapter_6/1_Ordinary_Least_Squares.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "2382f06f-bab5-4a22-a303-826328f945cb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "id": "07e759ca-10e3-4486-bc10-67624e60ec95", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "image/png": "\n", 23 | "text/plain": [ 24 | "
" 25 | ] 26 | }, 27 | "metadata": { 28 | "needs_background": "light" 29 | }, 30 | "output_type": "display_data" 31 | } 32 | ], 33 | "source": [ 34 | "# Plot - The relationship between x and y\n", 35 | "\n", 36 | "error = np.random.normal(0,1,50)\n", 37 | "x = np.linspace(0,10,50)\n", 38 | "y = 2*x+3 +error\n", 39 | "\n", 40 | "# plot the results\n", 41 | "plt.figure(figsize = (10,8))\n", 42 | "plt.scatter(x,y,color='blue',label='Actual Values')\n", 43 | "plt.plot(x, 2*x + 3, color='red',label='Line of Best Fit')\n", 44 | "plt.xlabel('x')\n", 45 | "plt.ylabel('y')\n", 46 | "plt.title('The relationship between x and y')\n", 47 | "plt.legend()\n", 48 | "plt.tight_layout()\n", 49 | "plt.show()\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "3b9e347a-a464-423b-bff0-5c2631597bac", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Function to find beta_0_hat and beta_1_hat\n", 60 | "\n", 61 | "def least_squares_method(x,y): \n", 62 | " x_mean=x.mean() \n", 63 | " y_mean=y.mean() \n", 64 | " beta1 = ((x-x_mean)*(y-y_mean)).sum(axis=0)/ ((x-x.mean())**2).sum(axis=0) \n", 65 | " beta0 = y_mean-(beta1*x_mean) \n", 66 | " return beta0, beta1 " 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "fb898304-d686-4a78-8ece-dd5cebcd062e", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3 (ipykernel)", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.9.7" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 5 99 | } 100 | -------------------------------------------------------------------------------- /chapter_7/3_Shrinkage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fb29a9cd-e1be-4e12-93c4-09cf274855ee", 6 | "metadata": {}, 7 | "source": [ 8 | "# Prep Data" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 5, 14 | "id": "f0c53df5-4deb-42b6-90e2-fe45ebad939e", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.metrics import mean_squared_error as MSE\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "from sklearn.preprocessing import StandardScaler\n", 21 | "from sklearn.datasets import fetch_california_housing\n", 22 | "import statsmodels.api as sm\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "california_housing = fetch_california_housing()\n", 26 | "df_california = pd.DataFrame(california_housing.data, columns = california_housing.feature_names)\n", 27 | "df_california['PRICE'] = california_housing.target\n", 28 | "df_california = sm.add_constant(df_california, prepend=False)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 6, 34 | "id": "c5cc3dba-1695-48fe-8fa8-1e12bac2a12b", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "df_california.head().to_clipboard()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 7, 44 | "id": "ee667fa7-81cf-4be4-8c0e-8c5b2bf18321", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "X = df_california.drop('PRICE', axis=1)\n", 49 | "y = df_california['PRICE']" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 8, 55 | "id": "56b54779-ed92-47f6-b205-ac8476cf4d63", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "sc = StandardScaler()\n", 60 | "X_scaled = sc.fit_transform(X)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 9, 66 | "id": "18ae04e9-cb2b-45a1-8dcb-00f24b022f30", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, shuffle=True)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "35991f94-52c3-4822-8617-518f13653096", 76 | "metadata": {}, 77 | "source": [ 78 | "# Ridge Regression" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 10, 84 | "id": "775860e6-ef88-406d-9d21-92d21a5f06e4", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "ols_model = sm.OLS(y_train, X_train)\n", 89 | "compiled_model = ols_model.fit()\n", 90 | "compiled_model_ridge = ols_model.fit_regularized(method='elastic_net', L1_wt=0, alpha=0.1,refit=True)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 11, 96 | "id": "00095601-be50-4238-a586-f288405c5016", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "OLS Error: 4.806871781767541\n", 104 | "Ridge Regression Error: 4.862847117128481\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "print('OLS Error: ', MSE(y_train, compiled_model.predict(X_train)) )\n", 110 | "print('Ridge Regression Error: ', MSE(y_train, compiled_model_ridge.predict(X_train)))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 12, 116 | "id": "33f7c116-739c-4c22-9201-12c0741694d4", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "OLS Error: 4.799965081300563\n", 124 | "Ridge Regression Error: 4.802637071118009\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "print('OLS Error: ', MSE(y_test, compiled_model.predict(X_test)) )\n", 130 | "print('Ridge Regression Error: ', MSE(y_test, compiled_model_ridge.predict(X_test)))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 14, 136 | "id": "9a10cd1d-3d79-419d-a240-cae500d60ace", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "df_compare = pd.DataFrame({'Before Ridge Regression':compiled_model.params,\n", 141 | " 'After Ridge Regression':compiled_model_ridge.params})\n", 142 | "df_compare.index=list(X.columns)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 15, 148 | "id": "7409eb33-4c0a-4a62-9c34-2cbf11036a76", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/html": [ 154 | "
\n", 155 | "\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeconst
Before Ridge Regression0.8294970.118072-0.2611750.331459-0.010918-0.027147-0.993947-0.9506860.0
After Ridge Regression0.7366160.159327-0.0744730.0924430.004344-0.027914-0.438714-0.3884920.0
\n", 210 | "
" 211 | ], 212 | "text/plain": [ 213 | " MedInc HouseAge AveRooms AveBedrms Population \\\n", 214 | "Before Ridge Regression 0.829497 0.118072 -0.261175 0.331459 -0.010918 \n", 215 | "After Ridge Regression 0.736616 0.159327 -0.074473 0.092443 0.004344 \n", 216 | "\n", 217 | " AveOccup Latitude Longitude const \n", 218 | "Before Ridge Regression -0.027147 -0.993947 -0.950686 0.0 \n", 219 | "After Ridge Regression -0.027914 -0.438714 -0.388492 0.0 " 220 | ] 221 | }, 222 | "execution_count": 15, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "df_compare.T" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 500, 234 | "id": "a77527aa-22e9-416d-8ea7-764124ea62e4", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "df_compare.T.to_clipboard()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "3e871245-d817-4047-90ad-0a7d0e36615f", 244 | "metadata": {}, 245 | "source": [ 246 | "# LASSO Regression" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 16, 252 | "id": "a2e5c9bc-ab39-4a0f-8edc-90aa96a78e17", 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "OLS Error: 4.806871781767541\n", 260 | "LASSO Regression Error: 4.93937023995957\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "ols_model = sm.OLS(y_train, X_train)\n", 266 | "compiled_model = ols_model.fit()\n", 267 | "compiled_model_lasso = ols_model.fit_regularized(method='elastic_net', L1_wt=1, alpha=0.1,refit=True)\n", 268 | "\n", 269 | "print('OLS Error: ', MSE(y_train, compiled_model.predict(X_train)) )\n", 270 | "print('LASSO Regression Error: ', MSE(y_train, compiled_model_lasso.predict(X_train)))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 17, 276 | "id": "6217d1b6-6d66-45e5-a051-b31a31e0a6af", 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "OLS Error: 4.799965081300563\n", 284 | "LASSO Regression Error: 4.870307535710372\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "print('OLS Error: ', MSE(y_test, compiled_model.predict(X_test)) )\n", 290 | "print('LASSO Regression Error: ', MSE(y_test, compiled_model_lasso.predict(X_test)))" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 18, 296 | "id": "69e05a1c-53c7-491b-ba0f-a62e72810828", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "df_compare = pd.DataFrame({'Before LASSO Regression':compiled_model.params,\n", 301 | " 'After LASSO Regression':compiled_model_ridge.params})\n", 302 | "df_compare.index=list(X.columns)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 19, 308 | "id": "07d0e90d-ca22-4624-91b3-81ddcd027fd6", 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/html": [ 314 | "
\n", 315 | "\n", 328 | "\n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeconst
Before LASSO Regression0.8294970.118072-0.2611750.331459-0.010918-0.027147-0.993947-0.9506860.0
After LASSO Regression0.7366160.159327-0.0744730.0924430.004344-0.027914-0.438714-0.3884920.0
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " MedInc HouseAge AveRooms AveBedrms Population \\\n", 374 | "Before LASSO Regression 0.829497 0.118072 -0.261175 0.331459 -0.010918 \n", 375 | "After LASSO Regression 0.736616 0.159327 -0.074473 0.092443 0.004344 \n", 376 | "\n", 377 | " AveOccup Latitude Longitude const \n", 378 | "Before LASSO Regression -0.027147 -0.993947 -0.950686 0.0 \n", 379 | "After LASSO Regression -0.027914 -0.438714 -0.388492 0.0 " 380 | ] 381 | }, 382 | "execution_count": 19, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "df_compare.T" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 20, 394 | "id": "843d4b10-b8a2-4a1b-ad94-be4a61643b35", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "df_compare.T.to_clipboard()" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "id": "0630faa9-da96-4fbc-9d07-61a4f1cb4925", 404 | "metadata": {}, 405 | "source": [ 406 | "# Elastic Net Regression" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 21, 412 | "id": "1257fbd2-d5e0-445b-8012-088da3358879", 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "ols_model = sm.OLS(y_train, X_train)\n", 417 | "compiled_model = ols_model.fit()\n", 418 | "compiled_model_elastic = ols_model.fit_regularized(method='elastic_net', L1_wt=0.5, alpha=8,refit=True)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 22, 424 | "id": "20103009-464c-41c8-a889-9f0cc2a4a456", 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "OLS Error: 4.806871781767541\n", 432 | "Elastic Net Regression Error: 5.646749280108061\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "print('OLS Error: ', MSE(y_train, compiled_model.predict(X_train)) )\n", 438 | "print('Elastic Net Regression Error: ', MSE(y_train, compiled_model_elastic.predict(X_train)))" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 23, 444 | "id": "f3348a4c-53b6-4688-8c52-5a525e04a764", 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "OLS Error: 4.799965081300563\n", 452 | "Elastic Net Regression Error: 5.501684955624825\n" 453 | ] 454 | } 455 | ], 456 | "source": [ 457 | "print('OLS Error: ', MSE(y_test, compiled_model.predict(X_test)) )\n", 458 | "print('Elastic Net Regression Error: ', MSE(y_test, compiled_model_elastic.predict(X_test)))" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 24, 464 | "id": "d43c9e38-9720-4105-90a0-a8194464f99d", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "df_compare = pd.DataFrame({'Before Elastic Net Regression':compiled_model.params,\n", 469 | " 'After Elastic Net Regression':compiled_model_ridge.params})\n", 470 | "df_compare.index=list(X.columns)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 25, 476 | "id": "a4bc06c9-5c52-4f7b-b155-0a4a0b4089bf", 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/html": [ 482 | "
\n", 483 | "\n", 496 | "\n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeconst
Before Elastic Net Regression0.8294970.118072-0.2611750.331459-0.010918-0.027147-0.993947-0.9506860.0
After Elastic Net Regression0.7366160.159327-0.0744730.0924430.004344-0.027914-0.438714-0.3884920.0
\n", 538 | "
" 539 | ], 540 | "text/plain": [ 541 | " MedInc HouseAge AveRooms AveBedrms \\\n", 542 | "Before Elastic Net Regression 0.829497 0.118072 -0.261175 0.331459 \n", 543 | "After Elastic Net Regression 0.736616 0.159327 -0.074473 0.092443 \n", 544 | "\n", 545 | " Population AveOccup Latitude Longitude \\\n", 546 | "Before Elastic Net Regression -0.010918 -0.027147 -0.993947 -0.950686 \n", 547 | "After Elastic Net Regression 0.004344 -0.027914 -0.438714 -0.388492 \n", 548 | "\n", 549 | " const \n", 550 | "Before Elastic Net Regression 0.0 \n", 551 | "After Elastic Net Regression 0.0 " 552 | ] 553 | }, 554 | "execution_count": 25, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "df_compare.T" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 26, 566 | "id": "50ad6baa-0bd0-4417-9399-d07926d6065f", 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "df_compare.T.to_clipboard()" 571 | ] 572 | } 573 | ], 574 | "metadata": { 575 | "kernelspec": { 576 | "display_name": "Python 3", 577 | "language": "python", 578 | "name": "python3" 579 | }, 580 | "language_info": { 581 | "codemirror_mode": { 582 | "name": "ipython", 583 | "version": 3 584 | }, 585 | "file_extension": ".py", 586 | "mimetype": "text/x-python", 587 | "name": "python", 588 | "nbconvert_exporter": "python", 589 | "pygments_lexer": "ipython3", 590 | "version": "3.8.3" 591 | } 592 | }, 593 | "nbformat": 4, 594 | "nbformat_minor": 5 595 | } 596 | -------------------------------------------------------------------------------- /chapter_7/Data/Hitters.csv: -------------------------------------------------------------------------------- 1 | AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague 2 | 293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,NA,A 3 | 315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475,N 4 | 479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480,A 5 | 496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500,N 6 | 321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N 7 | 594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750,A 8 | 185,37,1,23,8,21,2,214,42,1,30,9,24,N,E,76,127,7,70,A 9 | 298,73,0,24,24,7,3,509,108,0,41,37,12,A,W,121,283,9,100,A 10 | 323,81,6,26,32,8,2,341,86,6,32,34,8,N,W,143,290,19,75,N 11 | 401,92,17,49,66,65,13,5206,1332,253,784,890,866,A,E,0,0,0,1100,A 12 | 574,159,21,107,75,59,10,4631,1300,90,702,504,488,A,E,238,445,22,517.143,A 13 | 202,53,4,31,26,27,9,1876,467,15,192,186,161,N,W,304,45,11,512.5,N 14 | 418,113,13,48,61,47,4,1512,392,41,205,204,203,N,E,211,11,7,550,N 15 | 239,60,0,30,11,22,6,1941,510,4,309,103,207,A,E,121,151,6,700,A 16 | 196,43,7,29,27,30,13,3231,825,36,376,290,238,N,E,80,45,8,240,N 17 | 183,39,3,20,15,11,3,201,42,3,20,16,11,A,W,118,0,0,NA,A 18 | 568,158,20,89,75,73,15,8068,2273,177,1045,993,732,N,W,105,290,10,775,N 19 | 190,46,2,24,8,15,5,479,102,5,65,23,39,A,W,102,177,16,175,A 20 | 407,104,6,57,43,65,12,5233,1478,100,643,658,653,A,W,912,88,9,NA,A 21 | 127,32,8,16,22,14,8,727,180,24,67,82,56,N,W,202,22,2,135,N 22 | 413,92,16,72,48,65,1,413,92,16,72,48,65,N,E,280,9,5,100,N 23 | 426,109,3,55,43,62,1,426,109,3,55,43,62,A,W,361,22,2,115,N 24 | 22,10,1,4,2,1,6,84,26,2,9,9,3,A,W,812,84,11,NA,A 25 | 472,116,16,60,62,74,6,1924,489,67,242,251,240,N,W,518,55,3,600,N 26 | 629,168,18,73,102,40,18,8424,2464,164,1008,1072,402,A,E,1067,157,14,776.667,A 27 | 587,163,4,92,51,70,6,2695,747,17,442,198,317,A,E,434,9,3,765,A 28 | 324,73,4,32,18,22,7,1931,491,13,291,108,180,N,E,222,3,3,708.333,N 29 | 474,129,10,50,56,40,10,2331,604,61,246,327,166,N,W,732,83,13,750,N 30 | 550,152,6,92,37,81,5,2308,633,32,349,182,308,N,W,262,329,16,625,N 31 | 513,137,20,90,95,90,14,5201,1382,166,763,734,784,A,W,267,5,3,900,A 32 | 313,84,9,42,30,39,17,6890,1833,224,1033,864,1087,A,W,127,221,7,NA,A 33 | 419,108,6,55,36,22,3,591,149,8,80,46,31,N,W,226,7,4,110,N 34 | 517,141,27,70,87,52,9,3571,994,215,545,652,337,N,W,1378,102,8,NA,N 35 | 583,168,17,83,80,56,5,1646,452,44,219,208,136,A,E,109,292,25,612.5,A 36 | 204,49,6,23,25,12,7,1309,308,27,126,132,66,A,W,419,46,5,300,A 37 | 379,106,10,38,60,30,14,6207,1906,146,859,803,571,N,W,72,170,24,850,N 38 | 161,36,0,19,10,17,4,1053,244,3,156,86,107,A,E,70,149,12,NA,A 39 | 268,60,5,24,25,15,2,350,78,5,34,29,18,N,W,442,59,6,90,N 40 | 346,98,5,31,53,30,16,5913,1615,235,784,901,560,A,E,0,0,0,NA,A 41 | 241,61,1,34,12,14,1,241,61,1,34,12,14,N,W,166,172,10,NA,N 42 | 181,41,1,15,21,33,2,232,50,4,20,29,45,A,E,326,29,5,67.5,A 43 | 216,54,0,21,18,15,18,7318,1926,46,796,627,483,N,W,103,84,5,NA,N 44 | 200,57,6,23,14,14,9,2516,684,46,371,230,195,N,W,69,1,1,NA,N 45 | 217,46,7,32,19,9,4,694,160,32,86,76,32,A,E,307,25,1,180,A 46 | 194,40,7,19,29,30,11,4183,1069,64,486,493,608,A,E,325,22,2,NA,A 47 | 254,68,2,28,26,22,6,999,236,21,108,117,118,A,E,359,30,4,305,A 48 | 416,132,7,57,49,33,3,932,273,24,113,121,80,N,W,73,177,18,215,N 49 | 205,57,8,34,32,9,5,756,192,32,117,107,51,A,E,58,4,4,247.5,A 50 | 542,140,12,46,75,41,16,7099,2130,235,987,1089,431,A,E,697,61,9,NA,A 51 | 526,146,13,71,70,84,6,2648,715,77,352,342,289,N,W,303,9,9,815,N 52 | 457,101,14,42,63,22,17,6521,1767,281,1003,977,619,A,W,389,39,4,875,A 53 | 214,53,2,30,29,23,2,226,59,2,32,32,27,N,E,109,7,3,70,N 54 | 19,7,0,1,2,1,4,41,13,1,3,4,4,A,E,0,0,0,NA,A 55 | 591,168,19,80,72,39,9,4478,1307,113,634,563,319,A,W,67,147,4,1200,A 56 | 403,101,12,45,53,39,12,5150,1429,166,747,666,526,A,E,316,6,5,675,A 57 | 405,102,18,49,85,20,6,950,231,29,99,138,64,N,W,161,10,3,415,N 58 | 244,58,9,28,25,35,4,1335,333,49,164,179,194,N,W,142,14,2,340,N 59 | 235,61,3,24,39,21,14,3926,1029,35,441,401,333,A,E,425,43,4,NA,A 60 | 313,78,6,32,41,12,12,3742,968,35,409,321,170,N,W,106,206,7,416.667,N 61 | 627,177,25,98,81,70,6,3210,927,133,529,472,313,A,E,240,482,13,1350,A 62 | 416,113,24,58,69,16,1,416,113,24,58,69,16,A,E,203,70,10,90,A 63 | 155,44,6,21,23,15,16,6631,1634,98,698,661,777,N,E,53,88,3,275,N 64 | 236,56,0,27,15,11,4,1115,270,1,116,64,57,A,W,125,199,13,230,A 65 | 216,53,1,31,15,22,4,926,210,9,118,69,114,N,W,73,152,11,225,N 66 | 24,3,0,1,0,2,3,159,28,0,20,12,9,A,W,80,4,0,NA,A 67 | 585,139,31,93,94,62,17,7546,1982,315,1141,1179,727,A,E,0,0,0,950,A 68 | 191,37,4,12,17,14,4,773,163,16,61,74,52,N,E,391,38,8,NA,N 69 | 199,53,5,29,22,21,3,514,120,8,57,40,39,A,W,152,3,5,75,A 70 | 521,142,20,67,86,45,4,815,205,22,99,103,78,A,E,107,242,23,105,A 71 | 419,113,1,44,27,44,12,4484,1231,32,612,344,422,A,E,211,2,1,NA,A 72 | 311,81,3,42,30,26,17,8247,2198,100,950,909,690,N,W,153,223,10,320,N 73 | 138,31,8,18,21,38,3,244,53,12,33,32,55,N,E,244,21,4,NA,N 74 | 512,131,26,69,96,52,14,5347,1397,221,712,815,548,A,W,119,216,12,850,A 75 | 507,122,29,78,85,91,18,7761,1947,347,1175,1152,1380,A,E,808,108,2,535,A 76 | 529,137,26,86,97,97,15,6661,1785,291,1082,949,989,A,E,280,10,5,933.333,A 77 | 424,119,6,57,46,13,9,3651,1046,32,461,301,112,A,E,224,286,8,850,N 78 | 351,97,4,55,29,39,4,1258,353,16,196,110,117,N,W,226,7,3,210,A 79 | 195,55,5,24,33,30,8,1313,338,25,144,149,153,N,E,83,2,1,NA,N 80 | 388,103,15,59,47,39,6,2174,555,80,285,274,186,A,W,182,9,4,325,A 81 | 339,96,4,37,29,23,4,1064,290,11,123,108,55,A,W,104,213,9,275,A 82 | 561,118,35,70,94,33,16,6677,1575,442,901,1210,608,A,W,463,32,8,NA,A 83 | 255,70,7,49,35,43,15,6311,1661,154,1019,608,820,N,E,51,54,8,450,N 84 | 677,238,31,117,113,53,5,2223,737,93,349,401,171,A,E,1377,100,6,1975,A 85 | 227,46,7,23,20,12,5,1325,324,44,156,158,67,A,W,92,2,2,NA,A 86 | 614,163,29,89,83,75,11,5017,1388,266,813,822,617,N,W,303,6,6,1900,N 87 | 329,83,9,50,39,56,9,3828,948,145,575,528,635,A,W,276,6,2,600,A 88 | 637,174,31,89,116,56,14,6727,2024,247,978,1093,495,N,W,278,9,9,1041.667,N 89 | 280,82,16,44,45,47,2,428,113,25,61,70,63,A,E,148,4,2,110,A 90 | 155,41,12,21,29,22,16,5409,1338,181,746,805,875,A,W,165,9,1,260,A 91 | 458,114,13,67,57,48,4,1350,298,28,160,123,122,A,W,246,389,18,475,A 92 | 314,83,13,39,46,16,5,1457,405,28,156,159,76,A,W,533,40,4,431.5,A 93 | 475,123,27,76,93,72,4,1810,471,108,292,343,267,N,E,226,10,6,1220,N 94 | 317,78,7,35,35,32,1,317,78,7,35,35,32,A,E,45,122,26,70,A 95 | 511,138,25,76,96,61,3,592,164,28,87,110,71,A,W,157,7,8,145,A 96 | 278,69,3,24,21,29,8,2079,565,32,258,192,162,N,W,142,210,10,NA,N 97 | 382,119,13,54,58,36,12,2133,594,41,287,294,227,N,W,59,156,9,595,N 98 | 565,148,24,90,104,77,14,7287,2083,305,1135,1234,791,A,E,292,9,5,1861.46,A 99 | 277,71,2,27,29,14,15,5952,1647,60,753,596,259,N,W,360,32,5,NA,N 100 | 415,115,27,97,71,68,3,711,184,45,156,119,99,N,W,274,2,7,300,N 101 | 424,110,15,70,47,36,7,2130,544,38,335,174,258,N,W,292,6,3,490,N 102 | 495,151,17,61,84,78,10,5624,1679,275,884,1015,709,A,E,1045,88,13,2460,A 103 | 524,132,9,69,47,54,2,972,260,14,123,92,90,A,E,212,327,20,NA,A 104 | 233,49,2,41,23,18,8,1350,336,7,166,122,106,A,E,102,132,10,375,A 105 | 395,106,16,48,56,35,10,2303,571,86,266,323,248,A,E,709,41,7,NA,A 106 | 397,114,23,67,67,53,13,5589,1632,241,906,926,716,A,E,244,2,4,NA,A 107 | 210,37,8,15,19,15,6,994,244,36,107,114,53,A,E,40,115,15,NA,A 108 | 420,95,23,55,58,37,3,646,139,31,77,77,61,N,W,206,10,7,NA,N 109 | 566,154,22,76,84,43,14,6100,1583,131,743,693,300,A,W,316,439,10,750,A 110 | 641,198,31,101,108,41,5,2129,610,92,297,319,117,A,E,269,17,10,1175,A 111 | 215,51,4,19,18,11,1,215,51,4,19,18,11,A,E,116,5,12,70,A 112 | 441,128,16,70,73,80,14,6675,2095,209,1072,1050,695,A,W,97,218,16,1500,A 113 | 325,76,16,33,52,37,5,1506,351,71,195,219,214,N,W,726,87,3,385,A 114 | 490,125,24,81,105,62,13,6063,1646,271,847,999,680,N,E,869,62,8,1925.571,N 115 | 574,152,31,91,101,64,3,985,260,53,148,173,95,N,W,1253,111,11,215,N 116 | 284,64,14,30,42,24,18,7023,1925,348,986,1239,666,N,E,96,4,4,NA,N 117 | 596,171,34,91,108,52,6,2862,728,107,361,401,224,A,W,118,334,21,900,A 118 | 472,118,12,63,54,30,4,793,187,14,102,80,50,A,W,228,377,26,155,A 119 | 283,77,14,45,47,26,16,6840,1910,259,915,1067,546,A,W,144,6,5,700,A 120 | 408,94,4,42,36,66,9,3573,866,59,429,365,410,N,W,282,487,19,535,N 121 | 327,85,3,30,44,20,8,2140,568,16,216,208,93,A,E,91,185,12,362.5,A 122 | 370,96,21,49,46,60,15,6986,1972,231,1070,955,921,N,E,137,5,9,733.333,N 123 | 354,77,16,36,55,41,20,8716,2172,384,1172,1267,1057,N,W,83,174,16,200,N 124 | 539,139,5,93,58,69,5,1469,369,12,247,126,198,A,W,462,9,7,400,A 125 | 340,84,11,62,33,47,5,1516,376,42,284,141,219,N,E,185,8,4,400,A 126 | 510,126,2,42,44,35,11,5562,1578,44,703,519,256,N,W,207,358,20,737.5,N 127 | 315,59,16,45,36,58,13,4677,1051,268,681,782,697,A,W,0,0,0,NA,A 128 | 282,78,13,37,51,29,5,1649,453,73,211,280,138,A,W,670,57,5,500,A 129 | 380,120,5,54,51,31,8,3118,900,92,444,419,240,A,W,237,8,1,600,A 130 | 584,158,15,70,84,42,5,2358,636,58,265,316,134,N,E,331,20,4,662.5,N 131 | 570,169,21,72,88,38,7,3754,1077,140,492,589,263,A,W,295,15,5,950,A 132 | 306,104,14,50,58,25,7,2954,822,55,313,377,187,N,E,116,222,15,750,N 133 | 220,54,10,30,39,31,5,1185,299,40,145,154,128,N,E,50,136,20,297.5,N 134 | 278,70,7,22,37,18,18,7186,2081,190,935,1088,643,A,W,0,0,0,325,A 135 | 445,99,1,46,24,29,4,618,129,1,72,31,48,A,W,278,415,16,87.5,A 136 | 143,39,5,18,30,15,9,639,151,16,80,97,61,N,W,138,15,1,175,N 137 | 185,40,4,23,11,18,3,524,125,7,58,37,47,N,E,97,2,2,90,N 138 | 589,170,40,107,108,69,6,2325,634,128,371,376,238,A,E,368,20,3,1237.5,A 139 | 343,103,6,48,36,40,15,4338,1193,70,581,421,325,A,E,211,56,13,430,A 140 | 284,69,1,33,18,25,5,1407,361,6,139,98,111,A,E,122,140,5,NA,N 141 | 438,103,2,65,32,71,2,440,103,2,67,32,71,A,W,276,7,9,100,N 142 | 600,144,33,85,117,65,2,696,173,38,101,130,69,A,W,319,4,14,165,A 143 | 663,200,29,108,121,32,4,1447,404,57,210,222,68,A,E,241,8,6,250,A 144 | 232,55,9,34,23,45,12,4405,1213,194,702,705,625,N,E,623,35,3,1300,N 145 | 479,133,10,48,72,55,17,7472,2147,153,980,1032,854,N,W,237,5,4,773.333,N 146 | 209,45,0,38,19,42,10,3859,916,23,557,279,478,A,W,132,205,5,NA,A 147 | 528,132,21,61,74,41,6,2641,671,97,273,383,226,N,E,885,105,8,1008.333,N 148 | 160,39,8,18,31,22,14,2128,543,56,304,268,298,A,E,33,3,0,275,A 149 | 599,183,10,80,74,32,5,2482,715,27,330,326,158,A,E,231,374,18,775,A 150 | 497,136,7,58,38,26,11,3871,1066,40,450,367,241,A,E,304,347,10,850,A 151 | 210,70,13,32,51,28,15,4040,1130,97,544,462,551,A,E,0,0,0,365,A 152 | 225,61,5,32,26,26,11,1568,408,25,202,185,257,A,W,132,9,0,NA,A 153 | 151,41,4,26,21,19,2,288,68,9,45,39,35,A,W,28,56,2,95,A 154 | 278,86,4,33,38,45,1,278,86,4,33,38,45,N,W,102,4,2,110,N 155 | 341,95,6,48,42,20,10,2964,808,81,379,428,221,N,W,158,4,5,100,N 156 | 537,147,23,58,88,47,10,2744,730,97,302,351,174,N,E,92,257,20,277.5,N 157 | 399,102,3,56,34,34,5,670,167,4,89,48,54,A,W,211,9,3,80,A 158 | 309,94,5,37,32,26,13,4618,1330,57,616,522,436,N,E,161,3,3,600,N 159 | 401,100,2,60,19,28,4,876,238,2,126,44,55,N,E,193,11,4,NA,N 160 | 336,93,9,35,46,23,15,5779,1610,128,730,741,497,A,W,0,0,0,NA,A 161 | 616,163,27,83,107,32,3,1437,377,65,181,227,82,A,W,110,308,15,200,A 162 | 219,47,8,24,26,17,12,1188,286,23,100,125,63,A,W,260,58,4,NA,A 163 | 579,174,7,67,78,58,6,3053,880,32,366,337,218,N,E,280,479,5,657,N 164 | 165,39,2,13,9,16,3,196,44,2,18,10,18,A,W,332,19,2,75,N 165 | 618,200,20,98,110,62,13,7127,2163,351,1104,1289,564,A,E,330,16,8,2412.5,A 166 | 257,66,5,31,26,32,14,3910,979,33,518,324,382,N,W,87,166,14,250,A 167 | 315,76,13,35,60,25,3,630,151,24,68,94,55,N,E,498,39,13,155,N 168 | 591,157,16,90,78,26,4,2020,541,52,310,226,91,N,E,290,440,25,640,N 169 | 404,92,11,54,49,18,6,1354,325,30,188,135,63,A,E,222,5,5,300,A 170 | 315,73,5,23,37,16,4,450,108,6,38,46,28,A,W,227,15,3,110,A 171 | 249,69,6,32,19,20,4,702,209,10,97,48,44,N,E,103,8,2,NA,N 172 | 429,91,12,41,42,57,13,5590,1397,83,578,579,644,A,W,686,46,4,825,N 173 | 212,54,13,28,44,18,2,233,59,13,31,46,20,A,E,243,23,5,NA,A 174 | 453,101,3,46,43,61,3,948,218,6,96,72,91,N,W,249,444,16,195,N 175 | 161,43,4,17,26,22,3,707,179,21,77,99,76,A,W,300,12,2,NA,A 176 | 184,47,5,20,28,18,11,3327,890,74,419,382,304,N,W,49,2,0,450,N 177 | 591,184,20,83,79,38,5,1689,462,40,219,195,82,N,W,303,12,5,630,N 178 | 181,58,6,34,23,22,1,181,58,6,34,23,22,N,W,88,0,3,86.5,N 179 | 441,118,28,84,86,68,8,2723,750,126,433,420,309,A,E,190,2,2,1300,A 180 | 490,150,21,69,58,35,14,6126,1839,121,983,707,600,A,E,96,5,3,1000,N 181 | 551,171,13,94,83,94,13,6090,1840,128,969,900,917,N,E,1199,149,5,1800,N 182 | 550,147,29,85,91,71,6,2816,815,117,405,474,319,A,W,1218,104,10,1310,A 183 | 283,74,4,34,29,22,10,3919,1062,85,505,456,283,N,W,145,5,7,737.5,N 184 | 560,161,26,89,96,66,4,1789,470,65,233,260,155,N,W,332,9,8,625,N 185 | 328,91,12,51,43,33,2,342,94,12,51,44,33,N,E,145,59,8,125,N 186 | 586,159,12,72,79,53,9,3082,880,83,363,477,295,N,E,181,13,4,1043.333,N 187 | 503,136,5,62,48,83,10,3423,970,20,408,303,414,N,W,65,258,8,725,N 188 | 344,85,24,69,64,88,7,911,214,64,150,156,187,A,W,0,0,0,300,A 189 | 680,223,31,119,96,34,3,1928,587,35,262,201,91,A,W,429,8,6,365,A 190 | 279,64,0,31,26,30,1,279,64,0,31,26,30,N,W,107,205,16,75,N 191 | 484,127,20,66,65,67,7,3006,844,116,436,458,377,N,E,1231,80,7,1183.333,N 192 | 431,127,8,77,45,58,2,667,187,9,117,64,88,N,E,283,8,3,202.5,N 193 | 283,70,8,33,37,27,12,4479,1222,94,557,483,307,A,E,156,2,2,225,A 194 | 491,141,11,77,47,37,15,4291,1240,84,615,430,340,A,E,239,8,2,525,A 195 | 199,52,9,26,28,21,6,805,191,30,113,119,87,N,W,235,22,5,265,N 196 | 589,149,21,89,86,64,7,3558,928,102,513,471,351,A,E,371,6,6,787.5,A 197 | 327,84,22,53,62,38,10,4273,1123,212,577,700,334,A,E,483,48,6,800,N 198 | 464,128,28,67,94,52,13,5829,1552,210,740,840,452,A,W,0,0,0,587.5,A 199 | 166,34,0,20,13,17,1,166,34,0,20,13,17,N,E,64,119,9,NA,N 200 | 338,92,18,42,60,21,3,682,185,36,88,112,50,A,E,0,0,0,145,A 201 | 508,146,8,80,44,46,9,3148,915,41,571,289,326,A,W,245,5,9,NA,A 202 | 584,157,20,95,73,63,10,4704,1320,93,724,522,576,A,E,276,421,11,420,A 203 | 216,54,2,27,25,33,1,216,54,2,27,25,33,N,W,317,36,1,75,N 204 | 625,179,4,94,60,65,5,1696,476,12,216,163,166,A,E,303,450,14,575,A 205 | 243,53,4,18,26,27,4,853,228,23,101,110,76,N,E,107,3,3,NA,N 206 | 489,131,19,77,55,34,7,2051,549,62,300,263,153,A,W,310,9,9,780,A 207 | 209,56,12,22,36,19,2,216,58,12,24,37,19,N,E,201,6,3,90,N 208 | 407,93,8,47,30,30,2,969,230,14,121,69,68,N,W,172,317,25,150,N 209 | 490,148,14,64,78,49,13,3400,1000,113,445,491,301,A,E,0,0,0,700,N 210 | 209,59,6,20,37,27,4,884,209,14,66,106,92,N,E,415,35,3,NA,N 211 | 442,131,18,68,77,33,6,1416,398,47,210,203,136,A,E,233,7,7,550,A 212 | 317,88,3,40,32,19,8,2543,715,28,269,270,118,A,W,220,16,4,NA,A 213 | 288,65,8,30,36,27,9,2815,698,55,315,325,189,N,E,259,30,10,650,A 214 | 209,54,3,25,14,12,1,209,54,3,25,14,12,A,W,102,6,3,68,A 215 | 303,71,3,18,30,36,3,344,76,3,20,36,45,N,E,468,47,6,100,N 216 | 330,77,19,47,53,27,6,1928,516,90,247,288,161,N,W,149,8,6,670,N 217 | 504,120,28,71,71,54,3,1085,259,54,150,167,114,A,E,103,283,19,175,A 218 | 258,60,8,28,33,18,3,638,170,17,80,75,36,A,W,358,32,8,137,A 219 | 20,1,0,0,0,0,2,41,9,2,6,7,4,N,E,78,220,6,2127.333,N 220 | 374,94,5,36,26,62,7,1968,519,26,181,199,288,N,W,756,64,15,875,N 221 | 211,43,10,26,35,39,3,498,116,14,59,55,78,A,W,463,32,8,120,A 222 | 299,75,6,38,23,26,3,580,160,8,71,33,44,N,E,212,1,2,140,N 223 | 576,167,8,89,49,57,4,822,232,19,132,83,79,N,E,325,12,8,210,N 224 | 381,110,9,61,45,32,7,3015,834,40,451,249,168,N,E,228,7,5,800,N 225 | 288,76,7,34,37,15,4,1644,408,16,198,120,113,N,W,203,3,3,240,N 226 | 369,93,9,43,42,49,5,1258,323,54,181,177,157,A,E,149,1,6,350,A 227 | 330,76,12,35,41,47,4,1367,326,55,167,198,167,N,W,512,30,5,NA,N 228 | 547,137,2,58,47,12,2,1038,271,3,129,80,24,A,W,261,459,22,175,A 229 | 572,152,18,105,49,65,2,978,249,36,168,91,101,A,W,325,13,3,200,A 230 | 359,84,4,46,27,21,12,4992,1257,37,699,386,387,N,W,151,8,5,NA,N 231 | 514,144,0,67,54,79,9,4739,1169,13,583,374,528,N,E,229,453,15,1940,N 232 | 359,80,15,45,48,63,7,1493,359,61,176,202,175,N,W,682,93,13,700,N 233 | 526,163,12,88,50,77,4,1556,470,38,245,167,174,A,W,250,11,1,750,A 234 | 313,83,9,43,41,30,14,5885,1543,104,751,714,535,N,W,58,141,23,450,N 235 | 540,135,30,82,88,55,1,540,135,30,82,88,55,A,W,157,6,14,172,A 236 | 437,123,9,62,55,40,9,4139,1203,79,676,390,364,A,E,82,170,15,1260,A 237 | 551,160,23,86,90,87,5,2235,602,75,278,328,273,A,W,1224,115,11,NA,A 238 | 237,52,0,15,25,30,24,14053,4256,160,2165,1314,1566,N,W,523,43,6,750,N 239 | 236,56,6,41,19,21,5,1257,329,24,166,125,105,A,E,172,1,4,190,A 240 | 473,154,6,61,48,29,6,1966,566,29,250,252,178,A,E,846,84,9,580,A 241 | 309,72,0,33,31,26,5,354,82,0,41,32,26,N,E,117,269,12,130,N 242 | 271,77,5,35,29,33,12,4933,1358,48,630,435,403,A,W,62,90,3,450,A 243 | 357,96,7,50,45,39,5,1394,344,43,178,192,136,A,W,167,2,4,300,A 244 | 216,56,4,22,18,15,12,2796,665,43,266,304,198,A,E,391,44,4,250,A 245 | 256,70,13,42,36,44,16,7058,1845,312,965,1128,990,N,E,41,118,8,1050,A 246 | 466,108,33,75,86,72,3,652,142,44,102,109,102,A,E,286,8,8,215,A 247 | 327,68,13,42,29,45,18,3949,939,78,438,380,466,A,E,659,53,7,400,A 248 | 462,119,16,49,65,37,7,2131,583,69,244,288,150,A,E,866,65,6,NA,A 249 | 341,110,9,45,49,46,9,2331,658,50,249,322,274,A,E,251,9,4,560,A 250 | 608,160,28,130,74,89,8,4071,1182,103,862,417,708,A,E,426,4,6,1670,A 251 | 419,101,18,65,58,92,20,9528,2510,548,1509,1659,1342,A,W,0,0,0,487.5,A 252 | 33,6,0,2,4,7,1,33,6,0,2,4,7,A,W,205,5,4,NA,A 253 | 376,82,21,42,60,35,5,1770,408,115,238,299,157,A,W,0,0,0,425,A 254 | 486,145,11,51,76,40,11,3967,1102,67,410,497,284,N,E,88,204,16,500,A 255 | 186,44,7,28,16,11,1,186,44,7,28,16,11,N,W,99,3,1,NA,N 256 | 307,80,1,42,36,29,7,2421,656,18,379,198,184,A,W,145,2,2,NA,A 257 | 246,76,5,35,39,13,6,912,234,12,102,96,80,A,E,44,0,1,250,A 258 | 205,52,8,31,27,17,12,5134,1323,56,643,445,459,A,E,155,3,2,400,A 259 | 348,90,11,50,45,43,10,2288,614,43,295,273,269,A,E,60,176,6,450,A 260 | 523,135,8,52,44,52,9,3368,895,39,377,284,296,N,W,367,475,19,750,N 261 | 312,68,2,32,22,24,1,312,68,2,32,22,24,A,E,86,150,15,70,A 262 | 496,119,8,57,33,21,7,3358,882,36,365,280,165,N,W,155,371,29,875,N 263 | 126,27,3,8,10,5,4,239,49,3,16,13,14,N,E,190,2,9,190,N 264 | 275,68,5,42,42,61,6,961,238,16,128,104,172,N,E,181,3,2,191,N 265 | 627,178,14,68,76,46,6,3146,902,74,494,345,242,N,E,309,492,5,740,N 266 | 394,86,1,38,28,36,4,1089,267,3,94,71,76,N,E,203,369,16,250,N 267 | 208,57,8,32,25,18,3,653,170,17,98,54,62,N,E,42,94,13,140,N 268 | 382,101,16,50,55,22,1,382,101,16,50,55,22,A,W,200,7,6,97.5,A 269 | 459,113,20,59,57,68,12,5348,1369,155,713,660,735,A,W,0,0,0,740,A 270 | 549,149,7,73,47,42,1,549,149,7,73,47,42,N,W,255,450,17,140,N 271 | 288,63,3,25,33,16,10,2682,667,38,315,259,204,A,W,135,257,7,341.667,A 272 | 303,84,4,35,32,23,2,312,87,4,39,32,23,N,W,179,5,3,NA,N 273 | 522,163,9,82,46,62,13,7037,2019,153,1043,827,535,A,E,352,9,1,1000,A 274 | 512,117,29,54,88,43,6,1750,412,100,204,276,155,A,W,1236,98,18,100,A 275 | 220,66,5,20,28,13,3,290,80,5,27,31,15,A,W,281,21,3,90,A 276 | 522,140,16,73,77,60,4,730,185,22,93,106,86,N,E,1320,166,17,200,N 277 | 461,112,18,54,54,35,2,680,160,24,76,75,49,A,W,111,226,11,135,A 278 | 581,145,17,66,68,21,2,831,210,21,106,86,40,N,E,320,465,32,155,N 279 | 530,159,3,82,50,47,6,1619,426,11,218,149,163,A,W,196,354,15,475,A 280 | 557,142,21,58,81,23,18,8759,2583,271,1138,1299,478,N,W,1160,53,7,1450,N 281 | 439,96,0,44,36,65,4,711,148,1,68,56,99,N,E,229,406,22,150,N 282 | 453,103,8,53,33,52,2,507,123,8,63,39,58,A,W,289,407,6,105,A 283 | 528,122,1,67,45,51,4,1716,403,12,211,146,155,A,W,209,372,17,350,A 284 | 633,210,6,91,56,59,6,3070,872,19,420,230,274,N,W,367,432,16,90,N 285 | 16,2,0,1,0,0,2,28,4,0,1,0,0,A,E,247,4,8,NA,A 286 | 562,169,17,88,73,53,8,3181,841,61,450,342,373,A,E,351,442,17,530,A 287 | 281,76,3,42,25,20,8,2658,657,48,324,300,179,A,E,106,144,7,341.667,A 288 | 593,152,23,69,75,53,6,2765,686,133,369,384,321,A,W,315,10,6,940,A 289 | 687,213,10,91,65,27,4,1518,448,15,196,137,89,A,E,294,445,13,350,A 290 | 368,103,3,48,28,54,8,1897,493,9,207,162,198,N,W,209,246,3,326.667,N 291 | 263,70,1,26,23,30,4,888,220,9,83,82,86,N,E,81,147,4,250,N 292 | 642,211,14,107,59,52,5,2364,770,27,352,230,193,N,W,337,19,4,740,N 293 | 265,68,8,26,30,29,7,1337,339,32,135,163,128,N,W,92,5,3,425,A 294 | 289,63,7,36,41,44,17,7402,1954,195,1115,919,1153,A,W,166,211,7,NA,A 295 | 559,141,2,48,61,73,8,3162,874,16,421,349,359,N,E,352,414,9,925,N 296 | 520,120,17,53,44,21,4,927,227,22,106,80,52,A,W,70,144,11,185,A 297 | 19,4,1,2,3,1,1,19,4,1,2,3,1,N,W,692,70,8,920,A 298 | 205,43,2,24,17,20,7,854,219,12,105,99,71,N,E,131,6,1,286.667,N 299 | 193,47,10,21,29,24,6,1136,256,42,129,139,106,A,W,299,13,5,245,A 300 | 181,46,1,19,18,17,5,937,238,9,88,95,104,A,E,37,98,9,NA,A 301 | 213,61,4,17,22,3,17,4061,1145,83,488,491,244,A,W,178,45,4,235,A 302 | 510,147,10,56,52,53,7,2872,821,63,307,340,174,N,E,810,99,18,1150,N 303 | 578,138,1,56,59,34,3,1399,357,7,149,161,87,N,E,133,371,20,160,N 304 | 200,51,2,14,29,25,23,9778,2732,379,1272,1652,925,N,W,398,29,7,NA,N 305 | 441,113,5,76,52,76,5,1546,397,17,226,149,191,A,W,160,290,11,425,A 306 | 172,42,3,17,14,15,10,4086,1150,57,579,363,406,N,W,65,0,0,900,N 307 | 580,194,9,91,62,78,8,3372,1028,48,604,314,469,N,E,270,13,6,NA,N 308 | 127,32,4,14,25,12,19,8396,2402,242,1048,1348,819,N,W,167,18,6,500,N 309 | 279,69,4,35,31,32,4,1359,355,31,180,148,158,N,E,133,173,9,277.5,N 310 | 480,112,18,50,71,44,7,3031,771,110,338,406,239,N,E,94,270,16,750,N 311 | 600,139,0,94,29,60,2,1236,309,1,201,69,110,N,E,300,12,9,160,N 312 | 610,186,19,107,98,74,6,2728,753,69,399,366,286,N,E,1182,96,13,1300,N 313 | 360,81,5,37,44,37,7,2268,566,41,279,257,246,N,E,170,284,3,525,N 314 | 387,124,1,67,27,36,7,1775,506,6,272,125,194,N,E,186,290,17,550,N 315 | 580,207,8,107,71,105,5,2778,978,32,474,322,417,A,E,121,267,19,1600,A 316 | 408,117,11,66,41,34,1,408,117,11,66,41,34,N,W,942,72,11,120,N 317 | 593,172,22,82,100,57,1,593,172,22,82,100,57,A,W,1222,139,15,165,A 318 | 221,53,2,21,23,22,8,1063,283,15,107,124,106,N,E,325,58,6,NA,N 319 | 497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700,N 320 | 492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875,A 321 | 475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385,A 322 | 573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960,A 323 | 631,170,9,77,44,31,11,4908,1457,30,775,357,249,A,W,408,4,3,1000,A 324 | -------------------------------------------------------------------------------- /chapter_8/1_Probit_and_Logit_Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "8c161d5e-877e-41eb-9ba1-09251398f4bb", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | "
AdmittedGPAExp
012.88
113.36
213.75
313.75
413.76
\n", 68 | "
" 69 | ], 70 | "text/plain": [ 71 | " Admitted GPA Exp\n", 72 | "0 1 2.8 8\n", 73 | "1 1 3.3 6\n", 74 | "2 1 3.7 5\n", 75 | "3 1 3.7 5\n", 76 | "4 1 3.7 6" 77 | ] 78 | }, 79 | "execution_count": 2, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "import pandas as pd \n", 86 | "\n", 87 | "# create gpa train data \n", 88 | "train = pd.DataFrame({'Admitted': [1, 1, 1,1, 1, 0, 1, 1, 0, 1,1,1, 1,1,0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ,0 ,0, 1,1,1,1, 0], \n", 89 | " 'GPA': [2.8, 3.3, 3.7, 3.7, 3.7, 3.3, 3.7, 3, 1.7, 3.6, 3.3, 4, 3.2, 3.4, 2.8, 4, 1.5, 2.7, 2.3, 2.3, 2.7, 2.2, 3.3,3.3, 4, 2.3, 3.6, 3.4, 4, 3.7, 2.3], \n", 90 | " 'Exp': [8, 6, 5, 5, 6, 3, 4, 2, 1, 5, 5, 3, 6,5, 4, 4, 4, 1, 1, 2, 2, 2, 1, 4, 4, 4, 5, 2, 4, 6, 3]}) \n", 91 | "train.head() " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "id": "b4b0c9a8-abc6-415a-aade-0458fc3cee07", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "
\n", 104 | "\n", 117 | "\n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
AdmittedGPAExp
012.99
102.41
213.86
303.01
413.34
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " Admitted GPA Exp\n", 163 | "0 1 2.9 9\n", 164 | "1 0 2.4 1\n", 165 | "2 1 3.8 6\n", 166 | "3 0 3.0 1\n", 167 | "4 1 3.3 4" 168 | ] 169 | }, 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "# create a testing dataset for this model. \n", 177 | "test = pd.DataFrame({'Admitted': [1, 0, 1, 0, 1], \n", 178 | " 'GPA': [2.9, 2.4, 3.8, 3, 3.3], \n", 179 | " 'Exp': [9, 1, 6, 1,4 ]}) \n", 180 | "test.head() " 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 4, 186 | "id": "362805e2-f7e2-45d5-ad48-766935ed307e", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "Optimization terminated successfully.\n", 194 | " Current function value: 0.316480\n", 195 | " Iterations 7\n" 196 | ] 197 | }, 198 | { 199 | "data": { 200 | "text/html": [ 201 | "\n", 202 | "\n", 203 | "\n", 204 | " \n", 205 | "\n", 206 | "\n", 207 | " \n", 208 | "\n", 209 | "\n", 210 | " \n", 211 | "\n", 212 | "\n", 213 | " \n", 214 | "\n", 215 | "\n", 216 | " \n", 217 | "\n", 218 | "\n", 219 | " \n", 220 | "\n", 221 | "\n", 222 | " \n", 223 | "\n", 224 | "
Logit Regression Results
Dep. Variable: Admitted No. Observations: 31
Model: Logit Df Residuals: 28
Method: MLE Df Model: 2
Date: Fri, 07 Jul 2023 Pseudo R-squ.: 0.5403
Time: 21:16:53 Log-Likelihood: -9.8109
converged: True LL-Null: -21.342
Covariance Type: nonrobust LLR p-value: 9.818e-06
\n", 225 | "\n", 226 | "\n", 227 | " \n", 228 | "\n", 229 | "\n", 230 | " \n", 231 | "\n", 232 | "\n", 233 | " \n", 234 | "\n", 235 | "\n", 236 | " \n", 237 | "\n", 238 | "
coef std err z P>|z| [0.025 0.975]
Intercept -11.4485 4.320 -2.650 0.008 -19.915 -2.982
GPA 2.7606 1.291 2.139 0.032 0.231 5.290
Exp 0.7569 0.383 1.977 0.048 0.006 1.507
" 239 | ], 240 | "text/plain": [ 241 | "\n", 242 | "\"\"\"\n", 243 | " Logit Regression Results \n", 244 | "==============================================================================\n", 245 | "Dep. Variable: Admitted No. Observations: 31\n", 246 | "Model: Logit Df Residuals: 28\n", 247 | "Method: MLE Df Model: 2\n", 248 | "Date: Fri, 07 Jul 2023 Pseudo R-squ.: 0.5403\n", 249 | "Time: 21:16:53 Log-Likelihood: -9.8109\n", 250 | "converged: True LL-Null: -21.342\n", 251 | "Covariance Type: nonrobust LLR p-value: 9.818e-06\n", 252 | "==============================================================================\n", 253 | " coef std err z P>|z| [0.025 0.975]\n", 254 | "------------------------------------------------------------------------------\n", 255 | "Intercept -11.4485 4.320 -2.650 0.008 -19.915 -2.982\n", 256 | "GPA 2.7606 1.291 2.139 0.032 0.231 5.290\n", 257 | "Exp 0.7569 0.383 1.977 0.048 0.006 1.507\n", 258 | "==============================================================================\n", 259 | "\"\"\"" 260 | ] 261 | }, 262 | "execution_count": 4, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "# We will use Logit from statsmodels. \n", 269 | "\n", 270 | "import statsmodels.formula.api as smf \n", 271 | "#fit logistic regression \n", 272 | "model = smf.logit('Admitted ~ GPA + Exp', data =train).fit() \n", 273 | "#summary \n", 274 | "model.summary() " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 5, 280 | "id": "1fd280f6-6411-4ee0-a270-5fe9add8b30a", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "Test accuracy = 1.0\n" 288 | ] 289 | }, 290 | { 291 | "data": { 292 | "image/png": "\n", 293 | "text/plain": [ 294 | "
" 295 | ] 296 | }, 297 | "metadata": { 298 | "needs_background": "light" 299 | }, 300 | "output_type": "display_data" 301 | } 302 | ], 303 | "source": [ 304 | "# confusion_matrix and accuracy_score to compute the accuracy of the model on the test set\n", 305 | "\n", 306 | "from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay \n", 307 | "# X_test and y_test \n", 308 | "X_test = test[['GPA', 'Exp']] \n", 309 | "y_test = test['Admitted'] \n", 310 | "# \n", 311 | "y_hat = model.predict(X_test) \n", 312 | "pred = list(map(round, y_hat)) \n", 313 | "# confusion matrix \n", 314 | "cm = confusion_matrix(y_test, pred) \n", 315 | "ConfusionMatrixDisplay(cm).plot() \n", 316 | "\n", 317 | "# Accuracy \n", 318 | "print('Test accuracy = ', accuracy_score(y_test, pred))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "dc9d0684-3cee-4771-8f31-e3a90fde0089", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "Python 3 (ipykernel)", 333 | "language": "python", 334 | "name": "python3" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 3 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython3", 346 | "version": "3.9.7" 347 | } 348 | }, 349 | "nbformat": 4, 350 | "nbformat_minor": 5 351 | } 352 | -------------------------------------------------------------------------------- /chapter_8/2_Multinomial_Logit_Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "51670ec2-6846-406e-a022-a421870a0417", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# import packages \n", 11 | "\n", 12 | "import numpy as np \n", 13 | "import pandas as pd \n", 14 | "from sklearn import datasets \n", 15 | "from sklearn.model_selection import train_test_split \n", 16 | "from sklearn.linear_model import LogisticRegression \n", 17 | "from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay \n", 18 | "import statsmodels.discrete.discrete_model as sm " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "73b145a2-43dc-4c08-98e6-bbb4c6222b75", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n", 32 | "['setosa' 'versicolor' 'virginica']\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# import Iris data \n", 38 | "\n", 39 | "iris = datasets.load_iris() \n", 40 | "print(iris.feature_names) \n", 41 | "print(iris.target_names) " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "3b8a03c5-454e-4a97-b6fc-121dcd1b19ad", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
sepal_lengthsepal_widthpetal_lengthpetal_widthtarget
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " sepal_length sepal_width petal_length petal_width target\n", 125 | "0 5.1 3.5 1.4 0.2 0\n", 126 | "1 4.9 3.0 1.4 0.2 0\n", 127 | "2 4.7 3.2 1.3 0.2 0\n", 128 | "3 4.6 3.1 1.5 0.2 0\n", 129 | "4 5.0 3.6 1.4 0.2 0" 130 | ] 131 | }, 132 | "execution_count": 3, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "#create dataframe \n", 139 | "\n", 140 | "df = pd.DataFrame(iris.data, columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']) \n", 141 | "df['target'] = iris.target \n", 142 | "df.head() \n", 143 | " " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 4, 149 | "id": "46084f27-5ab8-4b2a-a8ae-d5935845fb7d", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "sepal_length 0\n", 156 | "sepal_width 0\n", 157 | "petal_length 0\n", 158 | "petal_width 0\n", 159 | "target 0\n", 160 | "dtype: int64" 161 | ] 162 | }, 163 | "execution_count": 4, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "# check missing values \n", 170 | "\n", 171 | "df.isna().sum() " 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 5, 177 | "id": "bd487617-2c6e-42af-b6f2-be479f6ddab3", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# create train and test data \n", 182 | "\n", 183 | "X = df.drop('target', axis=1) \n", 184 | "y = df['target'] \n", 185 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state =1) " 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "id": "87541aa8-b7a9-4b49-ac4f-9885fe24f62c", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "Test accuracy = 0.9666666666666667\n" 199 | ] 200 | }, 201 | { 202 | "data": { 203 | "image/png": "\n", 204 | "text/plain": [ 205 | "
" 206 | ] 207 | }, 208 | "metadata": { 209 | "needs_background": "light" 210 | }, 211 | "output_type": "display_data" 212 | } 213 | ], 214 | "source": [ 215 | "# fit the model using sklearn \n", 216 | "\n", 217 | "model_sk = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial') \n", 218 | "model_sk.fit(X_train, y_train) \n", 219 | "y_hat_sk = model_sk.predict(X_test) \n", 220 | "pred_sk = list(map(round, y_hat_sk)) \n", 221 | "# confusion matrix \n", 222 | "cm_sk = confusion_matrix(y_test, pred_sk) \n", 223 | "ConfusionMatrixDisplay(cm_sk).plot() \n", 224 | "# Accuracy \n", 225 | "print('Test accuracy = ', accuracy_score(y_test, pred_sk)) " 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 7, 231 | "id": "8ca9004b-8403-45fc-8918-0a6ab6bee307", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Warning: Maximum number of iterations has been exceeded.\n", 239 | " Current function value: 0.073466\n", 240 | " Iterations: 35\n", 241 | " Function evaluations: 37\n", 242 | " Gradient evaluations: 37\n", 243 | "Test accuracy = 0.9666666666666667\n" 244 | ] 245 | }, 246 | { 247 | "name": "stderr", 248 | "output_type": "stream", 249 | "text": [ 250 | "/Users/hoangnguyen/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:566: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n", 251 | " warnings.warn(\"Maximum Likelihood optimization failed to \"\n" 252 | ] 253 | }, 254 | { 255 | "data": { 256 | "image/png": "\n", 257 | "text/plain": [ 258 | "
" 259 | ] 260 | }, 261 | "metadata": { 262 | "needs_background": "light" 263 | }, 264 | "output_type": "display_data" 265 | } 266 | ], 267 | "source": [ 268 | "#fit the model using statsmodels \n", 269 | "\n", 270 | "model_stat = sm.MNLogit(y_train, X_train).fit(method='bfgs') \n", 271 | "model_stat.summary() \n", 272 | "y_hat_stat = model_stat.predict(X_test) \n", 273 | "pred_stat = np.asarray(y_hat_stat).argmax(1) \n", 274 | "# confusion matrix \n", 275 | "cm_stat = confusion_matrix(y_test, pred_stat) \n", 276 | "ConfusionMatrixDisplay(cm_stat).plot() \n", 277 | "# Accuracy \n", 278 | "print('Test accuracy = ', accuracy_score(y_test, pred_stat)) " 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "id": "a533d9e7-6197-4c0d-b505-b1e7e49e973b", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": "Python 3 (ipykernel)", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "codemirror_mode": { 298 | "name": "ipython", 299 | "version": 3 300 | }, 301 | "file_extension": ".py", 302 | "mimetype": "text/x-python", 303 | "name": "python", 304 | "nbconvert_exporter": "python", 305 | "pygments_lexer": "ipython3", 306 | "version": "3.9.7" 307 | } 308 | }, 309 | "nbformat": 4, 310 | "nbformat_minor": 5 311 | } 312 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # core packages 2 | numpy==1.23.0 3 | scipy==1.8.1 4 | statsmodels==0.13.2 5 | pandas==1.4.3 6 | scikit-learn==1.1.1 7 | matplotlib==3.5.2 8 | # specialized 9 | seaborn==0.12.1 10 | sktime==0.15.0 11 | pmdarima==2.0.2 12 | lifelines==0.27.4 13 | pyreadr==0.4.6 14 | # jupyter 15 | jupyter 16 | notebook 17 | jupyterlab --------------------------------------------------------------------------------