├── .gitignore ├── LICENSE ├── README.md ├── data ├── auto-mpg.csv ├── auto-mpg.names ├── diamonds.csv ├── heart_2020_cleaned_sample.csv ├── penguins.csv ├── penguins_X_test.csv ├── penguins_X_train.csv ├── penguins_y_test.csv ├── penguins_y_train.csv ├── spotify_features.csv ├── telco_churn.csv └── world_happiness.csv ├── images ├── Confusion_Matrix.png ├── KNN.png ├── linear_regression_hyperplane.jpeg ├── linear_regression_line.png ├── overfitting.png └── validation.png ├── lessons ├── 00_introduction.md ├── 01_regression.ipynb ├── 02_regularization.ipynb ├── 03_preprocessing.ipynb ├── 04_classification.ipynb └── future │ ├── 05_walkthrough.ipynb │ ├── 06_clustering.ipynb │ └── 07_dimensionality_reduction.ipynb ├── requirements.txt └── solutions ├── 01_regression_solutions.ipynb ├── 02_regularization_solutions.ipynb ├── 03_preprocessing_solutions.ipynb └── 04_classification_solutions.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev 3 | 4 | ### JupyterNotebooks ### 5 | # gitignore template for Jupyter Notebooks 6 | # website: http://jupyter.org/ 7 | 8 | .ipynb_checkpoints 9 | */.ipynb_checkpoints/* 10 | 11 | # IPython 12 | profile_default/ 13 | ipython_config.py 14 | 15 | # Remove previous ipynb_checkpoints 16 | # git rm -r .ipynb_checkpoints/ 17 | 18 | ### macOS ### 19 | # General 20 | .DS_Store 21 | .AppleDouble 22 | .LSOverride 23 | 24 | # Icon must end with two \r 25 | Icon 26 | 27 | 28 | # Thumbnails 29 | ._* 30 | 31 | # Files that might appear in the root of a volume 32 | .DocumentRevisions-V100 33 | .fseventsd 34 | .Spotlight-V100 35 | .TemporaryItems 36 | .Trashes 37 | .VolumeIcon.icns 38 | .com.apple.timemachine.donotpresent 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | 47 | ### macOS Patch ### 48 | # iCloud generated files 49 | *.icloud 50 | 51 | ### PyCharm ### 52 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 53 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 54 | 55 | # User-specific stuff 56 | .idea/**/workspace.xml 57 | .idea/**/tasks.xml 58 | .idea/**/usage.statistics.xml 59 | .idea/**/dictionaries 60 | .idea/**/shelf 61 | 62 | # AWS User-specific 63 | .idea/**/aws.xml 64 | 65 | # Generated files 66 | .idea/**/contentModel.xml 67 | 68 | # Sensitive or high-churn files 69 | .idea/**/dataSources/ 70 | .idea/**/dataSources.ids 71 | .idea/**/dataSources.local.xml 72 | .idea/**/sqlDataSources.xml 73 | .idea/**/dynamic.xml 74 | .idea/**/uiDesigner.xml 75 | .idea/**/dbnavigator.xml 76 | 77 | # Gradle 78 | .idea/**/gradle.xml 79 | .idea/**/libraries 80 | 81 | # Gradle and Maven with auto-import 82 | # When using Gradle or Maven with auto-import, you should exclude module files, 83 | # since they will be recreated, and may cause churn. Uncomment if using 84 | # auto-import. 85 | # .idea/artifacts 86 | # .idea/compiler.xml 87 | # .idea/jarRepositories.xml 88 | # .idea/modules.xml 89 | # .idea/*.iml 90 | # .idea/modules 91 | # *.iml 92 | # *.ipr 93 | 94 | # CMake 95 | cmake-build-*/ 96 | 97 | # Mongo Explorer plugin 98 | .idea/**/mongoSettings.xml 99 | 100 | # File-based project format 101 | *.iws 102 | 103 | # IntelliJ 104 | out/ 105 | 106 | # mpeltonen/sbt-idea plugin 107 | .idea_modules/ 108 | 109 | # JIRA plugin 110 | atlassian-ide-plugin.xml 111 | 112 | # Cursive Clojure plugin 113 | .idea/replstate.xml 114 | 115 | # SonarLint plugin 116 | .idea/sonarlint/ 117 | 118 | # Crashlytics plugin (for Android Studio and IntelliJ) 119 | com_crashlytics_export_strings.xml 120 | crashlytics.properties 121 | crashlytics-build.properties 122 | fabric.properties 123 | 124 | # Editor-based Rest Client 125 | .idea/httpRequests 126 | 127 | # Android studio 3.1+ serialized cache file 128 | .idea/caches/build_file_checksums.ser 129 | 130 | ### PyCharm Patch ### 131 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 132 | 133 | # *.iml 134 | # modules.xml 135 | # .idea/misc.xml 136 | # *.ipr 137 | 138 | # Sonarlint plugin 139 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 140 | .idea/**/sonarlint/ 141 | 142 | # SonarQube Plugin 143 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 144 | .idea/**/sonarIssues.xml 145 | 146 | # Markdown Navigator plugin 147 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 148 | .idea/**/markdown-navigator.xml 149 | .idea/**/markdown-navigator-enh.xml 150 | .idea/**/markdown-navigator/ 151 | 152 | # Cache file creation bug 153 | # See https://youtrack.jetbrains.com/issue/JBR-2257 154 | .idea/$CACHE_FILE$ 155 | 156 | # CodeStream plugin 157 | # https://plugins.jetbrains.com/plugin/12206-codestream 158 | .idea/codestream.xml 159 | 160 | # Azure Toolkit for IntelliJ plugin 161 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij 162 | .idea/**/azureSettings.xml 163 | 164 | ### pydev ### 165 | .pydevproject 166 | 167 | ### Python ### 168 | # Byte-compiled / optimized / DLL files 169 | __pycache__/ 170 | *.py[cod] 171 | *$py.class 172 | 173 | # C extensions 174 | *.so 175 | 176 | # Distribution / packaging 177 | .Python 178 | build/ 179 | develop-eggs/ 180 | dist/ 181 | downloads/ 182 | eggs/ 183 | .eggs/ 184 | lib/ 185 | lib64/ 186 | parts/ 187 | sdist/ 188 | var/ 189 | wheels/ 190 | share/python-wheels/ 191 | *.egg-info/ 192 | .installed.cfg 193 | *.egg 194 | MANIFEST 195 | 196 | # PyInstaller 197 | # Usually these files are written by a python script from a template 198 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 199 | *.manifest 200 | *.spec 201 | 202 | # Installer logs 203 | pip-log.txt 204 | pip-delete-this-directory.txt 205 | 206 | # Unit test / coverage reports 207 | htmlcov/ 208 | .tox/ 209 | .nox/ 210 | .coverage 211 | .coverage.* 212 | .cache 213 | nosetests.xml 214 | coverage.xml 215 | *.cover 216 | *.py,cover 217 | .hypothesis/ 218 | .pytest_cache/ 219 | cover/ 220 | 221 | # Translations 222 | *.mo 223 | *.pot 224 | 225 | # Django stuff: 226 | *.log 227 | local_settings.py 228 | db.sqlite3 229 | db.sqlite3-journal 230 | 231 | # Flask stuff: 232 | instance/ 233 | .webassets-cache 234 | 235 | # Scrapy stuff: 236 | .scrapy 237 | 238 | # Sphinx documentation 239 | docs/_build/ 240 | 241 | # PyBuilder 242 | .pybuilder/ 243 | target/ 244 | 245 | # Jupyter Notebook 246 | 247 | # IPython 248 | 249 | # pyenv 250 | # For a library or package, you might want to ignore these files since the code is 251 | # intended to run in multiple environments; otherwise, check them in: 252 | # .python-version 253 | 254 | # pipenv 255 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 256 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 257 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 258 | # install all needed dependencies. 259 | #Pipfile.lock 260 | 261 | # poetry 262 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 263 | # This is especially recommended for binary packages to ensure reproducibility, and is more 264 | # commonly ignored for libraries. 265 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 266 | #poetry.lock 267 | 268 | # pdm 269 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 270 | #pdm.lock 271 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 272 | # in version control. 273 | # https://pdm.fming.dev/#use-with-ide 274 | .pdm.toml 275 | 276 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 277 | __pypackages__/ 278 | 279 | # Celery stuff 280 | celerybeat-schedule 281 | celerybeat.pid 282 | 283 | # SageMath parsed files 284 | *.sage.py 285 | 286 | # Environments 287 | .env 288 | .venv 289 | env/ 290 | venv/ 291 | ENV/ 292 | env.bak/ 293 | venv.bak/ 294 | 295 | # Spyder project settings 296 | .spyderproject 297 | .spyproject 298 | 299 | # Rope project settings 300 | .ropeproject 301 | 302 | # mkdocs documentation 303 | /site 304 | 305 | # mypy 306 | .mypy_cache/ 307 | .dmypy.json 308 | dmypy.json 309 | 310 | # Pyre type checker 311 | .pyre/ 312 | 313 | # pytype static type analyzer 314 | .pytype/ 315 | 316 | # Cython debug symbols 317 | cython_debug/ 318 | 319 | # PyCharm 320 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 321 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 322 | # and can be added to the global gitignore or merged into this file. For a more nuclear 323 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 324 | #.idea/ 325 | 326 | ### VisualStudioCode ### 327 | .vscode/* 328 | !.vscode/settings.json 329 | !.vscode/tasks.json 330 | !.vscode/launch.json 331 | !.vscode/extensions.json 332 | !.vscode/*.code-snippets 333 | 334 | # Local History for Visual Studio Code 335 | .history/ 336 | 337 | # Built Visual Studio Code Extensions 338 | *.vsix 339 | 340 | ### VisualStudioCode Patch ### 341 | # Ignore all local history of files 342 | .history 343 | .ionide 344 | 345 | # Support for Project snippet scope 346 | .vscode/*.code-snippets 347 | 348 | # Ignore code-workspaces 349 | *.code-workspace 350 | 351 | ### Windows ### 352 | # Windows thumbnail cache files 353 | Thumbs.db 354 | Thumbs.db:encryptable 355 | ehthumbs.db 356 | ehthumbs_vista.db 357 | 358 | # Dump file 359 | *.stackdump 360 | 361 | # Folder config file 362 | [Dd]esktop.ini 363 | 364 | # Recycle Bin used on file shares 365 | $RECYCLE.BIN/ 366 | 367 | # Windows Installer files 368 | *.cab 369 | *.msi 370 | *.msix 371 | *.msm 372 | *.msp 373 | 374 | # Windows shortcuts 375 | *.lnk 376 | 377 | # End of https://www.toptal.com/developers/gitignore/api/windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev 378 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Creative Commons Attribution-NonCommercial 4.0 International Public License 3 | 4 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 5 | 6 | Section 1 – Definitions. 7 | 8 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 9 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 13 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 14 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 15 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 16 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 17 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 18 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 19 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 20 | Section 2 – Scope. 21 | 22 | License grant. 23 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 24 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 25 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 26 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 27 | Term. The term of this Public License is specified in Section 6(a). 28 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 29 | Downstream recipients. 30 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 31 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 32 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 33 | Other rights. 34 | 35 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 36 | Patent and trademark rights are not licensed under this Public License. 37 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 38 | Section 3 – License Conditions. 39 | 40 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 41 | 42 | Attribution. 43 | 44 | If You Share the Licensed Material (including in modified form), You must: 45 | 46 | retain the following if it is supplied by the Licensor with the Licensed Material: 47 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 48 | a copyright notice; 49 | a notice that refers to this Public License; 50 | a notice that refers to the disclaimer of warranties; 51 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 52 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 53 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 54 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 55 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 56 | If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. 57 | Section 4 – Sui Generis Database Rights. 58 | 59 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 60 | 61 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 62 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and 63 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 64 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 65 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 66 | 67 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 68 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 69 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 70 | Section 6 – Term and Termination. 71 | 72 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 73 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 74 | 75 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 76 | upon express reinstatement by the Licensor. 77 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 78 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 79 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 80 | Section 7 – Other Terms and Conditions. 81 | 82 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 83 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 84 | Section 8 – Interpretation. 85 | 86 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 87 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 88 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 89 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # D-Lab's Python Machine Learning Workshop 2 | 3 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=lab%2Ftree%2FPython-Machine-Learning%2F&branch=main) [![Binder](http://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Machine-Learning/HEAD) 4 | 5 | This repository contains the materials for D-Lab’s Python Machine Learning workshop. 6 | 7 | ### Prerequisites 8 | Prior experience with [Python Fundamentals](https://github.com/dlab-berkeley/Python-Fundamentals), [Python Data Visualization](https://github.com/dlab-berkeley/Python-Data-Visualization), and [Python Data Wrangling](https://github.com/dlab-berkeley/Python-Data-Wrangling) is assumed. 9 | 10 | Check D-Lab's [Learning Pathways](https://dlab-berkeley.github.io/dlab-workshops/python_path.html) to figure out which of our workshops to take! 11 | 12 | ## Workshop Goals 13 | 14 | In this workshop, we provide an introduction to machine learning in Python. First, we'll cover some machine learning basics, including its foundational principles. Then, we'll dive into code, understanding how to perform regression, regularization, preprocessing, and classification. There are additional components of the workshop which explore building machine learning pipelines and unsupervised learning. We'll demonstrate how to perform these tasks using `scikit-learn`, the main package used for machine learning in Python. 15 | 16 | This workshop is divided into the following parts: 17 | 18 | 1. **Part 1: Regression and Regularization.** How can we use linear models to predict continuous outputs, and how can we prevent their overfitting? 19 | 2. **Part 2: Preprocessing and Classification.** What preprocessing steps do we need to take before fitting models? Then, how do we perform classification? 20 | 3. **Part 3: Machine Learning Pipeline.** We'll walk through a machine learning task, from exploratory data analysis to building an entire machine learning pipeline. 21 | 22 | The first two parts are taught as a joint series. Part 3 can be attended on its own, but prior knowledge of Parts 1 and 2 are assumed. 23 | 24 | ## Installation Instructions 25 | 26 | Anaconda is a useful package management software that allows you to run Python and Jupyter notebooks very easily. Installing Anaconda is the easiest way to make sure you have all the necessary software to run the materials for this workshop. Complete the following steps: 27 | 28 | 1. [Download and install Anaconda (Python 3.9 distribution)](https://www.anaconda.com/products/individual). Click "Download" and then click 64-bit "Graphical Installer" for your current operating system. 29 | 30 | 2. Download the [Python-Machine-Learning workshop materials](https://github.com/dlab-berkeley/Python-Machine-Learning): 31 | 32 | * Click the green "Code" button in the top right of the repository information. 33 | * Click "Download Zip". 34 | * Extract this file to a folder on your computer where you can easily access it (we recommend Desktop). 35 | 36 | 3. Optional: if you're familiar with `git`, you can instead clone this repository by opening a terminal and entering `git@github.com:dlab-berkeley/Python-Machine-Learning.git`. 37 | 38 | ## Run the code 39 | 40 | Now that you have all the required software and materials, you need to run the code: 41 | 42 | 1. Open the Anaconda Navigator application. You should see the green snake logo appear on your screen. Note that this can take a few minutes to load up the first time. 43 | 44 | 2. Click the "Launch" button under "Jupyter Notebooks" and navigate through your file system to the `Python-Machine-Learning` folder you downloaded above. 45 | 46 | 3. Click `00_introduction.md` to begin. 47 | 48 | 4. Press Shift + Enter (or Ctrl + Enter) to run a cell. 49 | 50 | ## Is Python not working on your computer? 51 | 52 | If you have a Berkeley CalNet ID, you can run these lessons on UC Berkeley's DataHub by clicking this button: 53 | 54 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=lab%2Ftree%2FPython-Machine-Learning%2F&branch=main) 55 | 56 | By using this link, you can save your work and come back to it at any time. When you want to return to your saved work, just go straight to DataHub [https://datahub.berkeley.edu](https://datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=tree%2FPython-Machine-Learning%2F&branch=main), sign in, and you click on the `Python-Machine-Learning` folder. 57 | 58 | If you don't have a Berkeley CalNet ID, you can still run these lessons in the cloud, by clicking this button: 59 | 60 | [![Binder](http://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Machine-Learning/main?urlpath=tree) 61 | 62 | By using this button, you cannot save your work unfortunately. 63 | 64 | # Additional Resources 65 | 66 | Check out the following resources to learn more about machine learning: 67 | 68 | * [scikit-learn Tutorials](https://scikit-learn.org/stable/tutorial/index.html). 69 | * [Stanford's CS229 course materials](https://cs229.stanford.edu/syllabus.html). 70 | * [IBM's free course of machine learning in Python](https://www.edx.org/course/machine-learning-with-python-a-practical-introduct). 71 | * The [Elements of AI course](https://course.elementsofai.com/). 72 | 73 | # About the UC Berkeley D-Lab 74 | 75 | D-Lab works with Berkeley faculty, research staff, and students to advance data-intensive social science and humanities research. Our goal at D-Lab is to provide practical training, staff support, resources, and space to enable you to use R for your own research applications. Our services cater to all skill levels and no programming, statistical, or computer science backgrounds are necessary. We offer these services in the form of workshops, one-to-one consulting, and working groups that cover a variety of research topics, digital tools, and programming languages. 76 | 77 | Visit the [D-Lab homepage](https://dlab.berkeley.edu/) to learn more about us. You can view our [calendar](https://dlab.berkeley.edu/events/calendar) for upcoming events, learn about how to utilize our [consulting](https://dlab.berkeley.edu/consulting) and [data](https://dlab.berkeley.edu/data) services, and check out upcoming [workshops](https://dlab.berkeley.edu/events/workshops). 78 | 79 | # Other D-Lab Python Workshops 80 | 81 | Here are other Python workshops offered by the D-Lab: 82 | 83 | ## Basic competency 84 | 85 | * [Python Fundamentals](https://github.com/dlab-berkeley/python-fundamentals) 86 | * [Introduction to Pandas](https://github.com/dlab-berkeley/introduction-to-pandas) 87 | * [Geospatial Fundamentals in Python](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-Python) 88 | * [Python Visualization](https://github.com/dlab-berkeley/Python-Data-Visualization) 89 | 90 | ## Intermediate/advanced copmetency 91 | 92 | * [Python Text Analysis](https://github.com/dlab-berkeley/Python-Text-Analysis) 93 | * [Python Deep Learning](https://github.com/dlab-berkeley/Python-Deep-Learning) 94 | * [Fairness and Bias in Machine Learning](https://github.com/dlab-berkeley/fairML) 95 | 96 | # Contributors 97 | * Pratik Sachdeva 98 | * Emily Grabowski 99 | * George McIntire 100 | * Sam Temlock 101 | * Samy Abdel-Ghaffar 102 | * Sean Perez 103 | * Christopher Hench 104 | -------------------------------------------------------------------------------- /data/auto-mpg.csv: -------------------------------------------------------------------------------- 1 | car name,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin 2 | chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1 3 | buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1 4 | plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1 5 | amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1 6 | ford torino,17.0,8,302.0,140,3449,10.5,70,1 7 | ford galaxie 500,15.0,8,429.0,198,4341,10.0,70,1 8 | chevrolet impala,14.0,8,454.0,220,4354,9.0,70,1 9 | plymouth fury iii,14.0,8,440.0,215,4312,8.5,70,1 10 | pontiac catalina,14.0,8,455.0,225,4425,10.0,70,1 11 | amc ambassador dpl,15.0,8,390.0,190,3850,8.5,70,1 12 | dodge challenger se,15.0,8,383.0,170,3563,10.0,70,1 13 | plymouth 'cuda 340,14.0,8,340.0,160,3609,8.0,70,1 14 | chevrolet monte carlo,15.0,8,400.0,150,3761,9.5,70,1 15 | buick estate wagon (sw),14.0,8,455.0,225,3086,10.0,70,1 16 | toyota corona mark ii,24.0,4,113.0,95,2372,15.0,70,3 17 | plymouth duster,22.0,6,198.0,95,2833,15.5,70,1 18 | amc hornet,18.0,6,199.0,97,2774,15.5,70,1 19 | ford maverick,21.0,6,200.0,85,2587,16.0,70,1 20 | datsun pl510,27.0,4,97.0,88,2130,14.5,70,3 21 | volkswagen 1131 deluxe sedan,26.0,4,97.0,46,1835,20.5,70,2 22 | peugeot 504,25.0,4,110.0,87,2672,17.5,70,2 23 | audi 100 ls,24.0,4,107.0,90,2430,14.5,70,2 24 | saab 99e,25.0,4,104.0,95,2375,17.5,70,2 25 | bmw 2002,26.0,4,121.0,113,2234,12.5,70,2 26 | amc gremlin,21.0,6,199.0,90,2648,15.0,70,1 27 | ford f250,10.0,8,360.0,215,4615,14.0,70,1 28 | chevy c20,10.0,8,307.0,200,4376,15.0,70,1 29 | dodge d200,11.0,8,318.0,210,4382,13.5,70,1 30 | hi 1200d,9.0,8,304.0,193,4732,18.5,70,1 31 | datsun pl510,27.0,4,97.0,88,2130,14.5,71,3 32 | chevrolet vega 2300,28.0,4,140.0,90,2264,15.5,71,1 33 | toyota corona,25.0,4,113.0,95,2228,14.0,71,3 34 | amc gremlin,19.0,6,232.0,100,2634,13.0,71,1 35 | plymouth satellite custom,16.0,6,225.0,105,3439,15.5,71,1 36 | chevrolet chevelle malibu,17.0,6,250.0,100,3329,15.5,71,1 37 | ford torino 500,19.0,6,250.0,88,3302,15.5,71,1 38 | amc matador,18.0,6,232.0,100,3288,15.5,71,1 39 | chevrolet impala,14.0,8,350.0,165,4209,12.0,71,1 40 | pontiac catalina brougham,14.0,8,400.0,175,4464,11.5,71,1 41 | ford galaxie 500,14.0,8,351.0,153,4154,13.5,71,1 42 | plymouth fury iii,14.0,8,318.0,150,4096,13.0,71,1 43 | dodge monaco (sw),12.0,8,383.0,180,4955,11.5,71,1 44 | ford country squire (sw),13.0,8,400.0,170,4746,12.0,71,1 45 | pontiac safari (sw),13.0,8,400.0,175,5140,12.0,71,1 46 | amc hornet sportabout (sw),18.0,6,258.0,110,2962,13.5,71,1 47 | chevrolet vega (sw),22.0,4,140.0,72,2408,19.0,71,1 48 | pontiac firebird,19.0,6,250.0,100,3282,15.0,71,1 49 | ford mustang,18.0,6,250.0,88,3139,14.5,71,1 50 | mercury capri 2000,23.0,4,122.0,86,2220,14.0,71,1 51 | opel 1900,28.0,4,116.0,90,2123,14.0,71,2 52 | peugeot 304,30.0,4,79.0,70,2074,19.5,71,2 53 | fiat 124b,30.0,4,88.0,76,2065,14.5,71,2 54 | toyota corolla 1200,31.0,4,71.0,65,1773,19.0,71,3 55 | datsun 1200,35.0,4,72.0,69,1613,18.0,71,3 56 | volkswagen model 111,27.0,4,97.0,60,1834,19.0,71,2 57 | plymouth cricket,26.0,4,91.0,70,1955,20.5,71,1 58 | toyota corona hardtop,24.0,4,113.0,95,2278,15.5,72,3 59 | dodge colt hardtop,25.0,4,97.5,80,2126,17.0,72,1 60 | volkswagen type 3,23.0,4,97.0,54,2254,23.5,72,2 61 | chevrolet vega,20.0,4,140.0,90,2408,19.5,72,1 62 | ford pinto runabout,21.0,4,122.0,86,2226,16.5,72,1 63 | chevrolet impala,13.0,8,350.0,165,4274,12.0,72,1 64 | pontiac catalina,14.0,8,400.0,175,4385,12.0,72,1 65 | plymouth fury iii,15.0,8,318.0,150,4135,13.5,72,1 66 | ford galaxie 500,14.0,8,351.0,153,4129,13.0,72,1 67 | amc ambassador sst,17.0,8,304.0,150,3672,11.5,72,1 68 | mercury marquis,11.0,8,429.0,208,4633,11.0,72,1 69 | buick lesabre custom,13.0,8,350.0,155,4502,13.5,72,1 70 | oldsmobile delta 88 royale,12.0,8,350.0,160,4456,13.5,72,1 71 | chrysler newport royal,13.0,8,400.0,190,4422,12.5,72,1 72 | mazda rx2 coupe,19.0,3,70.0,97,2330,13.5,72,3 73 | amc matador (sw),15.0,8,304.0,150,3892,12.5,72,1 74 | chevrolet chevelle concours (sw),13.0,8,307.0,130,4098,14.0,72,1 75 | ford gran torino (sw),13.0,8,302.0,140,4294,16.0,72,1 76 | plymouth satellite custom (sw),14.0,8,318.0,150,4077,14.0,72,1 77 | volvo 145e (sw),18.0,4,121.0,112,2933,14.5,72,2 78 | volkswagen 411 (sw),22.0,4,121.0,76,2511,18.0,72,2 79 | peugeot 504 (sw),21.0,4,120.0,87,2979,19.5,72,2 80 | renault 12 (sw),26.0,4,96.0,69,2189,18.0,72,2 81 | ford pinto (sw),22.0,4,122.0,86,2395,16.0,72,1 82 | datsun 510 (sw),28.0,4,97.0,92,2288,17.0,72,3 83 | toyouta corona mark ii (sw),23.0,4,120.0,97,2506,14.5,72,3 84 | dodge colt (sw),28.0,4,98.0,80,2164,15.0,72,1 85 | toyota corolla 1600 (sw),27.0,4,97.0,88,2100,16.5,72,3 86 | buick century 350,13.0,8,350.0,175,4100,13.0,73,1 87 | amc matador,14.0,8,304.0,150,3672,11.5,73,1 88 | chevrolet malibu,13.0,8,350.0,145,3988,13.0,73,1 89 | ford gran torino,14.0,8,302.0,137,4042,14.5,73,1 90 | dodge coronet custom,15.0,8,318.0,150,3777,12.5,73,1 91 | mercury marquis brougham,12.0,8,429.0,198,4952,11.5,73,1 92 | chevrolet caprice classic,13.0,8,400.0,150,4464,12.0,73,1 93 | ford ltd,13.0,8,351.0,158,4363,13.0,73,1 94 | plymouth fury gran sedan,14.0,8,318.0,150,4237,14.5,73,1 95 | chrysler new yorker brougham,13.0,8,440.0,215,4735,11.0,73,1 96 | buick electra 225 custom,12.0,8,455.0,225,4951,11.0,73,1 97 | amc ambassador brougham,13.0,8,360.0,175,3821,11.0,73,1 98 | plymouth valiant,18.0,6,225.0,105,3121,16.5,73,1 99 | chevrolet nova custom,16.0,6,250.0,100,3278,18.0,73,1 100 | amc hornet,18.0,6,232.0,100,2945,16.0,73,1 101 | ford maverick,18.0,6,250.0,88,3021,16.5,73,1 102 | plymouth duster,23.0,6,198.0,95,2904,16.0,73,1 103 | volkswagen super beetle,26.0,4,97.0,46,1950,21.0,73,2 104 | chevrolet impala,11.0,8,400.0,150,4997,14.0,73,1 105 | ford country,12.0,8,400.0,167,4906,12.5,73,1 106 | plymouth custom suburb,13.0,8,360.0,170,4654,13.0,73,1 107 | oldsmobile vista cruiser,12.0,8,350.0,180,4499,12.5,73,1 108 | amc gremlin,18.0,6,232.0,100,2789,15.0,73,1 109 | toyota carina,20.0,4,97.0,88,2279,19.0,73,3 110 | chevrolet vega,21.0,4,140.0,72,2401,19.5,73,1 111 | datsun 610,22.0,4,108.0,94,2379,16.5,73,3 112 | maxda rx3,18.0,3,70.0,90,2124,13.5,73,3 113 | ford pinto,19.0,4,122.0,85,2310,18.5,73,1 114 | mercury capri v6,21.0,6,155.0,107,2472,14.0,73,1 115 | fiat 124 sport coupe,26.0,4,98.0,90,2265,15.5,73,2 116 | chevrolet monte carlo s,15.0,8,350.0,145,4082,13.0,73,1 117 | pontiac grand prix,16.0,8,400.0,230,4278,9.5,73,1 118 | fiat 128,29.0,4,68.0,49,1867,19.5,73,2 119 | opel manta,24.0,4,116.0,75,2158,15.5,73,2 120 | audi 100ls,20.0,4,114.0,91,2582,14.0,73,2 121 | volvo 144ea,19.0,4,121.0,112,2868,15.5,73,2 122 | dodge dart custom,15.0,8,318.0,150,3399,11.0,73,1 123 | saab 99le,24.0,4,121.0,110,2660,14.0,73,2 124 | toyota mark ii,20.0,6,156.0,122,2807,13.5,73,3 125 | oldsmobile omega,11.0,8,350.0,180,3664,11.0,73,1 126 | plymouth duster,20.0,6,198.0,95,3102,16.5,74,1 127 | amc hornet,19.0,6,232.0,100,2901,16.0,74,1 128 | chevrolet nova,15.0,6,250.0,100,3336,17.0,74,1 129 | datsun b210,31.0,4,79.0,67,1950,19.0,74,3 130 | ford pinto,26.0,4,122.0,80,2451,16.5,74,1 131 | toyota corolla 1200,32.0,4,71.0,65,1836,21.0,74,3 132 | chevrolet vega,25.0,4,140.0,75,2542,17.0,74,1 133 | chevrolet chevelle malibu classic,16.0,6,250.0,100,3781,17.0,74,1 134 | amc matador,16.0,6,258.0,110,3632,18.0,74,1 135 | plymouth satellite sebring,18.0,6,225.0,105,3613,16.5,74,1 136 | ford gran torino,16.0,8,302.0,140,4141,14.0,74,1 137 | buick century luxus (sw),13.0,8,350.0,150,4699,14.5,74,1 138 | dodge coronet custom (sw),14.0,8,318.0,150,4457,13.5,74,1 139 | ford gran torino (sw),14.0,8,302.0,140,4638,16.0,74,1 140 | amc matador (sw),14.0,8,304.0,150,4257,15.5,74,1 141 | audi fox,29.0,4,98.0,83,2219,16.5,74,2 142 | volkswagen dasher,26.0,4,79.0,67,1963,15.5,74,2 143 | opel manta,26.0,4,97.0,78,2300,14.5,74,2 144 | toyota corona,31.0,4,76.0,52,1649,16.5,74,3 145 | datsun 710,32.0,4,83.0,61,2003,19.0,74,3 146 | dodge colt,28.0,4,90.0,75,2125,14.5,74,1 147 | fiat 128,24.0,4,90.0,75,2108,15.5,74,2 148 | fiat 124 tc,26.0,4,116.0,75,2246,14.0,74,2 149 | honda civic,24.0,4,120.0,97,2489,15.0,74,3 150 | subaru,26.0,4,108.0,93,2391,15.5,74,3 151 | fiat x1.9,31.0,4,79.0,67,2000,16.0,74,2 152 | plymouth valiant custom,19.0,6,225.0,95,3264,16.0,75,1 153 | chevrolet nova,18.0,6,250.0,105,3459,16.0,75,1 154 | mercury monarch,15.0,6,250.0,72,3432,21.0,75,1 155 | ford maverick,15.0,6,250.0,72,3158,19.5,75,1 156 | pontiac catalina,16.0,8,400.0,170,4668,11.5,75,1 157 | chevrolet bel air,15.0,8,350.0,145,4440,14.0,75,1 158 | plymouth grand fury,16.0,8,318.0,150,4498,14.5,75,1 159 | ford ltd,14.0,8,351.0,148,4657,13.5,75,1 160 | buick century,17.0,6,231.0,110,3907,21.0,75,1 161 | chevroelt chevelle malibu,16.0,6,250.0,105,3897,18.5,75,1 162 | amc matador,15.0,6,258.0,110,3730,19.0,75,1 163 | plymouth fury,18.0,6,225.0,95,3785,19.0,75,1 164 | buick skyhawk,21.0,6,231.0,110,3039,15.0,75,1 165 | chevrolet monza 2+2,20.0,8,262.0,110,3221,13.5,75,1 166 | ford mustang ii,13.0,8,302.0,129,3169,12.0,75,1 167 | toyota corolla,29.0,4,97.0,75,2171,16.0,75,3 168 | ford pinto,23.0,4,140.0,83,2639,17.0,75,1 169 | amc gremlin,20.0,6,232.0,100,2914,16.0,75,1 170 | pontiac astro,23.0,4,140.0,78,2592,18.5,75,1 171 | toyota corona,24.0,4,134.0,96,2702,13.5,75,3 172 | volkswagen dasher,25.0,4,90.0,71,2223,16.5,75,2 173 | datsun 710,24.0,4,119.0,97,2545,17.0,75,3 174 | ford pinto,18.0,6,171.0,97,2984,14.5,75,1 175 | volkswagen rabbit,29.0,4,90.0,70,1937,14.0,75,2 176 | amc pacer,19.0,6,232.0,90,3211,17.0,75,1 177 | audi 100ls,23.0,4,115.0,95,2694,15.0,75,2 178 | peugeot 504,23.0,4,120.0,88,2957,17.0,75,2 179 | volvo 244dl,22.0,4,121.0,98,2945,14.5,75,2 180 | saab 99le,25.0,4,121.0,115,2671,13.5,75,2 181 | honda civic cvcc,33.0,4,91.0,53,1795,17.5,75,3 182 | fiat 131,28.0,4,107.0,86,2464,15.5,76,2 183 | opel 1900,25.0,4,116.0,81,2220,16.9,76,2 184 | capri ii,25.0,4,140.0,92,2572,14.9,76,1 185 | dodge colt,26.0,4,98.0,79,2255,17.7,76,1 186 | renault 12tl,27.0,4,101.0,83,2202,15.3,76,2 187 | chevrolet chevelle malibu classic,17.5,8,305.0,140,4215,13.0,76,1 188 | dodge coronet brougham,16.0,8,318.0,150,4190,13.0,76,1 189 | amc matador,15.5,8,304.0,120,3962,13.9,76,1 190 | ford gran torino,14.5,8,351.0,152,4215,12.8,76,1 191 | plymouth valiant,22.0,6,225.0,100,3233,15.4,76,1 192 | chevrolet nova,22.0,6,250.0,105,3353,14.5,76,1 193 | ford maverick,24.0,6,200.0,81,3012,17.6,76,1 194 | amc hornet,22.5,6,232.0,90,3085,17.6,76,1 195 | chevrolet chevette,29.0,4,85.0,52,2035,22.2,76,1 196 | chevrolet woody,24.5,4,98.0,60,2164,22.1,76,1 197 | vw rabbit,29.0,4,90.0,70,1937,14.2,76,2 198 | honda civic,33.0,4,91.0,53,1795,17.4,76,3 199 | dodge aspen se,20.0,6,225.0,100,3651,17.7,76,1 200 | ford granada ghia,18.0,6,250.0,78,3574,21.0,76,1 201 | pontiac ventura sj,18.5,6,250.0,110,3645,16.2,76,1 202 | amc pacer d/l,17.5,6,258.0,95,3193,17.8,76,1 203 | volkswagen rabbit,29.5,4,97.0,71,1825,12.2,76,2 204 | datsun b-210,32.0,4,85.0,70,1990,17.0,76,3 205 | toyota corolla,28.0,4,97.0,75,2155,16.4,76,3 206 | ford pinto,26.5,4,140.0,72,2565,13.6,76,1 207 | volvo 245,20.0,4,130.0,102,3150,15.7,76,2 208 | plymouth volare premier v8,13.0,8,318.0,150,3940,13.2,76,1 209 | peugeot 504,19.0,4,120.0,88,3270,21.9,76,2 210 | toyota mark ii,19.0,6,156.0,108,2930,15.5,76,3 211 | mercedes-benz 280s,16.5,6,168.0,120,3820,16.7,76,2 212 | cadillac seville,16.5,8,350.0,180,4380,12.1,76,1 213 | chevy c10,13.0,8,350.0,145,4055,12.0,76,1 214 | ford f108,13.0,8,302.0,130,3870,15.0,76,1 215 | dodge d100,13.0,8,318.0,150,3755,14.0,76,1 216 | honda accord cvcc,31.5,4,98.0,68,2045,18.5,77,3 217 | buick opel isuzu deluxe,30.0,4,111.0,80,2155,14.8,77,1 218 | renault 5 gtl,36.0,4,79.0,58,1825,18.6,77,2 219 | plymouth arrow gs,25.5,4,122.0,96,2300,15.5,77,1 220 | datsun f-10 hatchback,33.5,4,85.0,70,1945,16.8,77,3 221 | chevrolet caprice classic,17.5,8,305.0,145,3880,12.5,77,1 222 | oldsmobile cutlass supreme,17.0,8,260.0,110,4060,19.0,77,1 223 | dodge monaco brougham,15.5,8,318.0,145,4140,13.7,77,1 224 | mercury cougar brougham,15.0,8,302.0,130,4295,14.9,77,1 225 | chevrolet concours,17.5,6,250.0,110,3520,16.4,77,1 226 | buick skylark,20.5,6,231.0,105,3425,16.9,77,1 227 | plymouth volare custom,19.0,6,225.0,100,3630,17.7,77,1 228 | ford granada,18.5,6,250.0,98,3525,19.0,77,1 229 | pontiac grand prix lj,16.0,8,400.0,180,4220,11.1,77,1 230 | chevrolet monte carlo landau,15.5,8,350.0,170,4165,11.4,77,1 231 | chrysler cordoba,15.5,8,400.0,190,4325,12.2,77,1 232 | ford thunderbird,16.0,8,351.0,149,4335,14.5,77,1 233 | volkswagen rabbit custom,29.0,4,97.0,78,1940,14.5,77,2 234 | pontiac sunbird coupe,24.5,4,151.0,88,2740,16.0,77,1 235 | toyota corolla liftback,26.0,4,97.0,75,2265,18.2,77,3 236 | ford mustang ii 2+2,25.5,4,140.0,89,2755,15.8,77,1 237 | chevrolet chevette,30.5,4,98.0,63,2051,17.0,77,1 238 | dodge colt m/m,33.5,4,98.0,83,2075,15.9,77,1 239 | subaru dl,30.0,4,97.0,67,1985,16.4,77,3 240 | volkswagen dasher,30.5,4,97.0,78,2190,14.1,77,2 241 | datsun 810,22.0,6,146.0,97,2815,14.5,77,3 242 | bmw 320i,21.5,4,121.0,110,2600,12.8,77,2 243 | mazda rx-4,21.5,3,80.0,110,2720,13.5,77,3 244 | volkswagen rabbit custom diesel,43.1,4,90.0,48,1985,21.5,78,2 245 | ford fiesta,36.1,4,98.0,66,1800,14.4,78,1 246 | mazda glc deluxe,32.8,4,78.0,52,1985,19.4,78,3 247 | datsun b210 gx,39.4,4,85.0,70,2070,18.6,78,3 248 | honda civic cvcc,36.1,4,91.0,60,1800,16.4,78,3 249 | oldsmobile cutlass salon brougham,19.9,8,260.0,110,3365,15.5,78,1 250 | dodge diplomat,19.4,8,318.0,140,3735,13.2,78,1 251 | mercury monarch ghia,20.2,8,302.0,139,3570,12.8,78,1 252 | pontiac phoenix lj,19.2,6,231.0,105,3535,19.2,78,1 253 | chevrolet malibu,20.5,6,200.0,95,3155,18.2,78,1 254 | ford fairmont (auto),20.2,6,200.0,85,2965,15.8,78,1 255 | ford fairmont (man),25.1,4,140.0,88,2720,15.4,78,1 256 | plymouth volare,20.5,6,225.0,100,3430,17.2,78,1 257 | amc concord,19.4,6,232.0,90,3210,17.2,78,1 258 | buick century special,20.6,6,231.0,105,3380,15.8,78,1 259 | mercury zephyr,20.8,6,200.0,85,3070,16.7,78,1 260 | dodge aspen,18.6,6,225.0,110,3620,18.7,78,1 261 | amc concord d/l,18.1,6,258.0,120,3410,15.1,78,1 262 | chevrolet monte carlo landau,19.2,8,305.0,145,3425,13.2,78,1 263 | buick regal sport coupe (turbo),17.7,6,231.0,165,3445,13.4,78,1 264 | ford futura,18.1,8,302.0,139,3205,11.2,78,1 265 | dodge magnum xe,17.5,8,318.0,140,4080,13.7,78,1 266 | chevrolet chevette,30.0,4,98.0,68,2155,16.5,78,1 267 | toyota corona,27.5,4,134.0,95,2560,14.2,78,3 268 | datsun 510,27.2,4,119.0,97,2300,14.7,78,3 269 | dodge omni,30.9,4,105.0,75,2230,14.5,78,1 270 | toyota celica gt liftback,21.1,4,134.0,95,2515,14.8,78,3 271 | plymouth sapporo,23.2,4,156.0,105,2745,16.7,78,1 272 | oldsmobile starfire sx,23.8,4,151.0,85,2855,17.6,78,1 273 | datsun 200-sx,23.9,4,119.0,97,2405,14.9,78,3 274 | audi 5000,20.3,5,131.0,103,2830,15.9,78,2 275 | volvo 264gl,17.0,6,163.0,125,3140,13.6,78,2 276 | saab 99gle,21.6,4,121.0,115,2795,15.7,78,2 277 | peugeot 604sl,16.2,6,163.0,133,3410,15.8,78,2 278 | volkswagen scirocco,31.5,4,89.0,71,1990,14.9,78,2 279 | honda accord lx,29.5,4,98.0,68,2135,16.6,78,3 280 | pontiac lemans v6,21.5,6,231.0,115,3245,15.4,79,1 281 | mercury zephyr 6,19.8,6,200.0,85,2990,18.2,79,1 282 | ford fairmont 4,22.3,4,140.0,88,2890,17.3,79,1 283 | amc concord dl 6,20.2,6,232.0,90,3265,18.2,79,1 284 | dodge aspen 6,20.6,6,225.0,110,3360,16.6,79,1 285 | chevrolet caprice classic,17.0,8,305.0,130,3840,15.4,79,1 286 | ford ltd landau,17.6,8,302.0,129,3725,13.4,79,1 287 | mercury grand marquis,16.5,8,351.0,138,3955,13.2,79,1 288 | dodge st. regis,18.2,8,318.0,135,3830,15.2,79,1 289 | buick estate wagon (sw),16.9,8,350.0,155,4360,14.9,79,1 290 | ford country squire (sw),15.5,8,351.0,142,4054,14.3,79,1 291 | chevrolet malibu classic (sw),19.2,8,267.0,125,3605,15.0,79,1 292 | chrysler lebaron town @ country (sw),18.5,8,360.0,150,3940,13.0,79,1 293 | vw rabbit custom,31.9,4,89.0,71,1925,14.0,79,2 294 | maxda glc deluxe,34.1,4,86.0,65,1975,15.2,79,3 295 | dodge colt hatchback custom,35.7,4,98.0,80,1915,14.4,79,1 296 | amc spirit dl,27.4,4,121.0,80,2670,15.0,79,1 297 | mercedes benz 300d,25.4,5,183.0,77,3530,20.1,79,2 298 | cadillac eldorado,23.0,8,350.0,125,3900,17.4,79,1 299 | peugeot 504,27.2,4,141.0,71,3190,24.8,79,2 300 | oldsmobile cutlass salon brougham,23.9,8,260.0,90,3420,22.2,79,1 301 | plymouth horizon,34.2,4,105.0,70,2200,13.2,79,1 302 | plymouth horizon tc3,34.5,4,105.0,70,2150,14.9,79,1 303 | datsun 210,31.8,4,85.0,65,2020,19.2,79,3 304 | fiat strada custom,37.3,4,91.0,69,2130,14.7,79,2 305 | buick skylark limited,28.4,4,151.0,90,2670,16.0,79,1 306 | chevrolet citation,28.8,6,173.0,115,2595,11.3,79,1 307 | oldsmobile omega brougham,26.8,6,173.0,115,2700,12.9,79,1 308 | pontiac phoenix,33.5,4,151.0,90,2556,13.2,79,1 309 | vw rabbit,41.5,4,98.0,76,2144,14.7,80,2 310 | toyota corolla tercel,38.1,4,89.0,60,1968,18.8,80,3 311 | chevrolet chevette,32.1,4,98.0,70,2120,15.5,80,1 312 | datsun 310,37.2,4,86.0,65,2019,16.4,80,3 313 | chevrolet citation,28.0,4,151.0,90,2678,16.5,80,1 314 | ford fairmont,26.4,4,140.0,88,2870,18.1,80,1 315 | amc concord,24.3,4,151.0,90,3003,20.1,80,1 316 | dodge aspen,19.1,6,225.0,90,3381,18.7,80,1 317 | audi 4000,34.3,4,97.0,78,2188,15.8,80,2 318 | toyota corona liftback,29.8,4,134.0,90,2711,15.5,80,3 319 | mazda 626,31.3,4,120.0,75,2542,17.5,80,3 320 | datsun 510 hatchback,37.0,4,119.0,92,2434,15.0,80,3 321 | toyota corolla,32.2,4,108.0,75,2265,15.2,80,3 322 | mazda glc,46.6,4,86.0,65,2110,17.9,80,3 323 | dodge colt,27.9,4,156.0,105,2800,14.4,80,1 324 | datsun 210,40.8,4,85.0,65,2110,19.2,80,3 325 | vw rabbit c (diesel),44.3,4,90.0,48,2085,21.7,80,2 326 | vw dasher (diesel),43.4,4,90.0,48,2335,23.7,80,2 327 | audi 5000s (diesel),36.4,5,121.0,67,2950,19.9,80,2 328 | mercedes-benz 240d,30.0,4,146.0,67,3250,21.8,80,2 329 | honda civic 1500 gl,44.6,4,91.0,67,1850,13.8,80,3 330 | subaru dl,33.8,4,97.0,67,2145,18.0,80,3 331 | vokswagen rabbit,29.8,4,89.0,62,1845,15.3,80,2 332 | datsun 280-zx,32.7,6,168.0,132,2910,11.4,80,3 333 | mazda rx-7 gs,23.7,3,70.0,100,2420,12.5,80,3 334 | triumph tr7 coupe,35.0,4,122.0,88,2500,15.1,80,2 335 | honda accord,32.4,4,107.0,72,2290,17.0,80,3 336 | plymouth reliant,27.2,4,135.0,84,2490,15.7,81,1 337 | buick skylark,26.6,4,151.0,84,2635,16.4,81,1 338 | dodge aries wagon (sw),25.8,4,156.0,92,2620,14.4,81,1 339 | chevrolet citation,23.5,6,173.0,110,2725,12.6,81,1 340 | plymouth reliant,30.0,4,135.0,84,2385,12.9,81,1 341 | toyota starlet,39.1,4,79.0,58,1755,16.9,81,3 342 | plymouth champ,39.0,4,86.0,64,1875,16.4,81,1 343 | honda civic 1300,35.1,4,81.0,60,1760,16.1,81,3 344 | subaru,32.3,4,97.0,67,2065,17.8,81,3 345 | datsun 210 mpg,37.0,4,85.0,65,1975,19.4,81,3 346 | toyota tercel,37.7,4,89.0,62,2050,17.3,81,3 347 | mazda glc 4,34.1,4,91.0,68,1985,16.0,81,3 348 | plymouth horizon 4,34.7,4,105.0,63,2215,14.9,81,1 349 | ford escort 4w,34.4,4,98.0,65,2045,16.2,81,1 350 | ford escort 2h,29.9,4,98.0,65,2380,20.7,81,1 351 | volkswagen jetta,33.0,4,105.0,74,2190,14.2,81,2 352 | honda prelude,33.7,4,107.0,75,2210,14.4,81,3 353 | toyota corolla,32.4,4,108.0,75,2350,16.8,81,3 354 | datsun 200sx,32.9,4,119.0,100,2615,14.8,81,3 355 | mazda 626,31.6,4,120.0,74,2635,18.3,81,3 356 | peugeot 505s turbo diesel,28.1,4,141.0,80,3230,20.4,81,2 357 | volvo diesel,30.7,6,145.0,76,3160,19.6,81,2 358 | toyota cressida,25.4,6,168.0,116,2900,12.6,81,3 359 | datsun 810 maxima,24.2,6,146.0,120,2930,13.8,81,3 360 | buick century,22.4,6,231.0,110,3415,15.8,81,1 361 | oldsmobile cutlass ls,26.6,8,350.0,105,3725,19.0,81,1 362 | ford granada gl,20.2,6,200.0,88,3060,17.1,81,1 363 | chrysler lebaron salon,17.6,6,225.0,85,3465,16.6,81,1 364 | chevrolet cavalier,28.0,4,112.0,88,2605,19.6,82,1 365 | chevrolet cavalier wagon,27.0,4,112.0,88,2640,18.6,82,1 366 | chevrolet cavalier 2-door,34.0,4,112.0,88,2395,18.0,82,1 367 | pontiac j2000 se hatchback,31.0,4,112.0,85,2575,16.2,82,1 368 | dodge aries se,29.0,4,135.0,84,2525,16.0,82,1 369 | pontiac phoenix,27.0,4,151.0,90,2735,18.0,82,1 370 | ford fairmont futura,24.0,4,140.0,92,2865,16.4,82,1 371 | volkswagen rabbit l,36.0,4,105.0,74,1980,15.3,82,2 372 | mazda glc custom l,37.0,4,91.0,68,2025,18.2,82,3 373 | mazda glc custom,31.0,4,91.0,68,1970,17.6,82,3 374 | plymouth horizon miser,38.0,4,105.0,63,2125,14.7,82,1 375 | mercury lynx l,36.0,4,98.0,70,2125,17.3,82,1 376 | nissan stanza xe,36.0,4,120.0,88,2160,14.5,82,3 377 | honda accord,36.0,4,107.0,75,2205,14.5,82,3 378 | toyota corolla,34.0,4,108.0,70,2245,16.9,82,3 379 | honda civic,38.0,4,91.0,67,1965,15.0,82,3 380 | honda civic (auto),32.0,4,91.0,67,1965,15.7,82,3 381 | datsun 310 gx,38.0,4,91.0,67,1995,16.2,82,3 382 | buick century limited,25.0,6,181.0,110,2945,16.4,82,1 383 | oldsmobile cutlass ciera (diesel),38.0,6,262.0,85,3015,17.0,82,1 384 | chrysler lebaron medallion,26.0,4,156.0,92,2585,14.5,82,1 385 | ford granada l,22.0,6,232.0,112,2835,14.7,82,1 386 | toyota celica gt,32.0,4,144.0,96,2665,13.9,82,3 387 | dodge charger 2.2,36.0,4,135.0,84,2370,13.0,82,1 388 | chevrolet camaro,27.0,4,151.0,90,2950,17.3,82,1 389 | ford mustang gl,27.0,4,140.0,86,2790,15.6,82,1 390 | vw pickup,44.0,4,97.0,52,2130,24.6,82,2 391 | dodge rampage,32.0,4,135.0,84,2295,11.6,82,1 392 | ford ranger,28.0,4,120.0,79,2625,18.6,82,1 393 | chevy s-10,31.0,4,119.0,82,2720,19.4,82,1 394 | -------------------------------------------------------------------------------- /data/auto-mpg.names: -------------------------------------------------------------------------------- 1 | 1. Title: Auto-Mpg Data 2 | 3 | 2. Sources: 4 | (a) Origin: This dataset was taken from the StatLib library which is 5 | maintained at Carnegie Mellon University. The dataset was 6 | used in the 1983 American Statistical Association Exposition. 7 | (c) Date: July 7, 1993 8 | 9 | 3. Past Usage: 10 | - See 2b (above) 11 | - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. 12 | In Proceedings on the Tenth International Conference of Machine 13 | Learning, 236-243, University of Massachusetts, Amherst. Morgan 14 | Kaufmann. 15 | 16 | 4. Relevant Information: 17 | 18 | This dataset is a slightly modified version of the dataset provided in 19 | the StatLib library. In line with the use by Ross Quinlan (1993) in 20 | predicting the attribute "mpg", 8 of the original instances were removed 21 | because they had unknown values for the "mpg" attribute. The original 22 | dataset is available in the file "auto-mpg.data-original". 23 | 24 | "The data concerns city-cycle fuel consumption in miles per gallon, 25 | to be predicted in terms of 3 multivalued discrete and 5 continuous 26 | attributes." (Quinlan, 1993) 27 | 28 | 5. Number of Instances: 398 29 | 30 | 6. Number of Attributes: 9 including the class attribute 31 | 32 | 7. Attribute Information: 33 | 34 | 1. mpg: continuous 35 | 2. cylinders: multi-valued discrete 36 | 3. displacement: continuous 37 | 4. horsepower: continuous 38 | 5. weight: continuous 39 | 6. acceleration: continuous 40 | 7. model year: multi-valued discrete 41 | 8. origin: multi-valued discrete 42 | 9. car name: string (unique for each instance) 43 | 44 | 8. Missing Attribute Values: horsepower has 6 missing values 45 | 46 | -------------------------------------------------------------------------------- /data/penguins.csv: -------------------------------------------------------------------------------- 1 | species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE 3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE 4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE 5 | Adelie,Torgersen,NA,NA,NA,NA,NA 6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE 7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE 8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE 9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE 10 | Adelie,Torgersen,34.1,18.1,193,3475,NA 11 | Adelie,Torgersen,42,20.2,190,4250,NA 12 | Adelie,Torgersen,37.8,17.1,186,3300,NA 13 | Adelie,Torgersen,37.8,17.3,180,3700,NA 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE 21 | Adelie,Torgersen,46,21.5,194,4200,MALE 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE 33 | Adelie,Dream,37.2,18.1,178,3900,MALE 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE 35 | Adelie,Dream,40.9,18.9,184,3900,MALE 36 | Adelie,Dream,36.4,17,195,3325,FEMALE 37 | Adelie,Dream,39.2,21.1,196,4150,MALE 38 | Adelie,Dream,38.8,20,190,3950,MALE 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE 41 | Adelie,Dream,39.8,19.1,184,4650,MALE 42 | Adelie,Dream,36.5,18,182,3150,FEMALE 43 | Adelie,Dream,40.8,18.4,195,3900,MALE 44 | Adelie,Dream,36,18.5,186,3100,FEMALE 45 | Adelie,Dream,44.1,19.7,196,4400,MALE 46 | Adelie,Dream,37,16.9,185,3000,FEMALE 47 | Adelie,Dream,39.6,18.8,190,4600,MALE 48 | Adelie,Dream,41.1,19,182,3425,MALE 49 | Adelie,Dream,37.5,18.9,179,2975,NA 50 | Adelie,Dream,36,17.9,190,3450,FEMALE 51 | Adelie,Dream,42.3,21.2,191,4150,MALE 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE 55 | Adelie,Biscoe,42,19.5,200,4050,MALE 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE 67 | Adelie,Biscoe,41.6,18,192,3950,MALE 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE 87 | Adelie,Dream,41.3,20.3,194,3550,MALE 88 | Adelie,Dream,36.3,19.5,190,3800,MALE 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE 90 | Adelie,Dream,38.3,19.2,189,3950,MALE 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE 92 | Adelie,Dream,35.7,18,202,3550,FEMALE 93 | Adelie,Dream,41.1,18.1,205,4300,MALE 94 | Adelie,Dream,34,17.1,185,3400,FEMALE 95 | Adelie,Dream,39.6,18.1,186,4450,MALE 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE 97 | Adelie,Dream,40.8,18.9,208,4300,MALE 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE 99 | Adelie,Dream,40.3,18.5,196,4350,MALE 100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE 101 | Adelie,Dream,43.2,18.5,192,4100,MALE 102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE 103 | Adelie,Biscoe,41,20,203,4725,MALE 104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE 105 | Adelie,Biscoe,37.8,20,190,4250,MALE 106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE 107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE 108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE 109 | Adelie,Biscoe,38.2,20,190,3900,MALE 110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE 111 | Adelie,Biscoe,43.2,19,197,4775,MALE 112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE 113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE 114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE 115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE 116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE 117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE 118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE 119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE 120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE 121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE 122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE 123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE 124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE 125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE 126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE 127 | Adelie,Torgersen,40.6,19,199,4000,MALE 128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE 129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE 130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE 131 | Adelie,Torgersen,44.1,18,210,4000,MALE 132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE 133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE 134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE 135 | Adelie,Dream,37.5,18.5,199,4475,MALE 136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE 137 | Adelie,Dream,41.1,17.5,190,3900,MALE 138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE 139 | Adelie,Dream,40.2,20.1,200,3975,MALE 140 | Adelie,Dream,37,16.5,185,3400,FEMALE 141 | Adelie,Dream,39.7,17.9,193,4250,MALE 142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE 143 | Adelie,Dream,40.6,17.2,187,3475,MALE 144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE 145 | Adelie,Dream,40.7,17,190,3725,MALE 146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE 147 | Adelie,Dream,39,18.7,185,3650,MALE 148 | Adelie,Dream,39.2,18.6,190,4250,MALE 149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE 150 | Adelie,Dream,36,17.8,195,3450,FEMALE 151 | Adelie,Dream,37.8,18.1,193,3750,MALE 152 | Adelie,Dream,36,17.1,187,3700,FEMALE 153 | Adelie,Dream,41.5,18.5,201,4000,MALE 154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE 155 | Chinstrap,Dream,50,19.5,196,3900,MALE 156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE 157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE 158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE 159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE 160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE 161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE 162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE 163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE 164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE 165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE 166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE 167 | Chinstrap,Dream,52,18.1,201,4050,MALE 168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE 169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE 170 | Chinstrap,Dream,50.3,20,197,3300,MALE 171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE 172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE 173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE 174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE 175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE 176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE 177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE 178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE 179 | Chinstrap,Dream,52,19,197,4150,MALE 180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE 181 | Chinstrap,Dream,49.5,19,200,3800,MALE 182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE 183 | Chinstrap,Dream,52.8,20,205,4550,MALE 184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE 185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE 186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE 187 | Chinstrap,Dream,51,18.8,203,4100,MALE 188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE 189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE 190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE 191 | Chinstrap,Dream,52,20.7,210,4800,MALE 192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE 193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE 194 | Chinstrap,Dream,49,19.5,210,3950,MALE 195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE 196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE 197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE 198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE 199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE 200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE 201 | Chinstrap,Dream,49,19.6,212,4300,MALE 202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE 203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE 204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE 205 | Chinstrap,Dream,51.4,19,201,3950,MALE 206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE 207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE 208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE 209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE 210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE 211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE 212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE 213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE 214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE 215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE 216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE 217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE 218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE 219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE 220 | Chinstrap,Dream,50.8,19,210,4100,MALE 221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE 222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE 223 | Gentoo,Biscoe,50,16.3,230,5700,MALE 224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE 225 | Gentoo,Biscoe,50,15.2,218,5700,MALE 226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE 227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE 228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE 229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE 230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE 231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE 232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE 233 | Gentoo,Biscoe,49,16.1,216,5550,MALE 234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE 235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE 236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE 237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE 238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE 239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE 240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE 241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE 242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE 243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE 244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE 245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE 246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE 247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE 248 | Gentoo,Biscoe,44.5,14.3,216,4100,NA 249 | Gentoo,Biscoe,47.8,15,215,5650,MALE 250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE 251 | Gentoo,Biscoe,50,15.3,220,5550,MALE 252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE 253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE 254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE 255 | Gentoo,Biscoe,59.6,17,230,6050,MALE 256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE 257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE 258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE 259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE 260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE 261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE 262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE 263 | Gentoo,Biscoe,49.6,16,225,5700,MALE 264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE 265 | Gentoo,Biscoe,49.6,15,216,4750,MALE 266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE 267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE 268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE 269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE 270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE 271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE 272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE 273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE 274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE 275 | Gentoo,Biscoe,50.1,15,225,5000,MALE 276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE 277 | Gentoo,Biscoe,45,15.4,220,5050,MALE 278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE 279 | Gentoo,Biscoe,45.5,15,220,5000,MALE 280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE 281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE 282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE 283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE 284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE 285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE 286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE 287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE 288 | Gentoo,Biscoe,46.2,14.4,214,4650,NA 289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE 290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE 291 | Gentoo,Biscoe,50.7,15,223,5550,MALE 292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE 293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE 294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE 295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE 296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE 297 | Gentoo,Biscoe,48.6,16,230,5800,MALE 298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE 299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE 300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE 301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE 302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE 303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE 304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE 305 | Gentoo,Biscoe,50,15.9,224,5350,MALE 306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE 307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE 308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE 309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE 310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE 311 | Gentoo,Biscoe,52.1,17,230,5550,MALE 312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE 313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE 314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE 315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE 316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE 317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE 318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE 319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE 320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE 321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE 322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE 323 | Gentoo,Biscoe,55.9,17,228,5600,MALE 324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE 325 | Gentoo,Biscoe,49.1,15,228,5500,MALE 326 | Gentoo,Biscoe,47.3,13.8,216,4725,NA 327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE 328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE 329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE 330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE 331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE 332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE 333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE 334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE 335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE 336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE 337 | Gentoo,Biscoe,55.1,16,230,5850,MALE 338 | Gentoo,Biscoe,44.5,15.7,217,4875,. 339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE 340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE 341 | Gentoo,Biscoe,NA,NA,NA,NA,NA 342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE 343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE 344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE 345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE 346 | -------------------------------------------------------------------------------- /data/penguins_X_test.csv: -------------------------------------------------------------------------------- 1 | ,Dream,Torgersen,Male,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g 2 | 0,1.0,0.0,1.0,1.1137063470051127,1.1966994432156142,-0.34260576201176546,-0.38521456736415055 3 | 1,1.0,0.0,1.0,1.35500938885622,1.0424664404409307,-0.5528986989244922,-0.6928426214811282 4 | 2,1.0,0.0,0.0,0.22274126940102307,-0.29421958360632183,-0.6930939901996434,-1.1850475080682923 5 | 3,0.0,0.0,0.0,0.16705595205076665,-1.7337276095033645,0.778956568189444,0.6607208166335733 6 | 4,0.0,0.0,1.0,1.9118625623587762,-0.7569185919303723,2.1108118353033802,1.7681818114546928 7 | 5,0.0,0.0,0.0,0.5382914010524709,-1.32243960210421,1.4799330245652,0.8145348436920622 8 | 6,1.0,0.0,1.0,-1.2065152092555387,0.6825894339666704,-0.13231282509903866,0.322329957104898 9 | 7,1.0,0.0,1.0,-1.1508298919052837,0.4769454302670937,-0.5528986989244922,-0.5697913998343371 10 | 8,0.0,0.0,1.0,0.6125384908528114,-0.962562595629949,1.4799330245652,1.2759769248675286 11 | 9,0.0,0.0,1.0,-0.48260608370221575,0.7340004348915655,-0.6930939901996434,-0.6313170106577326 12 | 10,0.0,0.0,1.0,-1.0765828021049417,0.4769454302670937,-1.1136798640250969,-0.323688956540755 13 | 11,0.0,0.0,0.0,0.4640443112521303,-1.887960612278048,0.6387612769142929,0.41461837333999124 14 | 12,0.0,1.0,1.0,-1.6519977480575836,1.145288442290719,-0.5528986989244922,-0.016060902423777418 15 | 13,0.0,0.0,1.0,1.0951445745550272,-0.5512745882307937,0.8490542138270196,1.4605537573377152 16 | 14,0.0,0.0,0.0,-0.03712354490017095,-1.6823166085784702,0.4985659856391417,0.10699031922301364 17 | 15,0.0,0.0,0.0,-1.670559520507669,0.3741234284173035,-0.7631916358372189,-0.9389450647747103 18 | 16,0.0,1.0,1.0,-0.2041794969509376,0.21989042564262185,-0.34260576201176546,0.5991952058101778 19 | 17,0.0,0.0,0.0,-0.6496620357529824,0.3741234284173035,-0.9734845727499457,-1.246573118891688 20 | 18,1.0,0.0,1.0,1.8933007899086922,1.8650424552392413,0.007882466176112518,0.10699031922301364 21 | 19,1.0,0.0,1.0,1.484941796006817,1.8136314543143461,0.6387612769142929,0.7222464274569689 22 | 20,0.0,0.0,1.0,1.0208974847546866,-0.49986358730590036,1.970616544028229,1.9527586439248794 23 | 21,0.0,0.0,1.0,0.5011678561523012,-0.962562595629949,1.2696400876524732,1.214451314044133 24 | 22,0.0,0.0,0.0,1.2065152092555387,-1.013973596554844,1.0593471507397463,0.968348870750551 25 | 23,0.0,0.0,0.0,0.816717987803749,-1.4252616039539985,0.1480777574512637,0.5069067895750845 26 | 24,1.0,0.0,1.0,0.9837739398545157,1.4023434469151907,0.1480777574512637,-0.200637734893964 27 | 25,1.0,0.0,1.0,-0.8909650776040896,0.7340004348915655,-0.7631916358372189,0.04546470839961811 28 | 26,0.0,0.0,0.0,-0.129932407150597,-1.9393716132029422,0.5686636312767173,0.2300415408698047 29 | 27,1.0,0.0,0.0,-2.2088509215601397,-0.8597405937801605,-0.9033869271123701,-1.4311499513618744 30 | 28,1.0,0.0,1.0,1.484941796006817,0.4769454302670937,0.007882466176112518,-0.200637734893964 31 | 29,0.0,0.0,0.0,0.09280886225042606,-1.271028601179316,0.9191518594645952,0.7837720382803643 32 | 30,1.0,0.0,0.0,-0.5754149459526419,-0.29421958360632183,-0.9734845727499457,-1.246573118891688 33 | 31,0.0,0.0,0.0,-1.5035035684569025,1.0424664404409307,-0.8332892814747945,-0.5082657890109417 34 | 32,0.0,1.0,0.0,-1.5035035684569025,-0.29421958360632183,-0.7631916358372189,-1.4311499513618744 35 | 33,0.0,0.0,1.0,-0.6311002633028969,0.8368224367413539,-0.5528986989244922,-0.5082657890109417 36 | 34,0.0,1.0,0.0,-0.9837739398545157,0.9396444385911422,-0.41270340764934105,-0.9389450647747103 37 | 35,0.0,0.0,1.0,0.761032670453494,-1.0653845974797382,0.5686636312767173,1.5836049789845061 38 | 36,1.0,0.0,1.0,1.0208974847546866,0.9396444385911422,-0.06221517946146307,-0.5082657890109417 39 | 37,0.0,1.0,0.0,-0.9466503949543461,0.3227124274924102,-1.3940704465753992,-0.7236054268928259 40 | 38,1.0,0.0,1.0,-0.5382914010524709,0.9396444385911422,-1.3239728009378238,-0.969707870186408 41 | 39,0.0,0.0,1.0,-0.7981562153536635,0.8882334376662472,-1.1837775096626726,-0.8158938431279192 42 | 40,0.0,1.0,0.0,-1.35500938885622,1.0938774413658257,-0.5528986989244922,-0.9389450647747103 43 | 41,1.0,0.0,1.0,1.2436387541557097,1.2995214450654025,0.1480777574512637,-0.200637734893964 44 | 42,0.0,1.0,1.0,-0.35267367655161874,0.9910554395160374,-0.41270340764934105,-0.2621633457173595 45 | 43,1.0,0.0,0.0,0.2969883592013636,1.145288442290719,-0.48280105328691664,-0.846656648539617 46 | 44,1.0,0.0,0.0,-1.3178858439560504,0.7340004348915655,-0.8332892814747945,-0.8774194539513147 47 | 45,0.0,1.0,1.0,0.3341119041015333,0.8882334376662472,-0.27250811637418987,-0.07758651324717294 48 | 46,0.0,0.0,1.0,1.7448066103080095,-0.7055075910054771,1.2696400876524732,1.5836049789845061 49 | 47,1.0,0.0,0.0,-1.2436387541557097,-0.19139758175653346,-0.6229963445620678,-1.49267556218527 50 | 48,1.0,0.0,1.0,1.2622005266057938,0.9396444385911422,0.6387612769142929,-0.13911212407056847 51 | 49,0.0,0.0,0.0,-0.24130304185110724,-1.7851386104282596,0.4985659856391417,-0.323688956540755 52 | 50,0.0,0.0,0.0,0.6867855806531533,-1.1167955984046325,1.0593471507397463,0.6607208166335733 53 | 51,0.0,0.0,0.0,0.9466503949543461,-1.3738506030291042,0.778956568189444,0.5069067895750845 54 | 52,0.0,0.0,1.0,0.5197296286023854,-0.5512745882307937,0.9892495051021708,1.5836049789845061 55 | 53,0.0,0.0,1.0,-0.5382914010524709,0.5283564311919869,-0.6229963445620678,-0.200637734893964 56 | 54,0.0,0.0,0.0,-1.4106947062064763,-0.03716457898184999,-1.1837775096626726,-1.6772523946554565 57 | 55,0.0,0.0,0.0,-0.09280886225042606,-1.5280836058037877,1.3397377332900489,0.5991952058101778 58 | 56,0.0,0.0,0.0,0.5939767184027273,-1.7851386104282596,0.9191518594645952,0.8760604545154577 59 | 57,0.0,0.0,0.0,0.0,-1.8365496113531536,0.4985659856391417,0.16851593004640916 60 | 58,0.0,0.0,1.0,1.3178858439560504,-0.44845258638100527,1.3397377332900489,2.1988610872184613 61 | 59,0.0,0.0,0.0,-0.07424708980034059,-1.6823166085784702,1.129444796377322,0.8452976491037599 62 | 60,0.0,0.0,1.0,-0.37123544900170424,1.1966994432156142,-0.06221517946146307,-0.200637734893964 63 | 61,1.0,0.0,0.0,-0.14849417960068118,-0.29421958360632183,-0.9734845727499457,-1.615726783832061 64 | 62,0.0,1.0,1.0,-0.5382914010524709,0.7340004348915655,-0.8332892814747945,-1.092759091833199 65 | 63,0.0,0.0,1.0,0.7053473531032375,-1.1167955984046325,0.9892495051021708,1.7681818114546928 66 | 64,1.0,0.0,1.0,1.3921329337563908,0.7854114358164588,-0.9734845727499457,-1.1850475080682923 67 | 65,1.0,0.0,0.0,-1.484941796006817,-0.03716457898184999,-0.9734845727499457,-0.6313170106577326 68 | 66,0.0,0.0,0.0,0.48260608370221575,-1.5280836058037877,0.6387612769142929,0.7837720382803643 69 | 67,1.0,0.0,1.0,1.763368382758095,1.4023434469151907,0.2882730487264149,0.3530927625165957 70 | 68,0.0,1.0,1.0,-0.48260608370221575,0.6825894339666704,0.07798011181368811,-0.41597737277584834 71 | 69,0.0,0.0,0.0,-1.3921329337563908,-0.29421958360632183,-1.3940704465753992,-1.6772523946554565 72 | 70,1.0,0.0,1.0,0.9280886225042606,1.248110444140509,0.778956568189444,0.10699031922301364 73 | 71,0.0,0.0,0.0,-1.577750658257243,-0.49986358730590036,-0.41270340764934105,-1.0619962864215013 74 | 72,1.0,0.0,1.0,1.0580210296548576,0.7340004348915655,-0.41270340764934105,-0.7543682323045237 75 | 73,1.0,0.0,0.0,-0.3341119041015333,0.6825894339666704,-1.464168092212975,-0.8158938431279192 76 | 74,0.0,1.0,1.0,-0.2784265867512782,1.8136314543143461,-0.27250811637418987,0.3530927625165957 77 | 75,0.0,1.0,0.0,-1.3735711613063053,0.3227124274924102,-1.1136798640250969,-0.6313170106577326 78 | 76,1.0,0.0,0.0,-1.8561772450085212,-0.03716457898184999,-1.1136798640250969,-1.0004706755981059 79 | 77,0.0,0.0,1.0,-0.4454825388020448,0.4255344293421986,-0.6229963445620678,-0.323688956540755 80 | 78,0.0,0.0,0.0,-1.614874203157414,0.8882334376662472,-0.9734845727499457,-0.5082657890109417 81 | 79,0.0,0.0,1.0,1.0765828021049417,-0.19139758175653346,2.0407141896658048,1.8297074222780882 82 | 80,0.0,0.0,1.0,1.039459257204772,-1.1167955984046325,1.0593471507397463,0.6607208166335733 83 | 81,0.0,0.0,1.0,1.3178858439560504,-0.3456305845312169,1.6902259614779267,1.2759769248675286 84 | 82,0.0,0.0,0.0,0.4640443112521303,-1.4252616039539985,1.129444796377322,0.8452976491037599 85 | 83,0.0,0.0,0.0,-1.002335712304601,0.014246421943043286,-0.13231282509903866,-0.5697913998343371 86 | -------------------------------------------------------------------------------- /data/penguins_X_train.csv: -------------------------------------------------------------------------------- 1 | ,Dream,Torgersen,Male,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g 2 | 0,1.0,0.0,1.0,1.169391664355368,1.453754447840086,-0.27250811637418987,-1.1235218972448968 3 | 1,0.0,0.0,0.0,-1.1879534368054532,-0.08857557990674508,-1.1136798640250969,-0.7543682323045237 4 | 2,0.0,0.0,0.0,0.3341119041015333,-1.5280836058037877,1.2696400876524732,0.5991952058101778 5 | 3,1.0,0.0,1.0,-0.5382914010524709,0.16847942471772676,-0.7631916358372189,-0.38521456736415055 6 | 4,0.0,0.0,1.0,-0.723909125553323,0.8882334376662472,-0.9033869271123701,0.10699031922301364 7 | 5,0.0,0.0,1.0,0.816717987803749,-1.32243960210421,0.8490542138270196,2.014284254748275 8 | 6,1.0,0.0,0.0,1.0765828021049417,0.06565742286793838,-0.20241047073661425,-0.6620798160694304 9 | 7,0.0,0.0,0.0,-0.816717987803749,1.8136314543143461,-0.6930939901996434,-0.38521456736415055 10 | 8,0.0,0.0,0.0,0.9466503949543461,-1.2196176002544208,1.3397377332900489,1.1529257032207376 11 | 9,0.0,1.0,1.0,-1.169391664355368,1.3509324459902976,-0.20241047073661425,-0.8774194539513147 12 | 10,1.0,0.0,1.0,-0.7795944429035794,0.9910554395160374,-1.1837775096626726,0.5376695949867824 13 | 11,1.0,0.0,0.0,0.6496620357529824,-0.19139758175653346,-0.13231282509903866,-0.38521456736415055 14 | 12,0.0,1.0,0.0,-1.7448066103080095,0.014246421943043286,-0.8332892814747945,-1.246573118891688 15 | 13,1.0,0.0,0.0,0.37123544900170424,0.8882334376662472,-0.41270340764934105,-0.07758651324717294 16 | 14,1.0,0.0,1.0,-0.816717987803749,0.4769454302670937,-1.0435822183875214,0.2915671516932002 17 | 15,0.0,0.0,0.0,-0.7981562153536635,0.2713014265675151,-0.5528986989244922,-1.246573118891688 18 | 16,1.0,0.0,1.0,1.522065340906988,0.8368224367413539,-0.27250811637418987,-0.9389450647747103 19 | 17,0.0,0.0,0.0,-0.2598648143011927,-1.7851386104282596,0.8490542138270196,0.9068232599271554 20 | 18,1.0,0.0,0.0,-1.2993240715059649,-0.3456305845312169,-1.1136798640250969,-1.0004706755981059 21 | 19,0.0,0.0,0.0,-0.09280886225042606,-1.013973596554844,0.8490542138270196,0.5376695949867824 22 | 20,1.0,0.0,1.0,1.4292564786565618,1.6079874506147693,-0.48280105328691664,-0.5390285944226394 23 | 21,0.0,0.0,0.0,0.6311002633028969,-1.32243960210421,0.778956568189444,0.6299580112218756 24 | 22,0.0,1.0,1.0,-1.2622005266057938,1.145288442290719,-1.1837775096626726,-0.38521456736415055 25 | 23,0.0,0.0,1.0,-1.0765828021049417,1.453754447840086,-0.7631916358372189,-0.38521456736415055 26 | 24,0.0,1.0,1.0,-0.9095268500541751,0.7854114358164588,-1.3940704465753992,-0.5697913998343371 27 | 25,1.0,0.0,0.0,-0.2784265867512782,0.06565742286793838,-0.9734845727499457,-1.0619962864215013 28 | 26,0.0,1.0,1.0,-1.2436387541557097,1.7108094524645578,-0.13231282509903866,-0.5390285944226394 29 | 27,0.0,0.0,1.0,0.9652121674044315,-1.013973596554844,1.4098353789276243,2.5680147521588346 30 | 28,0.0,0.0,0.0,-1.1322681194551982,0.7340004348915655,-2.0249492573135797,-1.3080987297150835 31 | 29,0.0,1.0,1.0,-0.16705595205076665,1.0424664404409307,-0.27250811637418987,-0.8774194539513147 32 | 30,0.0,0.0,0.0,0.4640443112521303,-1.3738506030291042,0.8490542138270196,0.2300415408698047 33 | 31,0.0,0.0,1.0,-0.6311002633028969,0.7340004348915655,-1.253875155300248,-0.8158938431279192 34 | 32,0.0,1.0,1.0,-0.8909650776040896,1.248110444140509,-0.41270340764934105,0.5684324003984801 35 | 33,0.0,0.0,1.0,0.6682238082030679,-1.3738506030291042,0.9892495051021708,1.4605537573377152 36 | 34,0.0,0.0,1.0,-0.5011678561523012,2.0192754580139245,-0.41270340764934105,0.2300415408698047 37 | 35,1.0,0.0,0.0,0.5568531735025564,0.06565742286793838,-1.1136798640250969,-0.6313170106577326 38 | 36,0.0,0.0,1.0,1.2065152092555387,-0.654096590080583,1.6902259614779267,1.4605537573377152 39 | 37,1.0,0.0,1.0,1.614874203157414,1.3509324459902976,-0.27250811637418987,-0.6005542052460349 40 | 38,0.0,0.0,1.0,1.1137063470051127,-0.654096590080583,1.6201283158403512,1.3990281465143197 41 | 39,0.0,0.0,0.0,0.16705595205076665,-1.9907826141278362,0.8490542138270196,1.091400092397342 42 | 40,1.0,0.0,0.0,0.31555013165144913,-0.08857557990674508,-0.41270340764934105,-0.6928426214811282 43 | 41,1.0,0.0,0.0,-0.8352797602538345,0.3227124274924102,-0.9033869271123701,-1.1235218972448968 44 | 42,1.0,0.0,1.0,1.2250769817056242,1.145288442290719,-0.5528986989244922,-0.5082657890109417 45 | 43,0.0,1.0,1.0,-1.7448066103080095,2.0192754580139245,-0.20241047073661425,0.2300415408698047 46 | 44,1.0,0.0,0.0,-0.2784265867512782,-0.24280858268142855,-0.9734845727499457,-1.0619962864215013 47 | 45,1.0,0.0,1.0,-0.5939767184027273,0.8882334376662472,0.4985659856391417,0.10699031922301364 48 | 46,0.0,0.0,0.0,0.6496620357529824,-1.5280836058037877,0.5686636312767173,0.47614398416338677 49 | 47,0.0,1.0,0.0,-1.35500938885622,0.8368224367413539,-0.9734845727499457,-0.5082657890109417 50 | 48,0.0,1.0,0.0,-1.447818251106646,-0.5512745882307937,-0.9734845727499457,-0.8158938431279192 51 | 49,1.0,0.0,0.0,-1.3921329337563908,0.4255344293421986,-1.3239728009378238,-1.3080987297150835 52 | 50,0.0,0.0,1.0,1.1879534368054532,-0.7569185919303723,1.4799330245652,1.8912330331014837 53 | 51,0.0,0.0,0.0,0.4454825388020448,-1.1167955984046325,1.0593471507397463,0.5991952058101778 54 | 52,0.0,0.0,1.0,0.2784265867512782,-1.1167955984046325,1.3397377332900489,0.968348870750551 55 | 53,0.0,0.0,0.0,0.3341119041015333,-1.32243960210421,0.6387612769142929,-0.016060902423777418 56 | 54,0.0,0.0,0.0,-1.0951445745550272,-0.3456305845312169,-0.20241047073661425,-0.47750298359924387 57 | 55,0.0,0.0,0.0,0.4640443112521303,-1.2196176002544208,1.129444796377322,1.214451314044133 58 | 56,0.0,0.0,0.0,0.2784265867512782,-1.6823166085784702,0.6387612769142929,-0.016060902423777418 59 | 57,1.0,0.0,0.0,-1.3735711613063053,0.6311784330417753,-1.1837775096626726,-0.9081822593630126 60 | 58,1.0,0.0,0.0,-2.023233197059288,-0.5512745882307937,-1.6043633834881261,-1.615726783832061 61 | 59,0.0,0.0,1.0,0.8352797602538345,-1.579494606728682,1.3397377332900489,1.337502535690924 62 | 60,1.0,0.0,0.0,-1.5406271133570721,0.4255344293421986,0.07798011181368811,-0.8158938431279192 63 | 61,0.0,0.0,1.0,0.22274126940102307,-0.397041585456112,1.5500306702027755,2.137335476395066 64 | 62,0.0,0.0,0.0,-0.37123544900170424,-1.887960612278048,0.6387612769142929,-0.07758651324717294 65 | 63,0.0,0.0,1.0,0.2969883592013636,1.6079874506147693,-0.6930939901996434,0.47614398416338677 66 | 64,1.0,0.0,1.0,-0.4640443112521303,0.6825894339666704,0.007882466176112518,-0.2621633457173595 67 | 65,0.0,0.0,1.0,1.3921329337563908,-0.44845258638100527,2.0407141896658048,1.5836049789845061 68 | 66,1.0,0.0,0.0,0.4454825388020448,0.3227124274924102,-0.6930939901996434,-0.6313170106577326 69 | 67,0.0,0.0,1.0,0.9280886225042606,-0.5512745882307937,1.0593471507397463,1.6451305898079018 70 | 68,0.0,0.0,1.0,0.8909650776040896,-0.49986358730590036,1.4799330245652,2.1988610872184613 71 | 69,1.0,0.0,0.0,0.2598648143011927,0.7854114358164588,-0.9033869271123701,-0.846656648539617 72 | 70,0.0,0.0,1.0,-0.5568531735025564,1.453754447840086,0.1480777574512637,0.6299580112218756 73 | 71,0.0,0.0,1.0,0.22274126940102307,-0.7055075910054771,0.9892495051021708,1.337502535690924 74 | 72,1.0,0.0,1.0,-0.5754149459526419,0.8882334376662472,-1.1837775096626726,-0.38521456736415055 75 | 73,1.0,0.0,1.0,1.1508298919052837,0.8368224367413539,0.07798011181368811,-0.5082657890109417 76 | 74,0.0,0.0,0.0,0.8352797602538345,-1.1167955984046325,1.2696400876524732,0.7837720382803643 77 | 75,0.0,0.0,1.0,-0.24130304185110724,0.5797674321168821,-0.34260576201176546,-0.16987492948226623 78 | 76,1.0,0.0,0.0,-1.484941796006817,0.3741234284173035,-0.7631916358372189,-0.9389450647747103 79 | 77,1.0,0.0,1.0,1.484941796006817,0.9396444385911422,-0.27250811637418987,-0.07758651324717294 80 | 78,0.0,0.0,1.0,0.42692076635195936,-0.7055075910054771,0.9892495051021708,1.0298744815739465 81 | 79,0.0,0.0,1.0,1.1879534368054532,-0.962562595629949,1.6201283158403512,1.6451305898079018 82 | 80,1.0,0.0,1.0,1.039459257204772,0.5283564311919869,-0.5528986989244922,-0.5390285944226394 83 | 81,0.0,1.0,0.0,-1.002335712304601,-0.08857557990674508,-0.9033869271123701,-1.615726783832061 84 | 82,1.0,0.0,1.0,1.35500938885622,0.5283564311919869,-0.27250811637418987,-0.5697913998343371 85 | 83,1.0,0.0,0.0,-1.3364476164061359,0.6825894339666704,-0.5528986989244922,-0.8774194539513147 86 | 84,1.0,0.0,1.0,1.633435975607498,1.453754447840086,0.2882730487264149,0.41461837333999124 87 | 85,1.0,0.0,0.0,2.5986481430119297,0.3227124274924102,-1.3940704465753992,-0.6313170106577326 88 | 86,1.0,0.0,0.0,-0.8352797602538345,-0.24280858268142855,-1.6043633834881261,-1.1850475080682923 89 | 87,1.0,0.0,0.0,0.48260608370221575,0.3227124274924102,-0.5528986989244922,-0.5082657890109417 90 | 88,0.0,0.0,0.0,0.2041794969509376,-1.4252616039539985,0.6387612769142929,0.2300415408698047 91 | 89,1.0,0.0,0.0,1.2807622990558793,0.3741234284173035,-0.34260576201176546,-0.6620798160694304 92 | 90,1.0,0.0,1.0,-0.6867855806531533,0.6825894339666704,-0.34260576201176546,0.16851593004640916 93 | 91,0.0,1.0,0.0,-0.8352797602538345,0.11706842379283165,-1.0435822183875214,-0.5082657890109417 94 | 92,1.0,0.0,0.0,-1.447818251106646,0.06565742286793838,-0.9734845727499457,-1.1235218972448968 95 | 93,1.0,0.0,0.0,1.1508298919052837,0.7854114358164588,-0.20241047073661425,-0.5390285944226394 96 | 94,1.0,0.0,1.0,1.3735711613063053,0.9396444385911422,0.007882466176112518,-0.323688956540755 97 | 95,1.0,0.0,1.0,-0.31555013165144913,2.070686458938818,-0.6930939901996434,-0.07758651324717294 98 | 96,1.0,0.0,1.0,0.9280886225042606,1.1966994432156142,0.6387612769142929,-0.323688956540755 99 | 97,0.0,0.0,0.0,0.24130304185110724,-1.7337276095033645,0.4985659856391417,-0.016060902423777418 100 | 98,1.0,0.0,1.0,1.35500938885622,1.4023434469151907,-0.20241047073661425,-0.6313170106577326 101 | 99,0.0,0.0,0.0,-1.0951445745550272,-0.08857557990674508,-1.3940704465753992,-1.2773359243033857 102 | 100,1.0,0.0,1.0,2.1902891491100545,1.3509324459902976,0.4284683400015661,-0.2621633457173595 103 | 101,0.0,1.0,1.0,-0.7981562153536635,0.6311784330417753,-0.7631916358372189,-0.38521456736415055 104 | 102,0.0,0.0,1.0,0.3897972214517897,-1.0653845974797382,0.9892495051021708,1.091400092397342 105 | 103,0.0,0.0,0.0,-0.22274126940102307,-1.5280836058037877,0.5686636312767173,0.5991952058101778 106 | 104,0.0,0.0,0.0,0.2598648143011927,-1.32243960210421,0.7088589225518684,0.7222464274569689 107 | 105,1.0,0.0,1.0,1.2622005266057938,0.6825894339666704,0.007882466176112518,0.2915671516932002 108 | 106,1.0,0.0,0.0,-1.1879534368054532,1.0938774413658257,-1.3940704465753992,-1.1235218972448968 109 | 107,1.0,0.0,1.0,0.9652121674044315,0.5283564311919869,-0.41270340764934105,0.2300415408698047 110 | 108,0.0,1.0,0.0,-0.816717987803749,0.014246421943043286,-0.34260576201176546,-0.8158938431279192 111 | 109,0.0,1.0,1.0,-0.6311002633028969,0.9396444385911422,-0.13231282509903866,-0.2621633457173595 112 | 110,0.0,0.0,1.0,1.522065340906988,-0.03716457898184999,1.9005188983906536,1.4605537573377152 113 | 111,1.0,0.0,0.0,0.6682238082030679,0.5797674321168821,-0.41270340764934105,-0.4467401781875461 114 | 112,0.0,0.0,1.0,0.4454825388020448,-0.8083295928552664,1.4098353789276243,0.968348870750551 115 | 113,0.0,0.0,1.0,1.5035035684569025,-0.08857557990674508,2.0407141896658048,1.6451305898079018 116 | 114,0.0,0.0,1.0,0.07424708980034059,0.06565742286793838,1.2696400876524732,1.2759769248675286 117 | 115,1.0,0.0,1.0,-1.0580210296548576,1.0424664404409307,-0.8332892814747945,-0.323688956540755 118 | 116,0.0,0.0,1.0,2.8956365022132933,-0.08857557990674508,2.0407141896658048,2.2603866980418568 119 | 117,0.0,1.0,0.0,-1.5406271133570721,-0.08857557990674508,-0.8332892814747945,-1.0619962864215013 120 | 118,0.0,0.0,0.0,-1.5406271133570721,-0.13998658083164017,-1.1136798640250969,-1.3080987297150835 121 | 119,0.0,0.0,1.0,1.0765828021049417,-0.654096590080583,1.970616544028229,2.137335476395066 122 | 120,1.0,0.0,0.0,-1.2436387541557097,0.3227124274924102,-0.6930939901996434,-1.0619962864215013 123 | 121,1.0,0.0,0.0,0.22274126940102307,0.3227124274924102,-0.20241047073661425,-0.323688956540755 124 | 122,1.0,0.0,0.0,0.5197296286023854,-0.3456305845312169,-0.8332892814747945,-0.6928426214811282 125 | 123,0.0,0.0,1.0,-0.6496620357529824,0.8882334376662472,-1.464168092212975,-0.323688956540755 126 | 124,1.0,0.0,0.0,-1.484941796006817,0.6825894339666704,-1.0435822183875214,-1.369624340538479 127 | 125,0.0,0.0,0.0,0.7795944429035794,-1.4766726048788925,0.6387612769142929,0.47614398416338677 128 | 126,0.0,1.0,1.0,0.37123544900170424,2.2249194617135015,-0.48280105328691664,-0.016060902423777418 129 | 127,0.0,0.0,0.0,-1.670559520507669,0.3741234284173035,-0.6229963445620678,-0.6005542052460349 130 | 128,1.0,0.0,1.0,0.8352797602538345,0.16847942471772676,-0.6930939901996434,-1.0004706755981059 131 | 129,0.0,1.0,0.0,-1.0208974847546866,0.3741234284173035,-0.7631916358372189,-1.092759091833199 132 | 130,1.0,0.0,1.0,-0.7981562153536635,0.3741234284173035,-0.5528986989244922,0.04546470839961811 133 | 131,1.0,0.0,0.0,-1.5591888858071574,0.16847942471772676,-0.6930939901996434,-1.2773359243033857 134 | 132,0.0,1.0,0.0,-0.9280886225042606,-0.03716457898184999,-0.6930939901996434,-1.4311499513618744 135 | 133,1.0,0.0,0.0,0.5011678561523012,0.3741234284173035,-0.41270340764934105,-1.1235218972448968 136 | 134,0.0,1.0,0.0,-0.5754149459526419,-0.19139758175653346,-0.6930939901996434,-0.6313170106577326 137 | 135,0.0,0.0,1.0,0.5197296286023854,-0.9111515947050548,0.9892495051021708,1.1529257032207376 138 | 136,0.0,1.0,1.0,-0.4640443112521303,0.5797674321168821,-0.41270340764934105,0.10699031922301364 139 | 137,0.0,0.0,0.0,0.6496620357529824,-1.6309056076535762,0.778956568189444,0.8145348436920622 140 | 138,1.0,0.0,0.0,0.35267367655161874,-0.03716457898184999,-0.7631916358372189,-0.7851310377162215 141 | 139,1.0,0.0,1.0,-0.7053473531032375,1.505165448764981,-0.06221517946146307,-0.2929261511290573 142 | 140,0.0,0.0,0.0,0.2784265867512782,-1.3738506030291042,0.778956568189444,0.6607208166335733 143 | 141,1.0,0.0,1.0,-0.5011678561523012,1.6079874506147693,-0.48280105328691664,-0.8158938431279192 144 | 142,1.0,0.0,1.0,-0.14849417960068118,0.6825894339666704,-0.6229963445620678,-0.13911212407056847 145 | 143,0.0,0.0,0.0,-1.763368382758095,0.4769454302670937,-0.9734845727499457,-1.615726783832061 146 | 144,0.0,1.0,0.0,-1.577750658257243,0.16847942471772676,-0.7631916358372189,-0.6313170106577326 147 | 145,1.0,0.0,0.0,0.3897972214517897,0.5283564311919869,-1.6043633834881261,-1.1850475080682923 148 | 146,0.0,0.0,0.0,0.4083589939018752,-1.579494606728682,1.129444796377322,0.19927873545810693 149 | 147,1.0,0.0,1.0,0.018561772450085477,1.2995214450654025,-0.34260576201176546,0.2300415408698047 150 | 148,1.0,0.0,1.0,1.2065152092555387,1.248110444140509,0.007882466176112518,-0.200637734893964 151 | 149,1.0,0.0,0.0,-0.7053473531032375,-0.03716457898184999,-0.5528986989244922,-1.0004706755981059 152 | 150,1.0,0.0,1.0,-0.9652121674044315,1.453754447840086,-0.7631916358372189,-0.323688956540755 153 | 151,1.0,0.0,0.0,1.1322681194551982,0.3741234284173035,-0.7631916358372189,-1.0004706755981059 154 | 152,1.0,0.0,0.0,0.761032670453494,-0.397041585456112,-0.13231282509903866,-1.092759091833199 155 | 153,0.0,1.0,0.0,-0.9652121674044315,0.21989042564262185,-0.6930939901996434,-1.1542847026565946 156 | 154,0.0,0.0,1.0,1.2622005266057938,-0.7569185919303723,1.7603236071155024,1.214451314044133 157 | 155,1.0,0.0,0.0,0.5382914010524709,-0.29421958360632183,-0.6229963445620678,-1.8618292271256431 158 | 156,0.0,1.0,1.0,-0.4083589939018752,1.145288442290719,-0.20241047073661425,0.2915671516932002 159 | 157,1.0,0.0,1.0,1.2807622990558793,0.9910554395160374,-0.34260576201176546,-0.8158938431279192 160 | 158,0.0,0.0,1.0,-0.9652121674044315,0.014246421943043286,-1.464168092212975,-0.5082657890109417 161 | 159,1.0,0.0,1.0,-0.816717987803749,0.8368224367413539,-0.7631916358372189,0.47614398416338677 162 | 160,0.0,1.0,0.0,-0.6867855806531533,0.4255344293421986,-0.41270340764934105,-1.1850475080682923 163 | 161,0.0,0.0,0.0,0.2041794969509376,-1.3738506030291042,0.9892495051021708,0.968348870750551 164 | 162,0.0,0.0,1.0,1.2065152092555387,-0.654096590080583,1.4799330245652,1.6451305898079018 165 | 163,0.0,1.0,1.0,-0.22274126940102307,0.6825894339666704,-0.41270340764934105,0.04546470839961811 166 | 164,0.0,0.0,1.0,1.039459257204772,-0.6026855891556887,1.6902259614779267,1.8297074222780882 167 | 165,0.0,0.0,1.0,1.0208974847546866,-0.5512745882307937,1.6201283158403512,1.7681818114546928 168 | 166,0.0,0.0,0.0,-0.11137063470051153,-1.4252616039539985,1.1995424420148977,0.47614398416338677 169 | 167,0.0,0.0,1.0,1.1508298919052837,-1.4766726048788925,1.1995424420148977,1.8297074222780882 170 | 168,1.0,0.0,1.0,-1.4292564786565618,1.1966994432156142,-0.7631916358372189,-0.5082657890109417 171 | 169,0.0,0.0,0.0,0.22274126940102307,-1.2196176002544208,0.778956568189444,1.214451314044133 172 | 170,1.0,0.0,0.0,-1.4106947062064763,-0.08857557990674508,-0.41270340764934105,-1.092759091833199 173 | 171,1.0,0.0,0.0,0.2784265867512782,-0.08857557990674508,-0.34260576201176546,-0.8774194539513147 174 | 172,0.0,1.0,0.0,-1.633435975607498,-0.654096590080583,-1.0435822183875214,-1.4311499513618744 175 | 173,1.0,0.0,0.0,-0.09280886225042606,0.4769454302670937,0.07798011181368811,-1.0004706755981059 176 | 174,1.0,0.0,1.0,1.2993240715059649,0.8368224367413539,0.1480777574512637,-0.13911212407056847 177 | 175,1.0,0.0,0.0,-1.484941796006817,0.3227124274924102,-0.41270340764934105,-0.9389450647747103 178 | 176,0.0,0.0,1.0,1.2436387541557097,-1.1167955984046325,1.5500306702027755,1.6451305898079018 179 | 177,0.0,0.0,1.0,-1.1879534368054532,0.9910554395160374,-0.48280105328691664,-0.5697913998343371 180 | 178,0.0,0.0,1.0,-1.1508298919052837,1.453754447840086,-0.7631916358372189,0.04546470839961811 181 | 179,0.0,0.0,0.0,-0.816717987803749,0.2713014265675151,-1.0435822183875214,-0.8774194539513147 182 | 180,1.0,0.0,0.0,0.31555013165144913,0.06565742286793838,-0.5528986989244922,-0.7543682323045237 183 | 181,0.0,0.0,1.0,1.1137063470051127,-0.44845258638100527,2.0407141896658048,1.8297074222780882 184 | 182,0.0,0.0,0.0,-0.42692076635195936,-1.271028601179316,0.6387612769142929,0.5991952058101778 185 | 183,1.0,0.0,0.0,-0.2969883592013636,0.06565742286793838,-1.3940704465753992,-0.7543682323045237 186 | 184,0.0,0.0,0.0,0.4083589939018752,-1.3738506030291042,0.5686636312767173,0.7222464274569689 187 | 185,0.0,1.0,1.0,-1.002335712304601,2.070686458938818,-0.6930939901996434,-0.5082657890109417 188 | 186,1.0,0.0,0.0,1.2065152092555387,0.6311784330417753,-0.06221517946146307,-1.0004706755981059 189 | 187,1.0,0.0,0.0,-1.0951445745550272,0.7340004348915655,-0.7631916358372189,-0.6313170106577326 190 | 188,0.0,0.0,1.0,1.35500938885622,-1.5280836058037877,1.1995424420148977,1.337502535690924 191 | 189,0.0,0.0,0.0,-1.169391664355368,-0.6026855891556887,-1.253875155300248,-1.4003871459501767 192 | 190,0.0,1.0,1.0,-0.8724033051540054,1.7622204533894528,-0.7631916358372189,-0.6928426214811282 193 | 191,0.0,0.0,1.0,0.18561772450085212,-0.9111515947050548,1.3397377332900489,1.0298744815739465 194 | 192,1.0,0.0,1.0,-0.8909650776040896,2.0192754580139245,-0.34260576201176546,-0.07758651324717294 195 | 193,1.0,0.0,1.0,-0.6125384908528114,-0.08857557990674508,-0.7631916358372189,-0.6005542052460349 196 | 194,1.0,0.0,1.0,-0.5382914010524709,0.4769454302670937,0.2882730487264149,0.10699031922301364 197 | 195,0.0,0.0,0.0,0.24130304185110724,-1.7851386104282596,0.6387612769142929,0.10699031922301364 198 | 196,0.0,0.0,1.0,0.4083589939018752,-1.1682065993295265,1.4098353789276243,1.337502535690924 199 | 197,0.0,0.0,1.0,1.577750658257243,-0.8083295928552664,1.4098353789276243,1.5220793681611107 200 | 198,0.0,0.0,1.0,-1.169391664355368,0.7854114358164588,-1.464168092212975,-0.7543682323045237 201 | 199,0.0,0.0,1.0,0.816717987803749,-0.44845258638100527,1.3397377332900489,1.4605537573377152 202 | 200,0.0,0.0,0.0,-0.129932407150597,-1.6309056076535762,0.4985659856391417,0.44538117875168903 203 | 201,1.0,0.0,0.0,-1.0951445745550272,0.21989042564262185,-0.9734845727499457,-0.969707870186408 204 | 202,0.0,0.0,0.0,-1.1508298919052837,0.5797674321168821,-1.8847539660384285,-1.0004706755981059 205 | 203,0.0,0.0,0.0,0.3897972214517897,-2.0421936150527316,0.7088589225518684,0.3530927625165957 206 | 204,0.0,0.0,0.0,-0.14849417960068118,-1.3738506030291042,0.4985659856391417,0.2915671516932002 207 | 205,0.0,0.0,0.0,-0.2041794969509376,-2.0936046159776254,0.9892495051021708,0.968348870750551 208 | 206,0.0,0.0,0.0,0.6496620357529824,-1.1167955984046325,1.1995424420148977,0.9068232599271554 209 | 207,0.0,0.0,0.0,0.31555013165144913,-1.6823166085784702,0.9191518594645952,0.2300415408698047 210 | 208,0.0,0.0,1.0,1.1137063470051127,-0.962562595629949,1.3397377332900489,1.6451305898079018 211 | 209,0.0,0.0,0.0,0.5939767184027273,-0.8597405937801605,0.9892495051021708,0.9375860653388532 212 | 210,0.0,1.0,0.0,-1.7819301552081805,0.6311784330417753,-1.1837775096626726,-1.092759091833199 213 | 211,0.0,0.0,1.0,-0.5382914010524709,0.9910554395160374,-0.9033869271123701,-0.13911212407056847 214 | 212,0.0,0.0,1.0,0.8724033051540054,-1.0653845974797382,1.4799330245652,1.3990281465143197 215 | 213,1.0,0.0,1.0,-0.6311002633028969,0.014246421943043286,-0.9734845727499457,-0.9081822593630126 216 | 214,1.0,0.0,1.0,1.4663800235567315,1.1966994432156142,0.3583706943639905,-0.323688956540755 217 | 215,0.0,0.0,1.0,1.1322681194551982,-1.1167955984046325,1.6902259614779267,0.968348870750551 218 | 216,0.0,0.0,0.0,-0.5754149459526419,-1.7851386104282596,0.9191518594645952,0.5376695949867824 219 | 217,0.0,0.0,1.0,-0.14849417960068118,0.9396444385911422,-0.27250811637418987,0.6914836220452711 220 | 218,0.0,0.0,1.0,-0.3341119041015333,1.1966994432156142,-0.27250811637418987,0.07622751381131587 221 | 219,0.0,0.0,0.0,-0.9280886225042606,0.16847942471772676,-1.0435822183875214,-0.8158938431279192 222 | 220,0.0,0.0,0.0,0.2041794969509376,-1.3738506030291042,0.4284683400015661,1.0298744815739465 223 | 221,0.0,0.0,1.0,0.7795944429035794,-0.8083295928552664,1.4098353789276243,1.091400092397342 224 | 222,0.0,0.0,0.0,0.5197296286023854,-1.4766726048788925,0.9892495051021708,0.7837720382803643 225 | 223,0.0,0.0,0.0,0.2784265867512782,-1.7851386104282596,0.9191518594645952,0.5376695949867824 226 | 224,0.0,1.0,0.0,-1.9489861072589472,0.9396444385911422,-0.7631916358372189,-0.7543682323045237 227 | 225,0.0,1.0,0.0,-0.5382914010524709,0.21989042564262185,-1.3239728009378238,-1.246573118891688 228 | 226,1.0,0.0,1.0,-0.5939767184027273,0.6311784330417753,-0.41270340764934105,-0.38521456736415055 229 | 227,0.0,0.0,1.0,0.9837739398545157,-0.7569185919303723,1.129444796377322,2.014284254748275 230 | 228,1.0,0.0,1.0,-0.9280886225042606,0.7854114358164588,-1.1136798640250969,-0.6928426214811282 231 | 229,1.0,0.0,1.0,-1.2622005266057938,0.4769454302670937,-1.6043633834881261,-0.38521456736415055 232 | 230,0.0,0.0,0.0,0.22274126940102307,-1.7337276095033645,0.9892495051021708,0.6607208166335733 233 | 231,0.0,0.0,1.0,0.8724033051540054,-0.7569185919303723,0.4985659856391417,1.3990281465143197 234 | 232,0.0,0.0,1.0,1.002335712304601,-0.7055075910054771,1.0593471507397463,0.8760604545154577 235 | 233,0.0,0.0,0.0,0.8724033051540054,-1.579494606728682,0.6387612769142929,0.2915671516932002 236 | 234,0.0,0.0,1.0,0.85384153270392,-0.6026855891556887,2.0407141896658048,1.9527586439248794 237 | 235,0.0,1.0,0.0,-0.7053473531032375,-0.08857557990674508,-1.7445586747632773,-0.9389450647747103 238 | 236,0.0,0.0,1.0,1.1137063470051127,-1.013973596554844,1.1995424420148977,1.8297074222780882 239 | 237,0.0,0.0,1.0,2.2088509215601397,-0.08857557990674508,1.9005188983906536,1.7066562006312973 240 | 238,1.0,0.0,0.0,0.4083589939018752,0.16847942471772676,-0.9734845727499457,-0.6928426214811282 241 | 239,0.0,0.0,1.0,1.2622005266057938,0.06565742286793838,1.9005188983906536,1.7066562006312973 242 | 240,0.0,0.0,0.0,-1.1322681194551982,0.7340004348915655,-0.5528986989244922,-1.5849639784203633 243 | 241,1.0,0.0,0.0,-1.2993240715059649,-0.13998658083164017,-1.1136798640250969,-1.49267556218527 244 | 242,0.0,1.0,0.0,-1.447818251106646,0.014246421943043286,-0.9734845727499457,-1.3080987297150835 245 | 243,1.0,0.0,0.0,0.4640443112521303,0.3741234284173035,-0.6229963445620678,-0.8774194539513147 246 | 244,0.0,0.0,1.0,0.9466503949543461,-1.1167955984046325,1.9005188983906536,1.5836049789845061 247 | 245,1.0,0.0,0.0,-0.9466503949543461,0.8368224367413539,-0.7631916358372189,-0.7543682323045237 248 | 246,0.0,0.0,1.0,2.060356741959459,-0.6026855891556887,2.0407141896658048,2.014284254748275 249 | 247,0.0,1.0,1.0,0.018561772450085477,0.4255344293421986,0.6387612769142929,-0.2621633457173595 250 | 248,1.0,0.0,0.0,0.4454825388020448,0.7340004348915655,-0.7631916358372189,-0.9389450647747103 251 | -------------------------------------------------------------------------------- /data/penguins_y_test.csv: -------------------------------------------------------------------------------- 1 | ,species 2 | 153,Chinstrap 3 | 154,Chinstrap 4 | 208,Chinstrap 5 | 304,Gentoo 6 | 283,Gentoo 7 | 317,Gentoo 8 | 133,Adelie 9 | 149,Adelie 10 | 250,Gentoo 11 | 55,Adelie 12 | 23,Adelie 13 | 225,Gentoo 14 | 83,Adelie 15 | 343,Gentoo 16 | 276,Gentoo 17 | 52,Adelie 18 | 81,Adelie 19 | 27,Adelie 20 | 183,Chinstrap 21 | 189,Chinstrap 22 | 287,Gentoo 23 | 227,Gentoo 24 | 330,Gentoo 25 | 318,Gentoo 26 | 209,Chinstrap 27 | 146,Adelie 28 | 228,Gentoo 29 | 142,Adelie 30 | 165,Chinstrap 31 | 314,Gentoo 32 | 182,Chinstrap 33 | 22,Adelie 34 | 68,Adelie 35 | 57,Adelie 36 | 16,Adelie 37 | 329,Gentoo 38 | 179,Chinstrap 39 | 6,Adelie 40 | 46,Adelie 41 | 105,Adelie 42 | 4,Adelie 43 | 205,Chinstrap 44 | 79,Adelie 45 | 211,Chinstrap 46 | 87,Adelie 47 | 73,Adelie 48 | 327,Gentoo 49 | 144,Adelie 50 | 218,Chinstrap 51 | 260,Gentoo 52 | 290,Gentoo 53 | 300,Gentoo 54 | 325,Gentoo 55 | 63,Adelie 56 | 64,Adelie 57 | 288,Gentoo 58 | 338,Gentoo 59 | 258,Gentoo 60 | 297,Gentoo 61 | 265,Gentoo 62 | 53,Adelie 63 | 174,Chinstrap 64 | 119,Adelie 65 | 247,Gentoo 66 | 200,Chinstrap 67 | 150,Adelie 68 | 270,Gentoo 69 | 191,Chinstrap 70 | 123,Adelie 71 | 58,Adelie 72 | 199,Chinstrap 73 | 66,Adelie 74 | 186,Chinstrap 75 | 37,Adelie 76 | 17,Adelie 77 | 15,Adelie 78 | 92,Adelie 79 | 65,Adelie 80 | 25,Adelie 81 | 285,Gentoo 82 | 263,Gentoo 83 | 319,Gentoo 84 | 274,Gentoo 85 | 106,Adelie 86 | -------------------------------------------------------------------------------- /data/penguins_y_train.csv: -------------------------------------------------------------------------------- 1 | ,species 2 | 168,Chinstrap 3 | 62,Adelie 4 | 284,Gentoo 5 | 135,Adelie 6 | 51,Adelie 7 | 233,Gentoo 8 | 201,Chinstrap 9 | 114,Adelie 10 | 254,Gentoo 11 | 121,Adelie 12 | 39,Adelie 13 | 187,Chinstrap 14 | 80,Adelie 15 | 160,Chinstrap 16 | 93,Adelie 17 | 112,Adelie 18 | 207,Chinstrap 19 | 256,Gentoo 20 | 138,Adelie 21 | 332,Gentoo 22 | 163,Chinstrap 23 | 302,Gentoo 24 | 77,Adelie 25 | 107,Adelie 26 | 0,Adelie 27 | 206,Chinstrap 28 | 117,Adelie 29 | 237,Gentoo 30 | 28,Adelie 31 | 131,Adelie 32 | 242,Gentoo 33 | 26,Adelie 34 | 7,Adelie 35 | 224,Gentoo 36 | 61,Adelie 37 | 164,Chinstrap 38 | 267,Gentoo 39 | 156,Chinstrap 40 | 303,Gentoo 41 | 268,Gentoo 42 | 214,Chinstrap 43 | 32,Adelie 44 | 175,Chinstrap 45 | 14,Adelie 46 | 184,Chinstrap 47 | 95,Adelie 48 | 296,Gentoo 49 | 82,Adelie 50 | 78,Adelie 51 | 40,Adelie 52 | 341,Gentoo 53 | 294,Gentoo 54 | 277,Gentoo 55 | 234,Gentoo 56 | 110,Adelie 57 | 293,Gentoo 58 | 266,Gentoo 59 | 147,Adelie 60 | 98,Adelie 61 | 271,Gentoo 62 | 90,Adelie 63 | 299,Gentoo 64 | 236,Gentoo 65 | 111,Adelie 66 | 151,Adelie 67 | 333,Gentoo 68 | 180,Chinstrap 69 | 231,Gentoo 70 | 337,Gentoo 71 | 155,Chinstrap 72 | 101,Adelie 73 | 269,Gentoo 74 | 33,Adelie 75 | 210,Chinstrap 76 | 320,Gentoo 77 | 115,Adelie 78 | 48,Adelie 79 | 177,Chinstrap 80 | 243,Gentoo 81 | 279,Gentoo 82 | 217,Chinstrap 83 | 116,Adelie 84 | 159,Chinstrap 85 | 132,Adelie 86 | 181,Chinstrap 87 | 169,Chinstrap 88 | 30,Adelie 89 | 162,Chinstrap 90 | 272,Gentoo 91 | 196,Chinstrap 92 | 97,Adelie 93 | 1,Adelie 94 | 94,Adelie 95 | 219,Chinstrap 96 | 203,Chinstrap 97 | 49,Adelie 98 | 192,Chinstrap 99 | 280,Gentoo 100 | 161,Chinstrap 101 | 108,Adelie 102 | 215,Chinstrap 103 | 71,Adelie 104 | 245,Gentoo 105 | 251,Gentoo 106 | 226,Gentoo 107 | 197,Chinstrap 108 | 38,Adelie 109 | 171,Chinstrap 110 | 72,Adelie 111 | 125,Adelie 112 | 311,Gentoo 113 | 188,Chinstrap 114 | 291,Gentoo 115 | 309,Gentoo 116 | 257,Gentoo 117 | 88,Adelie 118 | 253,Gentoo 119 | 118,Adelie 120 | 60,Adelie 121 | 331,Gentoo 122 | 84,Adelie 123 | 157,Chinstrap 124 | 213,Chinstrap 125 | 29,Adelie 126 | 42,Adelie 127 | 248,Gentoo 128 | 19,Adelie 129 | 100,Adelie 130 | 173,Chinstrap 131 | 130,Adelie 132 | 139,Adelie 133 | 136,Adelie 134 | 128,Adelie 135 | 176,Chinstrap 136 | 76,Adelie 137 | 229,Gentoo 138 | 127,Adelie 139 | 308,Gentoo 140 | 166,Chinstrap 141 | 137,Adelie 142 | 312,Gentoo 143 | 85,Adelie 144 | 99,Adelie 145 | 54,Adelie 146 | 74,Adelie 147 | 158,Chinstrap 148 | 334,Gentoo 149 | 43,Adelie 150 | 167,Chinstrap 151 | 140,Adelie 152 | 36,Adelie 153 | 198,Chinstrap 154 | 202,Chinstrap 155 | 126,Adelie 156 | 315,Gentoo 157 | 190,Chinstrap 158 | 69,Adelie 159 | 194,Chinstrap 160 | 24,Adelie 161 | 45,Adelie 162 | 2,Adelie 163 | 241,Gentoo 164 | 264,Gentoo 165 | 75,Adelie 166 | 261,Gentoo 167 | 313,Gentoo 168 | 306,Gentoo 169 | 240,Gentoo 170 | 86,Adelie 171 | 342,Gentoo 172 | 34,Adelie 173 | 195,Chinstrap 174 | 124,Adelie 175 | 216,Chinstrap 176 | 185,Chinstrap 177 | 148,Adelie 178 | 289,Gentoo 179 | 59,Adelie 180 | 103,Adelie 181 | 50,Adelie 182 | 204,Chinstrap 183 | 221,Gentoo 184 | 326,Gentoo 185 | 172,Chinstrap 186 | 238,Gentoo 187 | 13,Adelie 188 | 178,Chinstrap 189 | 96,Adelie 190 | 307,Gentoo 191 | 102,Adelie 192 | 5,Adelie 193 | 275,Gentoo 194 | 35,Adelie 195 | 143,Adelie 196 | 91,Adelie 197 | 262,Gentoo 198 | 281,Gentoo 199 | 301,Gentoo 200 | 21,Adelie 201 | 255,Gentoo 202 | 328,Gentoo 203 | 134,Adelie 204 | 20,Adelie 205 | 220,Gentoo 206 | 278,Gentoo 207 | 244,Gentoo 208 | 310,Gentoo 209 | 282,Gentoo 210 | 249,Gentoo 211 | 322,Gentoo 212 | 18,Adelie 213 | 67,Adelie 214 | 239,Gentoo 215 | 141,Adelie 216 | 212,Chinstrap 217 | 273,Gentoo 218 | 230,Gentoo 219 | 109,Adelie 220 | 113,Adelie 221 | 56,Adelie 222 | 252,Gentoo 223 | 292,Gentoo 224 | 340,Gentoo 225 | 232,Gentoo 226 | 70,Adelie 227 | 12,Adelie 228 | 41,Adelie 229 | 235,Gentoo 230 | 145,Adelie 231 | 31,Adelie 232 | 298,Gentoo 233 | 259,Gentoo 234 | 316,Gentoo 235 | 222,Gentoo 236 | 295,Gentoo 237 | 122,Adelie 238 | 223,Gentoo 239 | 321,Gentoo 240 | 193,Chinstrap 241 | 305,Gentoo 242 | 104,Adelie 243 | 44,Adelie 244 | 120,Adelie 245 | 152,Chinstrap 246 | 323,Gentoo 247 | 89,Adelie 248 | 335,Gentoo 249 | 129,Adelie 250 | 170,Chinstrap 251 | -------------------------------------------------------------------------------- /data/world_happiness.csv: -------------------------------------------------------------------------------- 1 | rank,country,happiness_score,gdp,social_support,life_expectancy,freedom,generosity,corruption,continent 2 | 1,Finland,7.821,1.892,1.258,0.775,0.736,0.109,0.534,Europe 3 | 2,Denmark,7.636,1.953,1.243,0.777,0.719,0.188,0.532,Europe 4 | 3,Iceland,7.557,1.936,1.32,0.803,0.718,0.27,0.191,Europe 5 | 4,Switzerland,7.512,2.026,1.226,0.822,0.677,0.147,0.461,Europe 6 | 5,Netherlands,7.415,1.945,1.206,0.787,0.651,0.271,0.419,Europe 7 | 6,Luxembourg*,7.404,2.209,1.155,0.79,0.7,0.12,0.388,Europe 8 | 7,Sweden,7.384,1.92,1.204,0.803,0.724,0.218,0.512,Europe 9 | 8,Norway,7.365,1.997,1.239,0.786,0.728,0.217,0.474,Europe 10 | 9,Israel,7.364,1.826,1.221,0.818,0.568,0.155,0.143,Asia 11 | 10,New Zealand,7.2,1.852,1.235,0.752,0.68,0.245,0.483,Oceania 12 | 11,Austria,7.163,1.931,1.165,0.774,0.623,0.193,0.329,Europe 13 | 12,Australia,7.162,1.9,1.203,0.772,0.676,0.258,0.341,Oceania 14 | 13,Ireland,7.041,2.129,1.166,0.779,0.627,0.19,0.408,Europe 15 | 14,Germany,7.034,1.924,1.088,0.776,0.585,0.163,0.358,Europe 16 | 15,Canada,7.025,1.886,1.188,0.783,0.659,0.217,0.368,North America 17 | 16,United States,6.977,1.982,1.182,0.628,0.574,0.22,0.177,North America 18 | 17,United Kingdom,6.943,1.867,1.143,0.75,0.597,0.289,0.329,Europe 19 | 18,Czechia,6.92,1.815,1.26,0.715,0.66,0.158,0.048,Europe 20 | 19,Belgium,6.805,1.907,1.106,0.764,0.492,0.049,0.204,Europe 21 | 20,France,6.687,1.863,1.219,0.808,0.567,0.07,0.266,Europe 22 | 21,Bahrain,6.647,1.854,1.029,0.625,0.693,0.199,0.155,Asia 23 | 22,Slovenia,6.63,1.81,1.249,0.769,0.685,0.118,0.115,Europe 24 | 23,Costa Rica,6.582,1.584,1.054,0.744,0.661,0.089,0.102,North America 25 | 24,United Arab Emirates,6.576,1.998,0.98,0.633,0.702,0.204,0.25,Asia 26 | 25,Saudi Arabia,6.523,1.87,1.092,0.577,0.651,0.078,0.18,Asia 27 | 26,Taiwan Province of China,6.512,1.897,1.095,0.733,0.542,0.075,0.168,Asia 28 | 27,Singapore,6.48,2.149,1.127,0.851,0.672,0.163,0.587,Asia 29 | 28,Romania,6.477,1.719,1.006,0.655,0.605,0.039,0.006,Europe 30 | 29,Spain,6.476,1.808,1.211,0.808,0.505,0.101,0.149,Europe 31 | 30,Uruguay,6.474,1.615,1.18,0.672,0.665,0.103,0.265,South America 32 | 31,Italy,6.467,1.834,1.052,0.801,0.412,0.085,0.059,Europe 33 | 32,Kosovo,6.455,1.362,0.949,0.569,0.599,0.309,0.035,Europe 34 | 33,Malta,6.447,1.838,1.169,0.789,0.679,0.174,0.166,Europe 35 | 34,Lithuania,6.446,1.804,1.204,0.659,0.496,0.053,0.077,Europe 36 | 35,Slovakia,6.391,1.736,1.232,0.707,0.479,0.118,0.025,Europe 37 | 36,Estonia,6.341,1.793,1.232,0.728,0.689,0.123,0.333,Europe 38 | 37,Panama,6.309,1.715,1.107,0.709,0.592,0.049,0.051,North America 39 | 38,Brazil,6.293,1.462,1.044,0.615,0.546,0.131,0.134,South America 40 | 39,Guatemala*,6.262,1.274,0.831,0.522,0.662,0.112,0.115,North America 41 | 40,Kazakhstan,6.234,1.668,1.22,0.611,0.584,0.134,0.157,Asia 42 | 41,Cyprus,6.221,1.815,0.909,0.819,0.448,0.123,0.062,Europe 43 | 42,Latvia,6.18,1.732,1.221,0.637,0.502,0.075,0.09,Europe 44 | 43,Serbia,6.178,1.55,1.086,0.658,0.546,0.219,0.088,Europe 45 | 44,Chile,6.172,1.651,1.08,0.748,0.46,0.124,0.069,South America 46 | 45,Nicaragua,6.165,1.105,1.029,0.617,0.617,0.168,0.212,North America 47 | 46,Mexico,6.128,1.552,0.886,0.623,0.621,0.092,0.115,North America 48 | 47,Croatia,6.125,1.705,1.183,0.709,0.535,0.109,0.0,Europe 49 | 48,Poland,6.123,1.758,1.174,0.712,0.523,0.124,0.14,Europe 50 | 49,El Salvador,6.12,1.265,0.768,0.607,0.666,0.089,0.212,North America 51 | 50,Kuwait*,6.106,1.904,0.983,0.747,0.617,0.087,0.147,Asia 52 | 51,Hungary,6.086,1.748,1.233,0.668,0.485,0.078,0.064,Europe 53 | 52,Mauritius,6.071,1.591,1.116,0.568,0.589,0.131,0.107,Africa 54 | 53,Uzbekistan,6.063,1.219,1.092,0.6,0.716,0.283,0.24,Asia 55 | 54,Japan,6.039,1.835,1.089,0.866,0.537,0.007,0.218,Asia 56 | 55,Honduras,6.022,1.111,0.885,0.555,0.582,0.202,0.076,North America 57 | 56,Portugal,6.016,1.76,1.078,0.777,0.655,0.016,0.039,Europe 58 | 57,Argentina,5.967,1.592,1.102,0.662,0.555,0.081,0.085,South America 59 | 58,Greece,5.948,1.703,0.98,0.774,0.249,0.015,0.108,Europe 60 | 59,South Korea,5.935,1.851,0.886,0.841,0.414,0.111,0.176,Asia 61 | 60,Philippines,5.904,1.268,0.912,0.514,0.678,0.107,0.142,Asia 62 | 61,Thailand,5.891,1.535,1.096,0.697,0.617,0.321,0.026,Asia 63 | 62,Moldova,5.857,1.417,1.008,0.597,0.561,0.102,0.028,Europe 64 | 63,Jamaica,5.85,1.296,1.045,0.646,0.567,0.08,0.053,North America 65 | 64,Kyrgyzstan,5.828,1.069,1.109,0.638,0.693,0.208,0.025,Asia 66 | 65,Belarus*,5.821,1.562,1.157,0.629,0.342,0.04,0.282,Europe 67 | 66,Colombia,5.781,1.452,0.929,0.72,0.545,0.087,0.077,South America 68 | 67,Bosnia and Herzegovina,5.768,1.468,1.068,0.665,0.448,0.244,0.006,Europe 69 | 68,Mongolia,5.761,1.393,1.197,0.467,0.398,0.247,0.059,Asia 70 | 69,Dominican Republic,5.737,1.538,1.003,0.577,0.606,0.084,0.179,North America 71 | 70,Malaysia,5.711,1.689,0.938,0.62,0.654,0.213,0.126,Asia 72 | 71,Bolivia,5.6,1.256,0.88,0.555,0.627,0.112,0.064,South America 73 | 72,China,5.585,1.508,0.958,0.705,0.656,0.099,0.142,Asia 74 | 73,Paraguay,5.578,1.409,1.13,0.624,0.629,0.171,0.059,South America 75 | 74,Peru,5.559,1.397,0.865,0.735,0.545,0.09,0.037,South America 76 | 75,Montenegro,5.547,1.573,1.023,0.659,0.46,0.135,0.077,Europe 77 | 76,Ecuador,5.533,1.352,0.879,0.708,0.565,0.08,0.083,South America 78 | 77,Vietnam,5.485,1.252,0.932,0.611,0.707,0.143,0.105,Asia 79 | 78,Turkmenistan*,5.474,1.484,1.319,0.516,0.649,0.314,0.032,Asia 80 | 79,North Cyprus*,5.467,1.815,0.888,0.819,0.523,0.13,0.213,Europe 81 | 80,Russia,5.459,1.685,1.095,0.586,0.401,0.117,0.08,Europe 82 | 81,Hong Kong S.A.R. of China,5.425,1.957,0.954,0.942,0.4,0.147,0.383,Asia 83 | 82,Armenia,5.399,1.434,0.82,0.668,0.558,0.054,0.21,Europe 84 | 83,Tajikistan,5.377,0.966,1.005,0.518,0.572,0.118,0.304,Asia 85 | 84,Nepal,5.377,0.984,0.784,0.499,0.519,0.237,0.13,Asia 86 | 85,Bulgaria,5.371,1.625,1.163,0.64,0.563,0.123,0.021,Europe 87 | 86,Libya*,5.33,1.476,0.943,0.606,0.477,0.106,0.179,Africa 88 | 87,Indonesia,5.24,1.382,0.883,0.539,0.62,0.468,0.047,Asia 89 | 88,Ivory Coast,5.235,1.094,0.442,0.322,0.451,0.149,0.124,Africa 90 | 89,North Macedonia,5.199,1.505,0.863,0.637,0.488,0.215,0.031,Europe 91 | 90,Albania,5.199,1.439,0.646,0.719,0.511,0.138,0.028,Europe 92 | 91,South Africa,5.194,1.425,1.088,0.361,0.442,0.089,0.046,Africa 93 | 92,Azerbaijan*,5.173,1.458,1.093,0.56,0.601,0.023,0.341,Europe 94 | 93,Gambia*,5.164,0.785,0.621,0.369,0.367,0.388,0.103,Africa 95 | 94,Bangladesh,5.155,1.06,0.614,0.581,0.622,0.125,0.187,Asia 96 | 95,Laos,5.14,1.239,0.654,0.479,0.679,0.197,0.184,Asia 97 | 96,Algeria,5.122,1.363,0.97,0.643,0.146,0.106,0.15,Africa 98 | 97,Liberia*,5.122,0.636,0.67,0.309,0.405,0.178,0.08,Africa 99 | 98,Ukraine,5.084,1.411,1.081,0.583,0.473,0.188,0.017,Europe 100 | 99,Congo,5.075,0.95,0.405,0.355,0.431,0.13,0.146,Africa 101 | 100,Morocco,5.06,1.208,0.268,0.565,0.492,0.02,0.102,Africa 102 | 101,Mozambique,5.048,0.578,0.66,0.191,0.593,0.185,0.2,Africa 103 | 102,Cameroon,5.048,0.968,0.672,0.317,0.397,0.152,0.074,Africa 104 | 103,Senegal,5.046,0.933,0.53,0.447,0.494,0.143,0.081,Africa 105 | 104,Niger*,5.003,0.57,0.56,0.326,0.571,0.165,0.145,Africa 106 | 105,Georgia,4.973,1.467,0.612,0.595,0.508,0.0,0.208,Europe 107 | 106,Gabon,4.958,1.459,0.738,0.396,0.343,0.032,0.099,Africa 108 | 107,Iraq,4.941,1.289,0.682,0.554,0.328,0.147,0.046,Asia 109 | 108,Venezuela,4.925,0.0,0.968,0.578,0.283,0.225,0.082,South America 110 | 109,Guinea,4.891,0.848,0.566,0.275,0.334,0.214,0.116,Africa 111 | 110,Iran,4.888,1.41,0.741,0.642,0.281,0.241,0.146,Asia 112 | 111,Ghana,4.872,1.112,0.595,0.409,0.5,0.23,0.056,Africa 113 | 112,Turkey,4.744,1.707,0.865,0.702,0.209,0.087,0.115,Europe 114 | 113,Burkina Faso,4.67,0.779,0.565,0.32,0.382,0.186,0.126,Africa 115 | 114,Cambodia,4.64,1.019,0.732,0.505,0.74,0.166,0.068,Asia 116 | 115,Benin,4.623,0.932,0.064,0.335,0.479,0.127,0.23,Africa 117 | 116,Comoros*,4.609,0.899,0.476,0.424,0.185,0.195,0.125,Africa 118 | 117,Uganda,4.603,0.777,0.875,0.418,0.402,0.222,0.066,Africa 119 | 118,Nigeria,4.552,1.079,0.732,0.3,0.444,0.175,0.038,Africa 120 | 119,Kenya,4.543,1.032,0.605,0.401,0.44,0.322,0.082,Africa 121 | 120,Tunisia,4.516,1.35,0.596,0.656,0.316,0.029,0.029,Africa 122 | 121,Pakistan,4.516,1.049,0.413,0.374,0.448,0.181,0.112,Asia 123 | 122,Palestinian Territories*,4.483,1.148,0.957,0.521,0.336,0.073,0.079,Asia 124 | 123,Mali,4.479,0.792,0.483,0.311,0.35,0.128,0.042,Africa 125 | 124,Namibia,4.459,1.292,0.877,0.354,0.384,0.067,0.071,Africa 126 | 125,"Eswatini, Kingdom of*",4.396,1.274,0.786,0.197,0.259,0.038,0.154,Africa 127 | 126,Myanmar,4.394,1.038,0.829,0.491,0.513,0.452,0.194,Asia 128 | 127,Sri Lanka,4.362,1.415,0.934,0.66,0.529,0.15,0.079,Asia 129 | 128,Madagascar*,4.339,0.67,0.645,0.378,0.202,0.143,0.154,Africa 130 | 129,Egypt,4.288,1.388,0.732,0.548,0.469,0.041,0.254,Africa 131 | 130,Chad*,4.251,0.662,0.506,0.225,0.18,0.182,0.077,Africa 132 | 131,Ethiopia,4.241,0.788,0.809,0.457,0.472,0.205,0.136,Africa 133 | 132,Yemen*,4.197,0.691,1.043,0.384,0.33,0.09,0.098,Asia 134 | 133,Mauritania*,4.153,1.1,0.865,0.45,0.304,0.088,0.138,Africa 135 | 134,Jordan,4.152,1.324,0.724,0.675,0.476,0.058,0.2,Asia 136 | 135,Togo,4.112,0.771,0.322,0.36,0.292,0.174,0.132,Africa 137 | 136,India,3.777,1.167,0.376,0.471,0.647,0.198,0.123,Asia 138 | 137,Zambia,3.76,0.93,0.577,0.306,0.525,0.203,0.083,Africa 139 | 138,Malawi,3.75,0.648,0.279,0.388,0.477,0.14,0.157,Africa 140 | 139,Tanzania,3.702,0.848,0.597,0.425,0.578,0.248,0.27,Africa 141 | 140,Sierra Leone,3.574,0.686,0.416,0.273,0.387,0.202,0.055,Africa 142 | 141,Lesotho*,3.512,0.839,0.848,0.0,0.419,0.076,0.018,Africa 143 | 142,Botswana*,3.471,1.503,0.815,0.28,0.571,0.012,0.102,Africa 144 | 143,Rwanda*,3.268,0.785,0.133,0.462,0.621,0.187,0.544,Africa 145 | 144,Zimbabwe,2.995,0.947,0.69,0.27,0.329,0.106,0.105,Africa 146 | 145,Lebanon,2.955,1.392,0.498,0.631,0.103,0.082,0.034,Asia 147 | 146,Afghanistan,2.404,0.758,0.0,0.289,0.0,0.089,0.005,Asia 148 | -------------------------------------------------------------------------------- /images/Confusion_Matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/Confusion_Matrix.png -------------------------------------------------------------------------------- /images/KNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/KNN.png -------------------------------------------------------------------------------- /images/linear_regression_hyperplane.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/linear_regression_hyperplane.jpeg -------------------------------------------------------------------------------- /images/linear_regression_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/linear_regression_line.png -------------------------------------------------------------------------------- /images/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/overfitting.png -------------------------------------------------------------------------------- /images/validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/validation.png -------------------------------------------------------------------------------- /lessons/00_introduction.md: -------------------------------------------------------------------------------- 1 | # Python Machine Learning: Introduction 2 | 3 | Please refer to these [introductory slides](https://docs.google.com/presentation/d/1IwlTdkOGXVGwgCxPVyWEXOOyRf_sZtAcKFSoJ9k4jig/edit?usp=sharing) for the first component of the workshop. -------------------------------------------------------------------------------- /lessons/02_regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3cfcb6e0-d342-4a48-9812-4a1176599965", 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "source": [ 10 | "# Python Machine Learning: Regularization\n", 11 | "\n", 12 | "In machine learning, the name of the game is generalization. We want to have a model perform well on the training set, but we need to make sure that the patterns the model learns can actually generalize to data the model hasn't seen before.\n", 13 | "\n", 14 | "So, the scenario we want to avoid is that of **overfitting**. This occurs when our model too strongly learns patterns in the training data, and doesn't generalize well. Overfit models tend to exhibit large generalization gaps: large differences in predictive performance between the training and test data.\n", 15 | "\n", 16 | "Overfitting can happen for a variety of reasons, the most well known of which is having a model that's too complicated. Luckily, all is not lost. There are a variety of approaches we can use to combat overfitting. In general, these approaches are called **regularization**." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "68f833e6-4c67-4f64-8f00-83b848024c93", 22 | "metadata": {}, 23 | "source": [ 24 | "## Overfitting and Regularization\n", 25 | "\n", 26 | "In the previous lesson, we discussed feature engineering, the process by which we create new features in order to make our model more expressive. One tradeoff to adding features to the model is that the model becomes more complex, which makes it prone to overfitting. \n", 27 | "\n", 28 | "For example, consider a basic regression with the points shown below:\n", 29 | "\n", 30 | "![overfitting](../images/overfitting.png)\n", 31 | "\n", 32 | "We could fit a simple line to this data, which will exhibit some error. However, we could also fit a more complex model - say, a polynomial - which could perfectly fit to the training data. There will be no error in the training predictions, which seems great!\n", 33 | "\n", 34 | "But do we *really* think the polynomial is making good predictions on *all* possible data points? Look at how it behaves in between the training examples. It's very likely on *new* data - that is, when the model needs to generalize - the linear model will perform much better than the polynomial model. This is because the polynomial model overfit to the data.\n", 35 | "\n", 36 | "So, it's common in machine learning to follow a \"parsimony principle\". Specifically, we aim to choose simpler models that can still be predictive, because simpler models are less likely to overfit, and thus generalize decently well.\n", 37 | "\n", 38 | "Regularization is often though of in terms of the **bias-variance tradeoff**. Specifically, prediction errors often break down in terms of two components: bias and variance. The linear model exhibits higher bias, since it exhibits large errors on the training example. But the polynomial model has higher variance - it's more likely to give wildly different predictions for training samples close together.\n", 39 | "\n", 40 | "We don't always have to use linear regression in the spirit of opting for simpler models. Sometimes, it's good to use the complicated model, particularly if it makes sense in a specific context. This is where **regularization** is useful: a technique we can use to make a model less prone to overfitting during training. It's important to note that regularization is more of a concept than it is a specific, standardized technique. There are many approaches used for regularizing. Today, we're going to cover the usage of **penalty terms** to regularize linear models." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "08518305-c290-46bd-8998-c728d68ada22", 46 | "metadata": {}, 47 | "source": [ 48 | "---\n", 49 | "### Challenge 1: Warm-Up\n", 50 | "\n", 51 | "Before we get started, let's warm up by importing our data and performing a train test split. We've providing the importing code for you. Go ahead and split the data into train/test sets using an 80/20 split, and a random state of 23.\n", 52 | "\n", 53 | "---" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "32a8b441-f363-45ac-8abf-236df98f8612", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import pandas as pd\n", 64 | "import numpy as np\n", 65 | "import matplotlib.pyplot as plt\n", 66 | "\n", 67 | "from sklearn.metrics import mean_squared_error\n", 68 | "from sklearn.model_selection import train_test_split" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "dccf9b24-e59a-41e9-8a82-2c2054363427", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Import data\n", 79 | "data = pd.read_csv('../data/auto-mpg.csv')\n", 80 | "# Remove the response variable and car name\n", 81 | "X = data.drop(columns=['car name', 'mpg'])\n", 82 | "# Assign response variable to its own variable\n", 83 | "y = data['mpg'].astype(np.float64)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "6cecbfb1-0121-42f7-81ab-180c2acd805c", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# YOUR CODE HERE\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "89d536fb-a981-461f-a100-99dd469f6be4", 99 | "metadata": {}, 100 | "source": [ 101 | "## Ridge Regression\n", 102 | "\n", 103 | "Recall the formulation of a linear model. We have the parameters we are trying to estimate, given in the model:\n", 104 | "\n", 105 | "$$Y = \\beta_0 + \\beta_1 X_1 + \\ldots + \\beta_P X_P$$\n", 106 | "\n", 107 | "We do this by minimizing the following objective function:\n", 108 | "\n", 109 | "$$\n", 110 | "\\begin{align}\n", 111 | "\\text{MSE} = L(\\beta) &= \\frac{1}{N}\\sum_{i=1}^{N}(y_i - \\hat{y}_i)^2 \\\\\n", 112 | "&= \\frac{1}{N}\\sum_{i=1}^{N}\\left(y_i - \\beta_0 - \\sum_{j=1}^P \\beta_j X_j\\right)^2\n", 113 | "\\end{align}\n", 114 | "$$\n", 115 | "\n", 116 | "We're going to regularize this model. We're not going to change the actual linear model - that's the top equation - but we will change how we choose the $\\beta$ parameters. Specifically, we're going to do **ridge regression** (also called $\\ell_2$ regularization and Tikhonov regularization). Instead of using the least squares objective function, specified in the second equation, we're going to use the following objective function: \n", 117 | "\n", 118 | "$$ L(\\beta) = \\sum_{i=1}^N (y_i - \\hat y_i)^2 + \\alpha \\sum_{j=1}^P \\beta_j^2 $$ \n", 119 | "\n", 120 | "What's the difference? There's a second term added on, which is equal to the sum of the squares of the $\\beta$ values. What does this mean?\n", 121 | "\n", 122 | "Our goal is for the loss, $L(\\beta)$, to be as small as possible. The first term says we can make that small if we make our errors, $y_i - \\hat y_i$, small. The second term says that we increase the loss if the $\\beta$ values get too large. There's a tradeoff here: if we make the $\\beta$ values all zero to accomodate the second term, then the first term will be large. So, in ridge regression, we try and minimize the errors, while trying hard not to make the coefficients too big.\n", 123 | "\n", 124 | "Also, note that ridge regression requires a **hyperparameter**, called $\\alpha$ (sometimes $\\lambda$). This hyperparameter indicates how much regularization should be done. In other words, how much do we care about the coefficient penalty term vs. how much do we care about the sum of squared errors term? The higher the value of $\\alpha$, the more regularization, and the smaller the resulting coefficients will be. On the other hand, if we use an $\\alpha$ value of 0, we get the same solution as the OLS regression done above.\n", 125 | "\n", 126 | "Why does ridge regression serve as a good regularizer? The penalty actually does several things, which are beneficial for our model:\n", 127 | "1. **Multicollinearity:** Ridge regression was devised largely to combat multicollinearity, or when features are highly correlated with each other. Ordinary least squares struggles in these scenarios, because multicollinearity can cause a huge increase in variance: it makes the parameter estimates unstable. Adding the penalty term stabilizes the parameter estimates, at a little cost to bias. This results in better generalization performance.\n", 128 | "2. **Low Number of Samples:** The most common scenario where you might overfit is when you have many features, but not many samples. Adding the penalty term stabilizes the model in these scenarios. There's not a great intuition for this without diving into the math, so you can just take it at face value. \n", 129 | "3. **Shrinkage:** The $\\ell_2$ penalty results in shrinkage, or a small reduction in the size of the parameters. This is effectively a bias, but helps regularize by reducing variance that often comes with overfit models." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "id": "107971a3-075d-4cc0-8fe3-c9fefe582ed0", 135 | "metadata": {}, 136 | "source": [ 137 | "## Ridge Regression in Practice\n", 138 | "\n", 139 | "As with linear regression, `scikit-learn` makes it easy to fit a ridge regression. We simply use the `Ridge` class from `scikit-learn`. This time, however, we're going to specify some arguments when we create the ridge regression object. The most important one is the regularization penalty, $\\alpha$, which we need to choose:" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "862640b7-4dee-487b-8308-d8c21aa61bae", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.linear_model import Ridge\n", 150 | "# Create models\n", 151 | "ridge = Ridge(\n", 152 | " # Regularization penalty\n", 153 | " alpha=10,\n", 154 | " random_state=1)\n", 155 | "# Fit object\n", 156 | "ridge.fit(X_train, y_train)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "fa21a66c-d694-410e-a198-ac73df50eb0f", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Run predictions\n", 167 | "y_train_pred_ridge = ridge.predict(X_train)\n", 168 | "y_test_pred_ridge = ridge.predict(X_test)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "a98614ab-de89-401f-b166-0eb216a2fe0f", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Evaluate model\n", 179 | "print(f'Training R^2: {ridge.score(X_train, y_train)}')\n", 180 | "print(f'Test R^2: {ridge.score(X_test, y_test)}')\n", 181 | "print(f'Train RMSE: {mean_squared_error(y_train, y_train_pred_ridge, squared=False)}')\n", 182 | "print(f'Test RMSE: {mean_squared_error(y_test, y_test_pred_ridge, squared=False)}')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "08c82e6e-a112-4c22-a048-6ca4c533a986", 188 | "metadata": {}, 189 | "source": [ 190 | "---\n", 191 | "### Challenge 2: Benchmarking\n", 192 | "\n", 193 | "Re-run the ordinary least squares on the data using `LinearRegression`. Then, create a new ridge regression where the `alpha` penalty is set equal to zero. How do the performances of these models compare to each other? How do they compare with the original ridge regression? Be sure to compare both the training performances and test performances.\n", 194 | "\n", 195 | "---" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "d77e1c87-b0a7-44ef-ba8f-7246f97b57d5", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "from sklearn.linear_model import LinearRegression\n", 206 | "# YOUR CODE HERE\n", 207 | "# Create models\n", 208 | "\n", 209 | "# Fit models\n", 210 | "\n", 211 | "# Run predictions\n", 212 | "\n", 213 | "# Evaluate models\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "02c5045a-a835-4de2-aa86-36d9511baef1", 219 | "metadata": {}, 220 | "source": [ 221 | "Based off your experiments, you probably found that ridge regression resulted in worse training performance, but slightly better generalization performance! So the regularization can help, particularly in this case where we know the parameters are correlated with each other." 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "c4a8103a-e35e-4369-b694-1232e71f5981", 227 | "metadata": {}, 228 | "source": [ 229 | "## Choosing Hyperparameters: Validation Sets\n", 230 | "\n", 231 | "The current issue with our analysis thus far is that we don't know what $\\alpha$ value we should use. Since hyperparameters are chosen *before* we fit the model, we can't just choose them based off the training data. So, how should we go about conducting **hyperparameter search**: identifying the best hyperparameter(s) to use?\n", 232 | "\n", 233 | "Let's think back to our original goal. We want a model that generalizes to unseen data. So, ideally, the choice of the hyperparameter should be such that the performance on unseen data is the best. We can't use the test set for this, but what if we had another set of held-out data? \n", 234 | "\n", 235 | "This is the basis for a **validation set**. If we had extra held-out dataset, we could try a bunch of hyperparameters on the training set, and see which one results in a model that performs the best on the validation set. We then would choose that hyperparameter, and use it to refit the model on both the training data and validation data. We could then, finally, evaluate on the test set.\n", 236 | "\n", 237 | "![validation](../images/validation.png)\n", 238 | "\n", 239 | "So, you'll often see a dataset not only split up into training/test sets, but training/validation/test sets, particularly when you need to choose a hyperparameter.\n", 240 | "\n", 241 | "### Cross-Validation\n", 242 | "\n", 243 | "We just formulated the process of choosing a hyperparameter with a single validation set. However, there are many ways to perform validation. The most common way is **cross-validation**. Cross-validation is motivated by the concern that we may not choose the best hyperparameter if we're only validating on a small fraction of the data. If the validation sample, just by chance, contains specific data samples, we may bias our model in favor of those samples, and limit its generalizability.\n", 244 | "\n", 245 | "So, during cross-validation, we effectively validate on the *entire* dataset, by breaking it up into folds. Here's the process:\n", 246 | "\n", 247 | "1. Perform a train/test split, as you normally would.\n", 248 | "2. Choose a number of folds - the most common is $K=5$ - and split up your training data into those equally sized \"folds\".\n", 249 | "3. For *each* hyperparameter, we're going to fit $K$ models. Let's assume $K=5$. The first model will be fit on Folds 2-5, and validated on Fold 1. The second model will be fit on Folds 1, 3-5, and validated on Fold 2. This process continues for all 5 splits.\n", 250 | "4. Each hyperparameter's performance is summarized by the average predictive performance on all 5 held-out folds. We then choose the hyperparameter that had the best average performance.\n", 251 | "5. We can then refit a new model to the entire training set, using our chosen hyperparameter. That's our final model - evaluate it on the test set!\n", 252 | "\n", 253 | "![cross-validation](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "6f1edb4d-60bf-45a6-a128-5f1097244cd5", 259 | "metadata": {}, 260 | "source": [ 261 | "## Cross-Validation in Practice\n", 262 | "\n", 263 | "You guessed it: `scikit-learn` makes it really easy to fit a model with cross-validation. We'll use the `RidgeCV` class. Check out the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html) for details about it.\n", 264 | "\n", 265 | "`RidgeCV` is going to need to know a few things from us: which hyperparameters do we want? How many folds should we use? We'll specify these when creating the model object." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "id": "1f632ebb-b5ae-45aa-a7de-2139454ca79e", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from sklearn.linear_model import RidgeCV\n", 276 | "# Create ridge model, with CV\n", 277 | "ridge_cv = RidgeCV(\n", 278 | " # Which alpha values to test for?\n", 279 | " alphas=np.logspace(-1, 3, 100),\n", 280 | " # Number of folds\n", 281 | " cv=5)\n", 282 | "# Fit model\n", 283 | "ridge_cv.fit(X_train, y_train)\n", 284 | "# Evaluate model\n", 285 | "print(ridge_cv.score(X_train, y_train))\n", 286 | "print(ridge_cv.score(X_test, y_test))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "id": "1ac60684-031b-43a3-b6f4-81f8b85e00da", 292 | "metadata": {}, 293 | "source": [ 294 | "We can also access the best $\\alpha$ value:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "f3237a9d-7083-4681-a1de-b39dc6457a53", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "ridge_cv.alpha_" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "5bf18376-c4a0-4f74-8f4d-d9d52b0ffede", 310 | "metadata": {}, 311 | "source": [ 312 | "As well as the coefficients:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "id": "85f569bf-38be-479d-8dea-5b8ef6e56ea6", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "ridge_cv.coef_" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "06a73083-aa63-4b5d-97c0-2ee8668993cb", 328 | "metadata": { 329 | "tags": [] 330 | }, 331 | "source": [ 332 | "## Bonus Material: Lasso Regression\n", 333 | "\n", 334 | "**Lasso regression** (also called $\\ell_1$ regularization) is another form of regularized regression that penalizes the coefficients. Rather than taking a squared penalty of the coefficients, Lasso uses an absolute value penalty: \n", 335 | "\n", 336 | "$$ L(\\beta) = \\sum_{i=1}^N (y_i - \\hat y_i)^2 + \\alpha \\sum_{j=1}^P |\\beta_j| $$ \n", 337 | "\n", 338 | "This has a similar effect on making the coefficients smaller, but also has a tendency to force some coefficients to be set *exactly equal to 0*. This leads to what is called **sparser** models, and is another way to reduce overfitting introduced by more complex models.\n", 339 | "\n", 340 | "Setting some coefficients exactly equal to zero has the added benefit of performing **feature selection**: it can exactly identify if some features are not worth including in the model, because their coefficients are set exactly to 0 (meaning that their values would have no impact on prediction)." 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "id": "57f36182-7007-4403-8ad8-f79271a0bd54", 346 | "metadata": {}, 347 | "source": [ 348 | "---\n", 349 | "### Challenge 3: Performing a Lasso Fit\n", 350 | "\n", 351 | "Below, we've imported the `Lasso` object from `scikit-learn` for you. Just like `Ridge`, it needs to know what the strength of the regularization penalty is before fitting to the data. \n", 352 | "\n", 353 | "Fit several Lasso models, with different regularization strengths. Try one with a small but non-zero regularization strength, and try one with a very large regularization strength. Look at the coefficients. What do you notice?\n", 354 | "\n", 355 | "---" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "80ac5a53-31e1-4238-8314-ebedb5200079", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from sklearn.linear_model import Lasso\n", 366 | "# YOUR CODE HERE\n" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "kernelspec": { 372 | "display_name": "Python 3 (ipykernel)", 373 | "language": "python", 374 | "name": "python3" 375 | }, 376 | "language_info": { 377 | "codemirror_mode": { 378 | "name": "ipython", 379 | "version": 3 380 | }, 381 | "file_extension": ".py", 382 | "mimetype": "text/x-python", 383 | "name": "python", 384 | "nbconvert_exporter": "python", 385 | "pygments_lexer": "ipython3", 386 | "version": "3.9.12" 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 5 391 | } 392 | -------------------------------------------------------------------------------- /lessons/03_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "52dcf6e0-34d7-487a-afc7-0404106c4741", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Machine Learning: Preprocessing\n", 9 | "\n", 10 | "Preprocessing is an essential step of the machine learning workflow and important for the performance of models. This notebook will introduce the major steps of preprocessing for machine learning. \n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "86cd8e28-1334-4520-b2d9-1b510ddb5819", 16 | "metadata": { 17 | "tags": [] 18 | }, 19 | "source": [ 20 | "## Load Data" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "d1b25be6-01f4-4555-b8ae-66956d67ace5", 26 | "metadata": {}, 27 | "source": [ 28 | "For today, we will be working with the `penguins` data set. This data set is from [Kaggle](https://www.kaggle.com/parulpandey/penguin-dataset-the-new-iris) and includes some penguins of three different species, their location, and some measurements for each penguin.\n", 29 | "\n", 30 | "First, let's import some packages we'll need." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "f0142813-ac28-4ead-9996-39b2ada322ca", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import warnings\n", 41 | "\n", 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "from sklearn.model_selection import train_test_split" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "e769ae58", 51 | "metadata": {}, 52 | "source": [ 53 | "Now, let's load in the data from the `data` subfolder of this directory.\n", 54 | "\n", 55 | "**Question:** How many columns are there in this data set? How many rows?" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "a612a6fb-fd37-4603-a430-2c018c5d7f29", 62 | "metadata": { 63 | "scrolled": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "data = pd.read_csv('../data/penguins.csv')\n", 68 | "data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "35f79ca2-f223-4a2d-b5a1-edd1e2df3d96", 74 | "metadata": {}, 75 | "source": [ 76 | "Below is the information for each of the columns:\n", 77 | "1. **species**: Species of penguin [Adelie, Chinstrap, Gentoo]\n", 78 | "2. **island**: Island where the penguin was found [Torgersen, Biscoe]\n", 79 | "3. **culmen_length_mm**: Length of upper part of penguin's bill (millimeters)\n", 80 | "4. **culmen_depth_mm**: Height of upper part of bill (millimeters)\n", 81 | "5. **flipper_length_mm**: Length of penguin flipper (millimeters)\n", 82 | "6. **body_mass_g**: Body mass of the penguin (grams)\n", 83 | "7. **sex**: Biological sex of the penguin [MALE, FEMALE]\n", 84 | "\n", 85 | "\n", 86 | "**Question:** Which of the columns are continuous? Which are categorical?\n", 87 | "\n", 88 | "\n", 89 | "We will need to treat the numeric and categorical data differently in preprocessing.\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "75343925-7865-43e6-bba1-f7fff9a673c1", 95 | "metadata": {}, 96 | "source": [ 97 | "## Missing Data Preprocessing\n", 98 | "\n", 99 | "First, let's check to see if there are any missing values in the data set. Missing values are represented by `NaN`. \n", 100 | "\n", 101 | "**Question:** In this case, what do missing values stand for?" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "0fbb04bc-4a44-493f-85d6-739adb1c7d8d", 108 | "metadata": { 109 | "scrolled": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "data.isnull().sum()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "fd318fc2", 119 | "metadata": {}, 120 | "source": [ 121 | "It is also possible to have non `NaN` missing values. For example, let's take a look at the `sex` column." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "2d613dce", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "data['sex'].unique()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "eed852c0", 137 | "metadata": {}, 138 | "source": [ 139 | "In this case, the `.` represents a missing value, so let's replace those with `np.nan` objects." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "d980a391", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "data.replace('.', np.nan, inplace=True)\n", 150 | "\n", 151 | "data['sex'].unique()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "737bbd99-c5ba-474b-a194-0003ae520a04", 157 | "metadata": {}, 158 | "source": [ 159 | "### Imputation\n", 160 | "\n", 161 | "In the case of missing values, we have the option to fill in the missing values with the best guess. This is called **imputation**. Here we'll impute any missing values using the average, or mean, of all the data that does exist, as that's the best guess for a data point if all we have is the data itself. To do that we'll use the `SimpleImputer` to assign the mean to all missing values in the data.\n", 162 | "\n", 163 | "There are also other strategies that can be used to impute missing data ([see documentation](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)).\n", 164 | "\n", 165 | "Let's see how the `SimpleImputer` works on a subset of the data. " 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "af30fe06-eb35-48af-88a2-b4cbd74e1335", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "from sklearn.impute import SimpleImputer\n", 176 | "\n", 177 | "imputer = SimpleImputer(missing_values=np.nan,\n", 178 | " strategy='mean', \n", 179 | " copy=True)\n", 180 | "imputed = imputer.fit_transform(data[['body_mass_g','flipper_length_mm']])\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "04e085f8", 186 | "metadata": {}, 187 | "source": [ 188 | "Now let's check that the previously null values have been filled in. " 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "bc7157f2", 195 | "metadata": { 196 | "scrolled": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "print(imputed[data[data['body_mass_g'].isna()].index])" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "id": "de080754", 206 | "metadata": {}, 207 | "source": [ 208 | "### Dropping Null Values" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "id": "a2f21878", 214 | "metadata": {}, 215 | "source": [ 216 | "Another option option is to use `pd.dropna()` to drop `Null` values from the `DataFrame`. This should almost always be used with the `subset` argument which restricts the function to only dropping values that are null in a certain column(s)." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "db11f7e0", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "data = data.dropna(subset='sex')\n", 227 | "\n", 228 | "# Now this line will return an empty dataframe\n", 229 | "data[data['sex'].isna()]" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "173e3fff-ded3-4c7a-9dfe-a3b9ff62a566", 235 | "metadata": {}, 236 | "source": [ 237 | "## Categorical Data Processing\n", 238 | "\n", 239 | "As we saw earlier, the `penguins` dataset contains both categorical and continuous features, which will each need to be preprocessed in different ways. First, we want to transform the categorical variables from strings to **indicator variables**. Indicator variables have one column per level, For example, the island variable will change from Biscoe/Dream/Torgersen --> Biscoe (1/0), Dream (1/0), and Torgerson (1/0). For each set of indicator variables, there should be a 1 in exactly one column." 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "fb9bc33e-2b97-4b31-83d1-985dec1e5950", 245 | "metadata": {}, 246 | "source": [ 247 | " Let's make a list of the categorical variable names to be transformed into indicator variables." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "3113d6a3-474c-4b57-9804-8040c38117a8", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# Define the variable names that are categorical for use later\n", 258 | "cat_var_names = ['island', 'sex']\n", 259 | "data_cat = data[cat_var_names]\n", 260 | "data_cat.head()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "id": "61a2d7ba-036f-49e2-ab9e-dc06086eaed6", 266 | "metadata": {}, 267 | "source": [ 268 | "### Categorical Variable Encoding (One-hot & Dummy)\n", 269 | "\n", 270 | "Many machine learning algorithms require that categorical data be encoded numerically in some fashion. There are two main ways to do so:\n", 271 | "\n", 272 | "\n", 273 | "- **One-hot-encoding**, which creates `k` new variables for a single categorical variable with `k` categories (or levels), where each new variable is coded with a `1` for the observations that contain that category, and a `0` for each observation that doesn't. \n", 274 | "- **Dummy encoding**, which creates `k-1` new variables for a categorical variable with `k` categories\n", 275 | "\n", 276 | "However, when using some machine learning algorithms we can run into the so-called [\"Dummy Variable Trap\"](https://www.algosome.com/articles/dummy-variable-trap-regression.html) when using One-Hot-Encoding on multiple categorical variables within the same set of features. This occurs because each set of one-hot-encoded variables can be added together across columns to create a single column of all `1`s, and so are multi-colinear when multiple one-hot-encoded variables exist within a given model. This can lead to misleading results. \n", 277 | "\n", 278 | "To resolve this, we can simply add an intercept term to our model (which is all `1`s) and remove the first one-hot-encoded variable for each categorical variables, resulting in `k-1` so-called \"Dummy Variables\". \n", 279 | "\n", 280 | "Luckily the `OneHotEncoder` from `sklearn` can perform both one-hot and dummy encoding simply by setting the `drop` parameter (`drop = 'first'` for Dummy Encoding and `drop = None` for One Hot Encoding). \n", 281 | "\n", 282 | "**Question:** How many total columns will there be in the output?" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "a9384a9e-453f-4b62-8bbf-7866b8ac441c", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "from sklearn.preprocessing import OneHotEncoder\n", 293 | "dummy_e = OneHotEncoder(categories='auto', drop='first', sparse=False)\n", 294 | "dummy_e.fit(data_cat);\n", 295 | "dummy_e.categories_" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "e4091b24-0e57-47e3-a58a-d88826ab5c8b", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "temp = dummy_e.transform(data_cat)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "fec19bc9-6aee-48d1-b043-04ab71e4208b", 311 | "metadata": { 312 | "tags": [] 313 | }, 314 | "source": [ 315 | "## Continuous Data Preprocessing\n", 316 | "\n", 317 | "For numeric data, we don't need to create indicator variables, instead we need to normalize our variables, which helps improve performance of many machine learning models.\n", 318 | "\n", 319 | " Let's make subset out the continuous variables to be normalized." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "06511352-4ba4-4bb5-8da4-82430ac080a9", 326 | "metadata": { 327 | "tags": [] 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "data_num = data.drop(columns=cat_var_names + ['species'])\n", 332 | "data_num.head()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "id": "a13162f8-71d0-4f34-8edb-2b95516b4fa0", 338 | "metadata": {}, 339 | "source": [ 340 | "### Normalization\n", 341 | "\n", 342 | "[Normalization](https://en.wikipedia.org/wiki/Normalization_(statistics)) is a transformation that puts data into some known \"normal\" scale. We use normalization to improve the performance of many machine learning algorithms (see [here](https://en.wikipedia.org/wiki/Feature_scaling)). There are many forms of normalization, but perhaps the most useful to machine learning algorithms is called the \"z-score\" also known as the standard score. \n", 343 | "\n", 344 | "To z-score normalize the data, we simply subtract the mean of the data, and divide by the standard deviation. This results in data with a mean of `0` and a standard deviation of `1`.\n", 345 | "\n", 346 | "We'll use the `StandardScaler` from `sklearn` to do normalization." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "19f872ea-59e4-46a6-b366-578f6d0716a7", 353 | "metadata": { 354 | "scrolled": true 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "from sklearn.preprocessing import StandardScaler\n", 359 | "norm_e = StandardScaler()\n", 360 | "norm_e.fit_transform(data_num,).mean(axis=0)\n" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "f71c20c9", 366 | "metadata": {}, 367 | "source": [ 368 | "To check the normalization works, let's look at the mean and standard variation of the resulting columns. \n", 369 | "\n", 370 | "**Question:** What should the mean and std variation be?" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "id": "1ac3fe89", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "print('mean:',norm_e.fit_transform(data_num,).mean(axis=0))\n", 381 | "print('std:',norm_e.fit_transform(data_num,).std(axis=0))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "id": "202c54f4", 387 | "metadata": {}, 388 | "source": [ 389 | "---\n", 390 | "## Challenge 1: Fitting preprocessing functions\n", 391 | "\n", 392 | "The simple imputer, normalization and one-hot-encoding rely on sklearn functions that are fit to a data set. \n", 393 | "\n", 394 | "1) What is being fit for each of the three functions?\n", 395 | " 1) One Hot Encoding\n", 396 | " 2) Standard Scaler\n", 397 | " 3) Simple Imputer\n", 398 | " \n", 399 | "*YOUR ANSWER HERE*\n", 400 | "\n", 401 | "When we are preprocessing data we have a few options: \n", 402 | "1) Fit on the whole data set\n", 403 | "2) Fit on the training data\n", 404 | "3) Fit on the testing data\n", 405 | "\n", 406 | "Which of the above methods would you use and why?\n", 407 | "\n", 408 | "*YOUR ANSWER HERE*\n", 409 | "\n", 410 | "---\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "id": "03d7c3bf-c215-4de8-830d-c933ed52c505", 416 | "metadata": {}, 417 | "source": [ 418 | "## Combine it all together\n", 419 | "\n", 420 | "Now let's combine what we've learned to preprocess the entire dataset.\n", 421 | "\n", 422 | "First we will reload the data set to start with a clean copy." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "4b097530", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "data = pd.read_csv('../data/penguins.csv')\n", 433 | "data.replace('.', np.nan, inplace=True)\n", 434 | "data = data.dropna(subset='sex')\n" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "id": "cea1cd98", 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "# Perform the train-test split\n", 445 | "y = data['species']\n", 446 | "X = data.drop('species', axis =1, inplace=False)\n", 447 | "X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.25, stratify=y)\n", 448 | "print(X_train.shape)\n" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "id": "bbadb45c", 454 | "metadata": {}, 455 | "source": [ 456 | "We want to train our imputers on the training data using `fit_transform`, then `transform` the test data. This more closely resembles what the workflow would look like if you are bringing in brand new test data." 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "id": "ae2be342-483d-4d5b-b3ba-105b60e2cfeb", 462 | "metadata": {}, 463 | "source": [ 464 | "First, we will subset out the categorical and numerical features separately. " 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "id": "af05022a-a041-4d01-b189-5ceb5e1e0468", 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# Get the categorical and numerical variable column indices\n", 475 | "cat_var = ['island', 'sex']\n", 476 | "num_var = ['culmen_length_mm', 'culmen_depth_mm',\n", 477 | " 'flipper_length_mm', 'body_mass_g']\n", 478 | "# Splice the training array\n", 479 | "X_train_cat = X_train[cat_var]\n", 480 | "X_train_num = X_train[num_var]\n", 481 | "\n", 482 | "# Splice the test array\n", 483 | "X_test_cat = X_test[cat_var]\n", 484 | "X_test_num = X_test[num_var]" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "id": "9b746b78-8d31-40e9-819e-2273278c2f88", 490 | "metadata": {}, 491 | "source": [ 492 | "Now, let's process the categorical data with **Dummy encoding**" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "id": "c45d20a3-73b9-490c-9f81-23e37fc09a2d", 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "warnings.filterwarnings('ignore')\n", 503 | "\n", 504 | "# Categorical feature encoding\n", 505 | "X_train_dummy = dummy_e.fit_transform(X_train_cat)\n", 506 | "X_test_dummy = dummy_e.transform(X_test_cat)\n", 507 | "\n", 508 | "\n", 509 | "# Check the shape\n", 510 | "X_train_dummy.shape, X_test_dummy.shape" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "id": "0ae07768", 516 | "metadata": {}, 517 | "source": [ 518 | "Now, let's process the numerical data by imputing any missing values and normalizing the results." 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "id": "127c7fc4-fd8e-4deb-832a-8e02d82909d6", 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "# Numerical feature standardization\n", 529 | "\n", 530 | "# Impute the data\n", 531 | "X_train_imp = imputer.fit_transform(X_train_num)\n", 532 | "X_test_imp = imputer.transform(X_test_num)\n", 533 | "\n", 534 | "# Check for missing values\n", 535 | "np.isnan(X_train_imp).any(), np.isnan(X_test_imp).any()\n", 536 | "\n", 537 | "# normalize\n", 538 | "X_train_norm = norm_e.fit_transform(X_train_num)\n", 539 | "X_test_norm = norm_e.transform(X_test_num)\n", 540 | "\n", 541 | "X_train_norm.shape, X_test_norm.shape" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "id": "f309dc2b-bdf8-420c-a3f3-fe93c854c3eb", 547 | "metadata": {}, 548 | "source": [ 549 | "Now that we've processed the numerical and categorical data separately, we can put the two arrays back together." 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "id": "5a97ace9-bd20-49c0-bae9-bd629a8b7a29", 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "X_train = np.hstack((X_train_dummy, X_train_norm))\n", 560 | "X_test = np.hstack((X_test_dummy, X_test_norm))\n", 561 | "\n", 562 | "X_train.shape, X_test.shape" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "id": "eab00968", 568 | "metadata": {}, 569 | "source": [ 570 | "---\n", 571 | "## Challenge 2: Order of Preprocessing\n", 572 | "\n", 573 | "In the preprocessing we did the following steps: \n", 574 | "\n", 575 | "1) Null values\n", 576 | "2) One-hot-encoding\n", 577 | "3) Imputation\n", 578 | "4) Normalization\n", 579 | "\n", 580 | "Now, consider that we change the order of the steps in the following ways. What effect might that have on the algorithms?\n", 581 | "**Hint**: Try copying the code from above and trying it out!\n", 582 | "\n", 583 | "- One-Hot-Encoding before Null Values\n", 584 | "- Normalization before Null values\n", 585 | "\n", 586 | "**Bonus:** Are there any other switches in order that might affect preprocessing?\n", 587 | "\n", 588 | "---" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "id": "d36e3bd7", 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "# YOUR CODE HERE" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "id": "92c4ecff-fb89-4f71-a7ef-70aa43ccc691", 604 | "metadata": {}, 605 | "source": [ 606 | "Finally, let's save our results as separate `.csv` files, so we won't have to run the preprocessing again.\n", 607 | "\n", 608 | "First we will make them DataFrames, add columns, and save them as .csv files" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "id": "1f18fab4", 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "X_train = pd.DataFrame(X_train)\n", 619 | "X_train.columns = ['Dream','Torgersen', 'Male',\n", 620 | " 'culmen_length_mm', 'culmen_depth_mm',\n", 621 | " 'flipper_length_mm', 'body_mass_g']\n", 622 | "\n", 623 | "X_test = pd.DataFrame(X_test)\n", 624 | "\n", 625 | "X_test.columns = ['Dream','Torgersen', 'Male',\n", 626 | " 'culmen_length_mm', 'culmen_depth_mm',\n", 627 | " 'flipper_length_mm', 'body_mass_g']\n", 628 | "y_train = pd.DataFrame(y_train)\n", 629 | "y_train.columns = ['species']\n", 630 | "\n", 631 | "y_test = pd.DataFrame(y_test)\n", 632 | "y_test.columns = ['species']\n", 633 | "\n", 634 | "X_train.to_csv('../data/penguins_X_train.csv')\n", 635 | "X_test.to_csv('../data/penguins_X_test.csv')\n", 636 | "y_train.to_csv('../data/penguins_y_train.csv')\n", 637 | "y_test.to_csv('../data/penguins_y_test.csv')\n" 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "id": "2a6de745", 643 | "metadata": {}, 644 | "source": [ 645 | "Although now we will move on to talk about classification, all of the choices we make in the preprocessing pipeline are extremely important to machine learning." 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "id": "06995721", 651 | "metadata": {}, 652 | "source": [ 653 | "---\n", 654 | "## Challenge 3: Preprocessing and regularization\n", 655 | "\n", 656 | "We are preprocessing data in preparation for a classification task down the line. However, preprocessing also applies to regression. \n", 657 | "\n", 658 | "Consider the regularization task applied in the previous notebook. How might the preprocessing steps affect the performance of regularization?\n", 659 | "\n", 660 | "---" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "id": "b0895317", 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "# YOUR CODE HERE" 671 | ] 672 | } 673 | ], 674 | "metadata": { 675 | "kernelspec": { 676 | "display_name": "Python 3 (ipykernel)", 677 | "language": "python", 678 | "name": "python3" 679 | }, 680 | "language_info": { 681 | "codemirror_mode": { 682 | "name": "ipython", 683 | "version": 3 684 | }, 685 | "file_extension": ".py", 686 | "mimetype": "text/x-python", 687 | "name": "python", 688 | "nbconvert_exporter": "python", 689 | "pygments_lexer": "ipython3", 690 | "version": "3.9.12" 691 | } 692 | }, 693 | "nbformat": 4, 694 | "nbformat_minor": 5 695 | } 696 | -------------------------------------------------------------------------------- /lessons/04_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Machine Learning: Classification" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "A common task in computational research is to classify an object based on a set of features. In supervised machine learning, we can give an algorithm a dataset of training examples that say \"here are specific features, and this is the target class it belongs to\". With enough training examples, a model can be built that recognizes important features in determining an object's class. This model can then be used to predict the class of an object given its known features.\n", 15 | "\n", 16 | "\n", 17 | "First let's import the packages that we need for this notebook." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import numpy as np\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import seaborn as sb\n", 30 | "\n", 31 | "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", 32 | "from sklearn.linear_model import LogisticRegression\n", 33 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", 34 | "from sklearn.model_selection import train_test_split, cross_val_score, KFold\n", 35 | "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Penguin Data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Let's say that we are studying penguins in Antartica. We have a set of penguins that we have body measurements for, of three different species: Adelie, Chinstrap, and Gentoo. We are interested in being able to differentiate between these three species based on the measurements. First, let's take a look at our data set. \n", 50 | "\n", 51 | "\n", 52 | "Now, let's load in our preprocessed `penguins` data set. \n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "X_train = pd.read_csv('../data/penguins_X_train.csv')\n", 62 | "X_test = pd.read_csv('../data/penguins_X_test.csv')\n", 63 | "y_train = pd.read_csv('../data/penguins_y_train.csv')\n", 64 | "y_test = pd.read_csv('../data/penguins_y_test.csv')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Let's start with just two penguin species: Adelie and Gentoo. " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "X_train = X_train[y_train['species'].isin(['Adelie','Gentoo'])].reset_index()\n", 81 | "X_test = X_test[y_test['species'].isin(['Adelie','Gentoo'])].reset_index()\n", 82 | "y_train = y_train[y_train['species'].isin(['Adelie','Gentoo'])].reset_index()\n", 83 | "y_test = y_test[y_test['species'].isin(['Adelie','Gentoo'])].reset_index()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Null Accuracy\n", 91 | "\n", 92 | "Let's say that we wanted to assign a species to each unknown measured penguin. One way to do this is to assign all observations to the majority classes. The code below shows the proportion of each species in the training data.\n", 93 | "\n", 94 | "**Question:** If we want to maximize accuracy, which species label would we assign to all observations? " 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "scrolled": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "y_train.value_counts('species')/sum(y_train.value_counts('species'))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "This accuracy is our **baseline model**, and is the number that we will try to improve on with classification." 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Let's get to know our dataset by conducting some exploratory data analysis. We'll be using some rudimentary data analysis to see there's a relationship between the independent variables across species." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "Let's say that we decide that body mass might be a good way to differentiate between Adelie and Gentoo penguins. We can look at a plot of the histogram to see how the distribution of this variable changes between species.\n", 127 | "\n", 128 | "**Question**: Where would you place a line to minimize the overlap in the distribution? " 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "scrolled": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "sb.histplot(data=X_train.loc[y_train['species'].isin(['Adelie','Gentoo'])],\n", 140 | " x = 'body_mass_g',\n", 141 | " hue = y_train['species'],kde=True,bins=20)\n", 142 | "#plt.axvline(.28,color= 'red')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Now let's apply this same decision boundary to the test data. \n", 150 | "\n", 151 | "**Question:** Is this still the best boundary?" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "sb.histplot(data=X_test.loc[y_test['species'].isin(['Gentoo','Adelie'])],\n", 161 | " x = 'body_mass_g',\n", 162 | " hue = y_test['species'],kde=True,bins=20)\n", 163 | "#plt.axvline(.28,color= 'red')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "This is the basic goal of classification. Based on your boundary criteria, you would **classify** all each of the penguins. However there would be some error involved. We can be more confident in our classification at the far ends of the distribution, and less confident where the distributions overlap. \n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Now let's figure out how to separate out these groups mathematically. For this, we will start by using an algorithm called Logistic Regression." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Logistic Regression\n", 185 | "\n", 186 | "Logistic regression is a supervised classification algorithm that is used to predict a binary outcome. Similar to linear regression, this model uses coefficients or betas to make its predictions. However unlike a linear regression, its predictions range from 0 to 1, where 0 and 1 stand for 'confidently class A and B' respectively. Predictions along the middle of the line show less confidence in the prediction.\n", 187 | "\n", 188 | "The function for the logistic regression is:\n", 189 | "$$ p(x) = \\frac{1}{1 + e^{(-\\beta_0+\\beta_1x_1...)}}$$\n", 190 | "\n", 191 | "where $\\beta$ are the learned parameters and $x$ are the input features.\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "Let's train a logistic regression model on the variable: `body_mass_g`" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Modeling with Logistic Regression" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "\n", 213 | "Logistic regression uses the same general steps as many other `sklearn` algorithms:\n", 214 | "1. Initialize Model\n", 215 | "2. Fit model on training data\n", 216 | "3. Evaluate on training and testing datasets" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "#1) Initialize Model\n", 226 | "lr = LogisticRegression(max_iter=170)\n", 227 | "\n", 228 | "#2) Fit model\n", 229 | "lr.fit(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n", 230 | "\n", 231 | "#3) Evaluate \n", 232 | "train_score = lr.score(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n", 233 | "test_score = lr.score(X_test['body_mass_g'].values.reshape(-1, 1), y_test['species'])\n", 234 | "\n", 235 | "print(\"Training score:\", train_score.round(3), \"Testing score:\", test_score.round(3))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "**Question:** How well did the model do compared to baseline?" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Multivariate Logistic Regression\n", 250 | "\n", 251 | "\n", 252 | "The logistic regression did a pretty good job at classifying the penguins. However, we have more than just body mass to base our decision of species based on. For example, let's look at the combination of culmen depth and body mass in our data by using a scatterplot.\n", 253 | "\n", 254 | "In the two dimensional space, the intuition is that we want to draw a line that separates the classes. \n", 255 | "\n", 256 | "**Question:** Is it possible to draw a line that separates the groups? If it is, this is a **linearly seperable** problem" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "scrolled": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "sb.scatterplot(data=X_train.loc[y_train['species'].isin(['Adelie','Gentoo'])],\n", 268 | " x = 'culmen_depth_mm',\n", 269 | " y = 'body_mass_g',\n", 270 | " hue = y_train['species'])" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Let's retrain the logistic model with two variables." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "lr = LogisticRegression(max_iter=170)\n", 287 | "lr.fit(X_train[['body_mass_g','culmen_depth_mm']], y_train['species'])\n", 288 | "\n", 289 | "train_score = lr.score(X_train[['body_mass_g','culmen_depth_mm']], y_train['species'])\n", 290 | "test_score = lr.score(X_test[['body_mass_g','culmen_depth_mm']], y_test['species'])\n", 291 | "\n", 292 | "print(\"Training score = {}, testing score = {}\".format(train_score.round(3), test_score.round(3)))" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "While this doesn't happen often in real life, we got a perfect score! We could add more features to the model, but there isn't a need since our model is already behaving perfectly. Now let's take a look at the coefficients of the model. We reference the `lr.coef_` attribute to see the coefficients" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "\n", 309 | "coef = pd.Series(index=['body_mass_g','culmen_depth_mm'], data=lr.coef_[0])\n", 310 | "\n", 311 | "coef.sort_values()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "**Question:** What do you think the *magnitude* and *sign* of the coefficients means about how these variables are related to each category?\n", 319 | "**Hint:** Refer back to the scatter plot!" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## Model evaluation" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "We've covered accuracy already but there a whole litany of other ways to evaluate the performance of a classification model.\n", 334 | "\n", 335 | "In a binary classification task, there are four major types of predictions:\n", 336 | "\n", 337 | "[Confusion Matrix (Wikipedia)](https://en.wikipedia.org/wiki/Confusion_matrix): \n", 338 | "- true positive (TP): A test result that correctly indicates the presence of a condition or characteristic\n", 339 | "- true negative (TN): A test result that correctly indicates the absence of a condition or characteristic\n", 340 | "- false positive (FP): A test result which wrongly indicates that a particular condition or attribute is present\n", 341 | "- false negative (FN): A test result which wrongly indicates that a particular condition or attribute is absent\n", 342 | "\n", 343 | "\n", 344 | "Accuracy, which is the most common metric used with classification can be characterized as:\n", 345 | "\n", 346 | "$$ Accuracy= \\frac{\\sum{\\text{True Positives}}+\\sum{\\text{True Negatives}}}{\\sum{\\text{Total Population}}}$$" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "We can combine the prediction measures above to create three helpful metrics for evaluating classification: **precision**, **recall**, and **specificity**. " 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "\n", 361 | "1. **Precision**: \n", 362 | "$$\\frac{\\sum{\\text{True Positives}}}{\\sum{\\text{Predicted Positives}}}$$\n", 363 | "2. **Recall** (or **Sensitivity**): \n", 364 | "$$\\frac{\\sum{\\text{True Positives}}}{\\sum{\\text{Condition Positives}}}$$ \n", 365 | "3. **Specificity** (like recall for negative examples): \n", 366 | "$$\\frac{\\sum{\\text{True Negatives}}}{\\sum{\\text{Condition Negatives}}}$$\n" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "Let's make a confusion matrix and derive the recall and precision scores." 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "First, let's go back to the original (not perfect) model so we can see what these rates look like.\n", 381 | "\n", 382 | "First we will retrain the model and make predictions on the test set." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "lr.fit(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n", 392 | "preds = lr.predict(X_test[['body_mass_g']])" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "# Pass y_test and preds into confusion_matrix\n", 402 | "confusion_matrix(y_test['species'], preds)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## Challenge 1: Model Evaluation\n", 410 | "\n", 411 | "1). What are the TP, FP, TN, FN in these model results?\n", 412 | "\n", 413 | "2). What is the precision and recall for this model?\n", 414 | "\n", 415 | "3). Which is more important, precision or recall?" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "Depending on your task, other metrics than accuracy might be more beneficial to understanding your model's performance. At the very least, examining the confusion matrix is a great way to get a better sense of how your model is performing across classes." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "## Decision Trees\n", 430 | "\n", 431 | "Let's now include all three species of penguin that we want to differentiate between. We can turn to other models that can handle two or more classes for classification. One such example is the Decision Tree Classifier. In terms of logic, this is like a flow chart.\n", 432 | "\n", 433 | "\n", 434 | "In this flow chart the data is that the lamp doesn't work, and the features are information about how the lamp doesn't work. The classes is the action that is taken at the end.\n", 435 | "\n", 436 | "![Alt](https://upload.wikimedia.org/wikipedia/commons/9/91/LampFlowchart.svg)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "While the ultimate goal of classification remains the same, machine learning algorithms vary widely in terms of *how* they go about this task. The neat thing about `sklearn` is that many algorithms use the same syntax, which makes comparing their performance on a task fairly straightforward. However, each model will have different underlying parameters and methods to identify the optimal split. When you are using a new model it is helpful to read up on how the model works. \n", 444 | "\n", 445 | "The documentation is a great way to do that.\n", 446 | "Read the [documentation](https://scikit-learn.org/stable/modules/tree.html#tree) for the Decision Tree and let's try to answer the following questions:\n", 447 | "\n", 448 | "1). What are two advantages and two disadvantages of the Decision Tree?\n", 449 | "2). What measure do Decision Trees use to determine optimal split?\n", 450 | "3). How do you import the Decision Tree from sklearn?\n", 451 | "\n", 452 | "**Decision Trees** are a classification/regression supervised learning algorithm that uses a series of splits to make its predictions.\n", 453 | "\n", 454 | "Decision Trees learn from the data by picking the feature-threshold that maximizes the information gain of the target variable. In other words it chooses a splitting point that produces the most imbalanced/pure proportions in the target variable. The goal of the model is to keep splitting until all the data in a terminal node or leaf are exclusively one class.\n", 455 | "\n", 456 | "The model iterates through a set of values for each feature and then calculate the information gain for each split and the one that produces the lowest value is the designated split." 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "**Parameters**\n", 464 | "\n", 465 | "There are many [parameters](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier) for the Decision Tree Classifier. A few relevant to this notebook are described here:\n", 466 | "\n", 467 | "**criterion**: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.\n", 468 | "\n", 469 | "**splitter**: The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.\n", 470 | "\n", 471 | "**max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.\n", 472 | "\n", 473 | "**min_samples_split**: The minimum number of samples required to split an internal node\n", 474 | "\n", 475 | "**min_samples_leaf**: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.\n", 476 | "\n", 477 | "**max_features**: The number of features to consider when looking for the best split" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "Now let's train a decision tree model on the penguins data set. We are going to start with a default DT model, meaning we're not going to pass in any parameters of our own. Like we did before, we are going to fit a model and then evaluate it on the training and testing datasets. Let's start with a single x-feature." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "# Initialize model\n", 494 | "dt = DecisionTreeClassifier()\n", 495 | "\n", 496 | "# Fit model on the dataset\n", 497 | "dt.fit(X_train[['body_mass_g']], y_train['species'])\n", 498 | "\n", 499 | "# Derive the training accuracy score\n", 500 | "dt.score(X_train[['body_mass_g']], y_train['species'])" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "# Test score\n", 510 | "dt.score(X_test[['body_mass_g']], y_test['species'])" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "**Question:** Our testing score is considerably lower. When the testing score is lower than the training score, what does that mean?" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "We can take advantage of some of the parameters of the decision tree in order to help prevent overfitting of the model. Let's try a model in which we impose some constraints on the tree?\n", 525 | "\n", 526 | "**Question:** From the documentation, what is one parameter that might help?" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "# Initialize\n", 536 | "dt = DecisionTreeClassifier(max_depth=2)\n", 537 | "# Fit \n", 538 | "dt.fit(X_train[['body_mass_g']], y_train['species'])\n", 539 | "\n", 540 | "# Evaluate\n", 541 | "train_score = dt.score(X_train[['body_mass_g']], y_train['species'])\n", 542 | "test_score = dt.score(X_test[['body_mass_g']], y_test['species'])\n", 543 | "\n", 544 | "print(\"Our training score is {} and our testing score is {}\".format(train_score.round(3), test_score.round(3)))" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "The gap between the two scores is considerably lower. Arguably we don't have an over fit model anymore. However, we could likely improve on the accuracy of this model by including more features." 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "### Tree Visualization\n", 559 | "\n", 560 | "One big advantage of the Decision Tree is that it can be visualized no matter how many features were involved.\n", 561 | "\n", 562 | "Let's retrain it with a small `max_depth` " 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "dt = DecisionTreeClassifier(max_depth=2)\n", 572 | "dt.fit(X_train[['body_mass_g']], y_train['species'])" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "**Question:** What is the first criteria used to split the decision tree? " 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "plt.figure(figsize=(28, 20))\n", 589 | "plot_tree(dt, feature_names=['body_mass_g'], class_names=[\"Adelie\", \"Chinstrap\",\"Gentoo\"], \n", 590 | " filled = True, proportion=True, fontsize=18\n", 591 | " );" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "Using the tree, how would we make predictions about the following customers?\n", 599 | "\n", 600 | "\n", 601 | " - Penguin A: Body Mass of .5\n", 602 | " - Penguin B: Body Mass of 0" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "## Challenge 2: Classification with SVM\n", 610 | "\n", 611 | "Now let's try another new model. The [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html#classification) is another class of machine learning algorithm that is used for classification. \n", 612 | "\n", 613 | "Choose two features of the data set to train your model on. Then, using the documentation for the support vector machine, follow the steps to:\n", 614 | "- Initialize the model\n", 615 | "- Fit it to the training data\n", 616 | "- Evaluate the model on both the training and testing data\n", 617 | "\n", 618 | "Is your model underfit? Is it overfit?\n", 619 | "\n", 620 | "How does SVM fit in with the **linearly separable** problem identified in the scatter plots above?" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "## YOUR CODE HERE\n", 630 | "from sklearn.svm import SVC\n", 631 | "X_train_subset = X_train[['feature1','feature2']]\n", 632 | "X_test_subset = X_test[['feature1','feature2']]\n", 633 | "y_train_subset = y_train['species']\n", 634 | "y_test_subset = y_test['species']\n", 635 | "\n", 636 | "##1) Initialize SVM\n", 637 | "\n", 638 | "##2) Train SVM on Training data \n", 639 | "\n", 640 | "##3) Evaluate SVM on Training and Test Data" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [] 649 | } 650 | ], 651 | "metadata": { 652 | "anaconda-cloud": {}, 653 | "hide_input": false, 654 | "kernelspec": { 655 | "display_name": "Python 3 (ipykernel)", 656 | "language": "python", 657 | "name": "python3" 658 | }, 659 | "language_info": { 660 | "codemirror_mode": { 661 | "name": "ipython", 662 | "version": 3 663 | }, 664 | "file_extension": ".py", 665 | "mimetype": "text/x-python", 666 | "name": "python", 667 | "nbconvert_exporter": "python", 668 | "pygments_lexer": "ipython3", 669 | "version": "3.9.12" 670 | }, 671 | "toc": { 672 | "base_numbering": 1, 673 | "nav_menu": {}, 674 | "number_sections": false, 675 | "sideBar": true, 676 | "skip_h1_title": false, 677 | "title_cell": "Table of Contents", 678 | "title_sidebar": "Contents", 679 | "toc_cell": false, 680 | "toc_position": {}, 681 | "toc_section_display": "block", 682 | "toc_window_display": true 683 | }, 684 | "varInspector": { 685 | "cols": { 686 | "lenName": 16, 687 | "lenType": 16, 688 | "lenVar": 40 689 | }, 690 | "kernels_config": { 691 | "python": { 692 | "delete_cmd_postfix": "", 693 | "delete_cmd_prefix": "del ", 694 | "library": "var_list.py", 695 | "varRefreshCmd": "print(var_dic_list())" 696 | }, 697 | "r": { 698 | "delete_cmd_postfix": ") ", 699 | "delete_cmd_prefix": "rm(", 700 | "library": "var_list.r", 701 | "varRefreshCmd": "cat(var_dic_list()) " 702 | } 703 | }, 704 | "types_to_exclude": [ 705 | "module", 706 | "function", 707 | "builtin_function_or_method", 708 | "instance", 709 | "_Feature" 710 | ], 711 | "window_display": false 712 | } 713 | }, 714 | "nbformat": 4, 715 | "nbformat_minor": 4 716 | } 717 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy>=1.16.3 3 | pandas>=0.24.2 4 | scipy>=1.3.1 5 | scikit-learn>=0.22.0 6 | tpot 7 | -------------------------------------------------------------------------------- /solutions/01_regression_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a027a99e-1ac6-4336-b87a-f0d5d79e22e2", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Machine Learning: Regression Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "8546cbf5-1c72-40c5-be75-234d1c3c9f3b", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "\n", 22 | "from sklearn.linear_model import LinearRegression\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "\n", 25 | "%matplotlib inline" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "772d7956-975b-4489-8336-40dc93e3f528", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "data = pd.read_csv('../data/auto-mpg.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "59c90903-4781-4a14-a73f-5322e7003705", 41 | "metadata": {}, 42 | "source": [ 43 | "---\n", 44 | "### Challenge 1: More EDA\n", 45 | "\n", 46 | "Create the following plots, or examine the following distributions, while exploring your data:\n", 47 | "\n", 48 | "1. A histogram of the displacement.\n", 49 | "2. A histogram of the horsepower.\n", 50 | "3. A histogram of the weight.\n", 51 | "4. A histogram of the acceleration.\n", 52 | "5. What are the unique model years, and their counts?\n", 53 | "6. What are the unique origin values, and their counts?\n", 54 | "\n", 55 | "---" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "859bccf7-82fa-4095-a6ff-523ef9eb7759", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "ax = data['displacement'].hist(grid=False, bins=np.linspace(75, 450, 15))\n", 66 | "ax.set_xlabel('Displacement')\n", 67 | "ax.set_ylabel('Frequency')\n", 68 | "plt.show()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "631de034-f513-4199-9e76-e2a1388d0475", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "ax = data['horsepower'].hist(grid=False, bins=np.linspace(45, 230, 15))\n", 79 | "ax.set_xlabel('Horsepower')\n", 80 | "ax.set_ylabel('Frequency')\n", 81 | "plt.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "0b5c0f99-584f-4d52-ad12-051eeb238067", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "ax = data['weight'].hist(grid=False)\n", 92 | "ax.set_xlabel('Weight')\n", 93 | "ax.set_ylabel('Frequency')\n", 94 | "plt.show()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "95c88602-8d09-4b1c-ab93-d7a0329cee4f", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "ax = data['acceleration'].hist(grid=False)\n", 105 | "ax.set_xlabel('Acceleration')\n", 106 | "ax.set_ylabel('Frequency')\n", 107 | "plt.show()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "e40bdacb-9b47-491a-995c-961430fcb4b2", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "data['model year'].value_counts().sort_index()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "d56a338a-1929-4c19-a7bc-3beeb7045335", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "data['origin'].value_counts().sort_index()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "c391bc78-fb9c-441c-8c04-e6708645c157", 133 | "metadata": {}, 134 | "source": [ 135 | "---\n", 136 | "### Challenge 2: Mean Absolute Error\n", 137 | "\n", 138 | "Another commonly used metric in regression is the **Mean Absolute Error (MAE)**. As the name suggests, this can be calculated by taking the mean of the absolute errors. Calculate the mean absolute error on the training and test data with your trained model. We've imported the MAE for you below:\n", 139 | "\n", 140 | "---" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "5b6f6d56-5967-468c-bcd2-0ceb8819e630", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Remove the response variable and car name\n", 151 | "X = data.drop(columns=['car name', 'mpg'])\n", 152 | "# Assign response variable to its own variable\n", 153 | "y = data['mpg'].astype(np.float64)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "edc3dbcb-9610-4342-96a3-5a4b7d400a15", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "f6eb59e4-1597-468e-b18d-ef5ecc519caf", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "model = LinearRegression()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "0994de85-ae86-43aa-9fe1-0ded209edbc9", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "model.fit(X_train, y_train)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "8c54289e-f6d0-4892-84bb-8728d8591402", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "y_train_pred = model.predict(X_train)\n", 194 | "y_test_pred = model.predict(X_test)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "1a7e56aa-35d8-4066-9fe1-29de73c359c3", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "from sklearn.metrics import mean_absolute_error\n", 205 | "print(mean_absolute_error(y_train, y_train_pred))\n", 206 | "print(mean_absolute_error(y_test, y_test_pred))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "c4205dbf-87e5-4bbc-97e2-f80c3bde8530", 212 | "metadata": {}, 213 | "source": [ 214 | "---\n", 215 | "### Challenge 3: Feature Engineering\n", 216 | "\n", 217 | "You might notice that the `origin` variable has only three values. So, it's really a categorical variable, where each sample has one of three origins. In this scenario, we've treated it like a continuous variable. \n", 218 | "\n", 219 | "How can we properly treat this variable as categorical? This is a question of preprocessing and **feature engineering**.\n", 220 | "\n", 221 | "What we can do is replace the `origin` feature with two binary variables. The first tells us whether origin is equal to 2. The second tells us whether origin is equal to 3. If both are false, that means origin is equal to 1.\n", 222 | "\n", 223 | "By fitting a linear regression with these two binary features rather than treating `origin` as continuous, we can get a better sense for how the origin impacts the MPG.\n", 224 | "\n", 225 | "Create two new binary features corresponding to origin, and then recreate the training and test data. Then, fit a linear model to the new data. What do you find about the performance and new coefficients?\n", 226 | "\n", 227 | "---" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "651f4a11-aa7f-45d5-84de-d3c6f8b551bd", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "data['origin_2'] = (data['origin'] == 2).astype('int')\n", 238 | "data['origin_3'] = (data['origin'] == 3).astype('int')" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "0ba5b282-fb1f-4550-a2e6-ce156ae4bb51", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# Remove the response variable and car name\n", 249 | "X = data.drop(columns=['car name', 'mpg', 'origin'])\n", 250 | "# Assign response variable to its own variable\n", 251 | "y = data['mpg'].astype(np.float64)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "b633c0f1-de8a-46ad-a573-7b37b50089a9", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# Split\n", 262 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)\n", 263 | "# Fit model\n", 264 | "model = LinearRegression()\n", 265 | "model.fit(X_train, y_train)\n", 266 | "# Evaluate model\n", 267 | "print(model.score(X_test, y_test))\n", 268 | "print(model.coef_)" 269 | ] 270 | } 271 | ], 272 | "metadata": { 273 | "kernelspec": { 274 | "display_name": "nlp", 275 | "language": "python", 276 | "name": "nlp" 277 | }, 278 | "language_info": { 279 | "codemirror_mode": { 280 | "name": "ipython", 281 | "version": 3 282 | }, 283 | "file_extension": ".py", 284 | "mimetype": "text/x-python", 285 | "name": "python", 286 | "nbconvert_exporter": "python", 287 | "pygments_lexer": "ipython3", 288 | "version": "3.9.7" 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 5 293 | } 294 | -------------------------------------------------------------------------------- /solutions/02_regularization_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cb96b065-4e7f-4836-8697-fd8137f80185", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Machine Learning: Regularization Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c2e63187-653f-4b31-9ed5-567a5f20b280", 14 | "metadata": {}, 15 | "source": [ 16 | "---\n", 17 | "### Challenge 1: Warm-Up\n", 18 | "\n", 19 | "Before we get started, let's warm up by importing our data and performing a train test split. We've providing the importing code for you. Go ahead and split the data into train/test sets using an 80/20 split, and a random state of 23.\n", 20 | "\n", 21 | "---" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "fcc3bf13-a08f-43df-a10b-bde060265645", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import numpy as np\n", 33 | "\n", 34 | "from sklearn.linear_model import Lasso, LinearRegression, Ridge, RidgeCV\n", 35 | "from sklearn.metrics import mean_squared_error\n", 36 | "from sklearn.model_selection import train_test_split" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "3daed737-14ca-48e7-bd17-86f63c9cea5e", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Import data\n", 47 | "data = pd.read_csv('../data/auto-mpg.csv')\n", 48 | "# Remove the response variable and car name\n", 49 | "X = data.drop(columns=['car name', 'mpg'])\n", 50 | "# Assign response variable to its own variable\n", 51 | "y = data['mpg'].astype(np.float64)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "74f27e92-06ac-4136-bf0c-77fa8fdc5a32", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "799213f6-7e18-43fe-ae78-aa65e66e26f7", 67 | "metadata": {}, 68 | "source": [ 69 | "---\n", 70 | "### Challenge 2: Benchmarking\n", 71 | "\n", 72 | "Re-run the ordinary least squares on the data using `LinearRegression`. Then, create a new ridge regression where the `alpha` penalty is set equal to zero. How do the performances of these models compare to each other? How do they compare with the original ridge regression? Be sure to compare both the training performances and test performances.\n", 73 | "\n", 74 | "---" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "58941489-36cd-4ad2-8b0d-25cc83151519", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from sklearn.linear_model import Ridge\n", 85 | "# Create models\n", 86 | "ridge = Ridge(\n", 87 | " # Regularization penalty\n", 88 | " alpha=10,\n", 89 | " random_state=1)\n", 90 | "# Fit object\n", 91 | "ridge.fit(X_train, y_train)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "ecb16cc3-6328-451a-917c-7d47a0a39cbd", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Linear regression\n", 102 | "ols = LinearRegression()\n", 103 | "ols.fit(X_train, y_train)\n", 104 | "# Ridge, no penalty\n", 105 | "ridge2 = Ridge(alpha=0, random_state=2) \n", 106 | "ridge2.fit(X_train, y_train)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "5116c014-77cc-4456-85b4-383d80087808", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Evaluate\n", 117 | "print(f'Training R^2, Original Ridge: {ridge.score(X_train, y_train)}')\n", 118 | "print(f'Test R^2, Original Ridge: {ridge.score(X_test, y_test)}')\n", 119 | "print(f'Training R^2, OLS: {ols.score(X_train, y_train)}')\n", 120 | "print(f'Test R^2, OLS: {ols.score(X_test, y_test)}')\n", 121 | "print(f'Training R^2, Ridge with no penalty: {ridge2.score(X_train, y_train)}')\n", 122 | "print(f'Test R^2, Ridge with no penalty: {ridge2.score(X_test, y_test)}')" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "9ce10e26-c494-4a21-9274-7e70801c2594", 128 | "metadata": {}, 129 | "source": [ 130 | "- Ridge with no penalty is the same as OLS.\n", 131 | "- Ridge regression with a penalty has slightly worse training performance, but slightly better test performance." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "27cc54c3-f936-4c95-ac6b-57c07bbc371f", 137 | "metadata": {}, 138 | "source": [ 139 | "---\n", 140 | "### Challenge 3: Performing a Lasso Fit\n", 141 | "\n", 142 | "Below, we've imported the `Lasso` object from `scikit-learn` for you. Just like `Ridge`, it needs to know what the strength of the regularization penalty is before fitting to the data. \n", 143 | "\n", 144 | "Fit several Lasso models, with different regularization strengths. Try one with a regularization strength of zero, try one with a small but non-zero regularization strength, and try one with a very large regularization strength. Look at the coefficients. What do you notice?\n", 145 | "\n", 146 | "---" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "263cd73b-3b2d-4161-b3d5-a24f1bd7488a", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "lasso1 = Lasso(alpha=0.01)\n", 157 | "lasso1.fit(X_train, y_train)\n", 158 | "lasso1.coef_" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "efc7b978-efb3-4766-8f95-8c16c51bbcd7", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "lasso2 = Lasso(alpha=10)\n", 169 | "lasso2.fit(X_train, y_train)\n", 170 | "lasso2.coef_" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "9c31efb4-933d-4fd5-9bfc-d4ec2b98ac46", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "lasso3 = Lasso(alpha=10000)\n", 181 | "lasso3.fit(X_train, y_train)\n", 182 | "lasso3.coef_" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "nlp", 189 | "language": "python", 190 | "name": "nlp" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.9.7" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 5 207 | } 208 | -------------------------------------------------------------------------------- /solutions/03_preprocessing_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "51dbac37", 6 | "metadata": {}, 7 | "source": [ 8 | "## Challenge 1: Fitting preprocessing functions\n", 9 | "\n", 10 | "The simple imputer, normalization and one-hot-encoding rely on sklearn functions that are fit to a data set. \n", 11 | "\n", 12 | "1) What is being fit for each of the three functions?\n", 13 | "\n", 14 | "**Solution:**\n", 15 | "\n", 16 | " 1) One Hot Encoding - Levels for each categorical variable\n", 17 | " \n", 18 | " 2) Standard Scaler - Mean / std deviation for each column\n", 19 | " \n", 20 | " 3) Simple Imputer - Mean for each column\n", 21 | " \n", 22 | "\n", 23 | "When we are preprocessing data we have a few options: \n", 24 | "1) Fit on the whole data set\n", 25 | "2) Fit on the training data\n", 26 | "3) Fit on the testing data\n", 27 | "\n", 28 | "Which of the above methods would you use and why?\n", 29 | "\n", 30 | "**Solution:** Best practice is to fit on the training data. This avoids **data leakage** or influence of test data information on training data." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "7d9b2692", 36 | "metadata": {}, 37 | "source": [ 38 | "## Challenge 2: Order of Preprocessing\n", 39 | "\n", 40 | "In the preprocessing we did the following steps: \n", 41 | "\n", 42 | "1) Null values\n", 43 | "2) One-hot-encoding\n", 44 | "3) Imputation\n", 45 | "4) Normalization\n", 46 | "\n", 47 | "Now, consider that we change the order of the steps in the following ways. What effect might that have on the algorithms?\n", 48 | "**Hint**: Try copying the code from above and trying it out!\n", 49 | "\n", 50 | "- One-Hot-Encoding before Null Values - This will include null values as levels in one-hot-encoding\n", 51 | "- Normalization before Null values - This may cause errors due to null values.\n", 52 | "\n", 53 | "**Bonus:** Are there any other switches in order that might affect preprocessing?\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "bcde87a2", 59 | "metadata": {}, 60 | "source": [ 61 | "## Challenge 3: Preprocessing and regularization\n", 62 | "\n", 63 | "We are preprocessing data in preparation for a classification task down the line. However, preprocessing also applies to regression. \n", 64 | "\n", 65 | "Consider the regularization task applied in the previous notebook. How might the preprocessing steps affect the performance of regularization?" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.9.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 5 90 | } 91 | -------------------------------------------------------------------------------- /solutions/04_classification_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "599e3581", 6 | "metadata": {}, 7 | "source": [ 8 | "## Challenge 1: Model Evaluation\n", 9 | "\n", 10 | "1). What are the TP, FP, TN, FN in these model results?\n", 11 | "\n", 12 | "- TP: 26\n", 13 | "- FP: 3\n", 14 | "- TN: 34\n", 15 | "- FN: 4\n", 16 | "\n", 17 | "\n", 18 | "2). What is the precision and recall for this model?\n", 19 | "\n", 20 | "**precision**: 26 / 29 = .896\n", 21 | "**recall**: 26 / 30 = .8666\n", 22 | "\n", 23 | "3). Which is more important, precision or recall?\n", 24 | "\n", 25 | "**solution:** it depends on the model and this problem" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "824b97aa", 31 | "metadata": {}, 32 | "source": [ 33 | "## Challenge 2: Classification with SVM\n", 34 | "\n", 35 | "Now let's try another new model. The [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html#classification) is another class of machine learning algorithm that is used for classification. \n", 36 | "\n", 37 | "Choose two features of the data set to train your model on. Then, using the documentation for the support vector machine, follow the steps to:\n", 38 | "- Initialize the model\n", 39 | "- Fit it to the training data\n", 40 | "- Evaluate the model on both the training and testing data\n", 41 | "\n", 42 | "Is your model underfit? Is it overfit? \n", 43 | "\n", 44 | "How does SVM fit in with the **linearly separable** problem identified in the scatter plots above?" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "id": "6ac4d9a3", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "ename": "NameError", 55 | "evalue": "name 'X_train' is not defined", 56 | "output_type": "error", 57 | "traceback": [ 58 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 59 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 60 | "Input \u001b[1;32mIn [3]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m## YOUR CODE HERE\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msvm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SVC\n\u001b[1;32m----> 3\u001b[0m X_train_subset \u001b[38;5;241m=\u001b[39m \u001b[43mX_train\u001b[49m[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody_mass_g\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mculmen_depth_mm\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m 4\u001b[0m X_test_subset \u001b[38;5;241m=\u001b[39m X_test[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody_mass_g\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mculmen_depth_mm\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m 5\u001b[0m y_train_subset \u001b[38;5;241m=\u001b[39m y_train[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mspecies\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", 61 | "\u001b[1;31mNameError\u001b[0m: name 'X_train' is not defined" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "## YOUR CODE HERE\n", 67 | "from sklearn.svm import SVC\n", 68 | "X_train_subset = X_train[['body_mass_g','culmen_depth_mm']]\n", 69 | "X_test_subset = X_test[['body_mass_g','culmen_depth_mm']]\n", 70 | "y_train_subset = y_train['species']\n", 71 | "y_test_subset = y_test['species']\n", 72 | "\n", 73 | "##1) Initialize SVM\n", 74 | "model = SVC()\n", 75 | "\n", 76 | "##2) Train SVM on Training data \n", 77 | "model.fit(X_train_subset,y_train_subset)\n", 78 | "##3) Evaluate SVM on Training and Test Data\n", 79 | "model.score(X_train_subset,y_train_subset)\n", 80 | "model.score(X_test_subset,y_test_subset)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "a031ab81", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.9.12" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 5 113 | } 114 | --------------------------------------------------------------------------------