├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── auto-mpg.csv
    ├── auto-mpg.names
    ├── diamonds.csv
    ├── heart_2020_cleaned_sample.csv
    ├── penguins.csv
    ├── penguins_X_test.csv
    ├── penguins_X_train.csv
    ├── penguins_y_test.csv
    ├── penguins_y_train.csv
    ├── spotify_features.csv
    ├── telco_churn.csv
    └── world_happiness.csv
├── images
    ├── Confusion_Matrix.png
    ├── KNN.png
    ├── linear_regression_hyperplane.jpeg
    ├── linear_regression_line.png
    ├── overfitting.png
    └── validation.png
├── lessons
    ├── 00_introduction.md
    ├── 01_regression.ipynb
    ├── 02_regularization.ipynb
    ├── 03_preprocessing.ipynb
    ├── 04_classification.ipynb
    └── future
    │   ├── 05_walkthrough.ipynb
    │   ├── 06_clustering.ipynb
    │   └── 07_dimensionality_reduction.ipynb
├── requirements.txt
└── solutions
    ├── 01_regression_solutions.ipynb
    ├── 02_regularization_solutions.ipynb
    ├── 03_preprocessing_solutions.ipynb
    └── 04_classification_solutions.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev
  3 | 
  4 | ### JupyterNotebooks ###
  5 | # gitignore template for Jupyter Notebooks
  6 | # website: http://jupyter.org/
  7 | 
  8 | .ipynb_checkpoints
  9 | */.ipynb_checkpoints/*
 10 | 
 11 | # IPython
 12 | profile_default/
 13 | ipython_config.py
 14 | 
 15 | # Remove previous ipynb_checkpoints
 16 | #   git rm -r .ipynb_checkpoints/
 17 | 
 18 | ### macOS ###
 19 | # General
 20 | .DS_Store
 21 | .AppleDouble
 22 | .LSOverride
 23 | 
 24 | # Icon must end with two \r
 25 | Icon
 26 | 
 27 | 
 28 | # Thumbnails
 29 | ._*
 30 | 
 31 | # Files that might appear in the root of a volume
 32 | .DocumentRevisions-V100
 33 | .fseventsd
 34 | .Spotlight-V100
 35 | .TemporaryItems
 36 | .Trashes
 37 | .VolumeIcon.icns
 38 | .com.apple.timemachine.donotpresent
 39 | 
 40 | # Directories potentially created on remote AFP share
 41 | .AppleDB
 42 | .AppleDesktop
 43 | Network Trash Folder
 44 | Temporary Items
 45 | .apdisk
 46 | 
 47 | ### macOS Patch ###
 48 | # iCloud generated files
 49 | *.icloud
 50 | 
 51 | ### PyCharm ###
 52 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 53 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 54 | 
 55 | # User-specific stuff
 56 | .idea/**/workspace.xml
 57 | .idea/**/tasks.xml
 58 | .idea/**/usage.statistics.xml
 59 | .idea/**/dictionaries
 60 | .idea/**/shelf
 61 | 
 62 | # AWS User-specific
 63 | .idea/**/aws.xml
 64 | 
 65 | # Generated files
 66 | .idea/**/contentModel.xml
 67 | 
 68 | # Sensitive or high-churn files
 69 | .idea/**/dataSources/
 70 | .idea/**/dataSources.ids
 71 | .idea/**/dataSources.local.xml
 72 | .idea/**/sqlDataSources.xml
 73 | .idea/**/dynamic.xml
 74 | .idea/**/uiDesigner.xml
 75 | .idea/**/dbnavigator.xml
 76 | 
 77 | # Gradle
 78 | .idea/**/gradle.xml
 79 | .idea/**/libraries
 80 | 
 81 | # Gradle and Maven with auto-import
 82 | # When using Gradle or Maven with auto-import, you should exclude module files,
 83 | # since they will be recreated, and may cause churn.  Uncomment if using
 84 | # auto-import.
 85 | # .idea/artifacts
 86 | # .idea/compiler.xml
 87 | # .idea/jarRepositories.xml
 88 | # .idea/modules.xml
 89 | # .idea/*.iml
 90 | # .idea/modules
 91 | # *.iml
 92 | # *.ipr
 93 | 
 94 | # CMake
 95 | cmake-build-*/
 96 | 
 97 | # Mongo Explorer plugin
 98 | .idea/**/mongoSettings.xml
 99 | 
100 | # File-based project format
101 | *.iws
102 | 
103 | # IntelliJ
104 | out/
105 | 
106 | # mpeltonen/sbt-idea plugin
107 | .idea_modules/
108 | 
109 | # JIRA plugin
110 | atlassian-ide-plugin.xml
111 | 
112 | # Cursive Clojure plugin
113 | .idea/replstate.xml
114 | 
115 | # SonarLint plugin
116 | .idea/sonarlint/
117 | 
118 | # Crashlytics plugin (for Android Studio and IntelliJ)
119 | com_crashlytics_export_strings.xml
120 | crashlytics.properties
121 | crashlytics-build.properties
122 | fabric.properties
123 | 
124 | # Editor-based Rest Client
125 | .idea/httpRequests
126 | 
127 | # Android studio 3.1+ serialized cache file
128 | .idea/caches/build_file_checksums.ser
129 | 
130 | ### PyCharm Patch ###
131 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
132 | 
133 | # *.iml
134 | # modules.xml
135 | # .idea/misc.xml
136 | # *.ipr
137 | 
138 | # Sonarlint plugin
139 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
140 | .idea/**/sonarlint/
141 | 
142 | # SonarQube Plugin
143 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
144 | .idea/**/sonarIssues.xml
145 | 
146 | # Markdown Navigator plugin
147 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
148 | .idea/**/markdown-navigator.xml
149 | .idea/**/markdown-navigator-enh.xml
150 | .idea/**/markdown-navigator/
151 | 
152 | # Cache file creation bug
153 | # See https://youtrack.jetbrains.com/issue/JBR-2257
154 | .idea/$CACHE_FILE$
155 | 
156 | # CodeStream plugin
157 | # https://plugins.jetbrains.com/plugin/12206-codestream
158 | .idea/codestream.xml
159 | 
160 | # Azure Toolkit for IntelliJ plugin
161 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
162 | .idea/**/azureSettings.xml
163 | 
164 | ### pydev ###
165 | .pydevproject
166 | 
167 | ### Python ###
168 | # Byte-compiled / optimized / DLL files
169 | __pycache__/
170 | *.py[cod]
171 | *$py.class
172 | 
173 | # C extensions
174 | *.so
175 | 
176 | # Distribution / packaging
177 | .Python
178 | build/
179 | develop-eggs/
180 | dist/
181 | downloads/
182 | eggs/
183 | .eggs/
184 | lib/
185 | lib64/
186 | parts/
187 | sdist/
188 | var/
189 | wheels/
190 | share/python-wheels/
191 | *.egg-info/
192 | .installed.cfg
193 | *.egg
194 | MANIFEST
195 | 
196 | # PyInstaller
197 | #  Usually these files are written by a python script from a template
198 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
199 | *.manifest
200 | *.spec
201 | 
202 | # Installer logs
203 | pip-log.txt
204 | pip-delete-this-directory.txt
205 | 
206 | # Unit test / coverage reports
207 | htmlcov/
208 | .tox/
209 | .nox/
210 | .coverage
211 | .coverage.*
212 | .cache
213 | nosetests.xml
214 | coverage.xml
215 | *.cover
216 | *.py,cover
217 | .hypothesis/
218 | .pytest_cache/
219 | cover/
220 | 
221 | # Translations
222 | *.mo
223 | *.pot
224 | 
225 | # Django stuff:
226 | *.log
227 | local_settings.py
228 | db.sqlite3
229 | db.sqlite3-journal
230 | 
231 | # Flask stuff:
232 | instance/
233 | .webassets-cache
234 | 
235 | # Scrapy stuff:
236 | .scrapy
237 | 
238 | # Sphinx documentation
239 | docs/_build/
240 | 
241 | # PyBuilder
242 | .pybuilder/
243 | target/
244 | 
245 | # Jupyter Notebook
246 | 
247 | # IPython
248 | 
249 | # pyenv
250 | #   For a library or package, you might want to ignore these files since the code is
251 | #   intended to run in multiple environments; otherwise, check them in:
252 | # .python-version
253 | 
254 | # pipenv
255 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
256 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
257 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
258 | #   install all needed dependencies.
259 | #Pipfile.lock
260 | 
261 | # poetry
262 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
263 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
264 | #   commonly ignored for libraries.
265 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
266 | #poetry.lock
267 | 
268 | # pdm
269 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
270 | #pdm.lock
271 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
272 | #   in version control.
273 | #   https://pdm.fming.dev/#use-with-ide
274 | .pdm.toml
275 | 
276 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
277 | __pypackages__/
278 | 
279 | # Celery stuff
280 | celerybeat-schedule
281 | celerybeat.pid
282 | 
283 | # SageMath parsed files
284 | *.sage.py
285 | 
286 | # Environments
287 | .env
288 | .venv
289 | env/
290 | venv/
291 | ENV/
292 | env.bak/
293 | venv.bak/
294 | 
295 | # Spyder project settings
296 | .spyderproject
297 | .spyproject
298 | 
299 | # Rope project settings
300 | .ropeproject
301 | 
302 | # mkdocs documentation
303 | /site
304 | 
305 | # mypy
306 | .mypy_cache/
307 | .dmypy.json
308 | dmypy.json
309 | 
310 | # Pyre type checker
311 | .pyre/
312 | 
313 | # pytype static type analyzer
314 | .pytype/
315 | 
316 | # Cython debug symbols
317 | cython_debug/
318 | 
319 | # PyCharm
320 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
321 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
322 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
323 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
324 | #.idea/
325 | 
326 | ### VisualStudioCode ###
327 | .vscode/*
328 | !.vscode/settings.json
329 | !.vscode/tasks.json
330 | !.vscode/launch.json
331 | !.vscode/extensions.json
332 | !.vscode/*.code-snippets
333 | 
334 | # Local History for Visual Studio Code
335 | .history/
336 | 
337 | # Built Visual Studio Code Extensions
338 | *.vsix
339 | 
340 | ### VisualStudioCode Patch ###
341 | # Ignore all local history of files
342 | .history
343 | .ionide
344 | 
345 | # Support for Project snippet scope
346 | .vscode/*.code-snippets
347 | 
348 | # Ignore code-workspaces
349 | *.code-workspace
350 | 
351 | ### Windows ###
352 | # Windows thumbnail cache files
353 | Thumbs.db
354 | Thumbs.db:encryptable
355 | ehthumbs.db
356 | ehthumbs_vista.db
357 | 
358 | # Dump file
359 | *.stackdump
360 | 
361 | # Folder config file
362 | [Dd]esktop.ini
363 | 
364 | # Recycle Bin used on file shares
365 | $RECYCLE.BIN/
366 | 
367 | # Windows Installer files
368 | *.cab
369 | *.msi
370 | *.msix
371 | *.msm
372 | *.msp
373 | 
374 | # Windows shortcuts
375 | *.lnk
376 | 
377 | # End of https://www.toptal.com/developers/gitignore/api/windows,macos,python,visualstudiocode,pycharm,jupyternotebooks,pydev
378 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Creative Commons Attribution-NonCommercial 4.0 International Public License
 3 | 
 4 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
 5 | 
 6 | Section 1 – Definitions.
 7 | 
 8 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
 9 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
13 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
14 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
15 | Licensor means the individual(s) or entity(ies) granting rights under this Public License.
16 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
17 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
18 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
19 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
20 | Section 2 – Scope.
21 | 
22 | License grant.
23 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
24 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
25 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
26 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
27 | Term. The term of this Public License is specified in Section 6(a).
28 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
29 | Downstream recipients.
30 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
31 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
32 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
33 | Other rights.
34 | 
35 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
36 | Patent and trademark rights are not licensed under this Public License.
37 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
38 | Section 3 – License Conditions.
39 | 
40 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
41 | 
42 | Attribution.
43 | 
44 | If You Share the Licensed Material (including in modified form), You must:
45 | 
46 | retain the following if it is supplied by the Licensor with the Licensed Material:
47 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
48 | a copyright notice;
49 | a notice that refers to this Public License;
50 | a notice that refers to the disclaimer of warranties;
51 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
52 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
53 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
54 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
55 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
56 | If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License.
57 | Section 4 – Sui Generis Database Rights.
58 | 
59 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
60 | 
61 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
62 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
63 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
64 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
65 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
66 | 
67 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
68 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
69 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
70 | Section 6 – Term and Termination.
71 | 
72 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
73 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
74 | 
75 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
76 | upon express reinstatement by the Licensor.
77 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
78 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
79 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
80 | Section 7 – Other Terms and Conditions.
81 | 
82 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
83 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
84 | Section 8 – Interpretation.
85 | 
86 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
87 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
88 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
89 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # D-Lab's Python Machine Learning Workshop
  2 | 
  3 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=lab%2Ftree%2FPython-Machine-Learning%2F&branch=main) [![Binder](http://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Machine-Learning/HEAD)
  4 | 
  5 | This repository contains the materials for D-Lab’s Python Machine Learning workshop. 
  6 | 
  7 | ### Prerequisites
  8 | Prior experience with [Python Fundamentals](https://github.com/dlab-berkeley/Python-Fundamentals), [Python Data Visualization](https://github.com/dlab-berkeley/Python-Data-Visualization), and [Python Data Wrangling](https://github.com/dlab-berkeley/Python-Data-Wrangling) is assumed.
  9 | 
 10 | Check D-Lab's [Learning Pathways](https://dlab-berkeley.github.io/dlab-workshops/python_path.html) to figure out which of our workshops to take!
 11 | 
 12 | ## Workshop Goals
 13 | 
 14 | In this workshop, we provide an introduction to machine learning in Python. First, we'll cover some machine learning basics, including its foundational principles. Then, we'll dive into code, understanding how to perform regression, regularization, preprocessing, and classification. There are additional components of the workshop which explore building machine learning pipelines and unsupervised learning. We'll demonstrate how to perform these tasks using `scikit-learn`, the main package used for machine learning in Python.
 15 | 
 16 | This workshop is divided into the following parts:
 17 | 
 18 | 1. **Part 1: Regression and Regularization.** How can we use linear models to predict continuous outputs, and how can we prevent their overfitting?
 19 | 2. **Part 2: Preprocessing and Classification.** What preprocessing steps do we need to take before fitting models? Then, how do we perform classification?
 20 | 3. **Part 3: Machine Learning Pipeline.** We'll walk through a machine learning task, from exploratory data analysis to building an entire machine learning pipeline.
 21 | 
 22 | The first two parts are taught as a joint series. Part 3 can be attended on its own, but prior knowledge of Parts 1 and 2 are assumed.
 23 | 
 24 | ## Installation Instructions
 25 | 
 26 | Anaconda is a useful package management software that allows you to run Python and Jupyter notebooks very easily. Installing Anaconda is the easiest way to make sure you have all the necessary software to run the materials for this workshop. Complete the following steps:
 27 | 
 28 | 1. [Download and install Anaconda (Python 3.9 distribution)](https://www.anaconda.com/products/individual). Click "Download" and then click 64-bit "Graphical Installer" for your current operating system.
 29 | 
 30 | 2. Download the [Python-Machine-Learning workshop materials](https://github.com/dlab-berkeley/Python-Machine-Learning):
 31 | 
 32 | * Click the green "Code" button in the top right of the repository information.
 33 | * Click "Download Zip".
 34 | * Extract this file to a folder on your computer where you can easily access it (we recommend Desktop).
 35 | 
 36 | 3. Optional: if you're familiar with `git`, you can instead clone this repository by opening a terminal and entering `git@github.com:dlab-berkeley/Python-Machine-Learning.git`.
 37 | 
 38 | ## Run the code
 39 | 
 40 | Now that you have all the required software and materials, you need to run the code:
 41 | 
 42 | 1. Open the Anaconda Navigator application. You should see the green snake logo appear on your screen. Note that this can take a few minutes to load up the first time. 
 43 | 
 44 | 2. Click the "Launch" button under "Jupyter Notebooks" and navigate through your file system to the `Python-Machine-Learning` folder you downloaded above.
 45 | 
 46 | 3. Click `00_introduction.md` to begin.
 47 | 
 48 | 4. Press Shift + Enter (or Ctrl + Enter) to run a cell.
 49 | 
 50 | ## Is Python not working on your computer?
 51 | 
 52 | If you have a Berkeley CalNet ID, you can run these lessons on UC Berkeley's DataHub by clicking this button:
 53 | 
 54 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=lab%2Ftree%2FPython-Machine-Learning%2F&branch=main)
 55 | 
 56 | By using this link, you can save your work and come back to it at any time. When you want to return to your saved work, just go straight to DataHub [https://datahub.berkeley.edu](https://datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Machine-Learning&urlpath=tree%2FPython-Machine-Learning%2F&branch=main), sign in, and you click on the `Python-Machine-Learning` folder.
 57 | 
 58 | If you don't have a Berkeley CalNet ID, you can still run these lessons in the cloud, by clicking this button:
 59 | 
 60 | [![Binder](http://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Machine-Learning/main?urlpath=tree)
 61 | 
 62 | By using this button, you cannot save your work unfortunately.
 63 | 
 64 | # Additional Resources
 65 | 
 66 | Check out the following resources to learn more about machine learning:
 67 | 
 68 | * [scikit-learn Tutorials](https://scikit-learn.org/stable/tutorial/index.html).
 69 | * [Stanford's CS229 course materials](https://cs229.stanford.edu/syllabus.html).
 70 | * [IBM's free course of machine learning in Python](https://www.edx.org/course/machine-learning-with-python-a-practical-introduct).
 71 | * The [Elements of AI course](https://course.elementsofai.com/).
 72 | 
 73 | # About the UC Berkeley D-Lab
 74 | 
 75 | D-Lab works with Berkeley faculty, research staff, and students to advance data-intensive social science and humanities research. Our goal at D-Lab is to provide practical training, staff support, resources, and space to enable you to use R for your own research applications. Our services cater to all skill levels and no programming, statistical, or computer science backgrounds are necessary. We offer these services in the form of workshops, one-to-one consulting, and working groups that cover a variety of research topics, digital tools, and programming languages.  
 76 | 
 77 | Visit the [D-Lab homepage](https://dlab.berkeley.edu/) to learn more about us. You can view our [calendar](https://dlab.berkeley.edu/events/calendar) for upcoming events, learn about how to utilize our [consulting](https://dlab.berkeley.edu/consulting) and [data](https://dlab.berkeley.edu/data) services, and check out upcoming [workshops](https://dlab.berkeley.edu/events/workshops).
 78 | 
 79 | # Other D-Lab Python Workshops
 80 | 
 81 | Here are other Python workshops offered by the D-Lab:
 82 | 
 83 | ## Basic competency
 84 | 
 85 | * [Python Fundamentals](https://github.com/dlab-berkeley/python-fundamentals)
 86 | * [Introduction to Pandas](https://github.com/dlab-berkeley/introduction-to-pandas)
 87 | * [Geospatial Fundamentals in Python](https://github.com/dlab-berkeley/Geospatial-Fundamentals-in-Python)
 88 | * [Python Visualization](https://github.com/dlab-berkeley/Python-Data-Visualization)
 89 | 
 90 | ## Intermediate/advanced copmetency
 91 | 
 92 | * [Python Text Analysis](https://github.com/dlab-berkeley/Python-Text-Analysis)
 93 | * [Python Deep Learning](https://github.com/dlab-berkeley/Python-Deep-Learning)
 94 | * [Fairness and Bias in Machine Learning](https://github.com/dlab-berkeley/fairML)
 95 | 
 96 | # Contributors
 97 | * Pratik Sachdeva
 98 | * Emily Grabowski
 99 | * George McIntire
100 | * Sam Temlock
101 | * Samy Abdel-Ghaffar
102 | * Sean Perez
103 | * Christopher Hench
104 | 


--------------------------------------------------------------------------------
/data/auto-mpg.csv:
--------------------------------------------------------------------------------
  1 | car name,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
  2 | chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1
  3 | buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1
  4 | plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1
  5 | amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1
  6 | ford torino,17.0,8,302.0,140,3449,10.5,70,1
  7 | ford galaxie 500,15.0,8,429.0,198,4341,10.0,70,1
  8 | chevrolet impala,14.0,8,454.0,220,4354,9.0,70,1
  9 | plymouth fury iii,14.0,8,440.0,215,4312,8.5,70,1
 10 | pontiac catalina,14.0,8,455.0,225,4425,10.0,70,1
 11 | amc ambassador dpl,15.0,8,390.0,190,3850,8.5,70,1
 12 | dodge challenger se,15.0,8,383.0,170,3563,10.0,70,1
 13 | plymouth 'cuda 340,14.0,8,340.0,160,3609,8.0,70,1
 14 | chevrolet monte carlo,15.0,8,400.0,150,3761,9.5,70,1
 15 | buick estate wagon (sw),14.0,8,455.0,225,3086,10.0,70,1
 16 | toyota corona mark ii,24.0,4,113.0,95,2372,15.0,70,3
 17 | plymouth duster,22.0,6,198.0,95,2833,15.5,70,1
 18 | amc hornet,18.0,6,199.0,97,2774,15.5,70,1
 19 | ford maverick,21.0,6,200.0,85,2587,16.0,70,1
 20 | datsun pl510,27.0,4,97.0,88,2130,14.5,70,3
 21 | volkswagen 1131 deluxe sedan,26.0,4,97.0,46,1835,20.5,70,2
 22 | peugeot 504,25.0,4,110.0,87,2672,17.5,70,2
 23 | audi 100 ls,24.0,4,107.0,90,2430,14.5,70,2
 24 | saab 99e,25.0,4,104.0,95,2375,17.5,70,2
 25 | bmw 2002,26.0,4,121.0,113,2234,12.5,70,2
 26 | amc gremlin,21.0,6,199.0,90,2648,15.0,70,1
 27 | ford f250,10.0,8,360.0,215,4615,14.0,70,1
 28 | chevy c20,10.0,8,307.0,200,4376,15.0,70,1
 29 | dodge d200,11.0,8,318.0,210,4382,13.5,70,1
 30 | hi 1200d,9.0,8,304.0,193,4732,18.5,70,1
 31 | datsun pl510,27.0,4,97.0,88,2130,14.5,71,3
 32 | chevrolet vega 2300,28.0,4,140.0,90,2264,15.5,71,1
 33 | toyota corona,25.0,4,113.0,95,2228,14.0,71,3
 34 | amc gremlin,19.0,6,232.0,100,2634,13.0,71,1
 35 | plymouth satellite custom,16.0,6,225.0,105,3439,15.5,71,1
 36 | chevrolet chevelle malibu,17.0,6,250.0,100,3329,15.5,71,1
 37 | ford torino 500,19.0,6,250.0,88,3302,15.5,71,1
 38 | amc matador,18.0,6,232.0,100,3288,15.5,71,1
 39 | chevrolet impala,14.0,8,350.0,165,4209,12.0,71,1
 40 | pontiac catalina brougham,14.0,8,400.0,175,4464,11.5,71,1
 41 | ford galaxie 500,14.0,8,351.0,153,4154,13.5,71,1
 42 | plymouth fury iii,14.0,8,318.0,150,4096,13.0,71,1
 43 | dodge monaco (sw),12.0,8,383.0,180,4955,11.5,71,1
 44 | ford country squire (sw),13.0,8,400.0,170,4746,12.0,71,1
 45 | pontiac safari (sw),13.0,8,400.0,175,5140,12.0,71,1
 46 | amc hornet sportabout (sw),18.0,6,258.0,110,2962,13.5,71,1
 47 | chevrolet vega (sw),22.0,4,140.0,72,2408,19.0,71,1
 48 | pontiac firebird,19.0,6,250.0,100,3282,15.0,71,1
 49 | ford mustang,18.0,6,250.0,88,3139,14.5,71,1
 50 | mercury capri 2000,23.0,4,122.0,86,2220,14.0,71,1
 51 | opel 1900,28.0,4,116.0,90,2123,14.0,71,2
 52 | peugeot 304,30.0,4,79.0,70,2074,19.5,71,2
 53 | fiat 124b,30.0,4,88.0,76,2065,14.5,71,2
 54 | toyota corolla 1200,31.0,4,71.0,65,1773,19.0,71,3
 55 | datsun 1200,35.0,4,72.0,69,1613,18.0,71,3
 56 | volkswagen model 111,27.0,4,97.0,60,1834,19.0,71,2
 57 | plymouth cricket,26.0,4,91.0,70,1955,20.5,71,1
 58 | toyota corona hardtop,24.0,4,113.0,95,2278,15.5,72,3
 59 | dodge colt hardtop,25.0,4,97.5,80,2126,17.0,72,1
 60 | volkswagen type 3,23.0,4,97.0,54,2254,23.5,72,2
 61 | chevrolet vega,20.0,4,140.0,90,2408,19.5,72,1
 62 | ford pinto runabout,21.0,4,122.0,86,2226,16.5,72,1
 63 | chevrolet impala,13.0,8,350.0,165,4274,12.0,72,1
 64 | pontiac catalina,14.0,8,400.0,175,4385,12.0,72,1
 65 | plymouth fury iii,15.0,8,318.0,150,4135,13.5,72,1
 66 | ford galaxie 500,14.0,8,351.0,153,4129,13.0,72,1
 67 | amc ambassador sst,17.0,8,304.0,150,3672,11.5,72,1
 68 | mercury marquis,11.0,8,429.0,208,4633,11.0,72,1
 69 | buick lesabre custom,13.0,8,350.0,155,4502,13.5,72,1
 70 | oldsmobile delta 88 royale,12.0,8,350.0,160,4456,13.5,72,1
 71 | chrysler newport royal,13.0,8,400.0,190,4422,12.5,72,1
 72 | mazda rx2 coupe,19.0,3,70.0,97,2330,13.5,72,3
 73 | amc matador (sw),15.0,8,304.0,150,3892,12.5,72,1
 74 | chevrolet chevelle concours (sw),13.0,8,307.0,130,4098,14.0,72,1
 75 | ford gran torino (sw),13.0,8,302.0,140,4294,16.0,72,1
 76 | plymouth satellite custom (sw),14.0,8,318.0,150,4077,14.0,72,1
 77 | volvo 145e (sw),18.0,4,121.0,112,2933,14.5,72,2
 78 | volkswagen 411 (sw),22.0,4,121.0,76,2511,18.0,72,2
 79 | peugeot 504 (sw),21.0,4,120.0,87,2979,19.5,72,2
 80 | renault 12 (sw),26.0,4,96.0,69,2189,18.0,72,2
 81 | ford pinto (sw),22.0,4,122.0,86,2395,16.0,72,1
 82 | datsun 510 (sw),28.0,4,97.0,92,2288,17.0,72,3
 83 | toyouta corona mark ii (sw),23.0,4,120.0,97,2506,14.5,72,3
 84 | dodge colt (sw),28.0,4,98.0,80,2164,15.0,72,1
 85 | toyota corolla 1600 (sw),27.0,4,97.0,88,2100,16.5,72,3
 86 | buick century 350,13.0,8,350.0,175,4100,13.0,73,1
 87 | amc matador,14.0,8,304.0,150,3672,11.5,73,1
 88 | chevrolet malibu,13.0,8,350.0,145,3988,13.0,73,1
 89 | ford gran torino,14.0,8,302.0,137,4042,14.5,73,1
 90 | dodge coronet custom,15.0,8,318.0,150,3777,12.5,73,1
 91 | mercury marquis brougham,12.0,8,429.0,198,4952,11.5,73,1
 92 | chevrolet caprice classic,13.0,8,400.0,150,4464,12.0,73,1
 93 | ford ltd,13.0,8,351.0,158,4363,13.0,73,1
 94 | plymouth fury gran sedan,14.0,8,318.0,150,4237,14.5,73,1
 95 | chrysler new yorker brougham,13.0,8,440.0,215,4735,11.0,73,1
 96 | buick electra 225 custom,12.0,8,455.0,225,4951,11.0,73,1
 97 | amc ambassador brougham,13.0,8,360.0,175,3821,11.0,73,1
 98 | plymouth valiant,18.0,6,225.0,105,3121,16.5,73,1
 99 | chevrolet nova custom,16.0,6,250.0,100,3278,18.0,73,1
100 | amc hornet,18.0,6,232.0,100,2945,16.0,73,1
101 | ford maverick,18.0,6,250.0,88,3021,16.5,73,1
102 | plymouth duster,23.0,6,198.0,95,2904,16.0,73,1
103 | volkswagen super beetle,26.0,4,97.0,46,1950,21.0,73,2
104 | chevrolet impala,11.0,8,400.0,150,4997,14.0,73,1
105 | ford country,12.0,8,400.0,167,4906,12.5,73,1
106 | plymouth custom suburb,13.0,8,360.0,170,4654,13.0,73,1
107 | oldsmobile vista cruiser,12.0,8,350.0,180,4499,12.5,73,1
108 | amc gremlin,18.0,6,232.0,100,2789,15.0,73,1
109 | toyota carina,20.0,4,97.0,88,2279,19.0,73,3
110 | chevrolet vega,21.0,4,140.0,72,2401,19.5,73,1
111 | datsun 610,22.0,4,108.0,94,2379,16.5,73,3
112 | maxda rx3,18.0,3,70.0,90,2124,13.5,73,3
113 | ford pinto,19.0,4,122.0,85,2310,18.5,73,1
114 | mercury capri v6,21.0,6,155.0,107,2472,14.0,73,1
115 | fiat 124 sport coupe,26.0,4,98.0,90,2265,15.5,73,2
116 | chevrolet monte carlo s,15.0,8,350.0,145,4082,13.0,73,1
117 | pontiac grand prix,16.0,8,400.0,230,4278,9.5,73,1
118 | fiat 128,29.0,4,68.0,49,1867,19.5,73,2
119 | opel manta,24.0,4,116.0,75,2158,15.5,73,2
120 | audi 100ls,20.0,4,114.0,91,2582,14.0,73,2
121 | volvo 144ea,19.0,4,121.0,112,2868,15.5,73,2
122 | dodge dart custom,15.0,8,318.0,150,3399,11.0,73,1
123 | saab 99le,24.0,4,121.0,110,2660,14.0,73,2
124 | toyota mark ii,20.0,6,156.0,122,2807,13.5,73,3
125 | oldsmobile omega,11.0,8,350.0,180,3664,11.0,73,1
126 | plymouth duster,20.0,6,198.0,95,3102,16.5,74,1
127 | amc hornet,19.0,6,232.0,100,2901,16.0,74,1
128 | chevrolet nova,15.0,6,250.0,100,3336,17.0,74,1
129 | datsun b210,31.0,4,79.0,67,1950,19.0,74,3
130 | ford pinto,26.0,4,122.0,80,2451,16.5,74,1
131 | toyota corolla 1200,32.0,4,71.0,65,1836,21.0,74,3
132 | chevrolet vega,25.0,4,140.0,75,2542,17.0,74,1
133 | chevrolet chevelle malibu classic,16.0,6,250.0,100,3781,17.0,74,1
134 | amc matador,16.0,6,258.0,110,3632,18.0,74,1
135 | plymouth satellite sebring,18.0,6,225.0,105,3613,16.5,74,1
136 | ford gran torino,16.0,8,302.0,140,4141,14.0,74,1
137 | buick century luxus (sw),13.0,8,350.0,150,4699,14.5,74,1
138 | dodge coronet custom (sw),14.0,8,318.0,150,4457,13.5,74,1
139 | ford gran torino (sw),14.0,8,302.0,140,4638,16.0,74,1
140 | amc matador (sw),14.0,8,304.0,150,4257,15.5,74,1
141 | audi fox,29.0,4,98.0,83,2219,16.5,74,2
142 | volkswagen dasher,26.0,4,79.0,67,1963,15.5,74,2
143 | opel manta,26.0,4,97.0,78,2300,14.5,74,2
144 | toyota corona,31.0,4,76.0,52,1649,16.5,74,3
145 | datsun 710,32.0,4,83.0,61,2003,19.0,74,3
146 | dodge colt,28.0,4,90.0,75,2125,14.5,74,1
147 | fiat 128,24.0,4,90.0,75,2108,15.5,74,2
148 | fiat 124 tc,26.0,4,116.0,75,2246,14.0,74,2
149 | honda civic,24.0,4,120.0,97,2489,15.0,74,3
150 | subaru,26.0,4,108.0,93,2391,15.5,74,3
151 | fiat x1.9,31.0,4,79.0,67,2000,16.0,74,2
152 | plymouth valiant custom,19.0,6,225.0,95,3264,16.0,75,1
153 | chevrolet nova,18.0,6,250.0,105,3459,16.0,75,1
154 | mercury monarch,15.0,6,250.0,72,3432,21.0,75,1
155 | ford maverick,15.0,6,250.0,72,3158,19.5,75,1
156 | pontiac catalina,16.0,8,400.0,170,4668,11.5,75,1
157 | chevrolet bel air,15.0,8,350.0,145,4440,14.0,75,1
158 | plymouth grand fury,16.0,8,318.0,150,4498,14.5,75,1
159 | ford ltd,14.0,8,351.0,148,4657,13.5,75,1
160 | buick century,17.0,6,231.0,110,3907,21.0,75,1
161 | chevroelt chevelle malibu,16.0,6,250.0,105,3897,18.5,75,1
162 | amc matador,15.0,6,258.0,110,3730,19.0,75,1
163 | plymouth fury,18.0,6,225.0,95,3785,19.0,75,1
164 | buick skyhawk,21.0,6,231.0,110,3039,15.0,75,1
165 | chevrolet monza 2+2,20.0,8,262.0,110,3221,13.5,75,1
166 | ford mustang ii,13.0,8,302.0,129,3169,12.0,75,1
167 | toyota corolla,29.0,4,97.0,75,2171,16.0,75,3
168 | ford pinto,23.0,4,140.0,83,2639,17.0,75,1
169 | amc gremlin,20.0,6,232.0,100,2914,16.0,75,1
170 | pontiac astro,23.0,4,140.0,78,2592,18.5,75,1
171 | toyota corona,24.0,4,134.0,96,2702,13.5,75,3
172 | volkswagen dasher,25.0,4,90.0,71,2223,16.5,75,2
173 | datsun 710,24.0,4,119.0,97,2545,17.0,75,3
174 | ford pinto,18.0,6,171.0,97,2984,14.5,75,1
175 | volkswagen rabbit,29.0,4,90.0,70,1937,14.0,75,2
176 | amc pacer,19.0,6,232.0,90,3211,17.0,75,1
177 | audi 100ls,23.0,4,115.0,95,2694,15.0,75,2
178 | peugeot 504,23.0,4,120.0,88,2957,17.0,75,2
179 | volvo 244dl,22.0,4,121.0,98,2945,14.5,75,2
180 | saab 99le,25.0,4,121.0,115,2671,13.5,75,2
181 | honda civic cvcc,33.0,4,91.0,53,1795,17.5,75,3
182 | fiat 131,28.0,4,107.0,86,2464,15.5,76,2
183 | opel 1900,25.0,4,116.0,81,2220,16.9,76,2
184 | capri ii,25.0,4,140.0,92,2572,14.9,76,1
185 | dodge colt,26.0,4,98.0,79,2255,17.7,76,1
186 | renault 12tl,27.0,4,101.0,83,2202,15.3,76,2
187 | chevrolet chevelle malibu classic,17.5,8,305.0,140,4215,13.0,76,1
188 | dodge coronet brougham,16.0,8,318.0,150,4190,13.0,76,1
189 | amc matador,15.5,8,304.0,120,3962,13.9,76,1
190 | ford gran torino,14.5,8,351.0,152,4215,12.8,76,1
191 | plymouth valiant,22.0,6,225.0,100,3233,15.4,76,1
192 | chevrolet nova,22.0,6,250.0,105,3353,14.5,76,1
193 | ford maverick,24.0,6,200.0,81,3012,17.6,76,1
194 | amc hornet,22.5,6,232.0,90,3085,17.6,76,1
195 | chevrolet chevette,29.0,4,85.0,52,2035,22.2,76,1
196 | chevrolet woody,24.5,4,98.0,60,2164,22.1,76,1
197 | vw rabbit,29.0,4,90.0,70,1937,14.2,76,2
198 | honda civic,33.0,4,91.0,53,1795,17.4,76,3
199 | dodge aspen se,20.0,6,225.0,100,3651,17.7,76,1
200 | ford granada ghia,18.0,6,250.0,78,3574,21.0,76,1
201 | pontiac ventura sj,18.5,6,250.0,110,3645,16.2,76,1
202 | amc pacer d/l,17.5,6,258.0,95,3193,17.8,76,1
203 | volkswagen rabbit,29.5,4,97.0,71,1825,12.2,76,2
204 | datsun b-210,32.0,4,85.0,70,1990,17.0,76,3
205 | toyota corolla,28.0,4,97.0,75,2155,16.4,76,3
206 | ford pinto,26.5,4,140.0,72,2565,13.6,76,1
207 | volvo 245,20.0,4,130.0,102,3150,15.7,76,2
208 | plymouth volare premier v8,13.0,8,318.0,150,3940,13.2,76,1
209 | peugeot 504,19.0,4,120.0,88,3270,21.9,76,2
210 | toyota mark ii,19.0,6,156.0,108,2930,15.5,76,3
211 | mercedes-benz 280s,16.5,6,168.0,120,3820,16.7,76,2
212 | cadillac seville,16.5,8,350.0,180,4380,12.1,76,1
213 | chevy c10,13.0,8,350.0,145,4055,12.0,76,1
214 | ford f108,13.0,8,302.0,130,3870,15.0,76,1
215 | dodge d100,13.0,8,318.0,150,3755,14.0,76,1
216 | honda accord cvcc,31.5,4,98.0,68,2045,18.5,77,3
217 | buick opel isuzu deluxe,30.0,4,111.0,80,2155,14.8,77,1
218 | renault 5 gtl,36.0,4,79.0,58,1825,18.6,77,2
219 | plymouth arrow gs,25.5,4,122.0,96,2300,15.5,77,1
220 | datsun f-10 hatchback,33.5,4,85.0,70,1945,16.8,77,3
221 | chevrolet caprice classic,17.5,8,305.0,145,3880,12.5,77,1
222 | oldsmobile cutlass supreme,17.0,8,260.0,110,4060,19.0,77,1
223 | dodge monaco brougham,15.5,8,318.0,145,4140,13.7,77,1
224 | mercury cougar brougham,15.0,8,302.0,130,4295,14.9,77,1
225 | chevrolet concours,17.5,6,250.0,110,3520,16.4,77,1
226 | buick skylark,20.5,6,231.0,105,3425,16.9,77,1
227 | plymouth volare custom,19.0,6,225.0,100,3630,17.7,77,1
228 | ford granada,18.5,6,250.0,98,3525,19.0,77,1
229 | pontiac grand prix lj,16.0,8,400.0,180,4220,11.1,77,1
230 | chevrolet monte carlo landau,15.5,8,350.0,170,4165,11.4,77,1
231 | chrysler cordoba,15.5,8,400.0,190,4325,12.2,77,1
232 | ford thunderbird,16.0,8,351.0,149,4335,14.5,77,1
233 | volkswagen rabbit custom,29.0,4,97.0,78,1940,14.5,77,2
234 | pontiac sunbird coupe,24.5,4,151.0,88,2740,16.0,77,1
235 | toyota corolla liftback,26.0,4,97.0,75,2265,18.2,77,3
236 | ford mustang ii 2+2,25.5,4,140.0,89,2755,15.8,77,1
237 | chevrolet chevette,30.5,4,98.0,63,2051,17.0,77,1
238 | dodge colt m/m,33.5,4,98.0,83,2075,15.9,77,1
239 | subaru dl,30.0,4,97.0,67,1985,16.4,77,3
240 | volkswagen dasher,30.5,4,97.0,78,2190,14.1,77,2
241 | datsun 810,22.0,6,146.0,97,2815,14.5,77,3
242 | bmw 320i,21.5,4,121.0,110,2600,12.8,77,2
243 | mazda rx-4,21.5,3,80.0,110,2720,13.5,77,3
244 | volkswagen rabbit custom diesel,43.1,4,90.0,48,1985,21.5,78,2
245 | ford fiesta,36.1,4,98.0,66,1800,14.4,78,1
246 | mazda glc deluxe,32.8,4,78.0,52,1985,19.4,78,3
247 | datsun b210 gx,39.4,4,85.0,70,2070,18.6,78,3
248 | honda civic cvcc,36.1,4,91.0,60,1800,16.4,78,3
249 | oldsmobile cutlass salon brougham,19.9,8,260.0,110,3365,15.5,78,1
250 | dodge diplomat,19.4,8,318.0,140,3735,13.2,78,1
251 | mercury monarch ghia,20.2,8,302.0,139,3570,12.8,78,1
252 | pontiac phoenix lj,19.2,6,231.0,105,3535,19.2,78,1
253 | chevrolet malibu,20.5,6,200.0,95,3155,18.2,78,1
254 | ford fairmont (auto),20.2,6,200.0,85,2965,15.8,78,1
255 | ford fairmont (man),25.1,4,140.0,88,2720,15.4,78,1
256 | plymouth volare,20.5,6,225.0,100,3430,17.2,78,1
257 | amc concord,19.4,6,232.0,90,3210,17.2,78,1
258 | buick century special,20.6,6,231.0,105,3380,15.8,78,1
259 | mercury zephyr,20.8,6,200.0,85,3070,16.7,78,1
260 | dodge aspen,18.6,6,225.0,110,3620,18.7,78,1
261 | amc concord d/l,18.1,6,258.0,120,3410,15.1,78,1
262 | chevrolet monte carlo landau,19.2,8,305.0,145,3425,13.2,78,1
263 | buick regal sport coupe (turbo),17.7,6,231.0,165,3445,13.4,78,1
264 | ford futura,18.1,8,302.0,139,3205,11.2,78,1
265 | dodge magnum xe,17.5,8,318.0,140,4080,13.7,78,1
266 | chevrolet chevette,30.0,4,98.0,68,2155,16.5,78,1
267 | toyota corona,27.5,4,134.0,95,2560,14.2,78,3
268 | datsun 510,27.2,4,119.0,97,2300,14.7,78,3
269 | dodge omni,30.9,4,105.0,75,2230,14.5,78,1
270 | toyota celica gt liftback,21.1,4,134.0,95,2515,14.8,78,3
271 | plymouth sapporo,23.2,4,156.0,105,2745,16.7,78,1
272 | oldsmobile starfire sx,23.8,4,151.0,85,2855,17.6,78,1
273 | datsun 200-sx,23.9,4,119.0,97,2405,14.9,78,3
274 | audi 5000,20.3,5,131.0,103,2830,15.9,78,2
275 | volvo 264gl,17.0,6,163.0,125,3140,13.6,78,2
276 | saab 99gle,21.6,4,121.0,115,2795,15.7,78,2
277 | peugeot 604sl,16.2,6,163.0,133,3410,15.8,78,2
278 | volkswagen scirocco,31.5,4,89.0,71,1990,14.9,78,2
279 | honda accord lx,29.5,4,98.0,68,2135,16.6,78,3
280 | pontiac lemans v6,21.5,6,231.0,115,3245,15.4,79,1
281 | mercury zephyr 6,19.8,6,200.0,85,2990,18.2,79,1
282 | ford fairmont 4,22.3,4,140.0,88,2890,17.3,79,1
283 | amc concord dl 6,20.2,6,232.0,90,3265,18.2,79,1
284 | dodge aspen 6,20.6,6,225.0,110,3360,16.6,79,1
285 | chevrolet caprice classic,17.0,8,305.0,130,3840,15.4,79,1
286 | ford ltd landau,17.6,8,302.0,129,3725,13.4,79,1
287 | mercury grand marquis,16.5,8,351.0,138,3955,13.2,79,1
288 | dodge st. regis,18.2,8,318.0,135,3830,15.2,79,1
289 | buick estate wagon (sw),16.9,8,350.0,155,4360,14.9,79,1
290 | ford country squire (sw),15.5,8,351.0,142,4054,14.3,79,1
291 | chevrolet malibu classic (sw),19.2,8,267.0,125,3605,15.0,79,1
292 | chrysler lebaron town @ country (sw),18.5,8,360.0,150,3940,13.0,79,1
293 | vw rabbit custom,31.9,4,89.0,71,1925,14.0,79,2
294 | maxda glc deluxe,34.1,4,86.0,65,1975,15.2,79,3
295 | dodge colt hatchback custom,35.7,4,98.0,80,1915,14.4,79,1
296 | amc spirit dl,27.4,4,121.0,80,2670,15.0,79,1
297 | mercedes benz 300d,25.4,5,183.0,77,3530,20.1,79,2
298 | cadillac eldorado,23.0,8,350.0,125,3900,17.4,79,1
299 | peugeot 504,27.2,4,141.0,71,3190,24.8,79,2
300 | oldsmobile cutlass salon brougham,23.9,8,260.0,90,3420,22.2,79,1
301 | plymouth horizon,34.2,4,105.0,70,2200,13.2,79,1
302 | plymouth horizon tc3,34.5,4,105.0,70,2150,14.9,79,1
303 | datsun 210,31.8,4,85.0,65,2020,19.2,79,3
304 | fiat strada custom,37.3,4,91.0,69,2130,14.7,79,2
305 | buick skylark limited,28.4,4,151.0,90,2670,16.0,79,1
306 | chevrolet citation,28.8,6,173.0,115,2595,11.3,79,1
307 | oldsmobile omega brougham,26.8,6,173.0,115,2700,12.9,79,1
308 | pontiac phoenix,33.5,4,151.0,90,2556,13.2,79,1
309 | vw rabbit,41.5,4,98.0,76,2144,14.7,80,2
310 | toyota corolla tercel,38.1,4,89.0,60,1968,18.8,80,3
311 | chevrolet chevette,32.1,4,98.0,70,2120,15.5,80,1
312 | datsun 310,37.2,4,86.0,65,2019,16.4,80,3
313 | chevrolet citation,28.0,4,151.0,90,2678,16.5,80,1
314 | ford fairmont,26.4,4,140.0,88,2870,18.1,80,1
315 | amc concord,24.3,4,151.0,90,3003,20.1,80,1
316 | dodge aspen,19.1,6,225.0,90,3381,18.7,80,1
317 | audi 4000,34.3,4,97.0,78,2188,15.8,80,2
318 | toyota corona liftback,29.8,4,134.0,90,2711,15.5,80,3
319 | mazda 626,31.3,4,120.0,75,2542,17.5,80,3
320 | datsun 510 hatchback,37.0,4,119.0,92,2434,15.0,80,3
321 | toyota corolla,32.2,4,108.0,75,2265,15.2,80,3
322 | mazda glc,46.6,4,86.0,65,2110,17.9,80,3
323 | dodge colt,27.9,4,156.0,105,2800,14.4,80,1
324 | datsun 210,40.8,4,85.0,65,2110,19.2,80,3
325 | vw rabbit c (diesel),44.3,4,90.0,48,2085,21.7,80,2
326 | vw dasher (diesel),43.4,4,90.0,48,2335,23.7,80,2
327 | audi 5000s (diesel),36.4,5,121.0,67,2950,19.9,80,2
328 | mercedes-benz 240d,30.0,4,146.0,67,3250,21.8,80,2
329 | honda civic 1500 gl,44.6,4,91.0,67,1850,13.8,80,3
330 | subaru dl,33.8,4,97.0,67,2145,18.0,80,3
331 | vokswagen rabbit,29.8,4,89.0,62,1845,15.3,80,2
332 | datsun 280-zx,32.7,6,168.0,132,2910,11.4,80,3
333 | mazda rx-7 gs,23.7,3,70.0,100,2420,12.5,80,3
334 | triumph tr7 coupe,35.0,4,122.0,88,2500,15.1,80,2
335 | honda accord,32.4,4,107.0,72,2290,17.0,80,3
336 | plymouth reliant,27.2,4,135.0,84,2490,15.7,81,1
337 | buick skylark,26.6,4,151.0,84,2635,16.4,81,1
338 | dodge aries wagon (sw),25.8,4,156.0,92,2620,14.4,81,1
339 | chevrolet citation,23.5,6,173.0,110,2725,12.6,81,1
340 | plymouth reliant,30.0,4,135.0,84,2385,12.9,81,1
341 | toyota starlet,39.1,4,79.0,58,1755,16.9,81,3
342 | plymouth champ,39.0,4,86.0,64,1875,16.4,81,1
343 | honda civic 1300,35.1,4,81.0,60,1760,16.1,81,3
344 | subaru,32.3,4,97.0,67,2065,17.8,81,3
345 | datsun 210 mpg,37.0,4,85.0,65,1975,19.4,81,3
346 | toyota tercel,37.7,4,89.0,62,2050,17.3,81,3
347 | mazda glc 4,34.1,4,91.0,68,1985,16.0,81,3
348 | plymouth horizon 4,34.7,4,105.0,63,2215,14.9,81,1
349 | ford escort 4w,34.4,4,98.0,65,2045,16.2,81,1
350 | ford escort 2h,29.9,4,98.0,65,2380,20.7,81,1
351 | volkswagen jetta,33.0,4,105.0,74,2190,14.2,81,2
352 | honda prelude,33.7,4,107.0,75,2210,14.4,81,3
353 | toyota corolla,32.4,4,108.0,75,2350,16.8,81,3
354 | datsun 200sx,32.9,4,119.0,100,2615,14.8,81,3
355 | mazda 626,31.6,4,120.0,74,2635,18.3,81,3
356 | peugeot 505s turbo diesel,28.1,4,141.0,80,3230,20.4,81,2
357 | volvo diesel,30.7,6,145.0,76,3160,19.6,81,2
358 | toyota cressida,25.4,6,168.0,116,2900,12.6,81,3
359 | datsun 810 maxima,24.2,6,146.0,120,2930,13.8,81,3
360 | buick century,22.4,6,231.0,110,3415,15.8,81,1
361 | oldsmobile cutlass ls,26.6,8,350.0,105,3725,19.0,81,1
362 | ford granada gl,20.2,6,200.0,88,3060,17.1,81,1
363 | chrysler lebaron salon,17.6,6,225.0,85,3465,16.6,81,1
364 | chevrolet cavalier,28.0,4,112.0,88,2605,19.6,82,1
365 | chevrolet cavalier wagon,27.0,4,112.0,88,2640,18.6,82,1
366 | chevrolet cavalier 2-door,34.0,4,112.0,88,2395,18.0,82,1
367 | pontiac j2000 se hatchback,31.0,4,112.0,85,2575,16.2,82,1
368 | dodge aries se,29.0,4,135.0,84,2525,16.0,82,1
369 | pontiac phoenix,27.0,4,151.0,90,2735,18.0,82,1
370 | ford fairmont futura,24.0,4,140.0,92,2865,16.4,82,1
371 | volkswagen rabbit l,36.0,4,105.0,74,1980,15.3,82,2
372 | mazda glc custom l,37.0,4,91.0,68,2025,18.2,82,3
373 | mazda glc custom,31.0,4,91.0,68,1970,17.6,82,3
374 | plymouth horizon miser,38.0,4,105.0,63,2125,14.7,82,1
375 | mercury lynx l,36.0,4,98.0,70,2125,17.3,82,1
376 | nissan stanza xe,36.0,4,120.0,88,2160,14.5,82,3
377 | honda accord,36.0,4,107.0,75,2205,14.5,82,3
378 | toyota corolla,34.0,4,108.0,70,2245,16.9,82,3
379 | honda civic,38.0,4,91.0,67,1965,15.0,82,3
380 | honda civic (auto),32.0,4,91.0,67,1965,15.7,82,3
381 | datsun 310 gx,38.0,4,91.0,67,1995,16.2,82,3
382 | buick century limited,25.0,6,181.0,110,2945,16.4,82,1
383 | oldsmobile cutlass ciera (diesel),38.0,6,262.0,85,3015,17.0,82,1
384 | chrysler lebaron medallion,26.0,4,156.0,92,2585,14.5,82,1
385 | ford granada l,22.0,6,232.0,112,2835,14.7,82,1
386 | toyota celica gt,32.0,4,144.0,96,2665,13.9,82,3
387 | dodge charger 2.2,36.0,4,135.0,84,2370,13.0,82,1
388 | chevrolet camaro,27.0,4,151.0,90,2950,17.3,82,1
389 | ford mustang gl,27.0,4,140.0,86,2790,15.6,82,1
390 | vw pickup,44.0,4,97.0,52,2130,24.6,82,2
391 | dodge rampage,32.0,4,135.0,84,2295,11.6,82,1
392 | ford ranger,28.0,4,120.0,79,2625,18.6,82,1
393 | chevy s-10,31.0,4,119.0,82,2720,19.4,82,1
394 | 


--------------------------------------------------------------------------------
/data/auto-mpg.names:
--------------------------------------------------------------------------------
 1 | 1. Title: Auto-Mpg Data
 2 | 
 3 | 2. Sources:
 4 |    (a) Origin:  This dataset was taken from the StatLib library which is
 5 |                 maintained at Carnegie Mellon University. The dataset was 
 6 |                 used in the 1983 American Statistical Association Exposition.
 7 |    (c) Date: July 7, 1993
 8 | 
 9 | 3. Past Usage:
10 |     -  See 2b (above)
11 |     -  Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning.
12 |        In Proceedings on the Tenth International Conference of Machine 
13 |        Learning, 236-243, University of Massachusetts, Amherst. Morgan
14 |        Kaufmann.
15 | 
16 | 4. Relevant Information:
17 | 
18 |    This dataset is a slightly modified version of the dataset provided in
19 |    the StatLib library.  In line with the use by Ross Quinlan (1993) in
20 |    predicting the attribute "mpg", 8 of the original instances were removed 
21 |    because they had unknown values for the "mpg" attribute.  The original 
22 |    dataset is available in the file "auto-mpg.data-original".
23 | 
24 |    "The data concerns city-cycle fuel consumption in miles per gallon,
25 |     to be predicted in terms of 3 multivalued discrete and 5 continuous
26 |     attributes." (Quinlan, 1993)
27 | 
28 | 5. Number of Instances: 398
29 | 
30 | 6. Number of Attributes: 9 including the class attribute
31 | 
32 | 7. Attribute Information:
33 | 
34 |     1. mpg:           continuous
35 |     2. cylinders:     multi-valued discrete
36 |     3. displacement:  continuous
37 |     4. horsepower:    continuous
38 |     5. weight:        continuous
39 |     6. acceleration:  continuous
40 |     7. model year:    multi-valued discrete
41 |     8. origin:        multi-valued discrete
42 |     9. car name:      string (unique for each instance)
43 | 
44 | 8. Missing Attribute Values:  horsepower has 6 missing values
45 | 
46 | 


--------------------------------------------------------------------------------
/data/penguins.csv:
--------------------------------------------------------------------------------
  1 | species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
  2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE
  3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE
  4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE
  5 | Adelie,Torgersen,NA,NA,NA,NA,NA
  6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE
  7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE
  8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE
  9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE
 10 | Adelie,Torgersen,34.1,18.1,193,3475,NA
 11 | Adelie,Torgersen,42,20.2,190,4250,NA
 12 | Adelie,Torgersen,37.8,17.1,186,3300,NA
 13 | Adelie,Torgersen,37.8,17.3,180,3700,NA
 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE
 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE
 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE
 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE
 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE
 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE
 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE
 21 | Adelie,Torgersen,46,21.5,194,4200,MALE
 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE
 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE
 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE
 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE
 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE
 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE
 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE
 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE
 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE
 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE
 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE
 33 | Adelie,Dream,37.2,18.1,178,3900,MALE
 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE
 35 | Adelie,Dream,40.9,18.9,184,3900,MALE
 36 | Adelie,Dream,36.4,17,195,3325,FEMALE
 37 | Adelie,Dream,39.2,21.1,196,4150,MALE
 38 | Adelie,Dream,38.8,20,190,3950,MALE
 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE
 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE
 41 | Adelie,Dream,39.8,19.1,184,4650,MALE
 42 | Adelie,Dream,36.5,18,182,3150,FEMALE
 43 | Adelie,Dream,40.8,18.4,195,3900,MALE
 44 | Adelie,Dream,36,18.5,186,3100,FEMALE
 45 | Adelie,Dream,44.1,19.7,196,4400,MALE
 46 | Adelie,Dream,37,16.9,185,3000,FEMALE
 47 | Adelie,Dream,39.6,18.8,190,4600,MALE
 48 | Adelie,Dream,41.1,19,182,3425,MALE
 49 | Adelie,Dream,37.5,18.9,179,2975,NA
 50 | Adelie,Dream,36,17.9,190,3450,FEMALE
 51 | Adelie,Dream,42.3,21.2,191,4150,MALE
 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE
 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE
 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE
 55 | Adelie,Biscoe,42,19.5,200,4050,MALE
 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE
 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE
 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE
 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE
 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE
 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE
 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE
 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE
 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE
 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE
 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE
 67 | Adelie,Biscoe,41.6,18,192,3950,MALE
 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE
 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE
 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE
 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE
 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE
 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE
 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE
 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE
 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE
 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE
 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE
 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE
 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE
 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE
 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE
 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE
 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE
 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE
 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE
 87 | Adelie,Dream,41.3,20.3,194,3550,MALE
 88 | Adelie,Dream,36.3,19.5,190,3800,MALE
 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE
 90 | Adelie,Dream,38.3,19.2,189,3950,MALE
 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE
 92 | Adelie,Dream,35.7,18,202,3550,FEMALE
 93 | Adelie,Dream,41.1,18.1,205,4300,MALE
 94 | Adelie,Dream,34,17.1,185,3400,FEMALE
 95 | Adelie,Dream,39.6,18.1,186,4450,MALE
 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE
 97 | Adelie,Dream,40.8,18.9,208,4300,MALE
 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE
 99 | Adelie,Dream,40.3,18.5,196,4350,MALE
100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE
101 | Adelie,Dream,43.2,18.5,192,4100,MALE
102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE
103 | Adelie,Biscoe,41,20,203,4725,MALE
104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE
105 | Adelie,Biscoe,37.8,20,190,4250,MALE
106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE
107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE
108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE
109 | Adelie,Biscoe,38.2,20,190,3900,MALE
110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE
111 | Adelie,Biscoe,43.2,19,197,4775,MALE
112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE
113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE
114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE
115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE
116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE
117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE
118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE
119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE
120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE
121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE
122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE
123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE
124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE
125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE
126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE
127 | Adelie,Torgersen,40.6,19,199,4000,MALE
128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE
129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE
130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE
131 | Adelie,Torgersen,44.1,18,210,4000,MALE
132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE
133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE
134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE
135 | Adelie,Dream,37.5,18.5,199,4475,MALE
136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE
137 | Adelie,Dream,41.1,17.5,190,3900,MALE
138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE
139 | Adelie,Dream,40.2,20.1,200,3975,MALE
140 | Adelie,Dream,37,16.5,185,3400,FEMALE
141 | Adelie,Dream,39.7,17.9,193,4250,MALE
142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE
143 | Adelie,Dream,40.6,17.2,187,3475,MALE
144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE
145 | Adelie,Dream,40.7,17,190,3725,MALE
146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE
147 | Adelie,Dream,39,18.7,185,3650,MALE
148 | Adelie,Dream,39.2,18.6,190,4250,MALE
149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE
150 | Adelie,Dream,36,17.8,195,3450,FEMALE
151 | Adelie,Dream,37.8,18.1,193,3750,MALE
152 | Adelie,Dream,36,17.1,187,3700,FEMALE
153 | Adelie,Dream,41.5,18.5,201,4000,MALE
154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE
155 | Chinstrap,Dream,50,19.5,196,3900,MALE
156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE
157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE
158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE
159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE
160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE
161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE
162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE
163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE
164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE
165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE
166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE
167 | Chinstrap,Dream,52,18.1,201,4050,MALE
168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE
169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE
170 | Chinstrap,Dream,50.3,20,197,3300,MALE
171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE
172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE
173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE
174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE
175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE
176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE
177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE
178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE
179 | Chinstrap,Dream,52,19,197,4150,MALE
180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE
181 | Chinstrap,Dream,49.5,19,200,3800,MALE
182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE
183 | Chinstrap,Dream,52.8,20,205,4550,MALE
184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE
185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE
186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE
187 | Chinstrap,Dream,51,18.8,203,4100,MALE
188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE
189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE
190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE
191 | Chinstrap,Dream,52,20.7,210,4800,MALE
192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE
193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE
194 | Chinstrap,Dream,49,19.5,210,3950,MALE
195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE
196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE
197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE
198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE
199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE
200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE
201 | Chinstrap,Dream,49,19.6,212,4300,MALE
202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE
203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE
204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE
205 | Chinstrap,Dream,51.4,19,201,3950,MALE
206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE
207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE
208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE
209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE
210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE
211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE
212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE
213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE
214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE
215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE
216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE
217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE
218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE
219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE
220 | Chinstrap,Dream,50.8,19,210,4100,MALE
221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE
222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE
223 | Gentoo,Biscoe,50,16.3,230,5700,MALE
224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE
225 | Gentoo,Biscoe,50,15.2,218,5700,MALE
226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE
227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE
228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE
229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE
230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE
231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE
232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE
233 | Gentoo,Biscoe,49,16.1,216,5550,MALE
234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE
235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE
236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE
237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE
238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE
239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE
240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE
241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE
242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE
243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE
244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE
245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE
246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE
247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE
248 | Gentoo,Biscoe,44.5,14.3,216,4100,NA
249 | Gentoo,Biscoe,47.8,15,215,5650,MALE
250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE
251 | Gentoo,Biscoe,50,15.3,220,5550,MALE
252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE
253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE
254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE
255 | Gentoo,Biscoe,59.6,17,230,6050,MALE
256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE
257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE
258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE
259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE
260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE
261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE
262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE
263 | Gentoo,Biscoe,49.6,16,225,5700,MALE
264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE
265 | Gentoo,Biscoe,49.6,15,216,4750,MALE
266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE
267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE
268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE
269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE
270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE
271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE
272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE
273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE
274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE
275 | Gentoo,Biscoe,50.1,15,225,5000,MALE
276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE
277 | Gentoo,Biscoe,45,15.4,220,5050,MALE
278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE
279 | Gentoo,Biscoe,45.5,15,220,5000,MALE
280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE
281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE
282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE
283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE
284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE
285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE
286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE
287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE
288 | Gentoo,Biscoe,46.2,14.4,214,4650,NA
289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE
290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE
291 | Gentoo,Biscoe,50.7,15,223,5550,MALE
292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE
293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE
294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE
295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE
296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE
297 | Gentoo,Biscoe,48.6,16,230,5800,MALE
298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE
299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE
300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE
301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE
302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE
303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE
304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE
305 | Gentoo,Biscoe,50,15.9,224,5350,MALE
306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE
307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE
308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE
309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE
310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE
311 | Gentoo,Biscoe,52.1,17,230,5550,MALE
312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE
313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE
314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE
315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE
316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE
317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE
318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE
319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE
320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE
321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE
322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE
323 | Gentoo,Biscoe,55.9,17,228,5600,MALE
324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE
325 | Gentoo,Biscoe,49.1,15,228,5500,MALE
326 | Gentoo,Biscoe,47.3,13.8,216,4725,NA
327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE
328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE
329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE
330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE
331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE
332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE
333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE
334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE
335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE
336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE
337 | Gentoo,Biscoe,55.1,16,230,5850,MALE
338 | Gentoo,Biscoe,44.5,15.7,217,4875,.
339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE
340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE
341 | Gentoo,Biscoe,NA,NA,NA,NA,NA
342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE
343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE
344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE
345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE
346 | 


--------------------------------------------------------------------------------
/data/penguins_X_test.csv:
--------------------------------------------------------------------------------
 1 | ,Dream,Torgersen,Male,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
 2 | 0,1.0,0.0,1.0,1.1137063470051127,1.1966994432156142,-0.34260576201176546,-0.38521456736415055
 3 | 1,1.0,0.0,1.0,1.35500938885622,1.0424664404409307,-0.5528986989244922,-0.6928426214811282
 4 | 2,1.0,0.0,0.0,0.22274126940102307,-0.29421958360632183,-0.6930939901996434,-1.1850475080682923
 5 | 3,0.0,0.0,0.0,0.16705595205076665,-1.7337276095033645,0.778956568189444,0.6607208166335733
 6 | 4,0.0,0.0,1.0,1.9118625623587762,-0.7569185919303723,2.1108118353033802,1.7681818114546928
 7 | 5,0.0,0.0,0.0,0.5382914010524709,-1.32243960210421,1.4799330245652,0.8145348436920622
 8 | 6,1.0,0.0,1.0,-1.2065152092555387,0.6825894339666704,-0.13231282509903866,0.322329957104898
 9 | 7,1.0,0.0,1.0,-1.1508298919052837,0.4769454302670937,-0.5528986989244922,-0.5697913998343371
10 | 8,0.0,0.0,1.0,0.6125384908528114,-0.962562595629949,1.4799330245652,1.2759769248675286
11 | 9,0.0,0.0,1.0,-0.48260608370221575,0.7340004348915655,-0.6930939901996434,-0.6313170106577326
12 | 10,0.0,0.0,1.0,-1.0765828021049417,0.4769454302670937,-1.1136798640250969,-0.323688956540755
13 | 11,0.0,0.0,0.0,0.4640443112521303,-1.887960612278048,0.6387612769142929,0.41461837333999124
14 | 12,0.0,1.0,1.0,-1.6519977480575836,1.145288442290719,-0.5528986989244922,-0.016060902423777418
15 | 13,0.0,0.0,1.0,1.0951445745550272,-0.5512745882307937,0.8490542138270196,1.4605537573377152
16 | 14,0.0,0.0,0.0,-0.03712354490017095,-1.6823166085784702,0.4985659856391417,0.10699031922301364
17 | 15,0.0,0.0,0.0,-1.670559520507669,0.3741234284173035,-0.7631916358372189,-0.9389450647747103
18 | 16,0.0,1.0,1.0,-0.2041794969509376,0.21989042564262185,-0.34260576201176546,0.5991952058101778
19 | 17,0.0,0.0,0.0,-0.6496620357529824,0.3741234284173035,-0.9734845727499457,-1.246573118891688
20 | 18,1.0,0.0,1.0,1.8933007899086922,1.8650424552392413,0.007882466176112518,0.10699031922301364
21 | 19,1.0,0.0,1.0,1.484941796006817,1.8136314543143461,0.6387612769142929,0.7222464274569689
22 | 20,0.0,0.0,1.0,1.0208974847546866,-0.49986358730590036,1.970616544028229,1.9527586439248794
23 | 21,0.0,0.0,1.0,0.5011678561523012,-0.962562595629949,1.2696400876524732,1.214451314044133
24 | 22,0.0,0.0,0.0,1.2065152092555387,-1.013973596554844,1.0593471507397463,0.968348870750551
25 | 23,0.0,0.0,0.0,0.816717987803749,-1.4252616039539985,0.1480777574512637,0.5069067895750845
26 | 24,1.0,0.0,1.0,0.9837739398545157,1.4023434469151907,0.1480777574512637,-0.200637734893964
27 | 25,1.0,0.0,1.0,-0.8909650776040896,0.7340004348915655,-0.7631916358372189,0.04546470839961811
28 | 26,0.0,0.0,0.0,-0.129932407150597,-1.9393716132029422,0.5686636312767173,0.2300415408698047
29 | 27,1.0,0.0,0.0,-2.2088509215601397,-0.8597405937801605,-0.9033869271123701,-1.4311499513618744
30 | 28,1.0,0.0,1.0,1.484941796006817,0.4769454302670937,0.007882466176112518,-0.200637734893964
31 | 29,0.0,0.0,0.0,0.09280886225042606,-1.271028601179316,0.9191518594645952,0.7837720382803643
32 | 30,1.0,0.0,0.0,-0.5754149459526419,-0.29421958360632183,-0.9734845727499457,-1.246573118891688
33 | 31,0.0,0.0,0.0,-1.5035035684569025,1.0424664404409307,-0.8332892814747945,-0.5082657890109417
34 | 32,0.0,1.0,0.0,-1.5035035684569025,-0.29421958360632183,-0.7631916358372189,-1.4311499513618744
35 | 33,0.0,0.0,1.0,-0.6311002633028969,0.8368224367413539,-0.5528986989244922,-0.5082657890109417
36 | 34,0.0,1.0,0.0,-0.9837739398545157,0.9396444385911422,-0.41270340764934105,-0.9389450647747103
37 | 35,0.0,0.0,1.0,0.761032670453494,-1.0653845974797382,0.5686636312767173,1.5836049789845061
38 | 36,1.0,0.0,1.0,1.0208974847546866,0.9396444385911422,-0.06221517946146307,-0.5082657890109417
39 | 37,0.0,1.0,0.0,-0.9466503949543461,0.3227124274924102,-1.3940704465753992,-0.7236054268928259
40 | 38,1.0,0.0,1.0,-0.5382914010524709,0.9396444385911422,-1.3239728009378238,-0.969707870186408
41 | 39,0.0,0.0,1.0,-0.7981562153536635,0.8882334376662472,-1.1837775096626726,-0.8158938431279192
42 | 40,0.0,1.0,0.0,-1.35500938885622,1.0938774413658257,-0.5528986989244922,-0.9389450647747103
43 | 41,1.0,0.0,1.0,1.2436387541557097,1.2995214450654025,0.1480777574512637,-0.200637734893964
44 | 42,0.0,1.0,1.0,-0.35267367655161874,0.9910554395160374,-0.41270340764934105,-0.2621633457173595
45 | 43,1.0,0.0,0.0,0.2969883592013636,1.145288442290719,-0.48280105328691664,-0.846656648539617
46 | 44,1.0,0.0,0.0,-1.3178858439560504,0.7340004348915655,-0.8332892814747945,-0.8774194539513147
47 | 45,0.0,1.0,1.0,0.3341119041015333,0.8882334376662472,-0.27250811637418987,-0.07758651324717294
48 | 46,0.0,0.0,1.0,1.7448066103080095,-0.7055075910054771,1.2696400876524732,1.5836049789845061
49 | 47,1.0,0.0,0.0,-1.2436387541557097,-0.19139758175653346,-0.6229963445620678,-1.49267556218527
50 | 48,1.0,0.0,1.0,1.2622005266057938,0.9396444385911422,0.6387612769142929,-0.13911212407056847
51 | 49,0.0,0.0,0.0,-0.24130304185110724,-1.7851386104282596,0.4985659856391417,-0.323688956540755
52 | 50,0.0,0.0,0.0,0.6867855806531533,-1.1167955984046325,1.0593471507397463,0.6607208166335733
53 | 51,0.0,0.0,0.0,0.9466503949543461,-1.3738506030291042,0.778956568189444,0.5069067895750845
54 | 52,0.0,0.0,1.0,0.5197296286023854,-0.5512745882307937,0.9892495051021708,1.5836049789845061
55 | 53,0.0,0.0,1.0,-0.5382914010524709,0.5283564311919869,-0.6229963445620678,-0.200637734893964
56 | 54,0.0,0.0,0.0,-1.4106947062064763,-0.03716457898184999,-1.1837775096626726,-1.6772523946554565
57 | 55,0.0,0.0,0.0,-0.09280886225042606,-1.5280836058037877,1.3397377332900489,0.5991952058101778
58 | 56,0.0,0.0,0.0,0.5939767184027273,-1.7851386104282596,0.9191518594645952,0.8760604545154577
59 | 57,0.0,0.0,0.0,0.0,-1.8365496113531536,0.4985659856391417,0.16851593004640916
60 | 58,0.0,0.0,1.0,1.3178858439560504,-0.44845258638100527,1.3397377332900489,2.1988610872184613
61 | 59,0.0,0.0,0.0,-0.07424708980034059,-1.6823166085784702,1.129444796377322,0.8452976491037599
62 | 60,0.0,0.0,1.0,-0.37123544900170424,1.1966994432156142,-0.06221517946146307,-0.200637734893964
63 | 61,1.0,0.0,0.0,-0.14849417960068118,-0.29421958360632183,-0.9734845727499457,-1.615726783832061
64 | 62,0.0,1.0,1.0,-0.5382914010524709,0.7340004348915655,-0.8332892814747945,-1.092759091833199
65 | 63,0.0,0.0,1.0,0.7053473531032375,-1.1167955984046325,0.9892495051021708,1.7681818114546928
66 | 64,1.0,0.0,1.0,1.3921329337563908,0.7854114358164588,-0.9734845727499457,-1.1850475080682923
67 | 65,1.0,0.0,0.0,-1.484941796006817,-0.03716457898184999,-0.9734845727499457,-0.6313170106577326
68 | 66,0.0,0.0,0.0,0.48260608370221575,-1.5280836058037877,0.6387612769142929,0.7837720382803643
69 | 67,1.0,0.0,1.0,1.763368382758095,1.4023434469151907,0.2882730487264149,0.3530927625165957
70 | 68,0.0,1.0,1.0,-0.48260608370221575,0.6825894339666704,0.07798011181368811,-0.41597737277584834
71 | 69,0.0,0.0,0.0,-1.3921329337563908,-0.29421958360632183,-1.3940704465753992,-1.6772523946554565
72 | 70,1.0,0.0,1.0,0.9280886225042606,1.248110444140509,0.778956568189444,0.10699031922301364
73 | 71,0.0,0.0,0.0,-1.577750658257243,-0.49986358730590036,-0.41270340764934105,-1.0619962864215013
74 | 72,1.0,0.0,1.0,1.0580210296548576,0.7340004348915655,-0.41270340764934105,-0.7543682323045237
75 | 73,1.0,0.0,0.0,-0.3341119041015333,0.6825894339666704,-1.464168092212975,-0.8158938431279192
76 | 74,0.0,1.0,1.0,-0.2784265867512782,1.8136314543143461,-0.27250811637418987,0.3530927625165957
77 | 75,0.0,1.0,0.0,-1.3735711613063053,0.3227124274924102,-1.1136798640250969,-0.6313170106577326
78 | 76,1.0,0.0,0.0,-1.8561772450085212,-0.03716457898184999,-1.1136798640250969,-1.0004706755981059
79 | 77,0.0,0.0,1.0,-0.4454825388020448,0.4255344293421986,-0.6229963445620678,-0.323688956540755
80 | 78,0.0,0.0,0.0,-1.614874203157414,0.8882334376662472,-0.9734845727499457,-0.5082657890109417
81 | 79,0.0,0.0,1.0,1.0765828021049417,-0.19139758175653346,2.0407141896658048,1.8297074222780882
82 | 80,0.0,0.0,1.0,1.039459257204772,-1.1167955984046325,1.0593471507397463,0.6607208166335733
83 | 81,0.0,0.0,1.0,1.3178858439560504,-0.3456305845312169,1.6902259614779267,1.2759769248675286
84 | 82,0.0,0.0,0.0,0.4640443112521303,-1.4252616039539985,1.129444796377322,0.8452976491037599
85 | 83,0.0,0.0,0.0,-1.002335712304601,0.014246421943043286,-0.13231282509903866,-0.5697913998343371
86 | 


--------------------------------------------------------------------------------
/data/penguins_X_train.csv:
--------------------------------------------------------------------------------
  1 | ,Dream,Torgersen,Male,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
  2 | 0,1.0,0.0,1.0,1.169391664355368,1.453754447840086,-0.27250811637418987,-1.1235218972448968
  3 | 1,0.0,0.0,0.0,-1.1879534368054532,-0.08857557990674508,-1.1136798640250969,-0.7543682323045237
  4 | 2,0.0,0.0,0.0,0.3341119041015333,-1.5280836058037877,1.2696400876524732,0.5991952058101778
  5 | 3,1.0,0.0,1.0,-0.5382914010524709,0.16847942471772676,-0.7631916358372189,-0.38521456736415055
  6 | 4,0.0,0.0,1.0,-0.723909125553323,0.8882334376662472,-0.9033869271123701,0.10699031922301364
  7 | 5,0.0,0.0,1.0,0.816717987803749,-1.32243960210421,0.8490542138270196,2.014284254748275
  8 | 6,1.0,0.0,0.0,1.0765828021049417,0.06565742286793838,-0.20241047073661425,-0.6620798160694304
  9 | 7,0.0,0.0,0.0,-0.816717987803749,1.8136314543143461,-0.6930939901996434,-0.38521456736415055
 10 | 8,0.0,0.0,0.0,0.9466503949543461,-1.2196176002544208,1.3397377332900489,1.1529257032207376
 11 | 9,0.0,1.0,1.0,-1.169391664355368,1.3509324459902976,-0.20241047073661425,-0.8774194539513147
 12 | 10,1.0,0.0,1.0,-0.7795944429035794,0.9910554395160374,-1.1837775096626726,0.5376695949867824
 13 | 11,1.0,0.0,0.0,0.6496620357529824,-0.19139758175653346,-0.13231282509903866,-0.38521456736415055
 14 | 12,0.0,1.0,0.0,-1.7448066103080095,0.014246421943043286,-0.8332892814747945,-1.246573118891688
 15 | 13,1.0,0.0,0.0,0.37123544900170424,0.8882334376662472,-0.41270340764934105,-0.07758651324717294
 16 | 14,1.0,0.0,1.0,-0.816717987803749,0.4769454302670937,-1.0435822183875214,0.2915671516932002
 17 | 15,0.0,0.0,0.0,-0.7981562153536635,0.2713014265675151,-0.5528986989244922,-1.246573118891688
 18 | 16,1.0,0.0,1.0,1.522065340906988,0.8368224367413539,-0.27250811637418987,-0.9389450647747103
 19 | 17,0.0,0.0,0.0,-0.2598648143011927,-1.7851386104282596,0.8490542138270196,0.9068232599271554
 20 | 18,1.0,0.0,0.0,-1.2993240715059649,-0.3456305845312169,-1.1136798640250969,-1.0004706755981059
 21 | 19,0.0,0.0,0.0,-0.09280886225042606,-1.013973596554844,0.8490542138270196,0.5376695949867824
 22 | 20,1.0,0.0,1.0,1.4292564786565618,1.6079874506147693,-0.48280105328691664,-0.5390285944226394
 23 | 21,0.0,0.0,0.0,0.6311002633028969,-1.32243960210421,0.778956568189444,0.6299580112218756
 24 | 22,0.0,1.0,1.0,-1.2622005266057938,1.145288442290719,-1.1837775096626726,-0.38521456736415055
 25 | 23,0.0,0.0,1.0,-1.0765828021049417,1.453754447840086,-0.7631916358372189,-0.38521456736415055
 26 | 24,0.0,1.0,1.0,-0.9095268500541751,0.7854114358164588,-1.3940704465753992,-0.5697913998343371
 27 | 25,1.0,0.0,0.0,-0.2784265867512782,0.06565742286793838,-0.9734845727499457,-1.0619962864215013
 28 | 26,0.0,1.0,1.0,-1.2436387541557097,1.7108094524645578,-0.13231282509903866,-0.5390285944226394
 29 | 27,0.0,0.0,1.0,0.9652121674044315,-1.013973596554844,1.4098353789276243,2.5680147521588346
 30 | 28,0.0,0.0,0.0,-1.1322681194551982,0.7340004348915655,-2.0249492573135797,-1.3080987297150835
 31 | 29,0.0,1.0,1.0,-0.16705595205076665,1.0424664404409307,-0.27250811637418987,-0.8774194539513147
 32 | 30,0.0,0.0,0.0,0.4640443112521303,-1.3738506030291042,0.8490542138270196,0.2300415408698047
 33 | 31,0.0,0.0,1.0,-0.6311002633028969,0.7340004348915655,-1.253875155300248,-0.8158938431279192
 34 | 32,0.0,1.0,1.0,-0.8909650776040896,1.248110444140509,-0.41270340764934105,0.5684324003984801
 35 | 33,0.0,0.0,1.0,0.6682238082030679,-1.3738506030291042,0.9892495051021708,1.4605537573377152
 36 | 34,0.0,0.0,1.0,-0.5011678561523012,2.0192754580139245,-0.41270340764934105,0.2300415408698047
 37 | 35,1.0,0.0,0.0,0.5568531735025564,0.06565742286793838,-1.1136798640250969,-0.6313170106577326
 38 | 36,0.0,0.0,1.0,1.2065152092555387,-0.654096590080583,1.6902259614779267,1.4605537573377152
 39 | 37,1.0,0.0,1.0,1.614874203157414,1.3509324459902976,-0.27250811637418987,-0.6005542052460349
 40 | 38,0.0,0.0,1.0,1.1137063470051127,-0.654096590080583,1.6201283158403512,1.3990281465143197
 41 | 39,0.0,0.0,0.0,0.16705595205076665,-1.9907826141278362,0.8490542138270196,1.091400092397342
 42 | 40,1.0,0.0,0.0,0.31555013165144913,-0.08857557990674508,-0.41270340764934105,-0.6928426214811282
 43 | 41,1.0,0.0,0.0,-0.8352797602538345,0.3227124274924102,-0.9033869271123701,-1.1235218972448968
 44 | 42,1.0,0.0,1.0,1.2250769817056242,1.145288442290719,-0.5528986989244922,-0.5082657890109417
 45 | 43,0.0,1.0,1.0,-1.7448066103080095,2.0192754580139245,-0.20241047073661425,0.2300415408698047
 46 | 44,1.0,0.0,0.0,-0.2784265867512782,-0.24280858268142855,-0.9734845727499457,-1.0619962864215013
 47 | 45,1.0,0.0,1.0,-0.5939767184027273,0.8882334376662472,0.4985659856391417,0.10699031922301364
 48 | 46,0.0,0.0,0.0,0.6496620357529824,-1.5280836058037877,0.5686636312767173,0.47614398416338677
 49 | 47,0.0,1.0,0.0,-1.35500938885622,0.8368224367413539,-0.9734845727499457,-0.5082657890109417
 50 | 48,0.0,1.0,0.0,-1.447818251106646,-0.5512745882307937,-0.9734845727499457,-0.8158938431279192
 51 | 49,1.0,0.0,0.0,-1.3921329337563908,0.4255344293421986,-1.3239728009378238,-1.3080987297150835
 52 | 50,0.0,0.0,1.0,1.1879534368054532,-0.7569185919303723,1.4799330245652,1.8912330331014837
 53 | 51,0.0,0.0,0.0,0.4454825388020448,-1.1167955984046325,1.0593471507397463,0.5991952058101778
 54 | 52,0.0,0.0,1.0,0.2784265867512782,-1.1167955984046325,1.3397377332900489,0.968348870750551
 55 | 53,0.0,0.0,0.0,0.3341119041015333,-1.32243960210421,0.6387612769142929,-0.016060902423777418
 56 | 54,0.0,0.0,0.0,-1.0951445745550272,-0.3456305845312169,-0.20241047073661425,-0.47750298359924387
 57 | 55,0.0,0.0,0.0,0.4640443112521303,-1.2196176002544208,1.129444796377322,1.214451314044133
 58 | 56,0.0,0.0,0.0,0.2784265867512782,-1.6823166085784702,0.6387612769142929,-0.016060902423777418
 59 | 57,1.0,0.0,0.0,-1.3735711613063053,0.6311784330417753,-1.1837775096626726,-0.9081822593630126
 60 | 58,1.0,0.0,0.0,-2.023233197059288,-0.5512745882307937,-1.6043633834881261,-1.615726783832061
 61 | 59,0.0,0.0,1.0,0.8352797602538345,-1.579494606728682,1.3397377332900489,1.337502535690924
 62 | 60,1.0,0.0,0.0,-1.5406271133570721,0.4255344293421986,0.07798011181368811,-0.8158938431279192
 63 | 61,0.0,0.0,1.0,0.22274126940102307,-0.397041585456112,1.5500306702027755,2.137335476395066
 64 | 62,0.0,0.0,0.0,-0.37123544900170424,-1.887960612278048,0.6387612769142929,-0.07758651324717294
 65 | 63,0.0,0.0,1.0,0.2969883592013636,1.6079874506147693,-0.6930939901996434,0.47614398416338677
 66 | 64,1.0,0.0,1.0,-0.4640443112521303,0.6825894339666704,0.007882466176112518,-0.2621633457173595
 67 | 65,0.0,0.0,1.0,1.3921329337563908,-0.44845258638100527,2.0407141896658048,1.5836049789845061
 68 | 66,1.0,0.0,0.0,0.4454825388020448,0.3227124274924102,-0.6930939901996434,-0.6313170106577326
 69 | 67,0.0,0.0,1.0,0.9280886225042606,-0.5512745882307937,1.0593471507397463,1.6451305898079018
 70 | 68,0.0,0.0,1.0,0.8909650776040896,-0.49986358730590036,1.4799330245652,2.1988610872184613
 71 | 69,1.0,0.0,0.0,0.2598648143011927,0.7854114358164588,-0.9033869271123701,-0.846656648539617
 72 | 70,0.0,0.0,1.0,-0.5568531735025564,1.453754447840086,0.1480777574512637,0.6299580112218756
 73 | 71,0.0,0.0,1.0,0.22274126940102307,-0.7055075910054771,0.9892495051021708,1.337502535690924
 74 | 72,1.0,0.0,1.0,-0.5754149459526419,0.8882334376662472,-1.1837775096626726,-0.38521456736415055
 75 | 73,1.0,0.0,1.0,1.1508298919052837,0.8368224367413539,0.07798011181368811,-0.5082657890109417
 76 | 74,0.0,0.0,0.0,0.8352797602538345,-1.1167955984046325,1.2696400876524732,0.7837720382803643
 77 | 75,0.0,0.0,1.0,-0.24130304185110724,0.5797674321168821,-0.34260576201176546,-0.16987492948226623
 78 | 76,1.0,0.0,0.0,-1.484941796006817,0.3741234284173035,-0.7631916358372189,-0.9389450647747103
 79 | 77,1.0,0.0,1.0,1.484941796006817,0.9396444385911422,-0.27250811637418987,-0.07758651324717294
 80 | 78,0.0,0.0,1.0,0.42692076635195936,-0.7055075910054771,0.9892495051021708,1.0298744815739465
 81 | 79,0.0,0.0,1.0,1.1879534368054532,-0.962562595629949,1.6201283158403512,1.6451305898079018
 82 | 80,1.0,0.0,1.0,1.039459257204772,0.5283564311919869,-0.5528986989244922,-0.5390285944226394
 83 | 81,0.0,1.0,0.0,-1.002335712304601,-0.08857557990674508,-0.9033869271123701,-1.615726783832061
 84 | 82,1.0,0.0,1.0,1.35500938885622,0.5283564311919869,-0.27250811637418987,-0.5697913998343371
 85 | 83,1.0,0.0,0.0,-1.3364476164061359,0.6825894339666704,-0.5528986989244922,-0.8774194539513147
 86 | 84,1.0,0.0,1.0,1.633435975607498,1.453754447840086,0.2882730487264149,0.41461837333999124
 87 | 85,1.0,0.0,0.0,2.5986481430119297,0.3227124274924102,-1.3940704465753992,-0.6313170106577326
 88 | 86,1.0,0.0,0.0,-0.8352797602538345,-0.24280858268142855,-1.6043633834881261,-1.1850475080682923
 89 | 87,1.0,0.0,0.0,0.48260608370221575,0.3227124274924102,-0.5528986989244922,-0.5082657890109417
 90 | 88,0.0,0.0,0.0,0.2041794969509376,-1.4252616039539985,0.6387612769142929,0.2300415408698047
 91 | 89,1.0,0.0,0.0,1.2807622990558793,0.3741234284173035,-0.34260576201176546,-0.6620798160694304
 92 | 90,1.0,0.0,1.0,-0.6867855806531533,0.6825894339666704,-0.34260576201176546,0.16851593004640916
 93 | 91,0.0,1.0,0.0,-0.8352797602538345,0.11706842379283165,-1.0435822183875214,-0.5082657890109417
 94 | 92,1.0,0.0,0.0,-1.447818251106646,0.06565742286793838,-0.9734845727499457,-1.1235218972448968
 95 | 93,1.0,0.0,0.0,1.1508298919052837,0.7854114358164588,-0.20241047073661425,-0.5390285944226394
 96 | 94,1.0,0.0,1.0,1.3735711613063053,0.9396444385911422,0.007882466176112518,-0.323688956540755
 97 | 95,1.0,0.0,1.0,-0.31555013165144913,2.070686458938818,-0.6930939901996434,-0.07758651324717294
 98 | 96,1.0,0.0,1.0,0.9280886225042606,1.1966994432156142,0.6387612769142929,-0.323688956540755
 99 | 97,0.0,0.0,0.0,0.24130304185110724,-1.7337276095033645,0.4985659856391417,-0.016060902423777418
100 | 98,1.0,0.0,1.0,1.35500938885622,1.4023434469151907,-0.20241047073661425,-0.6313170106577326
101 | 99,0.0,0.0,0.0,-1.0951445745550272,-0.08857557990674508,-1.3940704465753992,-1.2773359243033857
102 | 100,1.0,0.0,1.0,2.1902891491100545,1.3509324459902976,0.4284683400015661,-0.2621633457173595
103 | 101,0.0,1.0,1.0,-0.7981562153536635,0.6311784330417753,-0.7631916358372189,-0.38521456736415055
104 | 102,0.0,0.0,1.0,0.3897972214517897,-1.0653845974797382,0.9892495051021708,1.091400092397342
105 | 103,0.0,0.0,0.0,-0.22274126940102307,-1.5280836058037877,0.5686636312767173,0.5991952058101778
106 | 104,0.0,0.0,0.0,0.2598648143011927,-1.32243960210421,0.7088589225518684,0.7222464274569689
107 | 105,1.0,0.0,1.0,1.2622005266057938,0.6825894339666704,0.007882466176112518,0.2915671516932002
108 | 106,1.0,0.0,0.0,-1.1879534368054532,1.0938774413658257,-1.3940704465753992,-1.1235218972448968
109 | 107,1.0,0.0,1.0,0.9652121674044315,0.5283564311919869,-0.41270340764934105,0.2300415408698047
110 | 108,0.0,1.0,0.0,-0.816717987803749,0.014246421943043286,-0.34260576201176546,-0.8158938431279192
111 | 109,0.0,1.0,1.0,-0.6311002633028969,0.9396444385911422,-0.13231282509903866,-0.2621633457173595
112 | 110,0.0,0.0,1.0,1.522065340906988,-0.03716457898184999,1.9005188983906536,1.4605537573377152
113 | 111,1.0,0.0,0.0,0.6682238082030679,0.5797674321168821,-0.41270340764934105,-0.4467401781875461
114 | 112,0.0,0.0,1.0,0.4454825388020448,-0.8083295928552664,1.4098353789276243,0.968348870750551
115 | 113,0.0,0.0,1.0,1.5035035684569025,-0.08857557990674508,2.0407141896658048,1.6451305898079018
116 | 114,0.0,0.0,1.0,0.07424708980034059,0.06565742286793838,1.2696400876524732,1.2759769248675286
117 | 115,1.0,0.0,1.0,-1.0580210296548576,1.0424664404409307,-0.8332892814747945,-0.323688956540755
118 | 116,0.0,0.0,1.0,2.8956365022132933,-0.08857557990674508,2.0407141896658048,2.2603866980418568
119 | 117,0.0,1.0,0.0,-1.5406271133570721,-0.08857557990674508,-0.8332892814747945,-1.0619962864215013
120 | 118,0.0,0.0,0.0,-1.5406271133570721,-0.13998658083164017,-1.1136798640250969,-1.3080987297150835
121 | 119,0.0,0.0,1.0,1.0765828021049417,-0.654096590080583,1.970616544028229,2.137335476395066
122 | 120,1.0,0.0,0.0,-1.2436387541557097,0.3227124274924102,-0.6930939901996434,-1.0619962864215013
123 | 121,1.0,0.0,0.0,0.22274126940102307,0.3227124274924102,-0.20241047073661425,-0.323688956540755
124 | 122,1.0,0.0,0.0,0.5197296286023854,-0.3456305845312169,-0.8332892814747945,-0.6928426214811282
125 | 123,0.0,0.0,1.0,-0.6496620357529824,0.8882334376662472,-1.464168092212975,-0.323688956540755
126 | 124,1.0,0.0,0.0,-1.484941796006817,0.6825894339666704,-1.0435822183875214,-1.369624340538479
127 | 125,0.0,0.0,0.0,0.7795944429035794,-1.4766726048788925,0.6387612769142929,0.47614398416338677
128 | 126,0.0,1.0,1.0,0.37123544900170424,2.2249194617135015,-0.48280105328691664,-0.016060902423777418
129 | 127,0.0,0.0,0.0,-1.670559520507669,0.3741234284173035,-0.6229963445620678,-0.6005542052460349
130 | 128,1.0,0.0,1.0,0.8352797602538345,0.16847942471772676,-0.6930939901996434,-1.0004706755981059
131 | 129,0.0,1.0,0.0,-1.0208974847546866,0.3741234284173035,-0.7631916358372189,-1.092759091833199
132 | 130,1.0,0.0,1.0,-0.7981562153536635,0.3741234284173035,-0.5528986989244922,0.04546470839961811
133 | 131,1.0,0.0,0.0,-1.5591888858071574,0.16847942471772676,-0.6930939901996434,-1.2773359243033857
134 | 132,0.0,1.0,0.0,-0.9280886225042606,-0.03716457898184999,-0.6930939901996434,-1.4311499513618744
135 | 133,1.0,0.0,0.0,0.5011678561523012,0.3741234284173035,-0.41270340764934105,-1.1235218972448968
136 | 134,0.0,1.0,0.0,-0.5754149459526419,-0.19139758175653346,-0.6930939901996434,-0.6313170106577326
137 | 135,0.0,0.0,1.0,0.5197296286023854,-0.9111515947050548,0.9892495051021708,1.1529257032207376
138 | 136,0.0,1.0,1.0,-0.4640443112521303,0.5797674321168821,-0.41270340764934105,0.10699031922301364
139 | 137,0.0,0.0,0.0,0.6496620357529824,-1.6309056076535762,0.778956568189444,0.8145348436920622
140 | 138,1.0,0.0,0.0,0.35267367655161874,-0.03716457898184999,-0.7631916358372189,-0.7851310377162215
141 | 139,1.0,0.0,1.0,-0.7053473531032375,1.505165448764981,-0.06221517946146307,-0.2929261511290573
142 | 140,0.0,0.0,0.0,0.2784265867512782,-1.3738506030291042,0.778956568189444,0.6607208166335733
143 | 141,1.0,0.0,1.0,-0.5011678561523012,1.6079874506147693,-0.48280105328691664,-0.8158938431279192
144 | 142,1.0,0.0,1.0,-0.14849417960068118,0.6825894339666704,-0.6229963445620678,-0.13911212407056847
145 | 143,0.0,0.0,0.0,-1.763368382758095,0.4769454302670937,-0.9734845727499457,-1.615726783832061
146 | 144,0.0,1.0,0.0,-1.577750658257243,0.16847942471772676,-0.7631916358372189,-0.6313170106577326
147 | 145,1.0,0.0,0.0,0.3897972214517897,0.5283564311919869,-1.6043633834881261,-1.1850475080682923
148 | 146,0.0,0.0,0.0,0.4083589939018752,-1.579494606728682,1.129444796377322,0.19927873545810693
149 | 147,1.0,0.0,1.0,0.018561772450085477,1.2995214450654025,-0.34260576201176546,0.2300415408698047
150 | 148,1.0,0.0,1.0,1.2065152092555387,1.248110444140509,0.007882466176112518,-0.200637734893964
151 | 149,1.0,0.0,0.0,-0.7053473531032375,-0.03716457898184999,-0.5528986989244922,-1.0004706755981059
152 | 150,1.0,0.0,1.0,-0.9652121674044315,1.453754447840086,-0.7631916358372189,-0.323688956540755
153 | 151,1.0,0.0,0.0,1.1322681194551982,0.3741234284173035,-0.7631916358372189,-1.0004706755981059
154 | 152,1.0,0.0,0.0,0.761032670453494,-0.397041585456112,-0.13231282509903866,-1.092759091833199
155 | 153,0.0,1.0,0.0,-0.9652121674044315,0.21989042564262185,-0.6930939901996434,-1.1542847026565946
156 | 154,0.0,0.0,1.0,1.2622005266057938,-0.7569185919303723,1.7603236071155024,1.214451314044133
157 | 155,1.0,0.0,0.0,0.5382914010524709,-0.29421958360632183,-0.6229963445620678,-1.8618292271256431
158 | 156,0.0,1.0,1.0,-0.4083589939018752,1.145288442290719,-0.20241047073661425,0.2915671516932002
159 | 157,1.0,0.0,1.0,1.2807622990558793,0.9910554395160374,-0.34260576201176546,-0.8158938431279192
160 | 158,0.0,0.0,1.0,-0.9652121674044315,0.014246421943043286,-1.464168092212975,-0.5082657890109417
161 | 159,1.0,0.0,1.0,-0.816717987803749,0.8368224367413539,-0.7631916358372189,0.47614398416338677
162 | 160,0.0,1.0,0.0,-0.6867855806531533,0.4255344293421986,-0.41270340764934105,-1.1850475080682923
163 | 161,0.0,0.0,0.0,0.2041794969509376,-1.3738506030291042,0.9892495051021708,0.968348870750551
164 | 162,0.0,0.0,1.0,1.2065152092555387,-0.654096590080583,1.4799330245652,1.6451305898079018
165 | 163,0.0,1.0,1.0,-0.22274126940102307,0.6825894339666704,-0.41270340764934105,0.04546470839961811
166 | 164,0.0,0.0,1.0,1.039459257204772,-0.6026855891556887,1.6902259614779267,1.8297074222780882
167 | 165,0.0,0.0,1.0,1.0208974847546866,-0.5512745882307937,1.6201283158403512,1.7681818114546928
168 | 166,0.0,0.0,0.0,-0.11137063470051153,-1.4252616039539985,1.1995424420148977,0.47614398416338677
169 | 167,0.0,0.0,1.0,1.1508298919052837,-1.4766726048788925,1.1995424420148977,1.8297074222780882
170 | 168,1.0,0.0,1.0,-1.4292564786565618,1.1966994432156142,-0.7631916358372189,-0.5082657890109417
171 | 169,0.0,0.0,0.0,0.22274126940102307,-1.2196176002544208,0.778956568189444,1.214451314044133
172 | 170,1.0,0.0,0.0,-1.4106947062064763,-0.08857557990674508,-0.41270340764934105,-1.092759091833199
173 | 171,1.0,0.0,0.0,0.2784265867512782,-0.08857557990674508,-0.34260576201176546,-0.8774194539513147
174 | 172,0.0,1.0,0.0,-1.633435975607498,-0.654096590080583,-1.0435822183875214,-1.4311499513618744
175 | 173,1.0,0.0,0.0,-0.09280886225042606,0.4769454302670937,0.07798011181368811,-1.0004706755981059
176 | 174,1.0,0.0,1.0,1.2993240715059649,0.8368224367413539,0.1480777574512637,-0.13911212407056847
177 | 175,1.0,0.0,0.0,-1.484941796006817,0.3227124274924102,-0.41270340764934105,-0.9389450647747103
178 | 176,0.0,0.0,1.0,1.2436387541557097,-1.1167955984046325,1.5500306702027755,1.6451305898079018
179 | 177,0.0,0.0,1.0,-1.1879534368054532,0.9910554395160374,-0.48280105328691664,-0.5697913998343371
180 | 178,0.0,0.0,1.0,-1.1508298919052837,1.453754447840086,-0.7631916358372189,0.04546470839961811
181 | 179,0.0,0.0,0.0,-0.816717987803749,0.2713014265675151,-1.0435822183875214,-0.8774194539513147
182 | 180,1.0,0.0,0.0,0.31555013165144913,0.06565742286793838,-0.5528986989244922,-0.7543682323045237
183 | 181,0.0,0.0,1.0,1.1137063470051127,-0.44845258638100527,2.0407141896658048,1.8297074222780882
184 | 182,0.0,0.0,0.0,-0.42692076635195936,-1.271028601179316,0.6387612769142929,0.5991952058101778
185 | 183,1.0,0.0,0.0,-0.2969883592013636,0.06565742286793838,-1.3940704465753992,-0.7543682323045237
186 | 184,0.0,0.0,0.0,0.4083589939018752,-1.3738506030291042,0.5686636312767173,0.7222464274569689
187 | 185,0.0,1.0,1.0,-1.002335712304601,2.070686458938818,-0.6930939901996434,-0.5082657890109417
188 | 186,1.0,0.0,0.0,1.2065152092555387,0.6311784330417753,-0.06221517946146307,-1.0004706755981059
189 | 187,1.0,0.0,0.0,-1.0951445745550272,0.7340004348915655,-0.7631916358372189,-0.6313170106577326
190 | 188,0.0,0.0,1.0,1.35500938885622,-1.5280836058037877,1.1995424420148977,1.337502535690924
191 | 189,0.0,0.0,0.0,-1.169391664355368,-0.6026855891556887,-1.253875155300248,-1.4003871459501767
192 | 190,0.0,1.0,1.0,-0.8724033051540054,1.7622204533894528,-0.7631916358372189,-0.6928426214811282
193 | 191,0.0,0.0,1.0,0.18561772450085212,-0.9111515947050548,1.3397377332900489,1.0298744815739465
194 | 192,1.0,0.0,1.0,-0.8909650776040896,2.0192754580139245,-0.34260576201176546,-0.07758651324717294
195 | 193,1.0,0.0,1.0,-0.6125384908528114,-0.08857557990674508,-0.7631916358372189,-0.6005542052460349
196 | 194,1.0,0.0,1.0,-0.5382914010524709,0.4769454302670937,0.2882730487264149,0.10699031922301364
197 | 195,0.0,0.0,0.0,0.24130304185110724,-1.7851386104282596,0.6387612769142929,0.10699031922301364
198 | 196,0.0,0.0,1.0,0.4083589939018752,-1.1682065993295265,1.4098353789276243,1.337502535690924
199 | 197,0.0,0.0,1.0,1.577750658257243,-0.8083295928552664,1.4098353789276243,1.5220793681611107
200 | 198,0.0,0.0,1.0,-1.169391664355368,0.7854114358164588,-1.464168092212975,-0.7543682323045237
201 | 199,0.0,0.0,1.0,0.816717987803749,-0.44845258638100527,1.3397377332900489,1.4605537573377152
202 | 200,0.0,0.0,0.0,-0.129932407150597,-1.6309056076535762,0.4985659856391417,0.44538117875168903
203 | 201,1.0,0.0,0.0,-1.0951445745550272,0.21989042564262185,-0.9734845727499457,-0.969707870186408
204 | 202,0.0,0.0,0.0,-1.1508298919052837,0.5797674321168821,-1.8847539660384285,-1.0004706755981059
205 | 203,0.0,0.0,0.0,0.3897972214517897,-2.0421936150527316,0.7088589225518684,0.3530927625165957
206 | 204,0.0,0.0,0.0,-0.14849417960068118,-1.3738506030291042,0.4985659856391417,0.2915671516932002
207 | 205,0.0,0.0,0.0,-0.2041794969509376,-2.0936046159776254,0.9892495051021708,0.968348870750551
208 | 206,0.0,0.0,0.0,0.6496620357529824,-1.1167955984046325,1.1995424420148977,0.9068232599271554
209 | 207,0.0,0.0,0.0,0.31555013165144913,-1.6823166085784702,0.9191518594645952,0.2300415408698047
210 | 208,0.0,0.0,1.0,1.1137063470051127,-0.962562595629949,1.3397377332900489,1.6451305898079018
211 | 209,0.0,0.0,0.0,0.5939767184027273,-0.8597405937801605,0.9892495051021708,0.9375860653388532
212 | 210,0.0,1.0,0.0,-1.7819301552081805,0.6311784330417753,-1.1837775096626726,-1.092759091833199
213 | 211,0.0,0.0,1.0,-0.5382914010524709,0.9910554395160374,-0.9033869271123701,-0.13911212407056847
214 | 212,0.0,0.0,1.0,0.8724033051540054,-1.0653845974797382,1.4799330245652,1.3990281465143197
215 | 213,1.0,0.0,1.0,-0.6311002633028969,0.014246421943043286,-0.9734845727499457,-0.9081822593630126
216 | 214,1.0,0.0,1.0,1.4663800235567315,1.1966994432156142,0.3583706943639905,-0.323688956540755
217 | 215,0.0,0.0,1.0,1.1322681194551982,-1.1167955984046325,1.6902259614779267,0.968348870750551
218 | 216,0.0,0.0,0.0,-0.5754149459526419,-1.7851386104282596,0.9191518594645952,0.5376695949867824
219 | 217,0.0,0.0,1.0,-0.14849417960068118,0.9396444385911422,-0.27250811637418987,0.6914836220452711
220 | 218,0.0,0.0,1.0,-0.3341119041015333,1.1966994432156142,-0.27250811637418987,0.07622751381131587
221 | 219,0.0,0.0,0.0,-0.9280886225042606,0.16847942471772676,-1.0435822183875214,-0.8158938431279192
222 | 220,0.0,0.0,0.0,0.2041794969509376,-1.3738506030291042,0.4284683400015661,1.0298744815739465
223 | 221,0.0,0.0,1.0,0.7795944429035794,-0.8083295928552664,1.4098353789276243,1.091400092397342
224 | 222,0.0,0.0,0.0,0.5197296286023854,-1.4766726048788925,0.9892495051021708,0.7837720382803643
225 | 223,0.0,0.0,0.0,0.2784265867512782,-1.7851386104282596,0.9191518594645952,0.5376695949867824
226 | 224,0.0,1.0,0.0,-1.9489861072589472,0.9396444385911422,-0.7631916358372189,-0.7543682323045237
227 | 225,0.0,1.0,0.0,-0.5382914010524709,0.21989042564262185,-1.3239728009378238,-1.246573118891688
228 | 226,1.0,0.0,1.0,-0.5939767184027273,0.6311784330417753,-0.41270340764934105,-0.38521456736415055
229 | 227,0.0,0.0,1.0,0.9837739398545157,-0.7569185919303723,1.129444796377322,2.014284254748275
230 | 228,1.0,0.0,1.0,-0.9280886225042606,0.7854114358164588,-1.1136798640250969,-0.6928426214811282
231 | 229,1.0,0.0,1.0,-1.2622005266057938,0.4769454302670937,-1.6043633834881261,-0.38521456736415055
232 | 230,0.0,0.0,0.0,0.22274126940102307,-1.7337276095033645,0.9892495051021708,0.6607208166335733
233 | 231,0.0,0.0,1.0,0.8724033051540054,-0.7569185919303723,0.4985659856391417,1.3990281465143197
234 | 232,0.0,0.0,1.0,1.002335712304601,-0.7055075910054771,1.0593471507397463,0.8760604545154577
235 | 233,0.0,0.0,0.0,0.8724033051540054,-1.579494606728682,0.6387612769142929,0.2915671516932002
236 | 234,0.0,0.0,1.0,0.85384153270392,-0.6026855891556887,2.0407141896658048,1.9527586439248794
237 | 235,0.0,1.0,0.0,-0.7053473531032375,-0.08857557990674508,-1.7445586747632773,-0.9389450647747103
238 | 236,0.0,0.0,1.0,1.1137063470051127,-1.013973596554844,1.1995424420148977,1.8297074222780882
239 | 237,0.0,0.0,1.0,2.2088509215601397,-0.08857557990674508,1.9005188983906536,1.7066562006312973
240 | 238,1.0,0.0,0.0,0.4083589939018752,0.16847942471772676,-0.9734845727499457,-0.6928426214811282
241 | 239,0.0,0.0,1.0,1.2622005266057938,0.06565742286793838,1.9005188983906536,1.7066562006312973
242 | 240,0.0,0.0,0.0,-1.1322681194551982,0.7340004348915655,-0.5528986989244922,-1.5849639784203633
243 | 241,1.0,0.0,0.0,-1.2993240715059649,-0.13998658083164017,-1.1136798640250969,-1.49267556218527
244 | 242,0.0,1.0,0.0,-1.447818251106646,0.014246421943043286,-0.9734845727499457,-1.3080987297150835
245 | 243,1.0,0.0,0.0,0.4640443112521303,0.3741234284173035,-0.6229963445620678,-0.8774194539513147
246 | 244,0.0,0.0,1.0,0.9466503949543461,-1.1167955984046325,1.9005188983906536,1.5836049789845061
247 | 245,1.0,0.0,0.0,-0.9466503949543461,0.8368224367413539,-0.7631916358372189,-0.7543682323045237
248 | 246,0.0,0.0,1.0,2.060356741959459,-0.6026855891556887,2.0407141896658048,2.014284254748275
249 | 247,0.0,1.0,1.0,0.018561772450085477,0.4255344293421986,0.6387612769142929,-0.2621633457173595
250 | 248,1.0,0.0,0.0,0.4454825388020448,0.7340004348915655,-0.7631916358372189,-0.9389450647747103
251 | 


--------------------------------------------------------------------------------
/data/penguins_y_test.csv:
--------------------------------------------------------------------------------
 1 | ,species
 2 | 153,Chinstrap
 3 | 154,Chinstrap
 4 | 208,Chinstrap
 5 | 304,Gentoo
 6 | 283,Gentoo
 7 | 317,Gentoo
 8 | 133,Adelie
 9 | 149,Adelie
10 | 250,Gentoo
11 | 55,Adelie
12 | 23,Adelie
13 | 225,Gentoo
14 | 83,Adelie
15 | 343,Gentoo
16 | 276,Gentoo
17 | 52,Adelie
18 | 81,Adelie
19 | 27,Adelie
20 | 183,Chinstrap
21 | 189,Chinstrap
22 | 287,Gentoo
23 | 227,Gentoo
24 | 330,Gentoo
25 | 318,Gentoo
26 | 209,Chinstrap
27 | 146,Adelie
28 | 228,Gentoo
29 | 142,Adelie
30 | 165,Chinstrap
31 | 314,Gentoo
32 | 182,Chinstrap
33 | 22,Adelie
34 | 68,Adelie
35 | 57,Adelie
36 | 16,Adelie
37 | 329,Gentoo
38 | 179,Chinstrap
39 | 6,Adelie
40 | 46,Adelie
41 | 105,Adelie
42 | 4,Adelie
43 | 205,Chinstrap
44 | 79,Adelie
45 | 211,Chinstrap
46 | 87,Adelie
47 | 73,Adelie
48 | 327,Gentoo
49 | 144,Adelie
50 | 218,Chinstrap
51 | 260,Gentoo
52 | 290,Gentoo
53 | 300,Gentoo
54 | 325,Gentoo
55 | 63,Adelie
56 | 64,Adelie
57 | 288,Gentoo
58 | 338,Gentoo
59 | 258,Gentoo
60 | 297,Gentoo
61 | 265,Gentoo
62 | 53,Adelie
63 | 174,Chinstrap
64 | 119,Adelie
65 | 247,Gentoo
66 | 200,Chinstrap
67 | 150,Adelie
68 | 270,Gentoo
69 | 191,Chinstrap
70 | 123,Adelie
71 | 58,Adelie
72 | 199,Chinstrap
73 | 66,Adelie
74 | 186,Chinstrap
75 | 37,Adelie
76 | 17,Adelie
77 | 15,Adelie
78 | 92,Adelie
79 | 65,Adelie
80 | 25,Adelie
81 | 285,Gentoo
82 | 263,Gentoo
83 | 319,Gentoo
84 | 274,Gentoo
85 | 106,Adelie
86 | 


--------------------------------------------------------------------------------
/data/penguins_y_train.csv:
--------------------------------------------------------------------------------
  1 | ,species
  2 | 168,Chinstrap
  3 | 62,Adelie
  4 | 284,Gentoo
  5 | 135,Adelie
  6 | 51,Adelie
  7 | 233,Gentoo
  8 | 201,Chinstrap
  9 | 114,Adelie
 10 | 254,Gentoo
 11 | 121,Adelie
 12 | 39,Adelie
 13 | 187,Chinstrap
 14 | 80,Adelie
 15 | 160,Chinstrap
 16 | 93,Adelie
 17 | 112,Adelie
 18 | 207,Chinstrap
 19 | 256,Gentoo
 20 | 138,Adelie
 21 | 332,Gentoo
 22 | 163,Chinstrap
 23 | 302,Gentoo
 24 | 77,Adelie
 25 | 107,Adelie
 26 | 0,Adelie
 27 | 206,Chinstrap
 28 | 117,Adelie
 29 | 237,Gentoo
 30 | 28,Adelie
 31 | 131,Adelie
 32 | 242,Gentoo
 33 | 26,Adelie
 34 | 7,Adelie
 35 | 224,Gentoo
 36 | 61,Adelie
 37 | 164,Chinstrap
 38 | 267,Gentoo
 39 | 156,Chinstrap
 40 | 303,Gentoo
 41 | 268,Gentoo
 42 | 214,Chinstrap
 43 | 32,Adelie
 44 | 175,Chinstrap
 45 | 14,Adelie
 46 | 184,Chinstrap
 47 | 95,Adelie
 48 | 296,Gentoo
 49 | 82,Adelie
 50 | 78,Adelie
 51 | 40,Adelie
 52 | 341,Gentoo
 53 | 294,Gentoo
 54 | 277,Gentoo
 55 | 234,Gentoo
 56 | 110,Adelie
 57 | 293,Gentoo
 58 | 266,Gentoo
 59 | 147,Adelie
 60 | 98,Adelie
 61 | 271,Gentoo
 62 | 90,Adelie
 63 | 299,Gentoo
 64 | 236,Gentoo
 65 | 111,Adelie
 66 | 151,Adelie
 67 | 333,Gentoo
 68 | 180,Chinstrap
 69 | 231,Gentoo
 70 | 337,Gentoo
 71 | 155,Chinstrap
 72 | 101,Adelie
 73 | 269,Gentoo
 74 | 33,Adelie
 75 | 210,Chinstrap
 76 | 320,Gentoo
 77 | 115,Adelie
 78 | 48,Adelie
 79 | 177,Chinstrap
 80 | 243,Gentoo
 81 | 279,Gentoo
 82 | 217,Chinstrap
 83 | 116,Adelie
 84 | 159,Chinstrap
 85 | 132,Adelie
 86 | 181,Chinstrap
 87 | 169,Chinstrap
 88 | 30,Adelie
 89 | 162,Chinstrap
 90 | 272,Gentoo
 91 | 196,Chinstrap
 92 | 97,Adelie
 93 | 1,Adelie
 94 | 94,Adelie
 95 | 219,Chinstrap
 96 | 203,Chinstrap
 97 | 49,Adelie
 98 | 192,Chinstrap
 99 | 280,Gentoo
100 | 161,Chinstrap
101 | 108,Adelie
102 | 215,Chinstrap
103 | 71,Adelie
104 | 245,Gentoo
105 | 251,Gentoo
106 | 226,Gentoo
107 | 197,Chinstrap
108 | 38,Adelie
109 | 171,Chinstrap
110 | 72,Adelie
111 | 125,Adelie
112 | 311,Gentoo
113 | 188,Chinstrap
114 | 291,Gentoo
115 | 309,Gentoo
116 | 257,Gentoo
117 | 88,Adelie
118 | 253,Gentoo
119 | 118,Adelie
120 | 60,Adelie
121 | 331,Gentoo
122 | 84,Adelie
123 | 157,Chinstrap
124 | 213,Chinstrap
125 | 29,Adelie
126 | 42,Adelie
127 | 248,Gentoo
128 | 19,Adelie
129 | 100,Adelie
130 | 173,Chinstrap
131 | 130,Adelie
132 | 139,Adelie
133 | 136,Adelie
134 | 128,Adelie
135 | 176,Chinstrap
136 | 76,Adelie
137 | 229,Gentoo
138 | 127,Adelie
139 | 308,Gentoo
140 | 166,Chinstrap
141 | 137,Adelie
142 | 312,Gentoo
143 | 85,Adelie
144 | 99,Adelie
145 | 54,Adelie
146 | 74,Adelie
147 | 158,Chinstrap
148 | 334,Gentoo
149 | 43,Adelie
150 | 167,Chinstrap
151 | 140,Adelie
152 | 36,Adelie
153 | 198,Chinstrap
154 | 202,Chinstrap
155 | 126,Adelie
156 | 315,Gentoo
157 | 190,Chinstrap
158 | 69,Adelie
159 | 194,Chinstrap
160 | 24,Adelie
161 | 45,Adelie
162 | 2,Adelie
163 | 241,Gentoo
164 | 264,Gentoo
165 | 75,Adelie
166 | 261,Gentoo
167 | 313,Gentoo
168 | 306,Gentoo
169 | 240,Gentoo
170 | 86,Adelie
171 | 342,Gentoo
172 | 34,Adelie
173 | 195,Chinstrap
174 | 124,Adelie
175 | 216,Chinstrap
176 | 185,Chinstrap
177 | 148,Adelie
178 | 289,Gentoo
179 | 59,Adelie
180 | 103,Adelie
181 | 50,Adelie
182 | 204,Chinstrap
183 | 221,Gentoo
184 | 326,Gentoo
185 | 172,Chinstrap
186 | 238,Gentoo
187 | 13,Adelie
188 | 178,Chinstrap
189 | 96,Adelie
190 | 307,Gentoo
191 | 102,Adelie
192 | 5,Adelie
193 | 275,Gentoo
194 | 35,Adelie
195 | 143,Adelie
196 | 91,Adelie
197 | 262,Gentoo
198 | 281,Gentoo
199 | 301,Gentoo
200 | 21,Adelie
201 | 255,Gentoo
202 | 328,Gentoo
203 | 134,Adelie
204 | 20,Adelie
205 | 220,Gentoo
206 | 278,Gentoo
207 | 244,Gentoo
208 | 310,Gentoo
209 | 282,Gentoo
210 | 249,Gentoo
211 | 322,Gentoo
212 | 18,Adelie
213 | 67,Adelie
214 | 239,Gentoo
215 | 141,Adelie
216 | 212,Chinstrap
217 | 273,Gentoo
218 | 230,Gentoo
219 | 109,Adelie
220 | 113,Adelie
221 | 56,Adelie
222 | 252,Gentoo
223 | 292,Gentoo
224 | 340,Gentoo
225 | 232,Gentoo
226 | 70,Adelie
227 | 12,Adelie
228 | 41,Adelie
229 | 235,Gentoo
230 | 145,Adelie
231 | 31,Adelie
232 | 298,Gentoo
233 | 259,Gentoo
234 | 316,Gentoo
235 | 222,Gentoo
236 | 295,Gentoo
237 | 122,Adelie
238 | 223,Gentoo
239 | 321,Gentoo
240 | 193,Chinstrap
241 | 305,Gentoo
242 | 104,Adelie
243 | 44,Adelie
244 | 120,Adelie
245 | 152,Chinstrap
246 | 323,Gentoo
247 | 89,Adelie
248 | 335,Gentoo
249 | 129,Adelie
250 | 170,Chinstrap
251 | 


--------------------------------------------------------------------------------
/data/world_happiness.csv:
--------------------------------------------------------------------------------
  1 | rank,country,happiness_score,gdp,social_support,life_expectancy,freedom,generosity,corruption,continent
  2 | 1,Finland,7.821,1.892,1.258,0.775,0.736,0.109,0.534,Europe
  3 | 2,Denmark,7.636,1.953,1.243,0.777,0.719,0.188,0.532,Europe
  4 | 3,Iceland,7.557,1.936,1.32,0.803,0.718,0.27,0.191,Europe
  5 | 4,Switzerland,7.512,2.026,1.226,0.822,0.677,0.147,0.461,Europe
  6 | 5,Netherlands,7.415,1.945,1.206,0.787,0.651,0.271,0.419,Europe
  7 | 6,Luxembourg*,7.404,2.209,1.155,0.79,0.7,0.12,0.388,Europe
  8 | 7,Sweden,7.384,1.92,1.204,0.803,0.724,0.218,0.512,Europe
  9 | 8,Norway,7.365,1.997,1.239,0.786,0.728,0.217,0.474,Europe
 10 | 9,Israel,7.364,1.826,1.221,0.818,0.568,0.155,0.143,Asia
 11 | 10,New Zealand,7.2,1.852,1.235,0.752,0.68,0.245,0.483,Oceania
 12 | 11,Austria,7.163,1.931,1.165,0.774,0.623,0.193,0.329,Europe
 13 | 12,Australia,7.162,1.9,1.203,0.772,0.676,0.258,0.341,Oceania
 14 | 13,Ireland,7.041,2.129,1.166,0.779,0.627,0.19,0.408,Europe
 15 | 14,Germany,7.034,1.924,1.088,0.776,0.585,0.163,0.358,Europe
 16 | 15,Canada,7.025,1.886,1.188,0.783,0.659,0.217,0.368,North America
 17 | 16,United States,6.977,1.982,1.182,0.628,0.574,0.22,0.177,North America
 18 | 17,United Kingdom,6.943,1.867,1.143,0.75,0.597,0.289,0.329,Europe
 19 | 18,Czechia,6.92,1.815,1.26,0.715,0.66,0.158,0.048,Europe
 20 | 19,Belgium,6.805,1.907,1.106,0.764,0.492,0.049,0.204,Europe
 21 | 20,France,6.687,1.863,1.219,0.808,0.567,0.07,0.266,Europe
 22 | 21,Bahrain,6.647,1.854,1.029,0.625,0.693,0.199,0.155,Asia
 23 | 22,Slovenia,6.63,1.81,1.249,0.769,0.685,0.118,0.115,Europe
 24 | 23,Costa Rica,6.582,1.584,1.054,0.744,0.661,0.089,0.102,North America
 25 | 24,United Arab Emirates,6.576,1.998,0.98,0.633,0.702,0.204,0.25,Asia
 26 | 25,Saudi Arabia,6.523,1.87,1.092,0.577,0.651,0.078,0.18,Asia
 27 | 26,Taiwan Province of China,6.512,1.897,1.095,0.733,0.542,0.075,0.168,Asia
 28 | 27,Singapore,6.48,2.149,1.127,0.851,0.672,0.163,0.587,Asia
 29 | 28,Romania,6.477,1.719,1.006,0.655,0.605,0.039,0.006,Europe
 30 | 29,Spain,6.476,1.808,1.211,0.808,0.505,0.101,0.149,Europe
 31 | 30,Uruguay,6.474,1.615,1.18,0.672,0.665,0.103,0.265,South America
 32 | 31,Italy,6.467,1.834,1.052,0.801,0.412,0.085,0.059,Europe
 33 | 32,Kosovo,6.455,1.362,0.949,0.569,0.599,0.309,0.035,Europe
 34 | 33,Malta,6.447,1.838,1.169,0.789,0.679,0.174,0.166,Europe
 35 | 34,Lithuania,6.446,1.804,1.204,0.659,0.496,0.053,0.077,Europe
 36 | 35,Slovakia,6.391,1.736,1.232,0.707,0.479,0.118,0.025,Europe
 37 | 36,Estonia,6.341,1.793,1.232,0.728,0.689,0.123,0.333,Europe
 38 | 37,Panama,6.309,1.715,1.107,0.709,0.592,0.049,0.051,North America
 39 | 38,Brazil,6.293,1.462,1.044,0.615,0.546,0.131,0.134,South America
 40 | 39,Guatemala*,6.262,1.274,0.831,0.522,0.662,0.112,0.115,North America
 41 | 40,Kazakhstan,6.234,1.668,1.22,0.611,0.584,0.134,0.157,Asia
 42 | 41,Cyprus,6.221,1.815,0.909,0.819,0.448,0.123,0.062,Europe
 43 | 42,Latvia,6.18,1.732,1.221,0.637,0.502,0.075,0.09,Europe
 44 | 43,Serbia,6.178,1.55,1.086,0.658,0.546,0.219,0.088,Europe
 45 | 44,Chile,6.172,1.651,1.08,0.748,0.46,0.124,0.069,South America
 46 | 45,Nicaragua,6.165,1.105,1.029,0.617,0.617,0.168,0.212,North America
 47 | 46,Mexico,6.128,1.552,0.886,0.623,0.621,0.092,0.115,North America
 48 | 47,Croatia,6.125,1.705,1.183,0.709,0.535,0.109,0.0,Europe
 49 | 48,Poland,6.123,1.758,1.174,0.712,0.523,0.124,0.14,Europe
 50 | 49,El Salvador,6.12,1.265,0.768,0.607,0.666,0.089,0.212,North America
 51 | 50,Kuwait*,6.106,1.904,0.983,0.747,0.617,0.087,0.147,Asia
 52 | 51,Hungary,6.086,1.748,1.233,0.668,0.485,0.078,0.064,Europe
 53 | 52,Mauritius,6.071,1.591,1.116,0.568,0.589,0.131,0.107,Africa
 54 | 53,Uzbekistan,6.063,1.219,1.092,0.6,0.716,0.283,0.24,Asia
 55 | 54,Japan,6.039,1.835,1.089,0.866,0.537,0.007,0.218,Asia
 56 | 55,Honduras,6.022,1.111,0.885,0.555,0.582,0.202,0.076,North America
 57 | 56,Portugal,6.016,1.76,1.078,0.777,0.655,0.016,0.039,Europe
 58 | 57,Argentina,5.967,1.592,1.102,0.662,0.555,0.081,0.085,South America
 59 | 58,Greece,5.948,1.703,0.98,0.774,0.249,0.015,0.108,Europe
 60 | 59,South Korea,5.935,1.851,0.886,0.841,0.414,0.111,0.176,Asia
 61 | 60,Philippines,5.904,1.268,0.912,0.514,0.678,0.107,0.142,Asia
 62 | 61,Thailand,5.891,1.535,1.096,0.697,0.617,0.321,0.026,Asia
 63 | 62,Moldova,5.857,1.417,1.008,0.597,0.561,0.102,0.028,Europe
 64 | 63,Jamaica,5.85,1.296,1.045,0.646,0.567,0.08,0.053,North America
 65 | 64,Kyrgyzstan,5.828,1.069,1.109,0.638,0.693,0.208,0.025,Asia
 66 | 65,Belarus*,5.821,1.562,1.157,0.629,0.342,0.04,0.282,Europe
 67 | 66,Colombia,5.781,1.452,0.929,0.72,0.545,0.087,0.077,South America
 68 | 67,Bosnia and Herzegovina,5.768,1.468,1.068,0.665,0.448,0.244,0.006,Europe
 69 | 68,Mongolia,5.761,1.393,1.197,0.467,0.398,0.247,0.059,Asia
 70 | 69,Dominican Republic,5.737,1.538,1.003,0.577,0.606,0.084,0.179,North America
 71 | 70,Malaysia,5.711,1.689,0.938,0.62,0.654,0.213,0.126,Asia
 72 | 71,Bolivia,5.6,1.256,0.88,0.555,0.627,0.112,0.064,South America
 73 | 72,China,5.585,1.508,0.958,0.705,0.656,0.099,0.142,Asia
 74 | 73,Paraguay,5.578,1.409,1.13,0.624,0.629,0.171,0.059,South America
 75 | 74,Peru,5.559,1.397,0.865,0.735,0.545,0.09,0.037,South America
 76 | 75,Montenegro,5.547,1.573,1.023,0.659,0.46,0.135,0.077,Europe
 77 | 76,Ecuador,5.533,1.352,0.879,0.708,0.565,0.08,0.083,South America
 78 | 77,Vietnam,5.485,1.252,0.932,0.611,0.707,0.143,0.105,Asia
 79 | 78,Turkmenistan*,5.474,1.484,1.319,0.516,0.649,0.314,0.032,Asia
 80 | 79,North Cyprus*,5.467,1.815,0.888,0.819,0.523,0.13,0.213,Europe
 81 | 80,Russia,5.459,1.685,1.095,0.586,0.401,0.117,0.08,Europe
 82 | 81,Hong Kong S.A.R. of China,5.425,1.957,0.954,0.942,0.4,0.147,0.383,Asia
 83 | 82,Armenia,5.399,1.434,0.82,0.668,0.558,0.054,0.21,Europe
 84 | 83,Tajikistan,5.377,0.966,1.005,0.518,0.572,0.118,0.304,Asia
 85 | 84,Nepal,5.377,0.984,0.784,0.499,0.519,0.237,0.13,Asia
 86 | 85,Bulgaria,5.371,1.625,1.163,0.64,0.563,0.123,0.021,Europe
 87 | 86,Libya*,5.33,1.476,0.943,0.606,0.477,0.106,0.179,Africa
 88 | 87,Indonesia,5.24,1.382,0.883,0.539,0.62,0.468,0.047,Asia
 89 | 88,Ivory Coast,5.235,1.094,0.442,0.322,0.451,0.149,0.124,Africa
 90 | 89,North Macedonia,5.199,1.505,0.863,0.637,0.488,0.215,0.031,Europe
 91 | 90,Albania,5.199,1.439,0.646,0.719,0.511,0.138,0.028,Europe
 92 | 91,South Africa,5.194,1.425,1.088,0.361,0.442,0.089,0.046,Africa
 93 | 92,Azerbaijan*,5.173,1.458,1.093,0.56,0.601,0.023,0.341,Europe
 94 | 93,Gambia*,5.164,0.785,0.621,0.369,0.367,0.388,0.103,Africa
 95 | 94,Bangladesh,5.155,1.06,0.614,0.581,0.622,0.125,0.187,Asia
 96 | 95,Laos,5.14,1.239,0.654,0.479,0.679,0.197,0.184,Asia
 97 | 96,Algeria,5.122,1.363,0.97,0.643,0.146,0.106,0.15,Africa
 98 | 97,Liberia*,5.122,0.636,0.67,0.309,0.405,0.178,0.08,Africa
 99 | 98,Ukraine,5.084,1.411,1.081,0.583,0.473,0.188,0.017,Europe
100 | 99,Congo,5.075,0.95,0.405,0.355,0.431,0.13,0.146,Africa
101 | 100,Morocco,5.06,1.208,0.268,0.565,0.492,0.02,0.102,Africa
102 | 101,Mozambique,5.048,0.578,0.66,0.191,0.593,0.185,0.2,Africa
103 | 102,Cameroon,5.048,0.968,0.672,0.317,0.397,0.152,0.074,Africa
104 | 103,Senegal,5.046,0.933,0.53,0.447,0.494,0.143,0.081,Africa
105 | 104,Niger*,5.003,0.57,0.56,0.326,0.571,0.165,0.145,Africa
106 | 105,Georgia,4.973,1.467,0.612,0.595,0.508,0.0,0.208,Europe
107 | 106,Gabon,4.958,1.459,0.738,0.396,0.343,0.032,0.099,Africa
108 | 107,Iraq,4.941,1.289,0.682,0.554,0.328,0.147,0.046,Asia
109 | 108,Venezuela,4.925,0.0,0.968,0.578,0.283,0.225,0.082,South America
110 | 109,Guinea,4.891,0.848,0.566,0.275,0.334,0.214,0.116,Africa
111 | 110,Iran,4.888,1.41,0.741,0.642,0.281,0.241,0.146,Asia
112 | 111,Ghana,4.872,1.112,0.595,0.409,0.5,0.23,0.056,Africa
113 | 112,Turkey,4.744,1.707,0.865,0.702,0.209,0.087,0.115,Europe
114 | 113,Burkina Faso,4.67,0.779,0.565,0.32,0.382,0.186,0.126,Africa
115 | 114,Cambodia,4.64,1.019,0.732,0.505,0.74,0.166,0.068,Asia
116 | 115,Benin,4.623,0.932,0.064,0.335,0.479,0.127,0.23,Africa
117 | 116,Comoros*,4.609,0.899,0.476,0.424,0.185,0.195,0.125,Africa
118 | 117,Uganda,4.603,0.777,0.875,0.418,0.402,0.222,0.066,Africa
119 | 118,Nigeria,4.552,1.079,0.732,0.3,0.444,0.175,0.038,Africa
120 | 119,Kenya,4.543,1.032,0.605,0.401,0.44,0.322,0.082,Africa
121 | 120,Tunisia,4.516,1.35,0.596,0.656,0.316,0.029,0.029,Africa
122 | 121,Pakistan,4.516,1.049,0.413,0.374,0.448,0.181,0.112,Asia
123 | 122,Palestinian Territories*,4.483,1.148,0.957,0.521,0.336,0.073,0.079,Asia
124 | 123,Mali,4.479,0.792,0.483,0.311,0.35,0.128,0.042,Africa
125 | 124,Namibia,4.459,1.292,0.877,0.354,0.384,0.067,0.071,Africa
126 | 125,"Eswatini, Kingdom of*",4.396,1.274,0.786,0.197,0.259,0.038,0.154,Africa
127 | 126,Myanmar,4.394,1.038,0.829,0.491,0.513,0.452,0.194,Asia
128 | 127,Sri Lanka,4.362,1.415,0.934,0.66,0.529,0.15,0.079,Asia
129 | 128,Madagascar*,4.339,0.67,0.645,0.378,0.202,0.143,0.154,Africa
130 | 129,Egypt,4.288,1.388,0.732,0.548,0.469,0.041,0.254,Africa
131 | 130,Chad*,4.251,0.662,0.506,0.225,0.18,0.182,0.077,Africa
132 | 131,Ethiopia,4.241,0.788,0.809,0.457,0.472,0.205,0.136,Africa
133 | 132,Yemen*,4.197,0.691,1.043,0.384,0.33,0.09,0.098,Asia
134 | 133,Mauritania*,4.153,1.1,0.865,0.45,0.304,0.088,0.138,Africa
135 | 134,Jordan,4.152,1.324,0.724,0.675,0.476,0.058,0.2,Asia
136 | 135,Togo,4.112,0.771,0.322,0.36,0.292,0.174,0.132,Africa
137 | 136,India,3.777,1.167,0.376,0.471,0.647,0.198,0.123,Asia
138 | 137,Zambia,3.76,0.93,0.577,0.306,0.525,0.203,0.083,Africa
139 | 138,Malawi,3.75,0.648,0.279,0.388,0.477,0.14,0.157,Africa
140 | 139,Tanzania,3.702,0.848,0.597,0.425,0.578,0.248,0.27,Africa
141 | 140,Sierra Leone,3.574,0.686,0.416,0.273,0.387,0.202,0.055,Africa
142 | 141,Lesotho*,3.512,0.839,0.848,0.0,0.419,0.076,0.018,Africa
143 | 142,Botswana*,3.471,1.503,0.815,0.28,0.571,0.012,0.102,Africa
144 | 143,Rwanda*,3.268,0.785,0.133,0.462,0.621,0.187,0.544,Africa
145 | 144,Zimbabwe,2.995,0.947,0.69,0.27,0.329,0.106,0.105,Africa
146 | 145,Lebanon,2.955,1.392,0.498,0.631,0.103,0.082,0.034,Asia
147 | 146,Afghanistan,2.404,0.758,0.0,0.289,0.0,0.089,0.005,Asia
148 | 


--------------------------------------------------------------------------------
/images/Confusion_Matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/Confusion_Matrix.png


--------------------------------------------------------------------------------
/images/KNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/KNN.png


--------------------------------------------------------------------------------
/images/linear_regression_hyperplane.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/linear_regression_hyperplane.jpeg


--------------------------------------------------------------------------------
/images/linear_regression_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/linear_regression_line.png


--------------------------------------------------------------------------------
/images/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/overfitting.png


--------------------------------------------------------------------------------
/images/validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/Python-Machine-Learning/230e0a749724a7d0f5640dea15fef368f3498968/images/validation.png


--------------------------------------------------------------------------------
/lessons/00_introduction.md:
--------------------------------------------------------------------------------
1 | # Python Machine Learning: Introduction
2 | 
3 | Please refer to these [introductory slides](https://docs.google.com/presentation/d/1IwlTdkOGXVGwgCxPVyWEXOOyRf_sZtAcKFSoJ9k4jig/edit?usp=sharing) for the first component of the workshop.


--------------------------------------------------------------------------------
/lessons/02_regularization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "3cfcb6e0-d342-4a48-9812-4a1176599965",
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "source": [
 10 |     "# Python Machine Learning: Regularization\n",
 11 |     "\n",
 12 |     "In machine learning, the name of the game is generalization. We want to have a model perform well on the training set, but we need to make sure that the patterns the model learns can actually generalize to data the model hasn't seen before.\n",
 13 |     "\n",
 14 |     "So, the scenario we want to avoid is that of **overfitting**. This occurs when our model too strongly learns patterns in the training data, and doesn't generalize well. Overfit models tend to exhibit large generalization gaps: large differences in predictive performance between the training and test data.\n",
 15 |     "\n",
 16 |     "Overfitting can happen for a variety of reasons, the most well known of which is having a model that's too complicated. Luckily, all is not lost. There are a variety of approaches we can use to combat overfitting. In general, these approaches are called **regularization**."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "68f833e6-4c67-4f64-8f00-83b848024c93",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Overfitting and Regularization\n",
 25 |     "\n",
 26 |     "In the previous lesson, we discussed feature engineering, the process by which we create new features in order to make our model more expressive. One tradeoff to adding features to the model is that the model becomes more complex, which makes it prone to overfitting. \n",
 27 |     "\n",
 28 |     "For example, consider a basic regression with the points shown below:\n",
 29 |     "\n",
 30 |     "![overfitting](../images/overfitting.png)\n",
 31 |     "\n",
 32 |     "We could fit a simple line to this data, which will exhibit some error. However, we could also fit a more complex model - say, a polynomial - which could perfectly fit to the training data. There will be no error in the training predictions, which seems great!\n",
 33 |     "\n",
 34 |     "But do we *really* think the polynomial is making good predictions on *all* possible data points? Look at how it behaves in between the training examples. It's very likely on *new* data - that is, when the model needs to generalize - the linear model will perform much better than the polynomial model. This is because the polynomial model overfit to the data.\n",
 35 |     "\n",
 36 |     "So, it's common in machine learning to follow a \"parsimony principle\". Specifically, we aim to choose simpler models that can still be predictive, because simpler models are less likely to overfit, and thus generalize decently well.\n",
 37 |     "\n",
 38 |     "Regularization is often though of in terms of the **bias-variance tradeoff**. Specifically, prediction errors often break down in terms of two components: bias and variance. The linear model exhibits higher bias, since it exhibits large errors on the training example. But the polynomial model has higher variance - it's more likely to give wildly different predictions for training samples close together.\n",
 39 |     "\n",
 40 |     "We don't always have to use linear regression in the spirit of opting for simpler models. Sometimes, it's good to use the complicated model, particularly if it makes sense in a specific context. This is where **regularization** is useful: a technique we can use to make a model less prone to overfitting during training. It's important to note that regularization is more of a concept than it is a specific, standardized technique. There are many approaches used for regularizing. Today, we're going to cover the usage of **penalty terms** to regularize linear models."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "08518305-c290-46bd-8998-c728d68ada22",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "---\n",
 49 |     "### Challenge 1: Warm-Up\n",
 50 |     "\n",
 51 |     "Before we get started, let's warm up by importing our data and performing a train test split. We've providing the importing code for you. Go ahead and split the data into train/test sets using an 80/20 split, and a random state of 23.\n",
 52 |     "\n",
 53 |     "---"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "32a8b441-f363-45ac-8abf-236df98f8612",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import pandas as pd\n",
 64 |     "import numpy as np\n",
 65 |     "import matplotlib.pyplot as plt\n",
 66 |     "\n",
 67 |     "from sklearn.metrics import mean_squared_error\n",
 68 |     "from sklearn.model_selection import train_test_split"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "dccf9b24-e59a-41e9-8a82-2c2054363427",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# Import data\n",
 79 |     "data = pd.read_csv('../data/auto-mpg.csv')\n",
 80 |     "# Remove the response variable and car name\n",
 81 |     "X = data.drop(columns=['car name', 'mpg'])\n",
 82 |     "# Assign response variable to its own variable\n",
 83 |     "y = data['mpg'].astype(np.float64)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "6cecbfb1-0121-42f7-81ab-180c2acd805c",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# YOUR CODE HERE\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "89d536fb-a981-461f-a100-99dd469f6be4",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## Ridge Regression\n",
102 |     "\n",
103 |     "Recall the formulation of a linear model. We have the parameters we are trying to estimate, given in the model:\n",
104 |     "\n",
105 |     "$$Y = \\beta_0 + \\beta_1 X_1 + \\ldots + \\beta_P X_P$$\n",
106 |     "\n",
107 |     "We do this by minimizing the following objective function:\n",
108 |     "\n",
109 |     "$$\n",
110 |     "\\begin{align}\n",
111 |     "\\text{MSE} = L(\\beta) &= \\frac{1}{N}\\sum_{i=1}^{N}(y_i - \\hat{y}_i)^2 \\\\\n",
112 |     "&= \\frac{1}{N}\\sum_{i=1}^{N}\\left(y_i - \\beta_0 - \\sum_{j=1}^P \\beta_j X_j\\right)^2\n",
113 |     "\\end{align}\n",
114 |     "$$\n",
115 |     "\n",
116 |     "We're going to regularize this model. We're not going to change the actual linear model - that's the top equation - but we will change how we choose the $\\beta$ parameters. Specifically, we're going to do **ridge regression** (also called $\\ell_2$ regularization and Tikhonov regularization). Instead of using the least squares objective function, specified in the second equation, we're going to use the following objective function: \n",
117 |     "\n",
118 |     "$$ L(\\beta) = \\sum_{i=1}^N (y_i - \\hat y_i)^2  + \\alpha \\sum_{j=1}^P \\beta_j^2 $$ \n",
119 |     "\n",
120 |     "What's the difference? There's a second term added on, which is equal to the sum of the squares of the $\\beta$ values. What does this mean?\n",
121 |     "\n",
122 |     "Our goal is for the loss, $L(\\beta)$, to be as small as possible. The first term says we can make that small if we make our errors, $y_i - \\hat y_i$, small. The second term says that we increase the loss if the $\\beta$ values get too large. There's a tradeoff here: if we make the $\\beta$ values all zero to accomodate the second term, then the first term will be large. So, in ridge regression, we try and minimize the errors, while trying hard not to make the coefficients too big.\n",
123 |     "\n",
124 |     "Also, note that ridge regression requires a **hyperparameter**, called $\\alpha$ (sometimes $\\lambda$). This hyperparameter indicates how much regularization should be done. In other words, how much do we care about the coefficient penalty term vs. how much do we care about the sum of squared errors term? The higher the value of $\\alpha$, the more regularization, and the smaller the resulting coefficients will be. On the other hand, if we use an $\\alpha$ value of 0, we get the same solution as the OLS regression done above.\n",
125 |     "\n",
126 |     "Why does ridge regression serve as a good regularizer? The penalty actually does several things, which are beneficial for our model:\n",
127 |     "1. **Multicollinearity:** Ridge regression was devised largely to combat multicollinearity, or when features are highly correlated with each other. Ordinary least squares struggles in these scenarios, because multicollinearity can cause a huge increase in variance: it makes the parameter estimates unstable. Adding the penalty term stabilizes the parameter estimates, at a little cost to bias. This results in better generalization performance.\n",
128 |     "2. **Low Number of Samples:** The most common scenario where you might overfit is when you have many features, but not many samples. Adding the penalty term stabilizes the model in these scenarios. There's not a great intuition for this without diving into the math, so you can just take it at face value. \n",
129 |     "3. **Shrinkage:** The $\\ell_2$ penalty results in shrinkage, or a small reduction in the size of the parameters. This is effectively a bias, but helps regularize by reducing variance that often comes with overfit models."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "id": "107971a3-075d-4cc0-8fe3-c9fefe582ed0",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Ridge Regression in Practice\n",
138 |     "\n",
139 |     "As with linear regression, `scikit-learn` makes it easy to fit a ridge regression. We simply use the `Ridge` class from `scikit-learn`. This time, however, we're going to specify some arguments when we create the ridge regression object. The most important one is the regularization penalty, $\\alpha$, which we need to choose:"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "862640b7-4dee-487b-8308-d8c21aa61bae",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "from sklearn.linear_model import Ridge\n",
150 |     "# Create models\n",
151 |     "ridge = Ridge(\n",
152 |     "    # Regularization penalty\n",
153 |     "    alpha=10,\n",
154 |     "    random_state=1)\n",
155 |     "# Fit object\n",
156 |     "ridge.fit(X_train, y_train)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "id": "fa21a66c-d694-410e-a198-ac73df50eb0f",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Run predictions\n",
167 |     "y_train_pred_ridge = ridge.predict(X_train)\n",
168 |     "y_test_pred_ridge = ridge.predict(X_test)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "a98614ab-de89-401f-b166-0eb216a2fe0f",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "# Evaluate model\n",
179 |     "print(f'Training R^2: {ridge.score(X_train, y_train)}')\n",
180 |     "print(f'Test R^2: {ridge.score(X_test, y_test)}')\n",
181 |     "print(f'Train RMSE: {mean_squared_error(y_train, y_train_pred_ridge, squared=False)}')\n",
182 |     "print(f'Test RMSE: {mean_squared_error(y_test, y_test_pred_ridge, squared=False)}')"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "id": "08c82e6e-a112-4c22-a048-6ca4c533a986",
188 |    "metadata": {},
189 |    "source": [
190 |     "---\n",
191 |     "### Challenge 2: Benchmarking\n",
192 |     "\n",
193 |     "Re-run the ordinary least squares on the data using `LinearRegression`. Then, create a new ridge regression where the `alpha` penalty is set equal to zero. How do the performances of these models compare to each other? How do they compare with the original ridge regression? Be sure to compare both the training performances and test performances.\n",
194 |     "\n",
195 |     "---"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "id": "d77e1c87-b0a7-44ef-ba8f-7246f97b57d5",
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "from sklearn.linear_model import LinearRegression\n",
206 |     "# YOUR CODE HERE\n",
207 |     "# Create models\n",
208 |     "\n",
209 |     "# Fit models\n",
210 |     "\n",
211 |     "# Run predictions\n",
212 |     "\n",
213 |     "# Evaluate models\n"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "id": "02c5045a-a835-4de2-aa86-36d9511baef1",
219 |    "metadata": {},
220 |    "source": [
221 |     "Based off your experiments, you probably found that ridge regression resulted in worse training performance, but slightly better generalization performance! So the regularization can help, particularly in this case where we know the parameters are correlated with each other."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "id": "c4a8103a-e35e-4369-b694-1232e71f5981",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Choosing Hyperparameters: Validation Sets\n",
230 |     "\n",
231 |     "The current issue with our analysis thus far is that we don't know what $\\alpha$ value we should use. Since hyperparameters are chosen *before* we fit the model, we can't just choose them based off the training data. So, how should we go about conducting **hyperparameter search**: identifying the best hyperparameter(s) to use?\n",
232 |     "\n",
233 |     "Let's think back to our original goal. We want a model that generalizes to unseen data. So, ideally, the choice of the hyperparameter should be such that the performance on unseen data is the best. We can't use the test set for this, but what if we had another set of held-out data? \n",
234 |     "\n",
235 |     "This is the basis for a **validation set**. If we had extra held-out dataset, we could try a bunch of hyperparameters on the training set, and see which one results in a model that performs the best on the validation set. We then would choose that hyperparameter, and use it to refit the model on both the training data and validation data. We could then, finally, evaluate on the test set.\n",
236 |     "\n",
237 |     "![validation](../images/validation.png)\n",
238 |     "\n",
239 |     "So, you'll often see a dataset not only split up into training/test sets, but training/validation/test sets, particularly when you need to choose a hyperparameter.\n",
240 |     "\n",
241 |     "### Cross-Validation\n",
242 |     "\n",
243 |     "We just formulated the process of choosing a hyperparameter with a single validation set. However, there are many ways to perform validation. The most common way is **cross-validation**. Cross-validation is motivated by the concern that we may not choose the best hyperparameter if we're only validating on a small fraction of the data. If the validation sample, just by chance, contains specific data samples, we may bias our model in favor of those samples, and limit its generalizability.\n",
244 |     "\n",
245 |     "So, during cross-validation, we effectively validate on the *entire* dataset, by breaking it up into folds. Here's the process:\n",
246 |     "\n",
247 |     "1. Perform a train/test split, as you normally would.\n",
248 |     "2. Choose a number of folds - the most common is $K=5$ - and split up your training data into those equally sized \"folds\".\n",
249 |     "3. For *each* hyperparameter, we're going to fit $K$ models. Let's assume $K=5$. The first model will be fit on Folds 2-5, and validated on Fold 1. The second model will be fit on Folds 1, 3-5, and validated on Fold 2. This process continues for all 5 splits.\n",
250 |     "4. Each hyperparameter's performance is summarized by the average predictive performance on all 5 held-out folds. We then choose the hyperparameter that had the best average performance.\n",
251 |     "5. We can then refit a new model to the entire training set, using our chosen hyperparameter. That's our final model - evaluate it on the test set!\n",
252 |     "\n",
253 |     "![cross-validation](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "id": "6f1edb4d-60bf-45a6-a128-5f1097244cd5",
259 |    "metadata": {},
260 |    "source": [
261 |     "## Cross-Validation in Practice\n",
262 |     "\n",
263 |     "You guessed it: `scikit-learn` makes it really easy to fit a model with cross-validation. We'll use the `RidgeCV` class. Check out the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html) for details about it.\n",
264 |     "\n",
265 |     "`RidgeCV` is going to need to know a few things from us: which hyperparameters do we want? How many folds should we use? We'll specify these when creating the model object."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "id": "1f632ebb-b5ae-45aa-a7de-2139454ca79e",
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "from sklearn.linear_model import RidgeCV\n",
276 |     "# Create ridge model, with CV\n",
277 |     "ridge_cv = RidgeCV(\n",
278 |     "    # Which alpha values to test for?\n",
279 |     "    alphas=np.logspace(-1, 3, 100),\n",
280 |     "    # Number of folds\n",
281 |     "    cv=5)\n",
282 |     "# Fit model\n",
283 |     "ridge_cv.fit(X_train, y_train)\n",
284 |     "# Evaluate model\n",
285 |     "print(ridge_cv.score(X_train, y_train))\n",
286 |     "print(ridge_cv.score(X_test, y_test))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "id": "1ac60684-031b-43a3-b6f4-81f8b85e00da",
292 |    "metadata": {},
293 |    "source": [
294 |     "We can also access the best $\\alpha$ value:"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "f3237a9d-7083-4681-a1de-b39dc6457a53",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "ridge_cv.alpha_"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "id": "5bf18376-c4a0-4f74-8f4d-d9d52b0ffede",
310 |    "metadata": {},
311 |    "source": [
312 |     "As well as the coefficients:"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "id": "85f569bf-38be-479d-8dea-5b8ef6e56ea6",
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "ridge_cv.coef_"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "id": "06a73083-aa63-4b5d-97c0-2ee8668993cb",
328 |    "metadata": {
329 |     "tags": []
330 |    },
331 |    "source": [
332 |     "## Bonus Material: Lasso Regression\n",
333 |     "\n",
334 |     "**Lasso regression** (also called $\\ell_1$ regularization) is another form of regularized regression that penalizes the coefficients. Rather than taking a squared penalty of the coefficients, Lasso uses an absolute value penalty: \n",
335 |     "\n",
336 |     "$$ L(\\beta) = \\sum_{i=1}^N (y_i - \\hat y_i)^2  + \\alpha \\sum_{j=1}^P |\\beta_j| $$ \n",
337 |     "\n",
338 |     "This has a similar effect on making the coefficients smaller, but also has a tendency to force some coefficients to be set *exactly equal to 0*. This leads to what is called **sparser** models, and is another way to reduce overfitting introduced by more complex models.\n",
339 |     "\n",
340 |     "Setting some coefficients exactly equal to zero has the added benefit of performing **feature selection**: it can exactly identify if some features are not worth including in the model, because their coefficients are set exactly to 0 (meaning that their values would have no impact on prediction)."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "id": "57f36182-7007-4403-8ad8-f79271a0bd54",
346 |    "metadata": {},
347 |    "source": [
348 |     "---\n",
349 |     "### Challenge 3: Performing a Lasso Fit\n",
350 |     "\n",
351 |     "Below, we've imported the `Lasso` object from `scikit-learn` for you. Just like `Ridge`, it needs to know what the strength of the regularization penalty is before fitting to the data. \n",
352 |     "\n",
353 |     "Fit several Lasso models, with different regularization strengths. Try one with a small but non-zero regularization strength, and try one with a very large regularization strength. Look at the coefficients. What do you notice?\n",
354 |     "\n",
355 |     "---"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "80ac5a53-31e1-4238-8314-ebedb5200079",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "from sklearn.linear_model import Lasso\n",
366 |     "# YOUR CODE HERE\n"
367 |    ]
368 |   }
369 |  ],
370 |  "metadata": {
371 |   "kernelspec": {
372 |    "display_name": "Python 3 (ipykernel)",
373 |    "language": "python",
374 |    "name": "python3"
375 |   },
376 |   "language_info": {
377 |    "codemirror_mode": {
378 |     "name": "ipython",
379 |     "version": 3
380 |    },
381 |    "file_extension": ".py",
382 |    "mimetype": "text/x-python",
383 |    "name": "python",
384 |    "nbconvert_exporter": "python",
385 |    "pygments_lexer": "ipython3",
386 |    "version": "3.9.12"
387 |   }
388 |  },
389 |  "nbformat": 4,
390 |  "nbformat_minor": 5
391 | }
392 | 


--------------------------------------------------------------------------------
/lessons/03_preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "52dcf6e0-34d7-487a-afc7-0404106c4741",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Python Machine Learning: Preprocessing\n",
  9 |     "\n",
 10 |     "Preprocessing is an essential step of the machine learning workflow and important for the performance of models. This notebook will introduce the major steps of preprocessing for machine learning. \n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "86cd8e28-1334-4520-b2d9-1b510ddb5819",
 16 |    "metadata": {
 17 |     "tags": []
 18 |    },
 19 |    "source": [
 20 |     "## Load Data"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "d1b25be6-01f4-4555-b8ae-66956d67ace5",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "For today, we will be working with the `penguins` data set. This data set is from [Kaggle](https://www.kaggle.com/parulpandey/penguin-dataset-the-new-iris) and includes some penguins of three different species, their location, and some measurements for each penguin.\n",
 29 |     "\n",
 30 |     "First, let's import some packages we'll need."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "id": "f0142813-ac28-4ead-9996-39b2ada322ca",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import warnings\n",
 41 |     "\n",
 42 |     "import pandas as pd\n",
 43 |     "import numpy as np\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "from sklearn.model_selection import train_test_split"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "e769ae58",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "Now, let's load in the data from the `data` subfolder of this directory.\n",
 54 |     "\n",
 55 |     "**Question:** How many columns are there in this data set? How many rows?"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "a612a6fb-fd37-4603-a430-2c018c5d7f29",
 62 |    "metadata": {
 63 |     "scrolled": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "data = pd.read_csv('../data/penguins.csv')\n",
 68 |     "data"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "35f79ca2-f223-4a2d-b5a1-edd1e2df3d96",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Below is the information for each of the columns:\n",
 77 |     "1. **species**: Species of penguin [Adelie, Chinstrap, Gentoo]\n",
 78 |     "2. **island**: Island where the penguin was found [Torgersen, Biscoe]\n",
 79 |     "3. **culmen_length_mm**: Length of upper part of penguin's bill (millimeters)\n",
 80 |     "4. **culmen_depth_mm**: Height of upper part of bill (millimeters)\n",
 81 |     "5. **flipper_length_mm**: Length of penguin flipper (millimeters)\n",
 82 |     "6. **body_mass_g**: Body mass of the penguin (grams)\n",
 83 |     "7. **sex**: Biological sex of the penguin [MALE, FEMALE]\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "**Question:** Which of the columns are continuous? Which are categorical?\n",
 87 |     "\n",
 88 |     "\n",
 89 |     "We will need to treat the numeric and categorical data differently in preprocessing.\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "id": "75343925-7865-43e6-bba1-f7fff9a673c1",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "## Missing Data Preprocessing\n",
 98 |     "\n",
 99 |     "First, let's check to see if there are any missing values in the data set. Missing values are represented by `NaN`. \n",
100 |     "\n",
101 |     "**Question:** In this case, what do missing values stand for?"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "0fbb04bc-4a44-493f-85d6-739adb1c7d8d",
108 |    "metadata": {
109 |     "scrolled": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "data.isnull().sum()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "fd318fc2",
119 |    "metadata": {},
120 |    "source": [
121 |     "It is also possible to have non `NaN` missing values. For example, let's take a look at the `sex` column."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "2d613dce",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "data['sex'].unique()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "id": "eed852c0",
137 |    "metadata": {},
138 |    "source": [
139 |     "In this case, the `.` represents a missing value, so let's replace those with `np.nan` objects."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "d980a391",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "data.replace('.', np.nan, inplace=True)\n",
150 |     "\n",
151 |     "data['sex'].unique()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "737bbd99-c5ba-474b-a194-0003ae520a04",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Imputation\n",
160 |     "\n",
161 |     "In the case of missing values, we have the option to fill in the missing values with the best guess. This is called **imputation**. Here we'll impute any missing values using the average, or mean, of all the data that does exist, as that's the best guess for a data point if all we have is the data itself. To do that we'll use the `SimpleImputer` to assign the mean to all missing values in the data.\n",
162 |     "\n",
163 |     "There are also other strategies that can be used to impute missing data ([see documentation](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)).\n",
164 |     "\n",
165 |     "Let's see how the `SimpleImputer` works on a subset of the data. "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "af30fe06-eb35-48af-88a2-b4cbd74e1335",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "from sklearn.impute import SimpleImputer\n",
176 |     "\n",
177 |     "imputer = SimpleImputer(missing_values=np.nan,\n",
178 |     "                        strategy='mean', \n",
179 |     "                        copy=True)\n",
180 |     "imputed = imputer.fit_transform(data[['body_mass_g','flipper_length_mm']])\n"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "id": "04e085f8",
186 |    "metadata": {},
187 |    "source": [
188 |     "Now let's check that the previously null values have been filled in. "
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "bc7157f2",
195 |    "metadata": {
196 |     "scrolled": false
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "print(imputed[data[data['body_mass_g'].isna()].index])"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "id": "de080754",
206 |    "metadata": {},
207 |    "source": [
208 |     "### Dropping Null Values"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "id": "a2f21878",
214 |    "metadata": {},
215 |    "source": [
216 |     "Another option option is to use `pd.dropna()` to drop `Null` values from the `DataFrame`. This should almost always be used with the `subset` argument which restricts the function to only dropping values that are null in a certain column(s)."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "db11f7e0",
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "data = data.dropna(subset='sex')\n",
227 |     "\n",
228 |     "# Now this line will return an empty dataframe\n",
229 |     "data[data['sex'].isna()]"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "id": "173e3fff-ded3-4c7a-9dfe-a3b9ff62a566",
235 |    "metadata": {},
236 |    "source": [
237 |     "## Categorical Data Processing\n",
238 |     "\n",
239 |     "As we saw earlier, the `penguins` dataset contains both categorical and continuous features, which will each need to be preprocessed in different ways. First, we want to transform the categorical variables from strings to **indicator variables**. Indicator variables have one column per level, For example, the island variable will change from Biscoe/Dream/Torgersen --> Biscoe (1/0), Dream (1/0), and Torgerson (1/0). For each set of indicator variables, there should be a 1 in exactly one column."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "id": "fb9bc33e-2b97-4b31-83d1-985dec1e5950",
245 |    "metadata": {},
246 |    "source": [
247 |     " Let's make a list of the categorical variable names to be transformed into indicator variables."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "3113d6a3-474c-4b57-9804-8040c38117a8",
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# Define the variable names that are categorical for use later\n",
258 |     "cat_var_names = ['island', 'sex']\n",
259 |     "data_cat = data[cat_var_names]\n",
260 |     "data_cat.head()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "id": "61a2d7ba-036f-49e2-ab9e-dc06086eaed6",
266 |    "metadata": {},
267 |    "source": [
268 |     "### Categorical Variable Encoding (One-hot & Dummy)\n",
269 |     "\n",
270 |     "Many machine learning algorithms require that categorical data be encoded numerically in some fashion. There are two main ways to do so:\n",
271 |     "\n",
272 |     "\n",
273 |     "- **One-hot-encoding**, which creates `k` new variables for a single categorical variable with `k` categories (or levels), where each new variable is coded with a `1` for the observations that contain that category, and a `0` for each observation that doesn't. \n",
274 |     "- **Dummy encoding**, which creates `k-1` new variables for a categorical variable with `k` categories\n",
275 |     "\n",
276 |     "However, when using some machine learning algorithms we can run into the so-called [\"Dummy Variable Trap\"](https://www.algosome.com/articles/dummy-variable-trap-regression.html) when using One-Hot-Encoding on multiple categorical variables within the same set of features. This occurs because each set of one-hot-encoded variables can be added together across columns to create a single column of all `1`s, and so are multi-colinear when multiple one-hot-encoded variables exist within a given model. This can lead to misleading results. \n",
277 |     "\n",
278 |     "To resolve this, we can simply add an intercept term to our model (which is all `1`s) and remove the first one-hot-encoded variable for each categorical variables, resulting in `k-1` so-called \"Dummy Variables\". \n",
279 |     "\n",
280 |     "Luckily the `OneHotEncoder` from `sklearn` can perform both one-hot and dummy encoding simply by setting the `drop` parameter (`drop = 'first'` for Dummy Encoding and `drop = None` for One Hot Encoding). \n",
281 |     "\n",
282 |     "**Question:** How many total columns will there be in the output?"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "a9384a9e-453f-4b62-8bbf-7866b8ac441c",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "from sklearn.preprocessing import OneHotEncoder\n",
293 |     "dummy_e = OneHotEncoder(categories='auto', drop='first', sparse=False)\n",
294 |     "dummy_e.fit(data_cat);\n",
295 |     "dummy_e.categories_"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "id": "e4091b24-0e57-47e3-a58a-d88826ab5c8b",
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "temp = dummy_e.transform(data_cat)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "fec19bc9-6aee-48d1-b043-04ab71e4208b",
311 |    "metadata": {
312 |     "tags": []
313 |    },
314 |    "source": [
315 |     "## Continuous Data Preprocessing\n",
316 |     "\n",
317 |     "For numeric data, we don't need to create indicator variables, instead we need to normalize our variables, which helps improve performance of many machine learning models.\n",
318 |     "\n",
319 |     " Let's make subset out the continuous variables to be normalized."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "id": "06511352-4ba4-4bb5-8da4-82430ac080a9",
326 |    "metadata": {
327 |     "tags": []
328 |    },
329 |    "outputs": [],
330 |    "source": [
331 |     "data_num = data.drop(columns=cat_var_names + ['species'])\n",
332 |     "data_num.head()"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "id": "a13162f8-71d0-4f34-8edb-2b95516b4fa0",
338 |    "metadata": {},
339 |    "source": [
340 |     "### Normalization\n",
341 |     "\n",
342 |     "[Normalization](https://en.wikipedia.org/wiki/Normalization_(statistics)) is a transformation that puts data into some known \"normal\" scale. We use normalization to improve the performance of many machine learning algorithms (see [here](https://en.wikipedia.org/wiki/Feature_scaling)). There are many forms of normalization, but perhaps the most useful to machine learning algorithms is called the \"z-score\" also known as the standard score. \n",
343 |     "\n",
344 |     "To z-score normalize the data, we simply subtract the mean of the data, and divide by the standard deviation. This results in data with a mean of `0` and a standard deviation of `1`.\n",
345 |     "\n",
346 |     "We'll use the `StandardScaler` from `sklearn` to do normalization."
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "id": "19f872ea-59e4-46a6-b366-578f6d0716a7",
353 |    "metadata": {
354 |     "scrolled": true
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "from sklearn.preprocessing import StandardScaler\n",
359 |     "norm_e = StandardScaler()\n",
360 |     "norm_e.fit_transform(data_num,).mean(axis=0)\n"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "f71c20c9",
366 |    "metadata": {},
367 |    "source": [
368 |     "To check the normalization works, let's look at the mean and standard variation of the resulting columns. \n",
369 |     "\n",
370 |     "**Question:** What should the mean and std variation be?"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "id": "1ac3fe89",
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "print('mean:',norm_e.fit_transform(data_num,).mean(axis=0))\n",
381 |     "print('std:',norm_e.fit_transform(data_num,).std(axis=0))"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "id": "202c54f4",
387 |    "metadata": {},
388 |    "source": [
389 |     "---\n",
390 |     "## Challenge 1: Fitting preprocessing functions\n",
391 |     "\n",
392 |     "The simple imputer, normalization and one-hot-encoding rely on sklearn functions that are fit to a data set. \n",
393 |     "\n",
394 |     "1) What is being fit for each of the three functions?\n",
395 |     "    1) One Hot Encoding\n",
396 |     "    2) Standard Scaler\n",
397 |     "    3) Simple Imputer\n",
398 |     "    \n",
399 |     "*YOUR ANSWER HERE*\n",
400 |     "\n",
401 |     "When we are preprocessing data we have a few options: \n",
402 |     "1) Fit on the whole data set\n",
403 |     "2) Fit on the training data\n",
404 |     "3) Fit on the testing data\n",
405 |     "\n",
406 |     "Which of the above methods would you use and why?\n",
407 |     "\n",
408 |     "*YOUR ANSWER HERE*\n",
409 |     "\n",
410 |     "---\n"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "id": "03d7c3bf-c215-4de8-830d-c933ed52c505",
416 |    "metadata": {},
417 |    "source": [
418 |     "## Combine it all together\n",
419 |     "\n",
420 |     "Now let's combine what we've learned to preprocess the entire dataset.\n",
421 |     "\n",
422 |     "First we will reload the data set to start with a clean copy."
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "id": "4b097530",
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "data = pd.read_csv('../data/penguins.csv')\n",
433 |     "data.replace('.', np.nan, inplace=True)\n",
434 |     "data = data.dropna(subset='sex')\n"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "id": "cea1cd98",
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "# Perform the train-test split\n",
445 |     "y = data['species']\n",
446 |     "X = data.drop('species', axis =1, inplace=False)\n",
447 |     "X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.25, stratify=y)\n",
448 |     "print(X_train.shape)\n"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "id": "bbadb45c",
454 |    "metadata": {},
455 |    "source": [
456 |     "We want to train our imputers on the training data using `fit_transform`, then `transform` the test data. This more closely resembles what the workflow would look like if you are bringing in brand new test data."
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "id": "ae2be342-483d-4d5b-b3ba-105b60e2cfeb",
462 |    "metadata": {},
463 |    "source": [
464 |     "First, we will subset out the categorical and numerical features separately. "
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "id": "af05022a-a041-4d01-b189-5ceb5e1e0468",
471 |    "metadata": {},
472 |    "outputs": [],
473 |    "source": [
474 |     "# Get the categorical and numerical variable column indices\n",
475 |     "cat_var = ['island', 'sex']\n",
476 |     "num_var = ['culmen_length_mm', 'culmen_depth_mm',\n",
477 |     "           'flipper_length_mm', 'body_mass_g']\n",
478 |     "# Splice the training array\n",
479 |     "X_train_cat = X_train[cat_var]\n",
480 |     "X_train_num = X_train[num_var]\n",
481 |     "\n",
482 |     "# Splice the test array\n",
483 |     "X_test_cat = X_test[cat_var]\n",
484 |     "X_test_num = X_test[num_var]"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "markdown",
489 |    "id": "9b746b78-8d31-40e9-819e-2273278c2f88",
490 |    "metadata": {},
491 |    "source": [
492 |     "Now, let's process the categorical data with **Dummy encoding**"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "id": "c45d20a3-73b9-490c-9f81-23e37fc09a2d",
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "warnings.filterwarnings('ignore')\n",
503 |     "\n",
504 |     "# Categorical feature encoding\n",
505 |     "X_train_dummy = dummy_e.fit_transform(X_train_cat)\n",
506 |     "X_test_dummy = dummy_e.transform(X_test_cat)\n",
507 |     "\n",
508 |     "\n",
509 |     "# Check the shape\n",
510 |     "X_train_dummy.shape, X_test_dummy.shape"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "id": "0ae07768",
516 |    "metadata": {},
517 |    "source": [
518 |     "Now, let's process the numerical data by imputing any missing values and normalizing the results."
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "id": "127c7fc4-fd8e-4deb-832a-8e02d82909d6",
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "# Numerical feature standardization\n",
529 |     "\n",
530 |     "# Impute the data\n",
531 |     "X_train_imp = imputer.fit_transform(X_train_num)\n",
532 |     "X_test_imp = imputer.transform(X_test_num)\n",
533 |     "\n",
534 |     "# Check for missing values\n",
535 |     "np.isnan(X_train_imp).any(), np.isnan(X_test_imp).any()\n",
536 |     "\n",
537 |     "# normalize\n",
538 |     "X_train_norm = norm_e.fit_transform(X_train_num)\n",
539 |     "X_test_norm = norm_e.transform(X_test_num)\n",
540 |     "\n",
541 |     "X_train_norm.shape, X_test_norm.shape"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "id": "f309dc2b-bdf8-420c-a3f3-fe93c854c3eb",
547 |    "metadata": {},
548 |    "source": [
549 |     "Now that we've processed the numerical and categorical data separately, we can put the two arrays back together."
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "id": "5a97ace9-bd20-49c0-bae9-bd629a8b7a29",
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "X_train = np.hstack((X_train_dummy, X_train_norm))\n",
560 |     "X_test = np.hstack((X_test_dummy, X_test_norm))\n",
561 |     "\n",
562 |     "X_train.shape, X_test.shape"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "markdown",
567 |    "id": "eab00968",
568 |    "metadata": {},
569 |    "source": [
570 |     "---\n",
571 |     "## Challenge 2: Order of Preprocessing\n",
572 |     "\n",
573 |     "In the preprocessing we did the following steps: \n",
574 |     "\n",
575 |     "1) Null values\n",
576 |     "2) One-hot-encoding\n",
577 |     "3) Imputation\n",
578 |     "4) Normalization\n",
579 |     "\n",
580 |     "Now, consider that we change the order of the steps in the following ways. What effect might that have on the algorithms?\n",
581 |     "**Hint**: Try copying the code from above and trying it out!\n",
582 |     "\n",
583 |     "- One-Hot-Encoding before Null Values\n",
584 |     "- Normalization before Null values\n",
585 |     "\n",
586 |     "**Bonus:** Are there any other switches in order that might affect preprocessing?\n",
587 |     "\n",
588 |     "---"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": null,
594 |    "id": "d36e3bd7",
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": [
598 |     "# YOUR CODE HERE"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "markdown",
603 |    "id": "92c4ecff-fb89-4f71-a7ef-70aa43ccc691",
604 |    "metadata": {},
605 |    "source": [
606 |     "Finally, let's save our results as separate `.csv` files, so we won't have to run the preprocessing again.\n",
607 |     "\n",
608 |     "First we will make them DataFrames, add columns, and save them as .csv files"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": null,
614 |    "id": "1f18fab4",
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": [
618 |     "X_train = pd.DataFrame(X_train)\n",
619 |     "X_train.columns = ['Dream','Torgersen', 'Male',\n",
620 |     "                   'culmen_length_mm', 'culmen_depth_mm',\n",
621 |     "                   'flipper_length_mm', 'body_mass_g']\n",
622 |     "\n",
623 |     "X_test = pd.DataFrame(X_test)\n",
624 |     "\n",
625 |     "X_test.columns = ['Dream','Torgersen', 'Male',\n",
626 |     "                   'culmen_length_mm', 'culmen_depth_mm',\n",
627 |     "                   'flipper_length_mm', 'body_mass_g']\n",
628 |     "y_train = pd.DataFrame(y_train)\n",
629 |     "y_train.columns = ['species']\n",
630 |     "\n",
631 |     "y_test = pd.DataFrame(y_test)\n",
632 |     "y_test.columns = ['species']\n",
633 |     "\n",
634 |     "X_train.to_csv('../data/penguins_X_train.csv')\n",
635 |     "X_test.to_csv('../data/penguins_X_test.csv')\n",
636 |     "y_train.to_csv('../data/penguins_y_train.csv')\n",
637 |     "y_test.to_csv('../data/penguins_y_test.csv')\n"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "markdown",
642 |    "id": "2a6de745",
643 |    "metadata": {},
644 |    "source": [
645 |     "Although now we will move on to talk about classification, all of the choices we make in the preprocessing pipeline are extremely important to machine learning."
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "id": "06995721",
651 |    "metadata": {},
652 |    "source": [
653 |     "---\n",
654 |     "## Challenge 3: Preprocessing and regularization\n",
655 |     "\n",
656 |     "We are preprocessing data in preparation for a classification task down the line. However, preprocessing also applies to regression. \n",
657 |     "\n",
658 |     "Consider the regularization task applied in the previous notebook. How might the preprocessing steps affect the performance of regularization?\n",
659 |     "\n",
660 |     "---"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "id": "b0895317",
667 |    "metadata": {},
668 |    "outputs": [],
669 |    "source": [
670 |     "# YOUR CODE HERE"
671 |    ]
672 |   }
673 |  ],
674 |  "metadata": {
675 |   "kernelspec": {
676 |    "display_name": "Python 3 (ipykernel)",
677 |    "language": "python",
678 |    "name": "python3"
679 |   },
680 |   "language_info": {
681 |    "codemirror_mode": {
682 |     "name": "ipython",
683 |     "version": 3
684 |    },
685 |    "file_extension": ".py",
686 |    "mimetype": "text/x-python",
687 |    "name": "python",
688 |    "nbconvert_exporter": "python",
689 |    "pygments_lexer": "ipython3",
690 |    "version": "3.9.12"
691 |   }
692 |  },
693 |  "nbformat": 4,
694 |  "nbformat_minor": 5
695 | }
696 | 


--------------------------------------------------------------------------------
/lessons/04_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Python Machine Learning: Classification"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "A common task in computational research is to classify an object based on a set of features. In supervised machine learning, we can give an algorithm a dataset of training examples that say \"here are specific features, and this is the target class it belongs to\". With enough training examples, a model can be built that recognizes important features in determining an object's class. This model can then be used to predict the class of an object given its known features.\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "First let's import the packages that we need for this notebook."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "import numpy as np\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "import seaborn as sb\n",
 30 |     "\n",
 31 |     "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
 32 |     "from sklearn.linear_model import LogisticRegression\n",
 33 |     "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
 34 |     "from sklearn.model_selection import train_test_split, cross_val_score, KFold\n",
 35 |     "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Penguin Data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Let's say that we are studying penguins in Antartica. We have a set of penguins that we have body measurements for, of three different species: Adelie, Chinstrap, and Gentoo. We are interested in being able to differentiate between these three species based on the measurements. First, let's take a look at our data set. \n",
 50 |     "\n",
 51 |     "\n",
 52 |     "Now, let's load in our preprocessed `penguins` data set.  \n"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "X_train = pd.read_csv('../data/penguins_X_train.csv')\n",
 62 |     "X_test = pd.read_csv('../data/penguins_X_test.csv')\n",
 63 |     "y_train = pd.read_csv('../data/penguins_y_train.csv')\n",
 64 |     "y_test = pd.read_csv('../data/penguins_y_test.csv')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Let's start with just two penguin species: Adelie and Gentoo. "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "X_train = X_train[y_train['species'].isin(['Adelie','Gentoo'])].reset_index()\n",
 81 |     "X_test = X_test[y_test['species'].isin(['Adelie','Gentoo'])].reset_index()\n",
 82 |     "y_train = y_train[y_train['species'].isin(['Adelie','Gentoo'])].reset_index()\n",
 83 |     "y_test = y_test[y_test['species'].isin(['Adelie','Gentoo'])].reset_index()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## Null Accuracy\n",
 91 |     "\n",
 92 |     "Let's say that we wanted to assign a species to each unknown measured penguin. One way to do this is to assign all observations to the majority classes. The code below shows the proportion of each species in the training data.\n",
 93 |     "\n",
 94 |     "**Question:** If we want to maximize accuracy, which species label would we assign to all observations? "
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "scrolled": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "y_train.value_counts('species')/sum(y_train.value_counts('species'))"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "This accuracy is our **baseline model**, and is the number that we will try to improve on with classification."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "Let's get to know our dataset by conducting some exploratory data analysis. We'll be using some rudimentary data analysis to see there's a relationship between the independent variables across species."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "Let's say that we decide that body mass might be a good way to differentiate between Adelie and Gentoo penguins. We can look at a plot of the histogram to see how the distribution of this variable changes between species.\n",
127 |     "\n",
128 |     "**Question**: Where would you place a line to minimize the overlap in the distribution? "
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "scrolled": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "sb.histplot(data=X_train.loc[y_train['species'].isin(['Adelie','Gentoo'])],\n",
140 |     "                x = 'body_mass_g',\n",
141 |     "                hue = y_train['species'],kde=True,bins=20)\n",
142 |     "#plt.axvline(.28,color= 'red')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "Now let's apply this same decision boundary to the test data. \n",
150 |     "\n",
151 |     "**Question:** Is this still the best boundary?"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "sb.histplot(data=X_test.loc[y_test['species'].isin(['Gentoo','Adelie'])],\n",
161 |     "                x = 'body_mass_g',\n",
162 |     "                hue = y_test['species'],kde=True,bins=20)\n",
163 |     "#plt.axvline(.28,color= 'red')"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "This is the basic goal of classification. Based on your boundary criteria, you would **classify** all each of the penguins. However there would be some error involved. We can be more confident in our classification at the far ends of the distribution, and less confident where the distributions overlap. \n"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "Now let's figure out how to separate out these groups mathematically. For this, we will start by using an algorithm called Logistic Regression."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Logistic Regression\n",
185 |     "\n",
186 |     "Logistic regression is a supervised classification algorithm that is used to predict a binary outcome. Similar to linear regression, this model uses coefficients or betas to make its predictions. However unlike a linear regression, its predictions range from 0 to 1, where 0 and 1 stand for 'confidently class A and B' respectively. Predictions along the middle of the line show less confidence in the prediction.\n",
187 |     "\n",
188 |     "The function for the logistic regression is:\n",
189 |     "$$ p(x) = \\frac{1}{1 + e^{(-\\beta_0+\\beta_1x_1...)}}$$\n",
190 |     "\n",
191 |     "where $\\beta$ are the learned parameters and $x$ are the input features.\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "Let's train a logistic regression model on the variable: `body_mass_g`"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Modeling with Logistic Regression"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "\n",
213 |     "Logistic regression uses the same general steps as many other `sklearn` algorithms:\n",
214 |     "1. Initialize Model\n",
215 |     "2. Fit model on training data\n",
216 |     "3. Evaluate on training and testing datasets"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "#1) Initialize Model\n",
226 |     "lr = LogisticRegression(max_iter=170)\n",
227 |     "\n",
228 |     "#2) Fit model\n",
229 |     "lr.fit(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n",
230 |     "\n",
231 |     "#3) Evaluate \n",
232 |     "train_score = lr.score(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n",
233 |     "test_score = lr.score(X_test['body_mass_g'].values.reshape(-1, 1), y_test['species'])\n",
234 |     "\n",
235 |     "print(\"Training score:\", train_score.round(3), \"Testing score:\", test_score.round(3))"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "**Question:** How well did the model do compared to baseline?"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "## Multivariate Logistic Regression\n",
250 |     "\n",
251 |     "\n",
252 |     "The logistic regression did a pretty good job at classifying the penguins. However, we have more than just body mass to base our decision of species based on. For example, let's look at the combination of culmen depth and body mass in our data by using a scatterplot.\n",
253 |     "\n",
254 |     "In the two dimensional space, the intuition is that we want to draw a line that separates the classes. \n",
255 |     "\n",
256 |     "**Question:** Is it possible to draw a line that separates the groups? If it is, this is a **linearly seperable** problem"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "scrolled": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "sb.scatterplot(data=X_train.loc[y_train['species'].isin(['Adelie','Gentoo'])],\n",
268 |     "                x = 'culmen_depth_mm',\n",
269 |     "                y = 'body_mass_g',\n",
270 |     "                hue = y_train['species'])"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "Let's retrain the logistic model with two variables."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "lr = LogisticRegression(max_iter=170)\n",
287 |     "lr.fit(X_train[['body_mass_g','culmen_depth_mm']], y_train['species'])\n",
288 |     "\n",
289 |     "train_score = lr.score(X_train[['body_mass_g','culmen_depth_mm']], y_train['species'])\n",
290 |     "test_score = lr.score(X_test[['body_mass_g','culmen_depth_mm']], y_test['species'])\n",
291 |     "\n",
292 |     "print(\"Training score = {}, testing score = {}\".format(train_score.round(3), test_score.round(3)))"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "While this doesn't happen often in real life, we got a perfect score! We could add more features to the model, but there isn't a need since our model is already behaving perfectly. Now let's take a look at the coefficients of the model. We reference the `lr.coef_` attribute to see the coefficients"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "\n",
309 |     "coef = pd.Series(index=['body_mass_g','culmen_depth_mm'], data=lr.coef_[0])\n",
310 |     "\n",
311 |     "coef.sort_values()"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "**Question:** What do you think the *magnitude* and *sign* of the coefficients means about how these variables are related to each category?\n",
319 |     "**Hint:** Refer back to the scatter plot!"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "## Model evaluation"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "We've covered accuracy already but there a whole litany of other ways to evaluate the performance of a classification model.\n",
334 |     "\n",
335 |     "In a binary classification task, there are four major types of predictions:\n",
336 |     "\n",
337 |     "[Confusion Matrix (Wikipedia)](https://en.wikipedia.org/wiki/Confusion_matrix): \n",
338 |     "- true positive (TP): A test result that correctly indicates the presence of a condition or characteristic\n",
339 |     "- true negative (TN): A test result that correctly indicates the absence of a condition or characteristic\n",
340 |     "- false positive (FP): A test result which wrongly indicates that a particular condition or attribute is present\n",
341 |     "- false negative (FN): A test result which wrongly indicates that a particular condition or attribute is absent\n",
342 |     "\n",
343 |     "\n",
344 |     "Accuracy, which is the most common metric used with classification can be characterized as:\n",
345 |     "\n",
346 |     "$$ Accuracy= \\frac{\\sum{\\text{True Positives}}+\\sum{\\text{True Negatives}}}{\\sum{\\text{Total Population}}}$$"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "We can combine the prediction measures above to create three helpful metrics for evaluating classification: **precision**, **recall**, and **specificity**. "
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "\n",
361 |     "1. **Precision**: \n",
362 |     "$$\\frac{\\sum{\\text{True Positives}}}{\\sum{\\text{Predicted Positives}}}$$\n",
363 |     "2. **Recall** (or **Sensitivity**): \n",
364 |     "$$\\frac{\\sum{\\text{True Positives}}}{\\sum{\\text{Condition Positives}}}$$ \n",
365 |     "3. **Specificity** (like recall for negative examples): \n",
366 |     "$$\\frac{\\sum{\\text{True Negatives}}}{\\sum{\\text{Condition Negatives}}}$$\n"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "Let's make a confusion matrix and derive the recall and precision scores."
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "First, let's go back to the original (not perfect) model so we can see what these rates look like.\n",
381 |     "\n",
382 |     "First we will retrain the model and make predictions on the test set."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "lr.fit(X_train['body_mass_g'].values.reshape(-1, 1), y_train['species'])\n",
392 |     "preds = lr.predict(X_test[['body_mass_g']])"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "# Pass y_test and preds into confusion_matrix\n",
402 |     "confusion_matrix(y_test['species'], preds)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "## Challenge 1: Model Evaluation\n",
410 |     "\n",
411 |     "1). What are the TP, FP, TN, FN in these model results?\n",
412 |     "\n",
413 |     "2). What is the precision and recall for this model?\n",
414 |     "\n",
415 |     "3). Which is more important, precision or recall?"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {},
421 |    "source": [
422 |     "Depending on your task, other metrics than accuracy might be more beneficial to understanding your model's performance. At the very least, examining the confusion matrix is a great way to get a better sense of how your model is performing across classes."
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "markdown",
427 |    "metadata": {},
428 |    "source": [
429 |     "## Decision Trees\n",
430 |     "\n",
431 |     "Let's now include all three species of penguin that we want to differentiate between. We can turn to other models that can handle two or more classes for classification. One such example is the Decision Tree Classifier. In terms of logic, this is like a flow chart.\n",
432 |     "\n",
433 |     "\n",
434 |     "In this flow chart the data is that the lamp doesn't work, and the features are information about how the lamp doesn't work. The classes is the action that is taken at the end.\n",
435 |     "\n",
436 |     "![Alt](https://upload.wikimedia.org/wikipedia/commons/9/91/LampFlowchart.svg)"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "While the ultimate goal of classification remains the same, machine learning algorithms vary widely in terms of *how* they go about this task. The neat thing about `sklearn` is that many algorithms use the same syntax, which makes comparing their performance on a task fairly straightforward. However, each model will have different underlying parameters and methods to identify the optimal split. When you are using a new model it is helpful to read up on how the model works. \n",
444 |     "\n",
445 |     "The documentation is a great way to do that.\n",
446 |     "Read the [documentation](https://scikit-learn.org/stable/modules/tree.html#tree) for the Decision Tree and let's try to answer the following questions:\n",
447 |     "\n",
448 |     "1). What are two advantages and two disadvantages of the Decision Tree?\n",
449 |     "2). What measure do Decision Trees use to determine optimal split?\n",
450 |     "3). How do you import the Decision Tree from sklearn?\n",
451 |     "\n",
452 |     "**Decision Trees** are a classification/regression supervised learning algorithm that uses a series of splits to make its predictions.\n",
453 |     "\n",
454 |     "Decision Trees learn from the data by picking the feature-threshold that maximizes the information gain of the target variable. In other words it chooses a splitting point that produces the most imbalanced/pure proportions in the target variable. The goal of the model is to keep splitting until all the data in a terminal node or leaf are exclusively one class.\n",
455 |     "\n",
456 |     "The model iterates through a set of values for each feature and then calculate the information gain for each split and the one that produces the lowest value is the designated split."
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "**Parameters**\n",
464 |     "\n",
465 |     "There are many [parameters](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier) for the Decision Tree Classifier. A few relevant to this notebook are described here:\n",
466 |     "\n",
467 |     "**criterion**: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.\n",
468 |     "\n",
469 |     "**splitter**: The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.\n",
470 |     "\n",
471 |     "**max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.\n",
472 |     "\n",
473 |     "**min_samples_split**: The minimum number of samples required to split an internal node\n",
474 |     "\n",
475 |     "**min_samples_leaf**: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.\n",
476 |     "\n",
477 |     "**max_features**: The number of features to consider when looking for the best split"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "Now let's train a decision tree model on the penguins data set. We are going to start with a default DT model, meaning we're not going to pass in any parameters of our own. Like we did before, we are going to fit a model and then evaluate it on the training and testing datasets. Let's start with a single x-feature."
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "# Initialize model\n",
494 |     "dt = DecisionTreeClassifier()\n",
495 |     "\n",
496 |     "# Fit model on the dataset\n",
497 |     "dt.fit(X_train[['body_mass_g']], y_train['species'])\n",
498 |     "\n",
499 |     "# Derive the training accuracy score\n",
500 |     "dt.score(X_train[['body_mass_g']], y_train['species'])"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "# Test score\n",
510 |     "dt.score(X_test[['body_mass_g']], y_test['species'])"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "**Question:** Our testing score is considerably lower. When the testing score is lower than the training score, what does that mean?"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "markdown",
522 |    "metadata": {},
523 |    "source": [
524 |     "We can take advantage of some of the parameters of the decision tree in order to help prevent overfitting of the model. Let's try a model in which we impose some constraints on the tree?\n",
525 |     "\n",
526 |     "**Question:** From the documentation, what is one parameter that might help?"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "# Initialize\n",
536 |     "dt = DecisionTreeClassifier(max_depth=2)\n",
537 |     "# Fit \n",
538 |     "dt.fit(X_train[['body_mass_g']], y_train['species'])\n",
539 |     "\n",
540 |     "# Evaluate\n",
541 |     "train_score = dt.score(X_train[['body_mass_g']], y_train['species'])\n",
542 |     "test_score = dt.score(X_test[['body_mass_g']], y_test['species'])\n",
543 |     "\n",
544 |     "print(\"Our training score is {} and our testing score is {}\".format(train_score.round(3), test_score.round(3)))"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {},
550 |    "source": [
551 |     "The gap between the two scores is considerably lower. Arguably we don't have an over fit model anymore. However, we could likely improve on the accuracy of this model by including more features."
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "### Tree Visualization\n",
559 |     "\n",
560 |     "One big advantage of the Decision Tree is that it can be visualized no matter how many features were involved.\n",
561 |     "\n",
562 |     "Let's retrain it with a small `max_depth` "
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "dt = DecisionTreeClassifier(max_depth=2)\n",
572 |     "dt.fit(X_train[['body_mass_g']], y_train['species'])"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "**Question:** What is the first criteria used to split the decision tree? "
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": null,
585 |    "metadata": {},
586 |    "outputs": [],
587 |    "source": [
588 |     "plt.figure(figsize=(28, 20))\n",
589 |     "plot_tree(dt, feature_names=['body_mass_g'], class_names=[\"Adelie\", \"Chinstrap\",\"Gentoo\"], \n",
590 |     "          filled = True, proportion=True, fontsize=18\n",
591 |     "         );"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "Using the tree, how would we make predictions about the following customers?\n",
599 |     "\n",
600 |     "\n",
601 |     "    - Penguin A: Body Mass of .5\n",
602 |     "    - Penguin B: Body Mass of 0"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "markdown",
607 |    "metadata": {},
608 |    "source": [
609 |     "## Challenge 2: Classification with SVM\n",
610 |     "\n",
611 |     "Now let's try another new model. The [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html#classification) is another class of machine learning algorithm that is used for classification. \n",
612 |     "\n",
613 |     "Choose two features of the data set to train your model on. Then, using the documentation for the support vector machine, follow the steps to:\n",
614 |     "- Initialize the model\n",
615 |     "- Fit it to the training data\n",
616 |     "- Evaluate the model on both the training and testing data\n",
617 |     "\n",
618 |     "Is your model underfit? Is it overfit?\n",
619 |     "\n",
620 |     "How does SVM fit in with the **linearly separable** problem identified in the scatter plots above?"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "## YOUR CODE HERE\n",
630 |     "from sklearn.svm import SVC\n",
631 |     "X_train_subset = X_train[['feature1','feature2']]\n",
632 |     "X_test_subset = X_test[['feature1','feature2']]\n",
633 |     "y_train_subset = y_train['species']\n",
634 |     "y_test_subset = y_test['species']\n",
635 |     "\n",
636 |     "##1) Initialize SVM\n",
637 |     "\n",
638 |     "##2) Train SVM on Training data \n",
639 |     "\n",
640 |     "##3) Evaluate SVM on Training and Test Data"
641 |    ]
642 |   },
643 |   {
644 |    "cell_type": "code",
645 |    "execution_count": null,
646 |    "metadata": {},
647 |    "outputs": [],
648 |    "source": []
649 |   }
650 |  ],
651 |  "metadata": {
652 |   "anaconda-cloud": {},
653 |   "hide_input": false,
654 |   "kernelspec": {
655 |    "display_name": "Python 3 (ipykernel)",
656 |    "language": "python",
657 |    "name": "python3"
658 |   },
659 |   "language_info": {
660 |    "codemirror_mode": {
661 |     "name": "ipython",
662 |     "version": 3
663 |    },
664 |    "file_extension": ".py",
665 |    "mimetype": "text/x-python",
666 |    "name": "python",
667 |    "nbconvert_exporter": "python",
668 |    "pygments_lexer": "ipython3",
669 |    "version": "3.9.12"
670 |   },
671 |   "toc": {
672 |    "base_numbering": 1,
673 |    "nav_menu": {},
674 |    "number_sections": false,
675 |    "sideBar": true,
676 |    "skip_h1_title": false,
677 |    "title_cell": "Table of Contents",
678 |    "title_sidebar": "Contents",
679 |    "toc_cell": false,
680 |    "toc_position": {},
681 |    "toc_section_display": "block",
682 |    "toc_window_display": true
683 |   },
684 |   "varInspector": {
685 |    "cols": {
686 |     "lenName": 16,
687 |     "lenType": 16,
688 |     "lenVar": 40
689 |    },
690 |    "kernels_config": {
691 |     "python": {
692 |      "delete_cmd_postfix": "",
693 |      "delete_cmd_prefix": "del ",
694 |      "library": "var_list.py",
695 |      "varRefreshCmd": "print(var_dic_list())"
696 |     },
697 |     "r": {
698 |      "delete_cmd_postfix": ") ",
699 |      "delete_cmd_prefix": "rm(",
700 |      "library": "var_list.r",
701 |      "varRefreshCmd": "cat(var_dic_list()) "
702 |     }
703 |    },
704 |    "types_to_exclude": [
705 |     "module",
706 |     "function",
707 |     "builtin_function_or_method",
708 |     "instance",
709 |     "_Feature"
710 |    ],
711 |    "window_display": false
712 |   }
713 |  },
714 |  "nbformat": 4,
715 |  "nbformat_minor": 4
716 | }
717 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy>=1.16.3
3 | pandas>=0.24.2
4 | scipy>=1.3.1
5 | scikit-learn>=0.22.0
6 | tpot
7 | 


--------------------------------------------------------------------------------
/solutions/01_regression_solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a027a99e-1ac6-4336-b87a-f0d5d79e22e2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Python Machine Learning: Regression Solutions"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "8546cbf5-1c72-40c5-be75-234d1c3c9f3b",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "\n",
 22 |     "from sklearn.linear_model import LinearRegression\n",
 23 |     "from sklearn.model_selection import train_test_split\n",
 24 |     "\n",
 25 |     "%matplotlib inline"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "id": "772d7956-975b-4489-8336-40dc93e3f528",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "data = pd.read_csv('../data/auto-mpg.csv')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "id": "59c90903-4781-4a14-a73f-5322e7003705",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "---\n",
 44 |     "### Challenge 1: More EDA\n",
 45 |     "\n",
 46 |     "Create the following plots, or examine the following distributions, while exploring your data:\n",
 47 |     "\n",
 48 |     "1. A histogram of the displacement.\n",
 49 |     "2. A histogram of the horsepower.\n",
 50 |     "3. A histogram of the weight.\n",
 51 |     "4. A histogram of the acceleration.\n",
 52 |     "5. What are the unique model years, and their counts?\n",
 53 |     "6. What are the unique origin values, and their counts?\n",
 54 |     "\n",
 55 |     "---"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "859bccf7-82fa-4095-a6ff-523ef9eb7759",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "ax = data['displacement'].hist(grid=False, bins=np.linspace(75, 450, 15))\n",
 66 |     "ax.set_xlabel('Displacement')\n",
 67 |     "ax.set_ylabel('Frequency')\n",
 68 |     "plt.show()"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "631de034-f513-4199-9e76-e2a1388d0475",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "ax = data['horsepower'].hist(grid=False, bins=np.linspace(45, 230, 15))\n",
 79 |     "ax.set_xlabel('Horsepower')\n",
 80 |     "ax.set_ylabel('Frequency')\n",
 81 |     "plt.show()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "0b5c0f99-584f-4d52-ad12-051eeb238067",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "ax = data['weight'].hist(grid=False)\n",
 92 |     "ax.set_xlabel('Weight')\n",
 93 |     "ax.set_ylabel('Frequency')\n",
 94 |     "plt.show()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "95c88602-8d09-4b1c-ab93-d7a0329cee4f",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "ax = data['acceleration'].hist(grid=False)\n",
105 |     "ax.set_xlabel('Acceleration')\n",
106 |     "ax.set_ylabel('Frequency')\n",
107 |     "plt.show()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "e40bdacb-9b47-491a-995c-961430fcb4b2",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "data['model year'].value_counts().sort_index()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "d56a338a-1929-4c19-a7bc-3beeb7045335",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "data['origin'].value_counts().sort_index()"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "c391bc78-fb9c-441c-8c04-e6708645c157",
133 |    "metadata": {},
134 |    "source": [
135 |     "---\n",
136 |     "### Challenge 2: Mean Absolute Error\n",
137 |     "\n",
138 |     "Another commonly used metric in regression is the **Mean Absolute Error (MAE)**. As the name suggests, this can be calculated by taking the mean of the absolute errors. Calculate the mean absolute error on the training and test data with your trained model. We've imported the MAE for you below:\n",
139 |     "\n",
140 |     "---"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "5b6f6d56-5967-468c-bcd2-0ceb8819e630",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Remove the response variable and car name\n",
151 |     "X = data.drop(columns=['car name', 'mpg'])\n",
152 |     "# Assign response variable to its own variable\n",
153 |     "y = data['mpg'].astype(np.float64)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "edc3dbcb-9610-4342-96a3-5a4b7d400a15",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "f6eb59e4-1597-468e-b18d-ef5ecc519caf",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "model = LinearRegression()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "id": "0994de85-ae86-43aa-9fe1-0ded209edbc9",
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "model.fit(X_train, y_train)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "8c54289e-f6d0-4892-84bb-8728d8591402",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "y_train_pred = model.predict(X_train)\n",
194 |     "y_test_pred = model.predict(X_test)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "id": "1a7e56aa-35d8-4066-9fe1-29de73c359c3",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "from sklearn.metrics import mean_absolute_error\n",
205 |     "print(mean_absolute_error(y_train, y_train_pred))\n",
206 |     "print(mean_absolute_error(y_test, y_test_pred))"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "c4205dbf-87e5-4bbc-97e2-f80c3bde8530",
212 |    "metadata": {},
213 |    "source": [
214 |     "---\n",
215 |     "### Challenge 3: Feature Engineering\n",
216 |     "\n",
217 |     "You might notice that the `origin` variable has only three values. So, it's really a categorical variable, where each sample has one of three origins. In this scenario, we've treated it like a continuous variable. \n",
218 |     "\n",
219 |     "How can we properly treat this variable as categorical? This is a question of preprocessing and **feature engineering**.\n",
220 |     "\n",
221 |     "What we can do is replace the `origin` feature with two binary variables. The first tells us whether origin is equal to 2. The second tells us whether origin is equal to 3. If both are false, that means origin is equal to 1.\n",
222 |     "\n",
223 |     "By fitting a linear regression with these two binary features rather than treating `origin` as continuous, we can get a better sense for how the origin impacts the MPG.\n",
224 |     "\n",
225 |     "Create two new binary features corresponding to origin, and then recreate the training and test data. Then, fit a linear model to the new data. What do you find about the performance and new coefficients?\n",
226 |     "\n",
227 |     "---"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "id": "651f4a11-aa7f-45d5-84de-d3c6f8b551bd",
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "data['origin_2'] = (data['origin'] == 2).astype('int')\n",
238 |     "data['origin_3'] = (data['origin'] == 3).astype('int')"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "id": "0ba5b282-fb1f-4550-a2e6-ce156ae4bb51",
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# Remove the response variable and car name\n",
249 |     "X = data.drop(columns=['car name', 'mpg', 'origin'])\n",
250 |     "# Assign response variable to its own variable\n",
251 |     "y = data['mpg'].astype(np.float64)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "id": "b633c0f1-de8a-46ad-a573-7b37b50089a9",
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "# Split\n",
262 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)\n",
263 |     "# Fit model\n",
264 |     "model = LinearRegression()\n",
265 |     "model.fit(X_train, y_train)\n",
266 |     "# Evaluate model\n",
267 |     "print(model.score(X_test, y_test))\n",
268 |     "print(model.coef_)"
269 |    ]
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "kernelspec": {
274 |    "display_name": "nlp",
275 |    "language": "python",
276 |    "name": "nlp"
277 |   },
278 |   "language_info": {
279 |    "codemirror_mode": {
280 |     "name": "ipython",
281 |     "version": 3
282 |    },
283 |    "file_extension": ".py",
284 |    "mimetype": "text/x-python",
285 |    "name": "python",
286 |    "nbconvert_exporter": "python",
287 |    "pygments_lexer": "ipython3",
288 |    "version": "3.9.7"
289 |   }
290 |  },
291 |  "nbformat": 4,
292 |  "nbformat_minor": 5
293 | }
294 | 


--------------------------------------------------------------------------------
/solutions/02_regularization_solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "cb96b065-4e7f-4836-8697-fd8137f80185",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Python Machine Learning: Regularization Solutions"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c2e63187-653f-4b31-9ed5-567a5f20b280",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "---\n",
 17 |     "### Challenge 1: Warm-Up\n",
 18 |     "\n",
 19 |     "Before we get started, let's warm up by importing our data and performing a train test split. We've providing the importing code for you. Go ahead and split the data into train/test sets using an 80/20 split, and a random state of 23.\n",
 20 |     "\n",
 21 |     "---"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "fcc3bf13-a08f-43df-a10b-bde060265645",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import pandas as pd\n",
 32 |     "import numpy as np\n",
 33 |     "\n",
 34 |     "from sklearn.linear_model import Lasso, LinearRegression, Ridge, RidgeCV\n",
 35 |     "from sklearn.metrics import mean_squared_error\n",
 36 |     "from sklearn.model_selection import train_test_split"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "3daed737-14ca-48e7-bd17-86f63c9cea5e",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Import data\n",
 47 |     "data = pd.read_csv('../data/auto-mpg.csv')\n",
 48 |     "# Remove the response variable and car name\n",
 49 |     "X = data.drop(columns=['car name', 'mpg'])\n",
 50 |     "# Assign response variable to its own variable\n",
 51 |     "y = data['mpg'].astype(np.float64)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "74f27e92-06ac-4136-bf0c-77fa8fdc5a32",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "id": "799213f6-7e18-43fe-ae78-aa65e66e26f7",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "---\n",
 70 |     "### Challenge 2: Benchmarking\n",
 71 |     "\n",
 72 |     "Re-run the ordinary least squares on the data using `LinearRegression`. Then, create a new ridge regression where the `alpha` penalty is set equal to zero. How do the performances of these models compare to each other? How do they compare with the original ridge regression? Be sure to compare both the training performances and test performances.\n",
 73 |     "\n",
 74 |     "---"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "58941489-36cd-4ad2-8b0d-25cc83151519",
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from sklearn.linear_model import Ridge\n",
 85 |     "# Create models\n",
 86 |     "ridge = Ridge(\n",
 87 |     "    # Regularization penalty\n",
 88 |     "    alpha=10,\n",
 89 |     "    random_state=1)\n",
 90 |     "# Fit object\n",
 91 |     "ridge.fit(X_train, y_train)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "ecb16cc3-6328-451a-917c-7d47a0a39cbd",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Linear regression\n",
102 |     "ols = LinearRegression()\n",
103 |     "ols.fit(X_train, y_train)\n",
104 |     "# Ridge, no penalty\n",
105 |     "ridge2 = Ridge(alpha=0, random_state=2) \n",
106 |     "ridge2.fit(X_train, y_train)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "5116c014-77cc-4456-85b4-383d80087808",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Evaluate\n",
117 |     "print(f'Training R^2, Original Ridge: {ridge.score(X_train, y_train)}')\n",
118 |     "print(f'Test R^2, Original Ridge: {ridge.score(X_test, y_test)}')\n",
119 |     "print(f'Training R^2, OLS: {ols.score(X_train, y_train)}')\n",
120 |     "print(f'Test R^2, OLS: {ols.score(X_test, y_test)}')\n",
121 |     "print(f'Training R^2, Ridge with no penalty: {ridge2.score(X_train, y_train)}')\n",
122 |     "print(f'Test R^2, Ridge with no penalty: {ridge2.score(X_test, y_test)}')"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "9ce10e26-c494-4a21-9274-7e70801c2594",
128 |    "metadata": {},
129 |    "source": [
130 |     "- Ridge with no penalty is the same as OLS.\n",
131 |     "- Ridge regression with a penalty has slightly worse training performance, but slightly better test performance."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "id": "27cc54c3-f936-4c95-ac6b-57c07bbc371f",
137 |    "metadata": {},
138 |    "source": [
139 |     "---\n",
140 |     "### Challenge 3: Performing a Lasso Fit\n",
141 |     "\n",
142 |     "Below, we've imported the `Lasso` object from `scikit-learn` for you. Just like `Ridge`, it needs to know what the strength of the regularization penalty is before fitting to the data. \n",
143 |     "\n",
144 |     "Fit several Lasso models, with different regularization strengths. Try one with a regularization strength of zero, try one with a small but non-zero regularization strength, and try one with a very large regularization strength. Look at the coefficients. What do you notice?\n",
145 |     "\n",
146 |     "---"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "263cd73b-3b2d-4161-b3d5-a24f1bd7488a",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "lasso1 = Lasso(alpha=0.01)\n",
157 |     "lasso1.fit(X_train, y_train)\n",
158 |     "lasso1.coef_"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "id": "efc7b978-efb3-4766-8f95-8c16c51bbcd7",
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "lasso2 = Lasso(alpha=10)\n",
169 |     "lasso2.fit(X_train, y_train)\n",
170 |     "lasso2.coef_"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "9c31efb4-933d-4fd5-9bfc-d4ec2b98ac46",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "lasso3 = Lasso(alpha=10000)\n",
181 |     "lasso3.fit(X_train, y_train)\n",
182 |     "lasso3.coef_"
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "nlp",
189 |    "language": "python",
190 |    "name": "nlp"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 3
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython3",
202 |    "version": "3.9.7"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 5
207 | }
208 | 


--------------------------------------------------------------------------------
/solutions/03_preprocessing_solutions.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "51dbac37",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "## Challenge 1: Fitting preprocessing functions\n",
 9 |     "\n",
10 |     "The simple imputer, normalization and one-hot-encoding rely on sklearn functions that are fit to a data set. \n",
11 |     "\n",
12 |     "1) What is being fit for each of the three functions?\n",
13 |     "\n",
14 |     "**Solution:**\n",
15 |     "\n",
16 |     "    1) One Hot Encoding - Levels for each categorical variable\n",
17 |     "    \n",
18 |     "    2) Standard Scaler - Mean / std deviation for each column\n",
19 |     "    \n",
20 |     "    3) Simple Imputer - Mean for each column\n",
21 |     "    \n",
22 |     "\n",
23 |     "When we are preprocessing data we have a few options: \n",
24 |     "1) Fit on the whole data set\n",
25 |     "2) Fit on the training data\n",
26 |     "3) Fit on the testing data\n",
27 |     "\n",
28 |     "Which of the above methods would you use and why?\n",
29 |     "\n",
30 |     "**Solution:** Best practice is to fit on the training data. This avoids **data leakage** or influence of test data information on training data."
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "markdown",
35 |    "id": "7d9b2692",
36 |    "metadata": {},
37 |    "source": [
38 |     "## Challenge 2: Order of Preprocessing\n",
39 |     "\n",
40 |     "In the preprocessing we did the following steps: \n",
41 |     "\n",
42 |     "1) Null values\n",
43 |     "2) One-hot-encoding\n",
44 |     "3) Imputation\n",
45 |     "4) Normalization\n",
46 |     "\n",
47 |     "Now, consider that we change the order of the steps in the following ways. What effect might that have on the algorithms?\n",
48 |     "**Hint**: Try copying the code from above and trying it out!\n",
49 |     "\n",
50 |     "- One-Hot-Encoding before Null Values - This will include null values as levels in one-hot-encoding\n",
51 |     "- Normalization before Null values - This may cause errors due to null values.\n",
52 |     "\n",
53 |     "**Bonus:** Are there any other switches in order that might affect preprocessing?\n"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "markdown",
58 |    "id": "bcde87a2",
59 |    "metadata": {},
60 |    "source": [
61 |     "## Challenge 3: Preprocessing and regularization\n",
62 |     "\n",
63 |     "We are preprocessing data in preparation for a classification task down the line. However, preprocessing also applies to regression. \n",
64 |     "\n",
65 |     "Consider the regularization task applied in the previous notebook. How might the preprocessing steps affect the performance of regularization?"
66 |    ]
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.9.12"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 5
90 | }
91 | 


--------------------------------------------------------------------------------
/solutions/04_classification_solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "599e3581",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Challenge 1: Model Evaluation\n",
  9 |     "\n",
 10 |     "1). What are the TP, FP, TN, FN in these model results?\n",
 11 |     "\n",
 12 |     "- TP: 26\n",
 13 |     "- FP: 3\n",
 14 |     "- TN: 34\n",
 15 |     "- FN: 4\n",
 16 |     "\n",
 17 |     "\n",
 18 |     "2). What is the precision and recall for this model?\n",
 19 |     "\n",
 20 |     "**precision**: 26 / 29 = .896\n",
 21 |     "**recall**: 26 / 30 =  .8666\n",
 22 |     "\n",
 23 |     "3). Which is more important, precision or recall?\n",
 24 |     "\n",
 25 |     "**solution:** it depends on the model and this problem"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "id": "824b97aa",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Challenge 2: Classification with SVM\n",
 34 |     "\n",
 35 |     "Now let's try another new model. The [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html#classification) is another class of machine learning algorithm that is used for classification. \n",
 36 |     "\n",
 37 |     "Choose two features of the data set to train your model on. Then, using the documentation for the support vector machine, follow the steps to:\n",
 38 |     "- Initialize the model\n",
 39 |     "- Fit it to the training data\n",
 40 |     "- Evaluate the model on both the training and testing data\n",
 41 |     "\n",
 42 |     "Is your model underfit? Is it overfit? \n",
 43 |     "\n",
 44 |     "How does SVM fit in with the **linearly separable** problem identified in the scatter plots above?"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "id": "6ac4d9a3",
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "ename": "NameError",
 55 |      "evalue": "name 'X_train' is not defined",
 56 |      "output_type": "error",
 57 |      "traceback": [
 58 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 59 |       "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 60 |       "Input \u001b[1;32mIn [3]\u001b[0m, in \u001b[0;36m<cell line: 3>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m## YOUR CODE HERE\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msvm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SVC\n\u001b[1;32m----> 3\u001b[0m X_train_subset \u001b[38;5;241m=\u001b[39m \u001b[43mX_train\u001b[49m[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody_mass_g\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mculmen_depth_mm\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m      4\u001b[0m X_test_subset \u001b[38;5;241m=\u001b[39m X_test[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody_mass_g\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mculmen_depth_mm\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m      5\u001b[0m y_train_subset \u001b[38;5;241m=\u001b[39m y_train[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mspecies\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
 61 |       "\u001b[1;31mNameError\u001b[0m: name 'X_train' is not defined"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "## YOUR CODE HERE\n",
 67 |     "from sklearn.svm import SVC\n",
 68 |     "X_train_subset = X_train[['body_mass_g','culmen_depth_mm']]\n",
 69 |     "X_test_subset = X_test[['body_mass_g','culmen_depth_mm']]\n",
 70 |     "y_train_subset = y_train['species']\n",
 71 |     "y_test_subset = y_test['species']\n",
 72 |     "\n",
 73 |     "##1) Initialize SVM\n",
 74 |     "model = SVC()\n",
 75 |     "\n",
 76 |     "##2) Train SVM on Training data \n",
 77 |     "model.fit(X_train_subset,y_train_subset)\n",
 78 |     "##3) Evaluate SVM on Training and Test Data\n",
 79 |     "model.score(X_train_subset,y_train_subset)\n",
 80 |     "model.score(X_test_subset,y_test_subset)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "a031ab81",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": []
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "kernelspec": {
 94 |    "display_name": "Python 3 (ipykernel)",
 95 |    "language": "python",
 96 |    "name": "python3"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": {
100 |     "name": "ipython",
101 |     "version": 3
102 |    },
103 |    "file_extension": ".py",
104 |    "mimetype": "text/x-python",
105 |    "name": "python",
106 |    "nbconvert_exporter": "python",
107 |    "pygments_lexer": "ipython3",
108 |    "version": "3.9.12"
109 |   }
110 |  },
111 |  "nbformat": 4,
112 |  "nbformat_minor": 5
113 | }
114 | 


--------------------------------------------------------------------------------